Add project files

2025-12-15 14:51:19 -08:00 · 2024-04-09 19:19:33 +02:00 · 2024-04-09 19:19:33 +02:00 · 90715e702b
commit 90715e702b
parent 8321dca121
17 changed files with 4301 additions and 20 deletions
--- a/projects/text_to_speech/worker.js
+++ b/projects/text_to_speech/worker.js
@ -0,0 +1,105 @@
+import { env, Tensor, AutoTokenizer, SpeechT5ForTextToSpeech, SpeechT5HifiGan } from '@xenova/transformers';
+import { encodeWAV } from './utils';
+
+// Disable local model checks
+env.allowLocalModels = false;
+
+// Use the Singleton pattern to enable lazy construction of the pipeline.
+class MyTextToSpeechPipeline {
+
+    static BASE_URL = 'https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/';
+
+    static model_id = 'Xenova/speecht5_tts';
+    static vocoder_id = 'Xenova/speecht5_hifigan';
+
+    static tokenizer_instance = null;
+    static model_instance = null;
+    static vocoder_instance = null;
+
+    static async getInstance(progress_callback = null) {
+        if (this.tokenizer_instance === null) {
+            this.tokenizer = AutoTokenizer.from_pretrained(this.model_id, { progress_callback });
+        }
+
+        if (this.model_instance === null) {
+            this.model_instance = SpeechT5ForTextToSpeech.from_pretrained(this.model_id, {
+                quantized: false,
+                progress_callback,
+            });
+        }
+
+        if (this.vocoder_instance === null) {
+            this.vocoder_instance = SpeechT5HifiGan.from_pretrained(this.vocoder_id, {
+                quantized: false,
+                progress_callback,
+            });
+        }
+
+        return new Promise(async (resolve, reject) => {
+            const result = await Promise.all([
+                this.tokenizer,
+                this.model_instance,
+                this.vocoder_instance,
+            ]);
+            self.postMessage({
+                status: 'ready',
+            });
+            resolve(result);
+        });
+    }
+
+    static async getSpeakerEmbeddings(speaker_id) {
+        // e.g., `cmu_us_awb_arctic-wav-arctic_a0001`
+        const speaker_embeddings_url = `${this.BASE_URL}${speaker_id}.bin`;
+        const speaker_embeddings = new Tensor(
+            'float32',
+            new Float32Array(await (await fetch(speaker_embeddings_url)).arrayBuffer()),
+            [1, 512]
+        )
+        return speaker_embeddings;
+    }
+}
+
+// Mapping of cached speaker embeddings
+const speaker_embeddings_cache = new Map();
+
+// Listen for messages from the main thread
+self.addEventListener('message', async (event) => {
+    // Load the pipeline
+    const [tokenizer, model, vocoder] = await MyTextToSpeechPipeline.getInstance(x => {
+        // We also add a progress callback so that we can track model loading.
+        self.postMessage(x);
+    });
+
+    // Tokenize the input
+    const { input_ids } = tokenizer(event.data.text);
+
+    // Load the speaker embeddings
+    let speaker_embeddings = speaker_embeddings_cache.get(event.data.speaker_id);
+    if (speaker_embeddings === undefined) {
+        speaker_embeddings = await MyTextToSpeechPipeline.getSpeakerEmbeddings(event.data.speaker_id);
+        speaker_embeddings_cache.set(event.data.speaker_id, speaker_embeddings);
+    }
+
+    // Generate the waveform
+    let response;
+    try {
+        response = await model.generate_speech(input_ids, speaker_embeddings, { vocoder });
+    } catch(e) {
+        self.postMessage({
+            status: 'error',
+            exception: e,
+        });
+        throw e;
+    }
+    const { waveform } = response;
+
+    // Encode the waveform as a WAV file
+    const wav = encodeWAV(waveform.data);
+
+    // Send the output back to the main thread
+    self.postMessage({
+        status: 'complete',
+        output: new Blob([wav], { type: 'audio/wav' }),
+    });
+});