added voice cloning to piper

2026-01-09 13:58:14 -05:00 · 2025-08-08 01:32:37 -04:00
parent 6cb8f7ceaa
commit 3a702a0f2e
1 changed files with 133 additions and 18 deletions
--- a/lib/classes/tts_engines/piper.py
+++ b/lib/classes/tts_engines/piper.py
@@ -63,7 +63,8 @@ class Coqui:
            self.npz_data = None
            self.sentences_total_time = 0.0
            self.sentence_idx = 1
-            self.params = {TTS_ENGINES['PIPER']: {}}
+            # Add semitones cache to support voice cloning pitch adaptation (like VITS path)
+            self.params = {TTS_ENGINES['PIPER']: {"semitones": {}}}
            self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
            self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')    
            self.resampler_cache = {}
@@ -86,6 +87,15 @@ class Coqui:
                    else:
                        model_path = self._get_default_model_path()
                        tts = self._load_api(self.tts_key, model_path, self.session['device'])
+
+            # If a reference voice is provided, attempt to load the voice conversion engine
+            # to enable "cloning" via VC, similar to the VITS path.
+            if self.session.get('voice'):
+                tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
+                if not tts_vc:
+                    # Load VC model using a generic TTS API loader (separate from Piper)
+                    tts_vc = self._load_vc_engine(self.tts_vc_key, default_vc_model, self.session['device'])
+
            return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
        except Exception as e:
            error = f'build() error: {e}'
@@ -115,6 +125,7 @@ class Coqui:
        return model_name

    def _load_api(self, key, model_path, device):
+        # Piper TTS model loader
        global lock
        try:
            if key in loaded_tts.keys():
@@ -137,6 +148,35 @@ class Coqui:
            print(error)
        return False

+    def _load_vc_engine(self, key, model_path, device):
+        # Generic TTS/VC model loader (used for voice conversion engine), similar to coqui.py
+        global lock
+        try:
+            if key in loaded_tts.keys():
+                return loaded_tts[key]['engine']
+            unload_tts(device, [self.tts_vc_key])
+            with lock:
+                try:
+                    from TTS.api import TTS as CoquiAPI  # Lazy import to avoid hard dependency if not needed
+                except Exception as e:
+                    print(f"Voice conversion dependencies not available: {e}. Install Coqui TTS to enable cloning.")
+                    return False
+                tts = CoquiAPI(model_path)
+                if tts:
+                    if device == 'cuda':
+                        tts.cuda()
+                    else:
+                        tts.to(device)
+                    loaded_tts[key] = {"engine": tts, "config": None}
+                    msg = f'{model_path} (VC) Loaded!'
+                    print(msg)
+                    return tts
+                else:
+                    print('VC engine could not be created!')
+        except Exception as e:
+            print(f'_load_vc_engine() error: {e}')
+        return False
+
    def _download_model(self, model_name):
        """Download Piper model files from HuggingFace"""
        try:
@@ -167,6 +207,7 @@ class Coqui:
            else:
                # Fallback for unexpected format
                hf_path = f"en/en_US/lessac/medium/"
+                lang_code = "en"
            
            repo_id = "rhasspy/piper-voices"
            
@@ -270,6 +311,17 @@ class Coqui:
        sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
        return tmp_path

+    def _synthesize_with_piper(self, tts, sentence):
+        """Synthesize a sentence with Piper and return a numpy array."""
+        audio_chunks = list(tts.synthesize(sentence))
+        if audio_chunks:
+            audio_arrays = []
+            for chunk in audio_chunks:
+                audio_arrays.append(chunk.audio_float_array)
+            if audio_arrays:
+                return np.concatenate(audio_arrays)
+        return np.array([])
+
    def convert(self, sentence_number, sentence):
        try:
            speaker = None
@@ -278,10 +330,13 @@ class Coqui:
            settings = self.params[self.session['tts_engine']]
            final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
            sentence = sentence.strip()
-            
+
+            # Reference voice path (if provided) enables VC-based cloning akin to VITS path
+            settings['voice_path'] = self.session.get('voice', None)
+
            tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
            if tts:
-                if sentence[-1].isalnum():
+                if sentence and sentence[-1].isalnum():
                    sentence = f'{sentence} —'
                if sentence == TTS_SML['break']:
                    break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
@@ -292,22 +347,82 @@ class Coqui:
                    self.audio_segments.append(pause_tensor.clone())
                    return True
                else:
-                    if self.session['tts_engine'] == TTS_ENGINES['PIPER']:
-                        # Use Piper TTS to synthesize audio
-                        audio_chunks = list(tts.synthesize(sentence))
-                        if audio_chunks:
-                            # Concatenate audio chunks
-                            audio_arrays = []
-                            for chunk in audio_chunks:
-                                audio_arrays.append(chunk.audio_float_array)
-                            
-                            if audio_arrays:
-                                audio_sentence = np.concatenate(audio_arrays)
+                    # First, synthesize with Piper
+                    audio_sentence = self._synthesize_with_piper(tts, sentence)
+
+                    # If a reference voice is provided, perform VC-based cloning like VITS path
+                    if settings.get('voice_path'):
+                        try:
+                            proc_dir = os.path.join(self.session['voice_dir'], 'proc')
+                            os.makedirs(proc_dir, exist_ok=True)
+                            tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
+                            tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
+
+                            # Save Piper synthesis to tmp_in_wav
+                            if is_audio_data_valid(audio_sentence):
+                                sf.write(tmp_in_wav, audio_sentence, settings['samplerate'], subtype="PCM_16")
                            else:
-                                audio_sentence = np.array([])
-                        else:
-                            audio_sentence = np.array([])
-                            
+                                # Nothing synthesized, fall back to silent segment
+                                sf.write(tmp_in_wav, np.zeros(int(settings['samplerate'] * 0.1)), settings['samplerate'], subtype="PCM_16")
+
+                            # Determine and cache semitones shift based on gender detection
+                            if settings['voice_path'] in settings['semitones'].keys():
+                                semitones = settings['semitones'][settings['voice_path']]
+                            else:
+                                voice_path_gender = detect_gender(settings['voice_path'])
+                                voice_builtin_gender = detect_gender(tmp_in_wav)
+                                msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
+                                print(msg)
+                                if voice_builtin_gender != voice_path_gender and voice_path_gender in ['male', 'female'] and voice_builtin_gender in ['male', 'female']:
+                                    semitones = -4 if voice_path_gender == 'male' else 4
+                                    msg = f"Adapting builtin voice frequencies from the clone voice..."
+                                    print(msg)
+                                else:
+                                    semitones = 0
+                                settings['semitones'][settings['voice_path']] = semitones
+
+                            # Pitch shift if needed using sox
+                            if semitones > 0:
+                                try:
+                                    cmd = [
+                                        shutil.which('sox'), tmp_in_wav,
+                                        "-r", str(settings['samplerate']), tmp_out_wav,
+                                        "pitch", str(semitones * 100)
+                                    ]
+                                    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False)
+                                except Exception as e:
+                                    print(f"Pitch shift failed: {e}")
+                                    tmp_out_wav = tmp_in_wav
+                            else:
+                                tmp_out_wav = tmp_in_wav
+
+                            # Load VC engine and perform conversion
+                            tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
+                            if not tts_vc:
+                                tts_vc = self._load_vc_engine(self.tts_vc_key, default_vc_model, self.session['device'])
+                            if tts_vc:
+                                # Use VC samplerate
+                                settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
+                                source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
+                                target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
+                                audio_sentence = tts_vc.voice_conversion(
+                                    source_wav=source_wav,
+                                    target_wav=target_wav
+                                )
+                                # Cleanup
+                                if os.path.exists(source_wav) and source_wav != tmp_out_wav:
+                                    os.remove(source_wav)
+                            else:
+                                print(f'Engine {self.tts_vc_key} is None; skipping voice cloning.')
+
+                            # Cleanup temp files
+                            if os.path.exists(tmp_in_wav):
+                                os.remove(tmp_in_wav)
+                            if os.path.exists(tmp_out_wav) and tmp_out_wav != tmp_in_wav:
+                                os.remove(tmp_out_wav)
+                        except Exception as e:
+                            print(f"Piper VC cloning path failed, proceeding with Piper voice only. Error: {e}")
+
                    if is_audio_data_valid(audio_sentence):
                        sourceTensor = self._tensor_type(audio_sentence)
                        audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()