From 29d040e19d5a4db0b7d70add534d7fe8850ac428 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 29 Dec 2025 04:12:44 -0800 Subject: [PATCH] ... --- lib/classes/tts_engines/bark.py | 6 +++--- lib/classes/tts_engines/fairseq.py | 8 +++++--- lib/classes/tts_engines/tacotron.py | 8 +++++--- lib/classes/tts_engines/vits.py | 8 +++++--- lib/classes/tts_engines/xtts.py | 6 +++--- lib/classes/tts_engines/yourtts.py | 6 +++--- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/lib/classes/tts_engines/bark.py b/lib/classes/tts_engines/bark.py index a0a36029..b8f153d1 100644 --- a/lib/classes/tts_engines/bark.py +++ b/lib/classes/tts_engines/bark.py @@ -124,7 +124,6 @@ class Bark(TTSUtils, TTSRegistry, name='bark'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -187,6 +186,7 @@ class Bark(TTSUtils, TTSRegistry, name='bark'): **fine_tuned_params ) """ + self.engine.to(device) audio_sentence = self.engine.tts( text=sentence, speaker=speaker, @@ -195,8 +195,8 @@ class Bark(TTSUtils, TTSRegistry, name='bark'): **fine_tuned_params ) if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) diff --git a/lib/classes/tts_engines/fairseq.py b/lib/classes/tts_engines/fairseq.py index 4cbee35a..2c5f5d97 100644 --- a/lib/classes/tts_engines/fairseq.py +++ b/lib/classes/tts_engines/fairseq.py @@ -77,7 +77,6 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -102,6 +101,7 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'): tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") with torch.no_grad(): + self.engine.to(device) self.engine.tts_to_file( text=re.sub(not_supported_punc_pattern, ' ', sentence), file_path=tmp_in_wav, @@ -145,6 +145,7 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'): self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate'] source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate']) target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate']) + self.engine_zs.to(device) audio_sentence = self.engine_zs.voice_conversion( source_wav=source_wav, target_wav=target_wav @@ -161,13 +162,14 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'): os.remove(source_wav) else: with torch.no_grad(): + self.engine.to(device) audio_sentence = self.engine.tts( text=re.sub(not_supported_punc_pattern, ' ', sentence), **speaker_argument ) if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) diff --git a/lib/classes/tts_engines/tacotron.py b/lib/classes/tts_engines/tacotron.py index faf37060..0347c8e6 100644 --- a/lib/classes/tts_engines/tacotron.py +++ b/lib/classes/tts_engines/tacotron.py @@ -102,7 +102,6 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -130,6 +129,7 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'): tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") with torch.no_grad(): + self.engine.to(device) self.engine.tts_to_file( text=re.sub(not_supported_punc_pattern, ' ', sentence), file_path=tmp_in_wav, @@ -173,6 +173,7 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'): self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate'] source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate']) target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate']) + self.engine_zs.to(device) audio_sentence = self.engine_zs.voice_conversion( source_wav=source_wav, target_wav=target_wav @@ -189,13 +190,14 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'): os.remove(source_wav) else: with torch.no_grad(): + self.engine.to(device) audio_sentence = self.engine.tts( text=re.sub(not_supported_punc_pattern, ' ', sentence), **speaker_argument ) if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) diff --git a/lib/classes/tts_engines/vits.py b/lib/classes/tts_engines/vits.py index 1bd1954b..e04be172 100644 --- a/lib/classes/tts_engines/vits.py +++ b/lib/classes/tts_engines/vits.py @@ -85,7 +85,6 @@ class Vits(TTSUtils, TTSRegistry, name='vits'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -115,6 +114,7 @@ class Vits(TTSUtils, TTSRegistry, name='vits'): tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav") with torch.no_grad(): + self.engine.to(device) self.engine.tts_to_file( text=sentence, file_path=tmp_in_wav, @@ -158,6 +158,7 @@ class Vits(TTSUtils, TTSRegistry, name='vits'): self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate'] source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate']) target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate']) + self.engine_zs.to(device) audio_sentence = self.engine_zs.voice_conversion( source_wav=source_wav, target_wav=target_wav @@ -174,13 +175,14 @@ class Vits(TTSUtils, TTSRegistry, name='vits'): os.remove(source_wav) else: with torch.no_grad(): + self.engine.to(device) audio_sentence = self.engine.tts( text=sentence, **speaker_argument ) if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) diff --git a/lib/classes/tts_engines/xtts.py b/lib/classes/tts_engines/xtts.py index e9ac0171..42d8738e 100644 --- a/lib/classes/tts_engines/xtts.py +++ b/lib/classes/tts_engines/xtts.py @@ -85,7 +85,6 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -132,6 +131,7 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'): if self.session.get(key) is not None } with torch.no_grad(): + self.engine.to(device) result = self.engine.inference( text=sentence, language=self.session['language_iso1'], @@ -141,8 +141,8 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'): ) audio_sentence = result.get('wav') if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) diff --git a/lib/classes/tts_engines/yourtts.py b/lib/classes/tts_engines/yourtts.py index 22686162..9f03793d 100644 --- a/lib/classes/tts_engines/yourtts.py +++ b/lib/classes/tts_engines/yourtts.py @@ -76,7 +76,6 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'): return False if self.engine: device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device'] - self.engine.to(device) final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}') if sentence == TTS_SML['break']: silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 @@ -103,14 +102,15 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'): voice_key = default_engine_settings[self.session['tts_engine']]['voices']['ElectroMale-2'] speaker_argument = {"speaker": voice_key} with torch.no_grad(): + self.engine.to(device) audio_sentence = self.engine.tts( text=re.sub(not_supported_punc_pattern, ' ', sentence), language=language, **speaker_argument ) if is_audio_data_valid(audio_sentence): - sourceTensor = self._tensor_type(audio_sentence) - audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() + src_tensor = self._tensor_type(audio_sentence) + audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: if sentence[-1].isalnum() or sentence[-1] == '—': audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)