From 4f701f6caa94003aeaf1ca961064717e6c23d31e Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 28 Dec 2025 20:01:33 -0800 Subject: [PATCH] ... --- lib/classes/tts_engines/bark.py | 21 ++++----------------- lib/classes/tts_engines/fairseq.py | 18 ++++-------------- lib/classes/tts_engines/tacotron.py | 18 ++++-------------- lib/classes/tts_engines/vits.py | 18 ++++-------------- lib/classes/tts_engines/xtts.py | 5 ++--- lib/classes/tts_engines/yourtts.py | 18 ++++-------------- 6 files changed, 22 insertions(+), 76 deletions(-) diff --git a/lib/classes/tts_engines/bark.py b/lib/classes/tts_engines/bark.py index 6bda8963..a0a36029 100644 --- a/lib/classes/tts_engines/bark.py +++ b/lib/classes/tts_engines/bark.py @@ -194,25 +194,12 @@ class Bark(TTSUtils, TTSRegistry, name='bark'): **tts_dyn_params, **fine_tuned_params ) - #audio_sentence = result.get('wav') - #if is_audio_data_valid(audio_sentence): - # audio_sentence = audio_sentence.tolist() if is_audio_data_valid(audio_sentence): - if isinstance(audio_sentence, torch.Tensor): - audio_tensor = audio_sentence.detach().cpu().unsqueeze(0) - elif isinstance(audio_sentence, np.ndarray): - audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - elif isinstance(audio_sentence, (list, tuple)): - audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - else: - error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}" - print(error) - return False - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) + sourceTensor = self._tensor_type(audio_sentence) + audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 diff --git a/lib/classes/tts_engines/fairseq.py b/lib/classes/tts_engines/fairseq.py index ab800c8f..4cbee35a 100644 --- a/lib/classes/tts_engines/fairseq.py +++ b/lib/classes/tts_engines/fairseq.py @@ -166,21 +166,11 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'): **speaker_argument ) if is_audio_data_valid(audio_sentence): - if isinstance(audio_sentence, torch.Tensor): - audio_tensor = audio_sentence.detach().cpu().unsqueeze(0) - elif isinstance(audio_sentence, np.ndarray): - audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - elif isinstance(audio_sentence, (list, tuple)): - audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - else: - error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}" - print(error) - return False - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) + sourceTensor = self._tensor_type(audio_sentence) + audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 diff --git a/lib/classes/tts_engines/tacotron.py b/lib/classes/tts_engines/tacotron.py index b22ed9f0..faf37060 100644 --- a/lib/classes/tts_engines/tacotron.py +++ b/lib/classes/tts_engines/tacotron.py @@ -194,21 +194,11 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'): **speaker_argument ) if is_audio_data_valid(audio_sentence): - if isinstance(audio_sentence, torch.Tensor): - audio_tensor = audio_sentence.detach().cpu().unsqueeze(0) - elif isinstance(audio_sentence, np.ndarray): - audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - elif isinstance(audio_sentence, (list, tuple)): - audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - else: - error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}" - print(error) - return False - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) + sourceTensor = self._tensor_type(audio_sentence) + audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 diff --git a/lib/classes/tts_engines/vits.py b/lib/classes/tts_engines/vits.py index 3fecf17d..1bd1954b 100644 --- a/lib/classes/tts_engines/vits.py +++ b/lib/classes/tts_engines/vits.py @@ -179,21 +179,11 @@ class Vits(TTSUtils, TTSRegistry, name='vits'): **speaker_argument ) if is_audio_data_valid(audio_sentence): - if isinstance(audio_sentence, torch.Tensor): - audio_tensor = audio_sentence.detach().cpu().unsqueeze(0) - elif isinstance(audio_sentence, np.ndarray): - audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - elif isinstance(audio_sentence, (list, tuple)): - audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - else: - error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}" - print(error) - return False - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) + sourceTensor = self._tensor_type(audio_sentence) + audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 diff --git a/lib/classes/tts_engines/xtts.py b/lib/classes/tts_engines/xtts.py index 0afe2411..e9ac0171 100644 --- a/lib/classes/tts_engines/xtts.py +++ b/lib/classes/tts_engines/xtts.py @@ -141,12 +141,11 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'): ) audio_sentence = result.get('wav') if is_audio_data_valid(audio_sentence): - audio_sentence = result.get('wav') sourceTensor = self._tensor_type(audio_sentence) audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100 diff --git a/lib/classes/tts_engines/yourtts.py b/lib/classes/tts_engines/yourtts.py index 7da2d6a9..22686162 100644 --- a/lib/classes/tts_engines/yourtts.py +++ b/lib/classes/tts_engines/yourtts.py @@ -109,21 +109,11 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'): **speaker_argument ) if is_audio_data_valid(audio_sentence): - if isinstance(audio_sentence, torch.Tensor): - audio_tensor = audio_sentence.detach().cpu().unsqueeze(0) - elif isinstance(audio_sentence, np.ndarray): - audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - elif isinstance(audio_sentence, (list, tuple)): - audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0) - audio_tensor = audio_tensor.cpu() - else: - error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}" - print(error) - return False - if sentence[-1].isalnum() or sentence[-1] == '—': - audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) + sourceTensor = self._tensor_type(audio_sentence) + audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu() if audio_tensor is not None and audio_tensor.numel() > 0: + if sentence[-1].isalnum() or sentence[-1] == '—': + audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0) self.audio_segments.append(audio_tensor) if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—': silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100