mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-08 21:38:12 -05:00
...
This commit is contained in:
@@ -124,7 +124,6 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -187,6 +186,7 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
|
||||
**fine_tuned_params
|
||||
)
|
||||
"""
|
||||
self.engine.to(device)
|
||||
audio_sentence = self.engine.tts(
|
||||
text=sentence,
|
||||
speaker=speaker,
|
||||
@@ -195,8 +195,8 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
|
||||
**fine_tuned_params
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
@@ -77,7 +77,6 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -102,6 +101,7 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
@@ -145,6 +145,7 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate'])
|
||||
target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate'])
|
||||
self.engine_zs.to(device)
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
@@ -161,13 +162,14 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
@@ -102,7 +102,6 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -130,6 +129,7 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
@@ -173,6 +173,7 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate'])
|
||||
target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate'])
|
||||
self.engine_zs.to(device)
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
@@ -189,13 +190,14 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
@@ -85,7 +85,6 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -115,6 +114,7 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
self.engine.tts_to_file(
|
||||
text=sentence,
|
||||
file_path=tmp_in_wav,
|
||||
@@ -158,6 +158,7 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
self.params['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, self.params['samplerate'])
|
||||
target_wav = self._resample_wav(self.params['voice_path'], self.params['samplerate'])
|
||||
self.engine_zs.to(device)
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
@@ -174,13 +175,14 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
audio_sentence = self.engine.tts(
|
||||
text=sentence,
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
@@ -85,7 +85,6 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -132,6 +131,7 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
result = self.engine.inference(
|
||||
text=sentence,
|
||||
language=self.session['language_iso1'],
|
||||
@@ -141,8 +141,8 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
@@ -76,7 +76,6 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
@@ -103,14 +102,15 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
|
||||
voice_key = default_engine_settings[self.session['tts_engine']]['voices']['ElectroMale-2']
|
||||
speaker_argument = {"speaker": voice_key}
|
||||
with torch.no_grad():
|
||||
self.engine.to(device)
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
language=language,
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
src_tensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = src_tensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
|
||||
Reference in New Issue
Block a user