mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-08 21:38:12 -05:00
v25.12.29
This commit is contained in:
@@ -1 +1 @@
|
||||
25.12.28
|
||||
25.12.29
|
||||
@@ -194,25 +194,12 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
|
||||
**tts_dyn_params,
|
||||
**fine_tuned_params
|
||||
)
|
||||
#audio_sentence = result.get('wav')
|
||||
#if is_audio_data_valid(audio_sentence):
|
||||
# audio_sentence = audio_sentence.tolist()
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os, threading, gc, torch, torchaudio, shutil, tempfile, regex as re, soundfile as sf, numpy as np
|
||||
from lib.classes.tts_engines.common.audio import is_audio_data_valid
|
||||
|
||||
from typing import Any, Union, Dict
|
||||
from huggingface_hub import hf_hub_download
|
||||
@@ -210,36 +211,27 @@ class TTSUtils:
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[xtts]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[xtts]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {xtts} is False"
|
||||
else:
|
||||
|
||||
@@ -166,21 +166,11 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -194,21 +194,11 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -179,21 +179,11 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -141,21 +141,11 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -109,21 +109,11 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
if isinstance(audio_sentence, torch.Tensor):
|
||||
audio_tensor = audio_sentence.detach().cpu().unsqueeze(0)
|
||||
elif isinstance(audio_sentence, np.ndarray):
|
||||
audio_tensor = torch.from_numpy(audio_sentence).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
elif isinstance(audio_sentence, (list, tuple)):
|
||||
audio_tensor = torch.tensor(audio_sentence, dtype=torch.float32).unsqueeze(0)
|
||||
audio_tensor = audio_tensor.cpu()
|
||||
else:
|
||||
error = f"{self.session['tts_engine']}: Unsupported wav type: {type(audio_sentence)}"
|
||||
print(error)
|
||||
return False
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), self.params['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
|
||||
@@ -1118,12 +1118,14 @@ def build_interface(args:dict)->gr.Blocks:
|
||||
file_pattern = "*.wav"
|
||||
eng_options = []
|
||||
bark_options = []
|
||||
builtin_dir = Path(os.path.join(voices_dir, lang_dir))
|
||||
builtin_options = [
|
||||
(os.path.splitext(f.name)[0], str(f))
|
||||
for f in Path(os.path.join(voices_dir, lang_dir)).rglob(file_pattern)
|
||||
(base, str(f))
|
||||
for f in builtin_dir.rglob(file_pattern)
|
||||
for base in [os.path.splitext(f.name)[0]]
|
||||
]
|
||||
builtin_names = {t[0]: None for t in builtin_options}
|
||||
if session['language'] in default_engine_settings[TTS_ENGINES['XTTSv2']].get('languages', {}):
|
||||
builtin_names = {t[0]: None for t in builtin_options}
|
||||
eng_dir = Path(os.path.join(voices_dir, "eng"))
|
||||
eng_options = [
|
||||
(base, str(f))
|
||||
@@ -1162,7 +1164,7 @@ def build_interface(args:dict)->gr.Blocks:
|
||||
voice_options = [('Default', None)] + sorted(voice_options, key=lambda x: x[0].lower())
|
||||
else:
|
||||
voice_options = sorted(voice_options, key=lambda x: x[0].lower())
|
||||
if session['voice'] is not None:
|
||||
if session['voice'] is not None and isinstance(session.get('voice'), str):
|
||||
if session['voice_dir'] not in session['voice']:
|
||||
if not any(v[1] == session['voice'] for v in voice_options):
|
||||
voice_path = Path(session['voice'])
|
||||
@@ -1332,7 +1334,7 @@ def build_interface(args:dict)->gr.Blocks:
|
||||
bark_visible = visible_gr_tab_bark_params
|
||||
return (
|
||||
gr.update(value=show_rating(session['tts_engine'])),
|
||||
gr.update(visible=False),
|
||||
gr.update(visible=visible_gr_tab_bark_params),
|
||||
gr.update(visible=bark_visible),
|
||||
gr.update(visible=False),
|
||||
update_gr_fine_tuned_list(id),
|
||||
|
||||
Reference in New Issue
Block a user