v25.12.25

This commit is contained in:
ROBERT MCDOWELL
2025-12-23 20:42:30 -08:00
committed by GitHub
35 changed files with 2898 additions and 4818 deletions

53
.github/languages.yml vendored Normal file
View File

@@ -0,0 +1,53 @@
languages:
- code: ar # Arabic
name: Arabic (ara)
- code: zh-CN # Chinese (Simplified)
name: Chinese (zho)
- code: en # English
name: English (eng)
- code: es # Spanish
name: Spanish (spa)
- code: fr # French
name: French (fra)
- code: de # German
name: German (deu)
- code: it # Italian
name: Italian (ita)
- code: pt # Portuguese
name: Portuguese (por)
- code: pl # Polish
name: Polish (pol)
- code: tr # Turkish
name: Turkish (tur)
- code: ru # Russian
name: Russian (rus)
- code: nl # Dutch
name: Dutch (nld)
- code: cs # Czech
name: Czech (ces)
- code: ja # Japanese
name: Japanese (jpn)
- code: hi # Hindi
name: Hindi (hin)
- code: bn # Bengali
name: Bengali (ben)
- code: hu # Hungarian
name: Hungarian (hun)
- code: ko # Korean
name: Korean (kor)
- code: vi # Vietnamese
name: Vietnamese (vie)
- code: sv # Swedish
name: Swedish (swe)
- code: fa # Persian
name: Persian (fas)
- code: yo # Yoruba
name: Yoruba (yor)
- code: sw # Swahili
name: Swahili (swa)
- code: id # Indonesian
name: Indonesian (ind)
- code: sk # Slovak
name: Slovak (slk)
- code: hr # Croatian
name: Croatian (hrv)

View File

@@ -1 +1 @@
25.12.20
25.12.25

24
app.py
View File

@@ -1,8 +1,8 @@
import argparse, socket, multiprocessing, sys, warnings
from lib.conf import *
from lib.lang import default_language_code
from lib.models import TTS_ENGINES, default_fine_tuned, default_engine_settings
from lib.conf_lang import default_language_code
from lib.conf_models import TTS_ENGINES, default_fine_tuned, default_engine_settings
warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="jieba._compat")
@@ -234,10 +234,10 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
error = f'Error: Could not installed device packages!'
print(error)
sys.exit(1)
import lib.functions as f
f.context = f.SessionContext() if f.context is None else f.context
f.context_tracker = f.SessionTracker() if f.context_tracker is None else f.context_tracker
f.active_sessions = set() if f.active_sessions is None else f.active_sessions
import lib.core as c
c.context = c.SessionContext() if c.context is None else c.context
c.context_tracker = c.SessionTracker() if c.context_tracker is None else c.context_tracker
c.active_sessions = set() if c.active_sessions is None else c.active_sessions
# Conditions based on the --headless flag
if args['headless']:
args['is_gui_process'] = False
@@ -292,7 +292,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
if any(file.endswith(ext) for ext in ebook_formats):
full_path = os.path.abspath(os.path.join(args['ebooks_dir'], file))
args['ebook_list'].append(full_path)
progress_status, passed = f.convert_ebook_batch(args)
progress_status, passed = c.convert_ebook_batch(args)
if passed is False:
error = f'Conversion failed: {progress_status}'
print(error)
@@ -303,7 +303,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
error = f'Error: The provided --ebook "{args["ebook"]}" does not exist.'
print(error)
sys.exit(1)
progress_status, passed = f.convert_ebook(args)
progress_status, passed = c.convert_ebook(args)
if passed is False:
error = f'Conversion failed: {progress_status}'
print(error)
@@ -334,16 +334,16 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
)
except OSError as e:
error = f'Connection error: {e}'
f.alert_exception(error, None)
c.alert_exception(error, None)
except socket.error as e:
error = f'Socket error: {e}'
f.alert_exception(error, None)
c.alert_exception(error, None)
except KeyboardInterrupt:
error = 'Server interrupted by user. Shutting down...'
f.alert_exception(error, None)
c.alert_exception(error, None)
except Exception as e:
error = f'An unexpected error occurred: {e}'
f.alert_exception(error, None)
c.alert_exception(error, None)
else:
error = 'Error: In GUI mode, no option or only --share can be passed'
print(error)

View File

@@ -71,12 +71,14 @@ while (( $# > 0 )); do
case "$1" in
--*)
key="${1#--}"
if [[ -n "$2" && "$2" != --* ]]; then
arguments[$key]="$2"
if (( $# > 1 )) && [[ "$2" != --* ]]; then
arguments["$key"]="$2"
shift 2
continue
else
arguments[$key]=true
arguments["$key"]=true
shift
continue
fi
;;
*)
@@ -84,9 +86,9 @@ while (( $# > 0 )); do
exit 1
;;
esac
shift
done
if [[ -n "${arguments[script_mode]+exists}" ]]; then
if [[ "${arguments[script_mode]}" == "$BUILD_DOCKER" ]]; then
SCRIPT_MODE="${arguments[script_mode]}"
@@ -601,6 +603,8 @@ function check_conda {
# Detect Jetson and select correct Python version
MODEL="$(tr -d '\0' </proc/device-tree/model 2>/dev/null | tr 'A-Z' 'a-z' || true)"
if [[ "$MODEL" == *jetson* ]]; then
# needed gfortran to compile pip scipy pkg
sudo apt-get install gfortran
PYTHON_VERSION="3.10"
fi
else

View File

@@ -1,10 +1,3 @@
from .models import (
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
default_engine_settings, default_vc_model, default_voice_detection_model,
loaded_tts, xtts_builtin_speakers_list, max_custom_model, max_custom_voices,
models, os, voices_dir
)
from .conf import (
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate, max_upload_size,
@@ -18,21 +11,23 @@ from .conf import (
voices_dir, default_output_split, default_output_split_hours
)
from .lang import (
from .conf_lang import (
abbreviations_mapping, chapter_word_mapping, default_language_code,
roman_numbers_tuples, emojis_list, install_info, language_mapping,
language_math_phonemes, language_clock, language_tts, os, punctuation_list,
language_math_phonemes, language_clock, os, punctuation_list,
punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
specialchars_mapping, chars_remove, year_to_decades_languages
specialchars_mapping, chars_remove, year_to_decades_languages,
)
from .conf_models import (
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
default_engine_settings, default_vc_model, default_voice_detection_model,
loaded_tts, xtts_builtin_speakers_list,
max_custom_model, max_custom_voices, voices_dir
)
__all__ = [
# from models
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
"max_custom_voices", "models", "os", "voices_dir",
# from conf
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
@@ -46,11 +41,17 @@ __all__ = [
"requirements_file", "components_dir", "tmp_dir", "tmp_expire", "tts_dir",
"voice_formats", "voices_dir", "default_output_split", "default_output_split_hours",
# from lang
# from conf_lang
"abbreviations_mapping", "chapter_word_mapping", "default_language_code",
"roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
"language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
"language_math_phonemes", "language_clock", "os", "punctuation_list",
"punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
"punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
"specialchars_mapping", "chars_remove", "year_to_decades_languages"
"specialchars_mapping", "chars_remove", "year_to_decades_languages",
# from conf_models
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
"max_custom_voices", "voices_dir"
]

View File

@@ -5,7 +5,7 @@ import argostranslate.translate
from iso639 import Lang
from lib.conf import models_dir
from lib.lang import language_mapping
from lib.conf_lang import language_mapping
# NOTE: source_lang and target_lang must be iso639-1 (2 letters)

View File

@@ -5,7 +5,7 @@ import threading
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from lib.conf import tts_dir
from lib.models import default_voice_detection_model
from lib.conf_models import default_voice_detection_model
_PIPELINE_CACHE = {}

View File

@@ -27,7 +27,7 @@ class DeviceInstaller():
os_env = 'linux' if name == 'jetson' else self.check_platform
elif mode == 'build_docker':
os_env = 'linux' if name == 'jetson' else 'manylinux_2_28'
pyvenv = [3,10] if tag in ['jetson51', 'jetson60', 'jetson61'] else pyvenv
pyvenv = [3,10] if tag in ['jetson60', 'jetson61'] else pyvenv
if all([name, tag, os_env, arch, pyvenv]):
device_info = {"name": name, "os": os_env, "arch": arch, "pyvenv": pyvenv, "tag": tag, "note": msg}
return json.dumps(device_info)
@@ -171,27 +171,10 @@ class DeviceInstaller():
rev_major = int(parts[0])
rev_minor = int(parts[1]) if len(parts) > 1 else 0
rev_patch = int(parts[2]) if len(parts) > 2 else 0
if l4t_major < 35:
msg = f'JetPack too old (L4T {l4t_major}). Please upgrade to JetPack 5.1+. Falling back to CPU.'
if l4t_major < 36:
msg = f'JetPack too old (L4T {l4t_major}). Please upgrade to JetPack 6.0+. Falling back to CPU.'
return ('unsupported', msg)
if l4t_major == 35:
if rev_major == 0 and rev_minor <= 1:
msg = 'JetPack 5.0/5.0.1 detected. Please upgrade to JetPack 5.1+ to use the GPU. Failing back to CPU'
return ('cpu', msg)
if rev_major == 0 and rev_minor >= 2:
msg = 'JetPack 5.0.x detected. Please upgrade to JetPack 5.1+ to use the GPU. Failing back to CPU'
return ('cpu', msg)
if rev_major == 1 and rev_minor == 0:
msg = 'JetPack 5.1.0 detected. Please upgrade to JetPack 5.1.2 or newer.'
return ('51', msg)
if rev_major == 1 and rev_minor == 1:
msg = 'JetPack 5.1.1 detected. Please upgrade to JetPack 5.1.2 or newer.'
return ('51', msg)
if (rev_major > 1) or (rev_major == 1 and rev_minor >= 2):
return ('51', msg)
msg = 'Unrecognized JetPack 5.x version. Falling back to CPU.'
return ('unknown', msg)
if l4t_major == 36:
else:
if rev_major == 2:
return ('60', msg)
else:
@@ -734,7 +717,8 @@ class DeviceInstaller():
torchaudio_pkg = f"{url}/v{toolkit_version}/torchaudio-{jetson_torch_version_base[tag]}%2B{tag}-{tag_py}-{os_env}_{arch}.whl"
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', torch_pkg])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', torchaudio_pkg])
subprocess.check_call([sys.executable, '-m', 'pip', 'uninstall', '-y', 'scikit-learn'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--force', '--no-binary=scikit-learn', 'scikit-learn'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--force', '--no-cache-dir', '--no-binary=scipy', 'scipy'])
elif device_info['name'] == devices['MPS']['proc']:
torch_tag_py = f'cp{default_py_major}{default_py_minor}-none'
torchaudio_tag_py = f'cp{default_py_major}{default_py_minor}-cp{default_py_major}{default_py_minor}'

View File

@@ -20,7 +20,7 @@ class SubprocessPipe:
self.progress_bar((percent / 100), desc=self.msg)
def _on_complete(self)->None:
msg = f"{self.msg} completed!"
msg = f"\n{self.msg} completed!"
print(msg)
if self.is_gui_process:
self.progress_bar(1.0, desc=msg)

View File

@@ -1,280 +1,75 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class Bark(TTSUtils, TTSRegistry, name='bark'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class Bark(TTSRegistry, name='bark'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
if engine_name == TTS_ENGINES['XTTSv2']:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
elif engine_name == TTS_ENGINES['BARK']:
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
checkpoint_dir = kwargs.get('checkpoint_dir')
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
error = f'Missing or invalid checkpoint_dir: {checkpoint_dir}'
raise FileNotFoundError(error)
return False
#check_pth = ensure_safe_checkpoint(checkpoint_dir)
#if not check_pth:
# error = f'No valid checkpoint files found or conversion failed in: {checkpoint_dir}'
# raise RuntimeError(error)
# return False
config = BarkConfig()
config.CACHE_DIR = self.cache_dir
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
engine = Bark.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_dir = checkpoint_dir,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
"""
hf_repo = self.models[self.session['fine_tuned']]['repo']
hf_sub = self.models[self.session['fine_tuned']]['sub']
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
checkpoint_dir = os.path.dirname(text_model_path)
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
if self.engine:
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
"""
model_path = self.models[self.session['fine_tuned']]['repo']
engine = self._load_api(self.tts_key, model_path)
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
else:
error = '_load_engine() failed!'
raise ValueError(error)
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
raise ValueError(error)
"""
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str)->bool:
try:
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
if self.session['language'] in default_engine_settings[TTS_ENGINES['BARK']].get('languages', {}):
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
if os.path.exists(pth_voice_file):
@@ -310,51 +105,14 @@ class Bark(TTSRegistry, name='bark'):
error = f'_check_bark_npz() error: {e}'
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
"""
def convert(self, sentence_index:int, sentence:str)->bool:
try:
speaker = None
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -398,11 +156,13 @@ class Bark(TTSRegistry, name='bark'):
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
else:
bark_dir = os.path.join(os.path.dirname(self.params['voice_path']), 'bark')
bark_dir = os.path.join(os.path.dirname(self.params['voice_path']), 'bark')
"""
if not self._check_bark_npz(self.params['voice_path'], bark_dir, speaker):
error = 'Could not create pth voice file!'
print(error)
return False
"""
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
fine_tuned_params = {
@@ -414,6 +174,7 @@ class Bark(TTSRegistry, name='bark'):
if self.session.get(key) is not None
}
with torch.no_grad():
"""
result = self.engine.synthesize(
sentence,
#speaker_wav=self.params['voice_path'],
@@ -421,9 +182,17 @@ class Bark(TTSRegistry, name='bark'):
voice_dir=pth_voice_dir,
**fine_tuned_params
)
audio_sentence = result.get('wav')
if is_audio_data_valid(audio_sentence):
audio_sentence = audio_sentence.tolist()
"""
audio_sentence = self.engine.tts(
text=sentence,
speaker_wav=self.params['voice_path'],
speaker=speaker,
voice_dir=pth_voice_dir,
**fine_tuned_params
)
#audio_sentence = result.get('wav')
#if is_audio_data_valid(audio_sentence):
# audio_sentence = audio_sentence.tolist()
if is_audio_data_valid(audio_sentence):
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
@@ -447,11 +216,11 @@ class Bark(TTSRegistry, name='bark'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -464,7 +233,7 @@ class Bark(TTSRegistry, name='bark'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -0,0 +1,41 @@
import os, torch, torchaudio, random, subprocess, uuid, regex as re, numpy as np
from typing import Any
from pathlib import Path
from multiprocessing.managers import DictProxy
from huggingface_hub import hf_hub_download
from lib.conf import tts_dir, devices, default_audio_proc_format
from lib.conf_models import TTS_ENGINES, TTS_SML, TTS_VOICE_CONVERSION, loaded_tts, default_vc_model, default_engine_settings
from lib.classes.tts_registry import TTSRegistry
from lib.classes.tts_engines.common.utils import TTSUtils
from lib.classes.tts_engines.common.audio import detect_gender, trim_audio, is_audio_data_valid
__all__ = [
"os",
"torch",
"torchaudio",
"random",
"subprocess",
"uuid",
"re",
"np",
"Any",
"Path",
"DictProxy",
"hf_hub_download",
"TTSRegistry",
"TTSUtils",
"detect_gender",
"trim_audio",
"is_audio_data_valid",
"tts_dir",
"devices",
"default_audio_proc_format",
"TTS_ENGINES",
"TTS_SML",
"TTS_VOICE_CONVERSION",
"loaded_tts",
"default_vc_model",
"default_engine_settings"
]

View File

@@ -0,0 +1,20 @@
import importlib
import threading
from typing import Dict, Any
_lock = threading.Lock()
_presets_cache:Dict[str, Dict[str, Any]] = {}
def load_engine_presets(engine:str)->Dict[str, Any]:
with _lock:
if engine in _presets_cache:
return _presets_cache[engine]
module = importlib.import_module(
f"lib.classes.tts_engines.presets.{engine}_presets"
)
if not hasattr(module, "models"):
raise RuntimeError(
f"'models' not found in {engine}_presets"
)
_presets_cache[engine] = module.models
return module.models

View File

@@ -1,8 +1,4 @@
import os
import gc
import torch
import shutil
import regex as re
import os, threading, gc, torch, torchaudio, shutil, tempfile, regex as re, soundfile as sf, numpy as np
from typing import Any, Union, Dict
from huggingface_hub import hf_hub_download
@@ -11,190 +7,307 @@ from pathlib import Path
from torch import Tensor
from torch.nn import Module
from lib.conf import tts_dir
from lib.models import xtts_builtin_speakers_list, TTS_ENGINES, models
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.audio import normalize_audio
from lib import *
def cleanup_memory()->None:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.synchronize()
_lock = threading.Lock()
def model_size_bytes(model:Module)->int:
total = 0
for t in list(model.parameters()) + list(model.buffers()):
if isinstance(t, Tensor):
total += t.nelement() * t.element_size()
return total
class TTSUtils:
def loaded_tts_size_gb(loaded_tts:Dict[str, Module])->float:
total_bytes = 0
for model in loaded_tts.values():
try:
total_bytes += model_size_bytes(model)
except Exception:
pass
gb = total_bytes / (1024 ** 3)
return round(gb, 2)
def _cleanup_memory(self)->None:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.synchronize()
def load_xtts_builtin_list()->dict:
try:
if len(xtts_builtin_speakers_list) > 0:
return xtts_builtin_speakers_list
speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename='speakers_xtts.pth', cache_dir=tts_dir)
loaded = torch.load(speakers_path, weights_only=False)
if not isinstance(loaded, dict):
raise TypeError(
f"Invalid XTTS speakers format: {type(loaded)}"
)
for name, data in loaded.items():
if name not in xtts_builtin_speakers_list:
xtts_builtin_speakers_list[name] = data
return xtts_builtin_speakers_list
except Exception as error:
raise RuntimeError(
"load_xtts_builtin_list() failed"
) from error
def apply_cuda_policy(using_gpu, enough_vram, seed):
if using_gpu and enough_vram:
torch.cuda.set_per_process_memory_fraction(0.95)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.cuda.manual_seed_all(seed)
else:
torch.cuda.set_per_process_memory_fraction(0.7)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.cuda.manual_seed_all(seed)
def append_sentence2vtt(sentence_obj:dict[str, Any], path:str)->Union[int, bool]:
def format_timestamp(seconds:float)->str:
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
try:
index = 1
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
if "-->" in line:
index += 1
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
return index # Already written
if not os.path.exists(path):
with open(path, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
with open(path, "a", encoding="utf-8") as f:
start = format_timestamp(float(sentence_obj["start"]))
end = format_timestamp(float(sentence_obj["end"]))
text = re.sub(r"[\r\n]+", " ", str(sentence_obj["text"])).strip()
f.write(f"{start} --> {end}\n{text}\n\n")
return index + 1
except Exception as e:
error = f"append_sentence2vtt() error: {e}"
print(error)
return False
def is_safetensors_file(path:str)->bool:
try:
with open(path, 'rb') as f:
header = f.read(32)
return b'safetensors' in header
except Exception:
return False
def convert_pt_to_safetensors(pth_path:str, delete_original:bool=False)->str:
pth_path = Path(pth_path)
if not pth_path.exists():
error = f'File not found: {pth_path}'
print(error)
raise FileNotFoundError()
if not (pth_path.suffix in ['.pth', '.pt']):
error = f'Expected a .pth or .pt file, got: {pth_path.suffix}'
print(error)
raise ValueError(error)
safe_dir = pth_path.parent / "safetensors"
safe_dir.mkdir(exist_ok=True)
safe_path = safe_dir / pth_path.with_suffix('.safetensors').name
msg = f'Converting {pth_path.name} → safetensors/{safe_path.name}'
print(msg)
try:
try:
state = torch.load(str(pth_path), map_location='cpu', weights_only=True)
except Exception:
error = f'⚠️ weights_only load failed for {pth_path.name}, retrying unsafely (trusted file).'
print(error)
state = torch.load(str(pth_path), map_location='cpu', weights_only=False)
if isinstance(state, dict) and "model" in state:
state = state["model"]
flattened = {}
for k, v in state.items():
if isinstance(v, dict):
for subk, subv in v.items():
flattened[f"{k}.{subk}"] = subv
else:
flattened[k] = v
state = {k: v for k, v in flattened.items() if isinstance(v, torch.Tensor)}
for k, v in list(state.items()):
state[k] = v.clone().detach()
save_file(state, str(safe_path))
if delete_original:
pth_path.unlink(missing_ok=True)
msg = f'Deleted original: {pth_path}'
print(msg)
msg = f'Saved: {safe_path}'
print(msg)
return str(safe_path)
except Exception as e:
error = f'Failed to convert {pth_path.name}: {e}'
print(error)
raise
def ensure_safe_checkpoint(checkpoint_dir:str)->list[str]:
safe_files = []
if os.path.isfile(checkpoint_dir):
if not (checkpoint_dir.endswith('.pth') or checkpoint_dir.endswith('.pt')):
error = f'Invalid checkpoint file: {checkpoint_dir}'
raise ValueError(error)
if not is_safetensors_file(checkpoint_dir):
def _loaded_tts_size_gb(self, loaded_tts:Dict[str, Module])->float:
total_bytes = 0
for model in loaded_tts.values():
try:
safe_path = convert_pt_to_safetensors(checkpoint_dir, False)
msg = f'Created safetensors version of {os.path.basename(checkpoint_dir)}{safe_path}'
print(msg)
safe_files.append(safe_path)
except Exception as e:
error = f'Failed to convert {os.path.basename(checkpoint_dir)}: {e}'
print(error)
total_bytes += model_size_bytes(model)
except Exception:
pass
gb = total_bytes / (1024 ** 3)
return round(gb, 2)
def _load_xtts_builtin_list(self)->dict:
try:
if len(xtts_builtin_speakers_list) > 0:
return xtts_builtin_speakers_list
speakers_path = hf_hub_download(repo_id=default_engine_settings[TTS_ENGINES['XTTSv2']]['repo'], filename='speakers_xtts.pth', cache_dir=tts_dir)
loaded = torch.load(speakers_path, weights_only=False)
if not isinstance(loaded, dict):
raise TypeError(
f"Invalid XTTS speakers format: {type(loaded)}"
)
for name, data in loaded.items():
if name not in xtts_builtin_speakers_list:
xtts_builtin_speakers_list[name] = data
return xtts_builtin_speakers_list
except Exception as error:
raise RuntimeError(
"self._load_xtts_builtin_list() failed"
) from error
def _apply_cuda_policy(self, using_gpu:bool, enough_vram:bool, seed:int)->None:
torch.cuda.manual_seed_all(0)
if using_gpu and enough_vram:
torch.cuda.set_per_process_memory_fraction(0.95)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.cuda.manual_seed_all(seed)
else:
safe_files.append(checkpoint_dir)
return safe_files
if not os.path.isdir(checkpoint_dir):
raise FileNotFoundError(f"Invalid checkpoint_dir: {checkpoint_dir}")
for root, _, files in os.walk(checkpoint_dir):
for fname in files:
if fname.endswith(".pth") or fname.endswith(".pt"):
pth_path = os.path.join(root, fname)
if is_safetensors_file(pth_path):
safe_files.append(pth_path)
continue
try:
safe_path = convert_pt_to_safetensors(pth_path, False)
msg = f'Created safetensors version of {os.path.relpath(pth_path, checkpoint_dir)}{os.path.relpath(safe_path, checkpoint_dir)}'
torch.cuda.set_per_process_memory_fraction(0.7)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.cuda.manual_seed_all(seed)
def _load_api(self, key:str, model_path:str)->Any:
try:
with _lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
try:
with _lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
self._cleanup_memory()
engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not engine_zs:
engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
return engine_zs
except Exception as e:
error = f'_load_engine_zs() error: {e}'
raise ValueError(error)
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
xtts = TTS_ENGINES['XTTSv2']
if self.session['language'] in default_engine_settings[xtts].get('languages', {}):
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
safe_files.append(safe_path)
except Exception as e:
error = f'Failed to convert {fname}: {e}'
print(error)
return safe_files
key = f"{xtts}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
self._cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = default_engine_settings[xtts]['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=xtts, key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[xtts]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[xtts]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path], librosa_trim_db=30, load_sr=24000, sound_norm_refs=True)
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[xtts]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {xtts} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
def _append_sentence2vtt(self, sentence_obj:dict[str, Any], path:str)->Union[int, bool]:
def format_timestamp(seconds:float)->str:
m, s = divmod(seconds, 60)
h, m = divmod(m, 60)
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
try:
index = 1
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
if "-->" in line:
index += 1
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
return index # Already written
if not os.path.exists(path):
with open(path, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
with open(path, "a", encoding="utf-8") as f:
start = format_timestamp(float(sentence_obj["start"]))
end = format_timestamp(float(sentence_obj["end"]))
text = re.sub(r"[\r\n]+", " ", str(sentence_obj["text"])).strip()
f.write(f"{start} --> {end}\n{text}\n\n")
return index + 1
except Exception as e:
error = f"self._append_sentence2vtt() error: {e}"
print(error)
return False

View File

@@ -1,882 +0,0 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
import regex as re
import numpy as np
import soundfile as sf
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class Coqui:
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params={TTS_ENGINES['XTTSv2']:{"latent_embedding":{}},TTS_ENGINES['BARK']:{},TTS_ENGINES['VITS']:{"semitones":{}},TTS_ENGINES['FAIRSEQ']:{"semitones":{}},TTS_ENGINES['TACOTRON2']:{"semitones":{}},TTS_ENGINES['YOURTTS']:{}}
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
if not xtts_builtin_speakers_list:
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename='speakers_xtts.pth', cache_dir=self.cache_dir)
xtts_builtin_speakers_list = torch.load(self.speakers_path, weights_only=False)
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if using_gpu and enough_vram:
if devices['CUDA']['found'] or devices['ROCM']['found'] or devices['JETSON']['found']:
if devices['JETSON']['found']:
if not hasattr(torch, "distributed"):
torch.distributed = types.SimpleNamespace()
if not hasattr(torch.distributed, "ReduceOp"):
class _ReduceOp:
SUM = None
MAX = None
MIN = None
torch.distributed.ReduceOp = _ReduceOp
if not hasattr(torch.distributed, "all_reduce"):
def _all_reduce(*args, **kwargs):
return
torch.distributed.all_reduce = _all_reduce
torch.cuda.set_per_process_memory_fraction(0.95)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
torch.cuda.manual_seed_all(seed)
else:
if devices['CUDA']['found'] or devices['ROCM']['found'] or devices['JETSON']['found']:
torch.cuda.set_per_process_memory_fraction(0.7)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.cuda.manual_seed_all(seed)
self._load_engine()
self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
print(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
if engine_name == TTS_ENGINES['XTTSv2']:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
elif engine_name == TTS_ENGINES['BARK']:
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
checkpoint_dir = kwargs.get('checkpoint_dir')
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
error = f'Missing or invalid checkpoint_dir: {checkpoint_dir}'
raise FileNotFoundError(error)
return False
#check_pth = ensure_safe_checkpoint(checkpoint_dir)
#if not check_pth:
# error = f'No valid checkpoint files found or conversion failed in: {checkpoint_dir}'
# raise RuntimeError(error)
# return False
config = BarkConfig()
config.CACHE_DIR = self.cache_dir
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
engine = Bark.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_dir = checkpoint_dir,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
if self.session['custom_model'] is not None:
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
if self.session['fine_tuned'] == 'internal':
hf_sub = ''
if self.speakers_path is None:
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename='speakers_xtts.pth', cache_dir=self.cache_dir)
else:
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
checkpoint_dir = os.path.dirname(text_model_path)
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
if sub is not None:
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['TACOTRON2']][self.session['fine_tuned']]['samplerate'][sub]
if sub is None:
iso_dir = self.session['language']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
if sub is not None:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
m = self.engine.synthesizer.tts_model
d = m.decoder
# Stability
d.prenet_dropout = 0.0
d.attention_dropout = 0.0
d.decoder_dropout = 0.0
m.attention.location_attention.dropout = 0.0
# Stop-gate tuning
d.gate_threshold = 0.5
d.force_gate = True
d.gate_delay = 10
# Long-sentence fix
d.max_decoder_steps = 1000
# Prevent attention drift
d.attention_keeplast = True
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
self.engine = self._load_api(self.tts_key, model_path)
if self.engine:
msg = f'TTS {key} Loaded!'
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str)->bool:
try:
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
if os.path.exists(pth_voice_file):
return True
else:
os.makedirs(pth_voice_dir,exist_ok=True)
key = f"{TTS_ENGINES['BARK']}-internal"
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
default_text = Path(default_text_file).read_text(encoding="utf-8")
fine_tuned_params = {
key.removeprefix("bark_"):cast_type(self.session[key])
for key,cast_type in{
"bark_text_temp":float,
"bark_waveform_temp":float
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.engine.synthesize(
default_text,
speaker_wav=voice_path,
speaker=speaker,
voice_dir=pth_voice_dir,
**fine_tuned_params
)
del result
msg = f"Saved file: {pth_voice_file}"
print(msg)
return True
else:
return True
except Exception as e:
error = f'_check_bark_npz() error: {e}'
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(tmp_dir, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
def convert(self, sentence_index:int, sentence:str)->bool:
try:
speaker = None
audio_sentence = False
settings = self.params[self.session['tts_engine']]
settings['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
)
if settings['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['custom_model_dir'] not in settings['voice_path']:
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker)
if not settings['voice_path']:
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
print(msg)
return False
if self.engine:
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
self.engine.to(device)
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
self.audio_segments.append(break_tensor.clone())
return True
elif not sentence.replace('', '').strip() or sentence == TTS_SML['pause']:
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
self.audio_segments.append(pause_tensor.clone())
return True
else:
if sentence.endswith("'"):
sentence = sentence[:-1]
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
trim_audio_buffer = 0.008
sentence = sentence.replace('.', ' ;\n')
sentence += ' ...' if sentence[-1].isalnum() else ''
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
else:
msg = 'Computing speaker latents...'
print(msg)
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.engine.inference(
text=sentence,
language=self.session['language_iso1'],
gpt_cond_latent=settings['gpt_cond_latent'],
speaker_embedding=settings['speaker_embedding'],
**fine_tuned_params
)
audio_sentence = result.get('wav')
if is_audio_data_valid(audio_sentence):
audio_sentence = audio_sentence.tolist()
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
trim_audio_buffer = 0.002
sentence += '' if sentence[-1].isalnum() else ''
'''
[laughter]
[laughs]
[sighs]
[music]
[gasps]
[clears throat]
— or ... for hesitations
♪ for song lyrics
CAPITALIZATION for emphasis of a word
[MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
'''
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
else:
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker):
error = 'Could not create pth voice file!'
print(error)
return False
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
fine_tuned_params = {
key.removeprefix("bark_"): cast_type(self.session[key])
for key, cast_type in {
"bark_text_temp": float,
"bark_waveform_temp": float
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.engine.synthesize(
sentence,
#speaker_wav=settings['voice_path'],
speaker=speaker,
voice_dir=pth_voice_dir,
**fine_tuned_params
)
audio_sentence = result.get('wav')
if is_audio_data_valid(audio_sentence):
audio_sentence = audio_sentence.tolist()
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
trim_audio_buffer = 0.004
sentence += '' if sentence[-1].isalnum() else ''
speaker_argument = {}
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
speaker_argument = {"speaker": 'p262'}
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
speaker_argument = {"speaker": '09901'}
if settings['voice_path'] is not None:
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
with torch.no_grad():
self.engine.tts_to_file(
text=sentence,
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
voice_path_gender = detect_gender(settings['voice_path'])
voice_builtin_gender = detect_gender(tmp_in_wav)
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
print(msg)
if voice_builtin_gender != voice_path_gender:
semitones = -4 if voice_path_gender == 'male' else 4
msg = f"Adapting builtin voice frequencies from the clone voice..."
print(msg)
else:
semitones = 0
settings['semitones'][settings['voice_path']] = semitones
if semitones > 0:
try:
cmd = [
shutil.which('sox'), tmp_in_wav,
"-r", str(settings['samplerate']), tmp_out_wav,
"pitch", str(semitones * 100)
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
error = f"Subprocess error: {e.stderr}"
print(error)
DependencyError(e)
return False
except FileNotFoundError as e:
error = f"File not found: {e}"
print(error)
DependencyError(e)
return False
else:
tmp_out_wav = tmp_in_wav
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
os.remove(tmp_in_wav)
if os.path.exists(tmp_out_wav):
os.remove(tmp_out_wav)
if os.path.exists(source_wav):
os.remove(source_wav)
else:
with torch.no_grad():
audio_sentence = self.engine.tts(
text=sentence,
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
trim_audio_buffer = 0.004
sentence += '' if sentence[-1].isalnum() else ''
speaker_argument = {}
not_supported_punc_pattern = re.compile(r"[.:—]")
if settings['voice_path'] is not None:
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
with torch.no_grad():
self.engine.tts_to_file(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
voice_path_gender = detect_gender(settings['voice_path'])
voice_builtin_gender = detect_gender(tmp_in_wav)
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
print(msg)
if voice_builtin_gender != voice_path_gender:
semitones = -4 if voice_path_gender == 'male' else 4
msg = f"Adapting builtin voice frequencies from the clone voice..."
print(msg)
else:
semitones = 0
settings['semitones'][settings['voice_path']] = semitones
if semitones > 0:
try:
cmd = [
shutil.which('sox'), tmp_in_wav,
"-r", str(settings['samplerate']), tmp_out_wav,
"pitch", str(semitones * 100)
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
error = f'Subprocess error: {e.stderr}'
print(error)
DependencyError(e)
return False
except FileNotFoundError as e:
error = f'File not found: {e}'
print(error)
DependencyError(e)
return False
else:
tmp_out_wav = tmp_in_wav
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
os.remove(tmp_in_wav)
if os.path.exists(tmp_out_wav):
os.remove(tmp_out_wav)
if os.path.exists(source_wav):
os.remove(source_wav)
else:
with torch.no_grad():
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
trim_audio_buffer = 0.004
sentence += '...' if sentence[-1].isalnum() else ''
speaker_argument = {}
if self.session['language'] in ['zho', 'jpn', 'kor', 'tha', 'lao', 'mya', 'khm']:
not_supported_punc_pattern = re.compile(r'\p{P}+')
else:
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
if settings['voice_path'] is not None:
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
with torch.no_grad():
self.engine.tts_to_file(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
voice_path_gender = detect_gender(settings['voice_path'])
voice_builtin_gender = detect_gender(tmp_in_wav)
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
print(msg)
if voice_builtin_gender != voice_path_gender:
semitones = -4 if voice_path_gender == 'male' else 4
msg = f"Adapting builtin voice frequencies from the clone voice..."
print(msg)
else:
semitones = 0
settings['semitones'][settings['voice_path']] = semitones
if semitones > 0:
try:
cmd = [
shutil.which('sox'), tmp_in_wav,
"-r", str(settings['samplerate']), tmp_out_wav,
"pitch", str(semitones * 100)
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
error = f"Subprocess error: {e.stderr}"
print(error)
DependencyError(e)
return False
except FileNotFoundError as e:
error = f"File not found: {e}"
print(error)
DependencyError(e)
return False
else:
tmp_out_wav = tmp_in_wav
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
os.remove(tmp_in_wav)
if os.path.exists(tmp_out_wav):
os.remove(tmp_out_wav)
if os.path.exists(source_wav):
os.remove(source_wav)
else:
with torch.no_grad():
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
trim_audio_buffer = 0.002
sentence += '...' if sentence[-1].isalnum() else ''
speaker_argument = {}
not_supported_punc_pattern = re.compile(r'[—]')
language = self.session['language_iso1'] if self.session['language_iso1'] == 'en' else 'fr-fr' if self.session['language_iso1'] == 'fr' else 'pt-br' if self.session['language_iso1'] == 'pt' else 'en'
if settings['voice_path'] is not None:
speaker_wav = settings['voice_path']
speaker_argument = {"speaker_wav": speaker_wav}
else:
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
speaker_argument = {"speaker": voice_key}
with torch.no_grad():
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
language=language,
**speaker_argument
)
if is_audio_data_valid(audio_sentence):
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
if sentence[-1].isalnum() or sentence[-1] == '':
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
if audio_tensor is not None and audio_tensor.numel() > 0:
self.audio_segments.append(audio_tensor)
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '':
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
self.audio_segments.append(break_tensor.clone())
if self.audio_segments:
audio_tensor = torch.cat(self.audio_segments, dim=-1)
start_time = self.sentences_total_time
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
end_time = start_time + duration
self.sentences_total_time = end_time
sentence_obj = {
"start": start_time,
"end": end_time,
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
else:
error = f"Cannot create {final_sentence_file}"
print(error)
return False
else:
error = f"audio_sentence not valide"
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
print(error)
return False
except Exception as e:
error = f'Coqui.convert(): {e}'
raise ValueError(e)
return False

View File

@@ -1,286 +1,63 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class Fairseq(TTSRegistry, name='fairseq'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {"semitones":{}}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {"semitones":{}}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
if self.engine:
engine = self._load_api(self.tts_key, model_path)
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
else:
error = '_load_engine() failed!'
raise ValueError(error)
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
raise ValueError(error)
def convert(self, sentence_index:int, sentence:str)->bool:
try:
@@ -288,7 +65,7 @@ class Fairseq(TTSRegistry, name='fairseq'):
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -411,11 +188,11 @@ class Fairseq(TTSRegistry, name='fairseq'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -428,7 +205,7 @@ class Fairseq(TTSRegistry, name='fairseq'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -0,0 +1,14 @@
import os
from lib.conf import voices_dir
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "tts_models/multilingual/multi-dataset/bark", # load_checkpoint => erogol/bark, suno/bark, rsxdalv/suno. load_api => tts_models/multilingual/multi-dataset/bark
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
}
}

View File

@@ -0,0 +1,12 @@
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang]/fairseq/vits",
"sub": "",
"voice": None,
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
}
}

View File

@@ -0,0 +1,24 @@
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang_iso1]/[xxx]",
"sub": {
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
"thorsten/tacotron2-DDC": ['de'],
"kokoro/tacotron2-DDC": ['ja'],
"ljspeech/tacotron2-DDC": ['en'],
"baker/tacotron2-DDC-GST": ['zh-CN']
},
"voice": None,
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
"samplerate": {
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
},
}
}

View File

@@ -0,0 +1,34 @@
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang_iso1]/[xxx]",
"sub": {
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
"custom/vits": ['ca'],
"custom/vits-female": ['bn', 'fa'],
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
"mai/vits": ['uk'],
"mai_female/vits": ['pl'],
"mai_male/vits": ['it'],
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
"vctk/vits": ['en'],
"thorsten/vits": ['de']
},
"voice": None,
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
"samplerate": {
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"mai_female/vits": 24000,
"mai_male/vits": 16000,
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
}
}
}

View File

@@ -0,0 +1,278 @@
import os
from lib.conf import voices_dir
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "coqui/XTTS-v2",
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"AiExplained": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/AiExplained/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"AsmrRacoon": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/AsmrRacoon/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Awkwafina": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/Awkwafina/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BobOdenkirk": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BobOdenkirk/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BobRoss": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BobRoss/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BrinaPalencia": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BrinaPalencia/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BryanCranston": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BryanCranston/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DavidAttenborough": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DavidAttenborough/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DeathPussInBoots": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DeathPussInBoots/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DermotCrowley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DermotCrowley/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"EvaSeymour": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/EvaSeymour/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"GideonOfnirEldenRing": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"GhostMW2": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/GhostMW2/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JohnButlerASMR": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JohnButlerASMR/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JohnButlerASMR.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JohnMulaney": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JohnMulaney/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JohnMulaney.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JillRedfield": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JillRedfield/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JuliaWhenlan": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JuliaWhenlan/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"LeeHorsley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/LeeHorsley/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"MelinaEldenRing": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/MelinaEldenRing/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"MorganFreeman": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/MorganFreeman/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"NeilGaiman": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/NeilGaiman/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"PeterGriffinFamilyGuy": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RafeBeckley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RafeBeckley/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RafeBeckley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RainyDayHeadSpace": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RayPorter": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RayPorter/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RelaxForAWhile": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RelaxForAWhile/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RosamundPike": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RosamundPike/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"ScarlettJohansson": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/ScarlettJohansson/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"SladeTeenTitans": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/SladeTeenTitans/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"StanleyParable": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/StanleyParable/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Top15s": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/Top15s/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'Top15s.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"WhisperSalemASMR": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/WhisperSalemASMR/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Konishev": {
"lang": "rus",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/rus/Konishev/",
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
}
}

View File

@@ -0,0 +1,12 @@
from lib.conf_models import TTS_ENGINES, default_engine_settings
models = {
"internal": {
"lang": "multi",
"repo": "tts_models/multilingual/multi-dataset/your_tts",
"sub": "",
"voice": None,
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
}
}

View File

@@ -1,151 +1,68 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class Tacotron2(TTSRegistry, name='tacotron'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {"semitones":{}}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {"semitones":{}}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
iso_dir = default_engine_settings[self.session['tts_engine']]['languages'][self.session['language']]
sub_dict = self.models[self.session['fine_tuned']]['sub']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate'][sub]
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate'][sub]
if sub is None:
iso_dir = self.session['language']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
if sub is not None:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
m = self.engine.synthesizer.tts_model
engine = self._load_api(self.tts_key, model_path)
m = engine.synthesizer.tts_model
d = m.decoder
# Stability
d.prenet_dropout = 0.0
d.attention_dropout = 0.0
d.decoder_dropout = 0.0
m.attention.location_attention.dropout = 0.0
# Stop-gate tuning
d.gate_threshold = 0.5
d.force_gate = True
@@ -157,156 +74,15 @@ class Tacotron2(TTSRegistry, name='tacotron'):
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
if self.engine:
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
else:
error = '_load_engine() failed!'
raise ValueError(error)
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
raise ValueError(error)
def convert(self, sentence_index:int, sentence:str)->bool:
try:
@@ -314,7 +90,7 @@ class Tacotron2(TTSRegistry, name='tacotron'):
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -440,11 +216,11 @@ class Tacotron2(TTSRegistry, name='tacotron'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -457,7 +233,7 @@ class Tacotron2(TTSRegistry, name='tacotron'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -1,294 +1,71 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class Vits(TTSUtils, TTSRegistry, name='vits'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class Vits(TTSRegistry, name='vits'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {"semitones":{}}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {"semitones":{}}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
iso_dir = default_engine_settings[self.session['tts_engine']]['languages'][self.session['language']]
sub_dict = self.models[self.session['fine_tuned']]['sub']
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
if sub is not None:
self.params['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate'][sub]
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
self.tts_key = model_path
self.engine = self._load_api(self.tts_key, model_path)
engine = self._load_api(self.tts_key, model_path)
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
if self.engine:
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
else:
error = '_load_engine() failed!'
raise ValueError(error)
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
raise ValueError(error)
def convert(self, sentence_index:int, sentence:str)->bool:
try:
@@ -296,7 +73,7 @@ class Vits(TTSRegistry, name='vits'):
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -326,11 +103,11 @@ class Vits(TTSRegistry, name='vits'):
trim_audio_buffer = 0.004
sentence += '' if sentence[-1].isalnum() else ''
speaker_argument = {}
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
if self.session['language'] == 'eng' and 'vctk/vits' in self.models['internal']['sub']:
if self.session['language'] in self.models['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in self.models['internal']['sub']['vctk/vits']:
speaker_argument = {"speaker": 'p262'}
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
elif self.session['language'] == 'cat' and 'custom/vits' in self.models['internal']['sub']:
if self.session['language'] in self.models['internal']['sub']['custom/vits'] or self.session['language_iso1'] in self.models['internal']['sub']['custom/vits']:
speaker_argument = {"speaker": '09901'}
if self.params['voice_path'] is not None:
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
@@ -424,11 +201,11 @@ class Vits(TTSRegistry, name='vits'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -441,7 +218,7 @@ class Vits(TTSRegistry, name='vits'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -1,295 +1,71 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class XTTSv2(TTSRegistry, name='xtts'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {"latent_embedding":{}}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {"latent_embedding":{}}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
hf_repo = self.models[self.session['fine_tuned']]['repo']
if self.session['fine_tuned'] == 'internal':
hf_sub = ''
if self.speakers_path is None:
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename='speakers_xtts.pth', cache_dir=self.cache_dir)
else:
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if self.engine:
hf_sub = self.models[self.session['fine_tuned']]['sub']
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
raise ValueError(error)
def convert(self, sentence_index:int, sentence:str)->bool:
try:
@@ -297,7 +73,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -335,7 +111,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[self.params['voice_path']])
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[self.params['voice_path']], librosa_trim_db=30, load_sr=24000, sound_norm_refs=True)
self.params['latent_embedding'][self.params['voice_path']] = self.params['gpt_cond_latent'], self.params['speaker_embedding']
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
@@ -389,11 +165,11 @@ class XTTSv2(TTSRegistry, name='xtts'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -406,7 +182,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -1,285 +1,62 @@
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
from lib.classes.tts_engines.common.headers import *
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
import regex as re
import numpy as np
import soundfile as sf
class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
from multiprocessing.managers import DictProxy
from typing import Any
from pathlib import Path
from huggingface_hub import hf_hub_download
from lib.classes.tts_registry import TTSRegistry
from lib.classes.vram_detector import VRAMDetector
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
from lib import *
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
class YourTTS(TTSRegistry, name='yourtts'):
def __init__(self, session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = self.session['model_cache']
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {}
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self.models = load_engine_presets(self.session['tts_engine'])
self.params = {}
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
seed = 123456
random.seed(seed)
np.random.seed(seed)
seed = 0
#random.seed(seed)
#np.random.seed(seed)
torch.manual_seed(seed)
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
if has_cuda:
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = load_xtts_builtin_list()
self._load_engine()
self._load_engine_zs()
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
self.xtts_speakers = self._load_xtts_builtin_list()
self.engine = self._load_engine()
self.engine_zs = self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
raise ValueError(error)
def _load_api(self, key:str, model_path:str)->Any:
global lock
try:
with lock:
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
raise FileNotFoundError(error)
return False
if not config_path or not os.path.exists(config_path):
error = f'Missing or invalid config_path: {config_path}'
raise FileNotFoundError(error)
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
eval = True
)
if engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] > models_loaded_size_gb:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
def _load_engine(self)->Any:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._cleanup_memory()
engine = loaded_tts.get(self.tts_key, False)
if not engine:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
self.engine = self._load_api(self.tts_key, model_path)
if self.engine:
model_path = self.models[self.session['fine_tuned']]['repo']
engine = self._load_api(self.tts_key, model_path)
if engine and engine is not None:
msg = f'TTS {self.tts_key} Loaded!'
return engine
else:
error = '_load_engine() failed!'
raise ValueError(error)
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_memory()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
return voice_path
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_memory()
engine = loaded_tts.get(key, False)
if not engine:
vram_dict = VRAMDetector().detect_vram(self.session['device'])
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
if self.session['free_vram_gb'] <= models_loaded_size_gb:
del loaded_tts[self.tts_key]
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
#"xtts_codec_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
#"xtts_cvvp_weight": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
#"xtts_gpt_cond_len": int,
#"xtts_gpt_batch_size": int,
"xtts_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
self._load_engine()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
if new_voice_path:
Path(new_voice_path).unlink(missing_ok=True)
if proc_voice_path:
Path(proc_voice_path).unlink(missing_ok=True)
print(error)
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data, torch.Tensor):
return audio_data
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(resample_tmp, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
raise ValueError(error)
def convert(self, sentence_index:int, sentence:str)->bool:
try:
@@ -287,7 +64,7 @@ class YourTTS(TTSRegistry, name='yourtts'):
audio_sentence = False
self.params['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
else self.models[self.session['fine_tuned']]['voice']
)
if self.params['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
@@ -354,11 +131,11 @@ class YourTTS(TTSRegistry, name='yourtts'):
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_memory()
self._cleanup_memory()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
@@ -371,7 +148,7 @@ class YourTTS(TTSRegistry, name='yourtts'):
print(error)
return False
else:
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
error = f"TTS engine {self.session['tts_engine']} failed to load!"
print(error)
return False
except Exception as e:

View File

@@ -1,16 +1,15 @@
import lib.classes.tts_engines
from typing import Any
from lib.classes.tts_registry import TTSRegistry
import lib.classes.tts_engines
class TTSManager:
def __init__(self, session: Any) -> None:
self.session = session
def __init__(self, session:Any)->None:
self.session = session
engine_name = session.get("tts_engine")
if engine_name is None:
raise ValueError("session['tts_engine'] is missing")
try:
engine_cls = TTSRegistry.ENGINES[engine_name]
except KeyError:
@@ -18,8 +17,7 @@ class TTSManager:
f"Invalid tts_engine '{engine_name}'. "
f"Expected one of: {', '.join(TTSRegistry.ENGINES)}"
)
self.engine = engine_cls(session)
def convert_sentence2audio(self, sentence_number:int, sentence:str)->bool:
def convert_sentence2audio(self, sentence_number: int, sentence: str) -> bool:
return self.engine.convert(sentence_number, sentence)

View File

@@ -12,10 +12,9 @@ from io import BytesIO
from pydub import AudioSegment, silence
from pydub.silence import detect_silence
from lib.conf import voice_formats, default_audio_proc_samplerate
from lib.models import TTS_ENGINES, models
from lib.classes.background_detector import BackgroundDetector
from lib.classes.subprocess_pipe import SubprocessPipe
from lib.conf import voice_formats, default_audio_proc_samplerate
class VoiceExtractor:
def __init__(self, session:Any, voice_file:str, voice_name:str):

View File

@@ -48,7 +48,8 @@ os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostran
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32,garbage_collection_threshold:0.6,expandable_segments:True'
os.environ['TORCH_CUDA_ENABLE_CUDA_GRAPH'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,garbage_collection_threshold:0.6,expandable_segments:True'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_CACHE_MAXSIZE"] = "2147483648"
@@ -93,7 +94,6 @@ default_pytorch_url = 'https://download.pytorch.org/whl'
default_jetson_url = 'https://www.e-blokos.com/whl/jetson' # TODO: find a definitive place where to upload the jetpack5 torch
jetson_torch_version_base = {
"jetson51": "2.4.1",
"jetson60": "2.4.0",
"jetson61": "2.5.0"
}
@@ -125,7 +125,6 @@ torch_matrix = {
"xpu": {"url": default_pytorch_url},
# JETSON
"jetson51": {"url": default_jetson_url},
"jetson60": {"url": default_jetson_url},
"jetson61": {"url": default_jetson_url}
}
@@ -134,7 +133,7 @@ cuda_version_range = {"min": (11,8), "max": (12,8)}
rocm_version_range = {"min": (5,5), "max": (6,4)}
mps_version_range = {"min": (0,0), "max": (0,0)}
xpu_version_range = {"min": (0,0), "max": (0,0)}
jetson_version_range = {"min": (5,1), "max": (6,2)}
jetson_version_range = {"min": (6,0), "max": (6,2)}
# ---------------------------------------------------------------------
# Python environment references

File diff suppressed because one or more lines are too long

162
lib/conf_models.py Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,517 +0,0 @@
import os
from lib.conf import tts_dir, voices_dir
loaded_tts = {}
xtts_builtin_speakers_list = {}
TTS_ENGINES = {
"XTTSv2": "xtts",
"BARK": "bark",
"VITS": "vits",
"FAIRSEQ": "fairseq",
"TACOTRON2": "tacotron",
"YOURTTS": "yourtts"
}
TTS_VOICE_CONVERSION = {
"freevc24": {"path": "voice_conversion_models/multilingual/vctk/freevc24", "samplerate": 24000},
"knnvc": {"path": "voice_conversion_models/multilingual/multi-dataset/knnvc", "samplerate": 16000},
"openvoice_v1": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v1", "samplerate": 22050},
"openvoice_v2": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v2", "samplerate": 22050}
}
TTS_SML = {
"break": "‡break‡",
"pause": "‡pause‡",
"###": "‡pause‡"
}
default_tts_engine = TTS_ENGINES['XTTSv2']
default_fine_tuned = 'internal'
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
default_voice_detection_model = 'drewThomasson/segmentation'
max_custom_model = 100
max_custom_voices = 1000
default_engine_settings = {
TTS_ENGINES['XTTSv2']: {
"samplerate": 24000,
"temperature": 0.75,
#"codec_temperature": 0.3,
"length_penalty": 1.0,
"num_beams": 1,
"repetition_penalty": 2.0,
#"cvvp_weight": 0.3,
"top_k": 50,
"top_p": 0.85,
"speed": 1.0,
#"gpt_cond_len": 512,
#"gpt_batch_size": 1,
"enable_text_splitting": False,
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav'],
"voices": {
"ClaribelDervla": "Claribel Dervla", "DaisyStudious": "Daisy Studious", "GracieWise": "Gracie Wise",
"TammieEma": "Tammie Ema", "AlisonDietlinde": "Alison Dietlinde", "AnaFlorence": "Ana Florence",
"AnnmarieNele": "Annmarie Nele", "AsyaAnara": "Asya Anara", "BrendaStern": "Brenda Stern",
"GittaNikolina": "Gitta Nikolina", "HenrietteUsha": "Henriette Usha", "SofiaHellen": "Sofia Hellen",
"TammyGrit": "Tammy Grit", "TanjaAdelina": "Tanja Adelina", "VjollcaJohnnie": "Vjollca Johnnie",
"AndrewChipper": "Andrew Chipper", "BadrOdhiambo": "Badr Odhiambo", "DionisioSchuyler": "Dionisio Schuyler",
"RoystonMin": "Royston Min", "ViktorEka": "Viktor Eka", "AbrahanMack": "Abrahan Mack",
"AddeMichal": "Adde Michal", "BaldurSanjin": "Baldur Sanjin", "CraigGutsy": "Craig Gutsy",
"DamienBlack": "Damien Black", "GilbertoMathias": "Gilberto Mathias", "IlkinUrbano": "Ilkin Urbano",
"KazuhikoAtallah": "Kazuhiko Atallah", "LudvigMilivoj": "Ludvig Milivoj", "SuadQasim": "Suad Qasim",
"TorcullDiarmuid": "Torcull Diarmuid", "ViktorMenelaos": "Viktor Menelaos", "ZacharieAimilios": "Zacharie Aimilios",
"NovaHogarth": "Nova Hogarth", "MajaRuoho": "Maja Ruoho", "UtaObando": "Uta Obando",
"LidiyaSzekeres": "Lidiya Szekeres", "ChandraMacFarland": "Chandra MacFarland", "SzofiGranger": "Szofi Granger",
"CamillaHolmström": "Camilla Holmström", "LilyaStainthorpe": "Lilya Stainthorpe", "ZofijaKendrick": "Zofija Kendrick",
"NarelleMoon": "Narelle Moon", "BarboraMacLean": "Barbora MacLean", "AlexandraHisakawa": "Alexandra Hisakawa",
"AlmaMaría": "Alma María", "RosemaryOkafor": "Rosemary Okafor", "IgeBehringer": "Ige Behringer",
"FilipTraverse": "Filip Traverse", "DamjanChapman": "Damjan Chapman", "WulfCarlevaro": "Wulf Carlevaro",
"AaronDreschner": "Aaron Dreschner", "KumarDahl": "Kumar Dahl", "EugenioMataracı": "Eugenio Mataracı",
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
"MarcosRudaski": "Marcos Rudaski"
},
"rating": {"VRAM": 4, "CPU": 2, "RAM": 4, "Realism": 5}
},
TTS_ENGINES['BARK']: {
"samplerate": 24000,
"text_temp": 0.22,
"waveform_temp": 0.44,
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
"speakers_path": os.path.join(voices_dir, '__bark'),
"voices": {
"de_speaker_0": "Speaker 0", "de_speaker_1": "Speaker 1", "de_speaker_2": "Speaker 2",
"de_speaker_3": "Speaker 3", "de_speaker_4": "Speaker 4", "de_speaker_5": "Speaker 5",
"de_speaker_6": "Speaker 6", "de_speaker_7": "Speaker 7", "de_speaker_8": "Speaker 8",
"de_speaker_9": "Speaker 9", "en_speaker_0": "Speaker 0", "en_speaker_1": "Speaker 1",
"en_speaker_2": "Speaker 2", "en_speaker_3": "Speaker 3", "en_speaker_4": "Speaker 4",
"en_speaker_5": "Speaker 5", "en_speaker_6": "Speaker 6", "en_speaker_7": "Speaker 7",
"en_speaker_8": "Speaker 8", "en_speaker_9": "Speaker 9", "es_speaker_0": "Speaker 0",
"es_speaker_1": "Speaker 1", "es_speaker_2": "Speaker 2", "es_speaker_3": "Speaker 3",
"es_speaker_4": "Speaker 4", "es_speaker_5": "Speaker 5", "es_speaker_6": "Speaker 6",
"es_speaker_7": "Speaker 7", "es_speaker_8": "Speaker 8", "es_speaker_9": "Speaker 9",
"fr_speaker_0": "Speaker 0", "fr_speaker_1": "Speaker 1", "fr_speaker_2": "Speaker 2",
"fr_speaker_3": "Speaker 3", "fr_speaker_4": "Speaker 4", "fr_speaker_5": "Speaker 5",
"fr_speaker_6": "Speaker 6", "fr_speaker_7": "Speaker 7", "fr_speaker_8": "Speaker 8",
"fr_speaker_9": "Speaker 9", "hi_speaker_0": "Speaker 0", "hi_speaker_1": "Speaker 1",
"hi_speaker_2": "Speaker 2", "hi_speaker_3": "Speaker 3", "hi_speaker_4": "Speaker 4",
"hi_speaker_5": "Speaker 5", "hi_speaker_6": "Speaker 6", "hi_speaker_7": "Speaker 7",
"hi_speaker_8": "Speaker 8", "hi_speaker_9": "Speaker 9", "it_speaker_0": "Speaker 0",
"it_speaker_1": "Speaker 1", "it_speaker_2": "Speaker 2", "it_speaker_3": "Speaker 3",
"it_speaker_4": "Speaker 4", "it_speaker_5": "Speaker 5", "it_speaker_6": "Speaker 6",
"it_speaker_7": "Speaker 7", "it_speaker_8": "Speaker 8", "it_speaker_9": "Speaker 9",
"ja_speaker_0": "Speaker 0", "ja_speaker_1": "Speaker 1", "ja_speaker_2": "Speaker 2",
"ja_speaker_3": "Speaker 3", "ja_speaker_4": "Speaker 4", "ja_speaker_5": "Speaker 5",
"ja_speaker_6": "Speaker 6", "ja_speaker_7": "Speaker 7", "ja_speaker_8": "Speaker 8",
"ja_speaker_9": "Speaker 9", "ko_speaker_0": "Speaker 0", "ko_speaker_1": "Speaker 1",
"ko_speaker_2": "Speaker 2", "ko_speaker_3": "Speaker 3", "ko_speaker_4": "Speaker 4",
"ko_speaker_5": "Speaker 5", "ko_speaker_6": "Speaker 6", "ko_speaker_7": "Speaker 7",
"ko_speaker_8": "Speaker 8", "ko_speaker_9": "Speaker 9", "pl_speaker_0": "Speaker 0",
"pl_speaker_1": "Speaker 1", "pl_speaker_2": "Speaker 2", "pl_speaker_3": "Speaker 3",
"pl_speaker_4": "Speaker 4", "pl_speaker_5": "Speaker 5", "pl_speaker_6": "Speaker 6",
"pl_speaker_7": "Speaker 7", "pl_speaker_8": "Speaker 8", "pl_speaker_9": "Speaker 9",
"pt_speaker_0": "Speaker 0", "pt_speaker_1": "Speaker 1", "pt_speaker_2": "Speaker 2",
"pt_speaker_3": "Speaker 3", "pt_speaker_4": "Speaker 4", "pt_speaker_5": "Speaker 5",
"pt_speaker_6": "Speaker 6", "pt_speaker_7": "Speaker 7", "pt_speaker_8": "Speaker 8",
"pt_speaker_9": "Speaker 9", "ru_speaker_0": "Speaker 0", "ru_speaker_1": "Speaker 1",
"ru_speaker_2": "Speaker 2", "ru_speaker_3": "Speaker 3", "ru_speaker_4": "Speaker 4",
"ru_speaker_5": "Speaker 5", "ru_speaker_6": "Speaker 6", "ru_speaker_7": "Speaker 7",
"ru_speaker_8": "Speaker 8", "ru_speaker_9": "Speaker 9", "tr_speaker_0": "Speaker 0",
"tr_speaker_1": "Speaker 1", "tr_speaker_2": "Speaker 2", "tr_speaker_3": "Speaker 3",
"tr_speaker_4": "Speaker 4", "tr_speaker_5": "Speaker 5", "tr_speaker_6": "Speaker 6",
"tr_speaker_7": "Speaker 7", "tr_speaker_8": "Speaker 8", "tr_speaker_9": "Speaker 9",
"zh_speaker_0": "Speaker 0", "zh_speaker_1": "Speaker 1", "zh_speaker_2": "Speaker 2",
"zh_speaker_3": "Speaker 3", "zh_speaker_4": "Speaker 4", "zh_speaker_5": "Speaker 5",
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
"zh_speaker_9": "Speaker 9"
},
"rating": {"VRAM": 6, "CPU": 1, "RAM": 6, "Realism": 5}
},
TTS_ENGINES['VITS']: {
"samplerate": 22050,
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
"voices": {},
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
},
TTS_ENGINES['FAIRSEQ']: {
"samplerate": 16000,
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
"voices": {},
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
},
TTS_ENGINES['TACOTRON2']: {
"samplerate": 22050,
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
"voices": {},
"rating": {"VRAM": 1, "CPU": 5, "RAM": 2, "Realism": 3}
},
TTS_ENGINES['YOURTTS']: {
"samplerate": 16000,
"files": ['config.json', 'model_file.pth'],
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
"rating": {"VRAM": 1, "CPU": 5, "RAM": 1, "Realism": 2}
}
}
models = {
TTS_ENGINES['XTTSv2']: {
"internal": {
"lang": "multi",
"repo": "coqui/XTTS-v2",
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"AiExplained": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/AiExplained/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"AsmrRacoon": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/AsmrRacoon/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Awkwafina": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/Awkwafina/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BobOdenkirk": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BobOdenkirk/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BobRoss": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BobRoss/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BrinaPalencia": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BrinaPalencia/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"BryanCranston": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/BryanCranston/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DavidAttenborough": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DavidAttenborough/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DeathPussInBoots": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DeathPussInBoots/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"DermotCrowley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/DermotCrowley/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"EvaSeymour": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/EvaSeymour/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"GideonOfnirEldenRing": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"GhostMW2": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/GhostMW2/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JohnButlerASMR": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JohnButlerASMR/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JohnButlerASMR.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JohnMulaney": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JohnMulaney/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JohnMulaney.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JillRedfield": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JillRedfield/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"JuliaWhenlan": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/JuliaWhenlan/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"LeeHorsley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/LeeHorsley/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"MelinaEldenRing": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/MelinaEldenRing/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"MorganFreeman": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/MorganFreeman/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"NeilGaiman": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/NeilGaiman/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"PeterGriffinFamilyGuy": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RafeBeckley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RafeBeckley/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RafeBeckley.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RainyDayHeadSpace": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RayPorter": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RayPorter/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RelaxForAWhile": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RelaxForAWhile/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RosamundPike": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/RosamundPike/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"ScarlettJohansson": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/ScarlettJohansson/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"SladeTeenTitans": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/SladeTeenTitans/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"StanleyParable": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/StanleyParable/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Top15s": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/Top15s/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'Top15s.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"WhisperSalemASMR": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/WhisperSalemASMR/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"Konishev": {
"lang": "rus",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/rus/Konishev/",
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
}
},
TTS_ENGINES['BARK']: {
"internal": {
"lang": "multi",
"repo": "erogol/bark", # erogol/bark, suno/bark, rsxdalv/suno, tts_models/multilingual/multi-dataset/bark
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
}
},
TTS_ENGINES['VITS']: {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang_iso1]/[xxx]",
"sub": {
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
"custom/vits": ['ca'],
"custom/vits-female": ['bn', 'fa'],
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
"mai/vits": ['uk'],
"mai_female/vits": ['pl'],
"mai_male/vits": ['it'],
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
"vctk/vits": ['en'],
"thorsten/vits": ['de']
},
"voice": None,
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
"samplerate": {
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"mai_female/vits": 24000,
"mai_male/vits": 16000,
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
}
}
},
TTS_ENGINES['FAIRSEQ']: {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang]/fairseq/vits",
"sub": "",
"voice": None,
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
}
},
TTS_ENGINES['TACOTRON2']: {
"internal": {
"lang": "multi",
"repo": "tts_models/[lang_iso1]/[xxx]",
"sub": {
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
"thorsten/tacotron2-DDC": ['de'],
"kokoro/tacotron2-DDC": ['ja'],
"ljspeech/tacotron2-DDC": ['en'],
"baker/tacotron2-DDC-GST": ['zh-CN']
},
"voice": None,
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
"samplerate": {
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
},
}
},
TTS_ENGINES['YOURTTS']: {
"internal": {
"lang": "multi",
"repo": "tts_models/multilingual/multi-dataset/your_tts",
"sub": "",
"voice": None,
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
}
}
}