mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-09 13:58:14 -05:00
v25.12.25
This commit is contained in:
53
.github/languages.yml
vendored
Normal file
53
.github/languages.yml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
languages:
|
||||
- code: ar # Arabic
|
||||
name: Arabic (ara)
|
||||
- code: zh-CN # Chinese (Simplified)
|
||||
name: Chinese (zho)
|
||||
- code: en # English
|
||||
name: English (eng)
|
||||
- code: es # Spanish
|
||||
name: Spanish (spa)
|
||||
- code: fr # French
|
||||
name: French (fra)
|
||||
- code: de # German
|
||||
name: German (deu)
|
||||
- code: it # Italian
|
||||
name: Italian (ita)
|
||||
- code: pt # Portuguese
|
||||
name: Portuguese (por)
|
||||
- code: pl # Polish
|
||||
name: Polish (pol)
|
||||
- code: tr # Turkish
|
||||
name: Turkish (tur)
|
||||
- code: ru # Russian
|
||||
name: Russian (rus)
|
||||
- code: nl # Dutch
|
||||
name: Dutch (nld)
|
||||
- code: cs # Czech
|
||||
name: Czech (ces)
|
||||
- code: ja # Japanese
|
||||
name: Japanese (jpn)
|
||||
- code: hi # Hindi
|
||||
name: Hindi (hin)
|
||||
- code: bn # Bengali
|
||||
name: Bengali (ben)
|
||||
- code: hu # Hungarian
|
||||
name: Hungarian (hun)
|
||||
- code: ko # Korean
|
||||
name: Korean (kor)
|
||||
- code: vi # Vietnamese
|
||||
name: Vietnamese (vie)
|
||||
- code: sv # Swedish
|
||||
name: Swedish (swe)
|
||||
- code: fa # Persian
|
||||
name: Persian (fas)
|
||||
- code: yo # Yoruba
|
||||
name: Yoruba (yor)
|
||||
- code: sw # Swahili
|
||||
name: Swahili (swa)
|
||||
- code: id # Indonesian
|
||||
name: Indonesian (ind)
|
||||
- code: sk # Slovak
|
||||
name: Slovak (slk)
|
||||
- code: hr # Croatian
|
||||
name: Croatian (hrv)
|
||||
0
tmp/.gitkeep → .github/scripts/.gitkeep
vendored
0
tmp/.gitkeep → .github/scripts/.gitkeep
vendored
@@ -1 +1 @@
|
||||
25.12.20
|
||||
25.12.25
|
||||
24
app.py
24
app.py
@@ -1,8 +1,8 @@
|
||||
import argparse, socket, multiprocessing, sys, warnings
|
||||
|
||||
from lib.conf import *
|
||||
from lib.lang import default_language_code
|
||||
from lib.models import TTS_ENGINES, default_fine_tuned, default_engine_settings
|
||||
from lib.conf_lang import default_language_code
|
||||
from lib.conf_models import TTS_ENGINES, default_fine_tuned, default_engine_settings
|
||||
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning)
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="jieba._compat")
|
||||
@@ -234,10 +234,10 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
error = f'Error: Could not installed device packages!'
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
import lib.functions as f
|
||||
f.context = f.SessionContext() if f.context is None else f.context
|
||||
f.context_tracker = f.SessionTracker() if f.context_tracker is None else f.context_tracker
|
||||
f.active_sessions = set() if f.active_sessions is None else f.active_sessions
|
||||
import lib.core as c
|
||||
c.context = c.SessionContext() if c.context is None else c.context
|
||||
c.context_tracker = c.SessionTracker() if c.context_tracker is None else c.context_tracker
|
||||
c.active_sessions = set() if c.active_sessions is None else c.active_sessions
|
||||
# Conditions based on the --headless flag
|
||||
if args['headless']:
|
||||
args['is_gui_process'] = False
|
||||
@@ -292,7 +292,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
if any(file.endswith(ext) for ext in ebook_formats):
|
||||
full_path = os.path.abspath(os.path.join(args['ebooks_dir'], file))
|
||||
args['ebook_list'].append(full_path)
|
||||
progress_status, passed = f.convert_ebook_batch(args)
|
||||
progress_status, passed = c.convert_ebook_batch(args)
|
||||
if passed is False:
|
||||
error = f'Conversion failed: {progress_status}'
|
||||
print(error)
|
||||
@@ -303,7 +303,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
error = f'Error: The provided --ebook "{args["ebook"]}" does not exist.'
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
progress_status, passed = f.convert_ebook(args)
|
||||
progress_status, passed = c.convert_ebook(args)
|
||||
if passed is False:
|
||||
error = f'Conversion failed: {progress_status}'
|
||||
print(error)
|
||||
@@ -334,16 +334,16 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
)
|
||||
except OSError as e:
|
||||
error = f'Connection error: {e}'
|
||||
f.alert_exception(error, None)
|
||||
c.alert_exception(error, None)
|
||||
except socket.error as e:
|
||||
error = f'Socket error: {e}'
|
||||
f.alert_exception(error, None)
|
||||
c.alert_exception(error, None)
|
||||
except KeyboardInterrupt:
|
||||
error = 'Server interrupted by user. Shutting down...'
|
||||
f.alert_exception(error, None)
|
||||
c.alert_exception(error, None)
|
||||
except Exception as e:
|
||||
error = f'An unexpected error occurred: {e}'
|
||||
f.alert_exception(error, None)
|
||||
c.alert_exception(error, None)
|
||||
else:
|
||||
error = 'Error: In GUI mode, no option or only --share can be passed'
|
||||
print(error)
|
||||
|
||||
@@ -71,12 +71,14 @@ while (( $# > 0 )); do
|
||||
case "$1" in
|
||||
--*)
|
||||
key="${1#--}"
|
||||
if [[ -n "$2" && "$2" != --* ]]; then
|
||||
arguments[$key]="$2"
|
||||
if (( $# > 1 )) && [[ "$2" != --* ]]; then
|
||||
arguments["$key"]="$2"
|
||||
shift 2
|
||||
continue
|
||||
else
|
||||
arguments[$key]=true
|
||||
arguments["$key"]=true
|
||||
shift
|
||||
continue
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
@@ -84,9 +86,9 @@ while (( $# > 0 )); do
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
|
||||
if [[ -n "${arguments[script_mode]+exists}" ]]; then
|
||||
if [[ "${arguments[script_mode]}" == "$BUILD_DOCKER" ]]; then
|
||||
SCRIPT_MODE="${arguments[script_mode]}"
|
||||
@@ -601,6 +603,8 @@ function check_conda {
|
||||
# Detect Jetson and select correct Python version
|
||||
MODEL="$(tr -d '\0' </proc/device-tree/model 2>/dev/null | tr 'A-Z' 'a-z' || true)"
|
||||
if [[ "$MODEL" == *jetson* ]]; then
|
||||
# needed gfortran to compile pip scipy pkg
|
||||
sudo apt-get install gfortran
|
||||
PYTHON_VERSION="3.10"
|
||||
fi
|
||||
else
|
||||
|
||||
@@ -1,10 +1,3 @@
|
||||
from .models import (
|
||||
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
|
||||
default_engine_settings, default_vc_model, default_voice_detection_model,
|
||||
loaded_tts, xtts_builtin_speakers_list, max_custom_model, max_custom_voices,
|
||||
models, os, voices_dir
|
||||
)
|
||||
|
||||
from .conf import (
|
||||
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
|
||||
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate, max_upload_size,
|
||||
@@ -18,21 +11,23 @@ from .conf import (
|
||||
voices_dir, default_output_split, default_output_split_hours
|
||||
)
|
||||
|
||||
from .lang import (
|
||||
from .conf_lang import (
|
||||
abbreviations_mapping, chapter_word_mapping, default_language_code,
|
||||
roman_numbers_tuples, emojis_list, install_info, language_mapping,
|
||||
language_math_phonemes, language_clock, language_tts, os, punctuation_list,
|
||||
language_math_phonemes, language_clock, os, punctuation_list,
|
||||
punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
|
||||
punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
|
||||
specialchars_mapping, chars_remove, year_to_decades_languages
|
||||
specialchars_mapping, chars_remove, year_to_decades_languages,
|
||||
)
|
||||
|
||||
from .conf_models import (
|
||||
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
|
||||
default_engine_settings, default_vc_model, default_voice_detection_model,
|
||||
loaded_tts, xtts_builtin_speakers_list,
|
||||
max_custom_model, max_custom_voices, voices_dir
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# from models
|
||||
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
|
||||
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
|
||||
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
|
||||
"max_custom_voices", "models", "os", "voices_dir",
|
||||
|
||||
# from conf
|
||||
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
|
||||
@@ -46,11 +41,17 @@ __all__ = [
|
||||
"requirements_file", "components_dir", "tmp_dir", "tmp_expire", "tts_dir",
|
||||
"voice_formats", "voices_dir", "default_output_split", "default_output_split_hours",
|
||||
|
||||
# from lang
|
||||
# from conf_lang
|
||||
"abbreviations_mapping", "chapter_word_mapping", "default_language_code",
|
||||
"roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
|
||||
"language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
|
||||
"language_math_phonemes", "language_clock", "os", "punctuation_list",
|
||||
"punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
|
||||
"punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
|
||||
"specialchars_mapping", "chars_remove", "year_to_decades_languages"
|
||||
"specialchars_mapping", "chars_remove", "year_to_decades_languages",
|
||||
|
||||
# from conf_models
|
||||
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
|
||||
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
|
||||
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
|
||||
"max_custom_voices", "voices_dir"
|
||||
]
|
||||
|
||||
@@ -5,7 +5,7 @@ import argostranslate.translate
|
||||
|
||||
from iso639 import Lang
|
||||
from lib.conf import models_dir
|
||||
from lib.lang import language_mapping
|
||||
from lib.conf_lang import language_mapping
|
||||
|
||||
# NOTE: source_lang and target_lang must be iso639-1 (2 letters)
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ import threading
|
||||
from pyannote.audio import Model
|
||||
from pyannote.audio.pipelines import VoiceActivityDetection
|
||||
from lib.conf import tts_dir
|
||||
from lib.models import default_voice_detection_model
|
||||
from lib.conf_models import default_voice_detection_model
|
||||
|
||||
|
||||
_PIPELINE_CACHE = {}
|
||||
|
||||
@@ -27,7 +27,7 @@ class DeviceInstaller():
|
||||
os_env = 'linux' if name == 'jetson' else self.check_platform
|
||||
elif mode == 'build_docker':
|
||||
os_env = 'linux' if name == 'jetson' else 'manylinux_2_28'
|
||||
pyvenv = [3,10] if tag in ['jetson51', 'jetson60', 'jetson61'] else pyvenv
|
||||
pyvenv = [3,10] if tag in ['jetson60', 'jetson61'] else pyvenv
|
||||
if all([name, tag, os_env, arch, pyvenv]):
|
||||
device_info = {"name": name, "os": os_env, "arch": arch, "pyvenv": pyvenv, "tag": tag, "note": msg}
|
||||
return json.dumps(device_info)
|
||||
@@ -171,27 +171,10 @@ class DeviceInstaller():
|
||||
rev_major = int(parts[0])
|
||||
rev_minor = int(parts[1]) if len(parts) > 1 else 0
|
||||
rev_patch = int(parts[2]) if len(parts) > 2 else 0
|
||||
if l4t_major < 35:
|
||||
msg = f'JetPack too old (L4T {l4t_major}). Please upgrade to JetPack 5.1+. Falling back to CPU.'
|
||||
if l4t_major < 36:
|
||||
msg = f'JetPack too old (L4T {l4t_major}). Please upgrade to JetPack 6.0+. Falling back to CPU.'
|
||||
return ('unsupported', msg)
|
||||
if l4t_major == 35:
|
||||
if rev_major == 0 and rev_minor <= 1:
|
||||
msg = 'JetPack 5.0/5.0.1 detected. Please upgrade to JetPack 5.1+ to use the GPU. Failing back to CPU'
|
||||
return ('cpu', msg)
|
||||
if rev_major == 0 and rev_minor >= 2:
|
||||
msg = 'JetPack 5.0.x detected. Please upgrade to JetPack 5.1+ to use the GPU. Failing back to CPU'
|
||||
return ('cpu', msg)
|
||||
if rev_major == 1 and rev_minor == 0:
|
||||
msg = 'JetPack 5.1.0 detected. Please upgrade to JetPack 5.1.2 or newer.'
|
||||
return ('51', msg)
|
||||
if rev_major == 1 and rev_minor == 1:
|
||||
msg = 'JetPack 5.1.1 detected. Please upgrade to JetPack 5.1.2 or newer.'
|
||||
return ('51', msg)
|
||||
if (rev_major > 1) or (rev_major == 1 and rev_minor >= 2):
|
||||
return ('51', msg)
|
||||
msg = 'Unrecognized JetPack 5.x version. Falling back to CPU.'
|
||||
return ('unknown', msg)
|
||||
if l4t_major == 36:
|
||||
else:
|
||||
if rev_major == 2:
|
||||
return ('60', msg)
|
||||
else:
|
||||
@@ -734,7 +717,8 @@ class DeviceInstaller():
|
||||
torchaudio_pkg = f"{url}/v{toolkit_version}/torchaudio-{jetson_torch_version_base[tag]}%2B{tag}-{tag_py}-{os_env}_{arch}.whl"
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', torch_pkg])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', '--no-cache-dir', torchaudio_pkg])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'uninstall', '-y', 'scikit-learn'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--force', '--no-binary=scikit-learn', 'scikit-learn'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--force', '--no-cache-dir', '--no-binary=scipy', 'scipy'])
|
||||
elif device_info['name'] == devices['MPS']['proc']:
|
||||
torch_tag_py = f'cp{default_py_major}{default_py_minor}-none'
|
||||
torchaudio_tag_py = f'cp{default_py_major}{default_py_minor}-cp{default_py_major}{default_py_minor}'
|
||||
|
||||
@@ -20,7 +20,7 @@ class SubprocessPipe:
|
||||
self.progress_bar((percent / 100), desc=self.msg)
|
||||
|
||||
def _on_complete(self)->None:
|
||||
msg = f"{self.msg} completed!"
|
||||
msg = f"\n{self.msg} completed!"
|
||||
print(msg)
|
||||
if self.is_gui_process:
|
||||
self.progress_bar(1.0, desc=msg)
|
||||
|
||||
@@ -1,280 +1,75 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class Bark(TTSUtils, TTSRegistry, name='bark'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class Bark(TTSRegistry, name='bark'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
if engine_name == TTS_ENGINES['XTTSv2']:
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
elif engine_name == TTS_ENGINES['BARK']:
|
||||
from TTS.tts.configs.bark_config import BarkConfig
|
||||
from TTS.tts.models.bark import Bark
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
|
||||
error = f'Missing or invalid checkpoint_dir: {checkpoint_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
#check_pth = ensure_safe_checkpoint(checkpoint_dir)
|
||||
#if not check_pth:
|
||||
# error = f'No valid checkpoint files found or conversion failed in: {checkpoint_dir}'
|
||||
# raise RuntimeError(error)
|
||||
# return False
|
||||
config = BarkConfig()
|
||||
config.CACHE_DIR = self.cache_dir
|
||||
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
|
||||
engine = Bark.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_dir = checkpoint_dir,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
"""
|
||||
hf_repo = self.models[self.session['fine_tuned']]['repo']
|
||||
hf_sub = self.models[self.session['fine_tuned']]['sub']
|
||||
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
checkpoint_dir = os.path.dirname(text_model_path)
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
|
||||
if self.engine:
|
||||
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
|
||||
"""
|
||||
model_path = self.models[self.session['fine_tuned']]['repo']
|
||||
engine = self._load_api(self.tts_key, model_path)
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
else:
|
||||
error = '_load_engine() failed!'
|
||||
raise ValueError(error)
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
raise ValueError(error)
|
||||
"""
|
||||
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str)->bool:
|
||||
try:
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
|
||||
if self.session['language'] in default_engine_settings[TTS_ENGINES['BARK']].get('languages', {}):
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
|
||||
if os.path.exists(pth_voice_file):
|
||||
@@ -310,51 +105,14 @@ class Bark(TTSRegistry, name='bark'):
|
||||
error = f'_check_bark_npz() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
"""
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
speaker = None
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -398,11 +156,13 @@ class Bark(TTSRegistry, name='bark'):
|
||||
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
|
||||
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
|
||||
else:
|
||||
bark_dir = os.path.join(os.path.dirname(self.params['voice_path']), 'bark')
|
||||
bark_dir = os.path.join(os.path.dirname(self.params['voice_path']), 'bark')
|
||||
"""
|
||||
if not self._check_bark_npz(self.params['voice_path'], bark_dir, speaker):
|
||||
error = 'Could not create pth voice file!'
|
||||
print(error)
|
||||
return False
|
||||
"""
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
|
||||
fine_tuned_params = {
|
||||
@@ -414,6 +174,7 @@ class Bark(TTSRegistry, name='bark'):
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
"""
|
||||
result = self.engine.synthesize(
|
||||
sentence,
|
||||
#speaker_wav=self.params['voice_path'],
|
||||
@@ -421,9 +182,17 @@ class Bark(TTSRegistry, name='bark'):
|
||||
voice_dir=pth_voice_dir,
|
||||
**fine_tuned_params
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
"""
|
||||
audio_sentence = self.engine.tts(
|
||||
text=sentence,
|
||||
speaker_wav=self.params['voice_path'],
|
||||
speaker=speaker,
|
||||
voice_dir=pth_voice_dir,
|
||||
**fine_tuned_params
|
||||
)
|
||||
#audio_sentence = result.get('wav')
|
||||
#if is_audio_data_valid(audio_sentence):
|
||||
# audio_sentence = audio_sentence.tolist()
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
@@ -447,11 +216,11 @@ class Bark(TTSRegistry, name='bark'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -464,7 +233,7 @@ class Bark(TTSRegistry, name='bark'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
41
lib/classes/tts_engines/common/headers.py
Normal file
41
lib/classes/tts_engines/common/headers.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import os, torch, torchaudio, random, subprocess, uuid, regex as re, numpy as np
|
||||
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from multiprocessing.managers import DictProxy
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.conf import tts_dir, devices, default_audio_proc_format
|
||||
from lib.conf_models import TTS_ENGINES, TTS_SML, TTS_VOICE_CONVERSION, loaded_tts, default_vc_model, default_engine_settings
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.tts_engines.common.utils import TTSUtils
|
||||
from lib.classes.tts_engines.common.audio import detect_gender, trim_audio, is_audio_data_valid
|
||||
|
||||
__all__ = [
|
||||
"os",
|
||||
"torch",
|
||||
"torchaudio",
|
||||
"random",
|
||||
"subprocess",
|
||||
"uuid",
|
||||
"re",
|
||||
"np",
|
||||
"Any",
|
||||
"Path",
|
||||
"DictProxy",
|
||||
"hf_hub_download",
|
||||
"TTSRegistry",
|
||||
"TTSUtils",
|
||||
"detect_gender",
|
||||
"trim_audio",
|
||||
"is_audio_data_valid",
|
||||
"tts_dir",
|
||||
"devices",
|
||||
"default_audio_proc_format",
|
||||
"TTS_ENGINES",
|
||||
"TTS_SML",
|
||||
"TTS_VOICE_CONVERSION",
|
||||
"loaded_tts",
|
||||
"default_vc_model",
|
||||
"default_engine_settings"
|
||||
]
|
||||
20
lib/classes/tts_engines/common/preset_loader.py
Normal file
20
lib/classes/tts_engines/common/preset_loader.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import importlib
|
||||
import threading
|
||||
from typing import Dict, Any
|
||||
|
||||
_lock = threading.Lock()
|
||||
_presets_cache:Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
def load_engine_presets(engine:str)->Dict[str, Any]:
|
||||
with _lock:
|
||||
if engine in _presets_cache:
|
||||
return _presets_cache[engine]
|
||||
module = importlib.import_module(
|
||||
f"lib.classes.tts_engines.presets.{engine}_presets"
|
||||
)
|
||||
if not hasattr(module, "models"):
|
||||
raise RuntimeError(
|
||||
f"'models' not found in {engine}_presets"
|
||||
)
|
||||
_presets_cache[engine] = module.models
|
||||
return module.models
|
||||
@@ -1,8 +1,4 @@
|
||||
import os
|
||||
import gc
|
||||
import torch
|
||||
import shutil
|
||||
import regex as re
|
||||
import os, threading, gc, torch, torchaudio, shutil, tempfile, regex as re, soundfile as sf, numpy as np
|
||||
|
||||
from typing import Any, Union, Dict
|
||||
from huggingface_hub import hf_hub_download
|
||||
@@ -11,190 +7,307 @@ from pathlib import Path
|
||||
from torch import Tensor
|
||||
from torch.nn import Module
|
||||
|
||||
from lib.conf import tts_dir
|
||||
from lib.models import xtts_builtin_speakers_list, TTS_ENGINES, models
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.audio import normalize_audio
|
||||
from lib import *
|
||||
|
||||
def cleanup_memory()->None:
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.ipc_collect()
|
||||
torch.cuda.synchronize()
|
||||
_lock = threading.Lock()
|
||||
|
||||
def model_size_bytes(model:Module)->int:
|
||||
total = 0
|
||||
for t in list(model.parameters()) + list(model.buffers()):
|
||||
if isinstance(t, Tensor):
|
||||
total += t.nelement() * t.element_size()
|
||||
return total
|
||||
class TTSUtils:
|
||||
|
||||
def loaded_tts_size_gb(loaded_tts:Dict[str, Module])->float:
|
||||
total_bytes = 0
|
||||
for model in loaded_tts.values():
|
||||
try:
|
||||
total_bytes += model_size_bytes(model)
|
||||
except Exception:
|
||||
pass
|
||||
gb = total_bytes / (1024 ** 3)
|
||||
return round(gb, 2)
|
||||
def _cleanup_memory(self)->None:
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.ipc_collect()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
def load_xtts_builtin_list()->dict:
|
||||
try:
|
||||
if len(xtts_builtin_speakers_list) > 0:
|
||||
return xtts_builtin_speakers_list
|
||||
speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename='speakers_xtts.pth', cache_dir=tts_dir)
|
||||
loaded = torch.load(speakers_path, weights_only=False)
|
||||
if not isinstance(loaded, dict):
|
||||
raise TypeError(
|
||||
f"Invalid XTTS speakers format: {type(loaded)}"
|
||||
)
|
||||
for name, data in loaded.items():
|
||||
if name not in xtts_builtin_speakers_list:
|
||||
xtts_builtin_speakers_list[name] = data
|
||||
return xtts_builtin_speakers_list
|
||||
except Exception as error:
|
||||
raise RuntimeError(
|
||||
"load_xtts_builtin_list() failed"
|
||||
) from error
|
||||
|
||||
def apply_cuda_policy(using_gpu, enough_vram, seed):
|
||||
if using_gpu and enough_vram:
|
||||
torch.cuda.set_per_process_memory_fraction(0.95)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
else:
|
||||
torch.cuda.set_per_process_memory_fraction(0.7)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
def append_sentence2vtt(sentence_obj:dict[str, Any], path:str)->Union[int, bool]:
|
||||
|
||||
def format_timestamp(seconds:float)->str:
|
||||
m, s = divmod(seconds, 60)
|
||||
h, m = divmod(m, 60)
|
||||
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
|
||||
|
||||
try:
|
||||
index = 1
|
||||
if os.path.exists(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "-->" in line:
|
||||
index += 1
|
||||
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
|
||||
return index # Already written
|
||||
if not os.path.exists(path):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("WEBVTT\n\n")
|
||||
with open(path, "a", encoding="utf-8") as f:
|
||||
start = format_timestamp(float(sentence_obj["start"]))
|
||||
end = format_timestamp(float(sentence_obj["end"]))
|
||||
text = re.sub(r"[\r\n]+", " ", str(sentence_obj["text"])).strip()
|
||||
f.write(f"{start} --> {end}\n{text}\n\n")
|
||||
return index + 1
|
||||
except Exception as e:
|
||||
error = f"append_sentence2vtt() error: {e}"
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def is_safetensors_file(path:str)->bool:
|
||||
try:
|
||||
with open(path, 'rb') as f:
|
||||
header = f.read(32)
|
||||
return b'safetensors' in header
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def convert_pt_to_safetensors(pth_path:str, delete_original:bool=False)->str:
|
||||
pth_path = Path(pth_path)
|
||||
if not pth_path.exists():
|
||||
error = f'File not found: {pth_path}'
|
||||
print(error)
|
||||
raise FileNotFoundError()
|
||||
if not (pth_path.suffix in ['.pth', '.pt']):
|
||||
error = f'Expected a .pth or .pt file, got: {pth_path.suffix}'
|
||||
print(error)
|
||||
raise ValueError(error)
|
||||
safe_dir = pth_path.parent / "safetensors"
|
||||
safe_dir.mkdir(exist_ok=True)
|
||||
safe_path = safe_dir / pth_path.with_suffix('.safetensors').name
|
||||
msg = f'Converting {pth_path.name} → safetensors/{safe_path.name}'
|
||||
print(msg)
|
||||
try:
|
||||
try:
|
||||
state = torch.load(str(pth_path), map_location='cpu', weights_only=True)
|
||||
except Exception:
|
||||
error = f'⚠️ weights_only load failed for {pth_path.name}, retrying unsafely (trusted file).'
|
||||
print(error)
|
||||
state = torch.load(str(pth_path), map_location='cpu', weights_only=False)
|
||||
if isinstance(state, dict) and "model" in state:
|
||||
state = state["model"]
|
||||
flattened = {}
|
||||
for k, v in state.items():
|
||||
if isinstance(v, dict):
|
||||
for subk, subv in v.items():
|
||||
flattened[f"{k}.{subk}"] = subv
|
||||
else:
|
||||
flattened[k] = v
|
||||
state = {k: v for k, v in flattened.items() if isinstance(v, torch.Tensor)}
|
||||
for k, v in list(state.items()):
|
||||
state[k] = v.clone().detach()
|
||||
save_file(state, str(safe_path))
|
||||
if delete_original:
|
||||
pth_path.unlink(missing_ok=True)
|
||||
msg = f'Deleted original: {pth_path}'
|
||||
print(msg)
|
||||
msg = f'Saved: {safe_path}'
|
||||
print(msg)
|
||||
return str(safe_path)
|
||||
except Exception as e:
|
||||
error = f'Failed to convert {pth_path.name}: {e}'
|
||||
print(error)
|
||||
raise
|
||||
|
||||
def ensure_safe_checkpoint(checkpoint_dir:str)->list[str]:
|
||||
safe_files = []
|
||||
if os.path.isfile(checkpoint_dir):
|
||||
if not (checkpoint_dir.endswith('.pth') or checkpoint_dir.endswith('.pt')):
|
||||
error = f'Invalid checkpoint file: {checkpoint_dir}'
|
||||
raise ValueError(error)
|
||||
if not is_safetensors_file(checkpoint_dir):
|
||||
def _loaded_tts_size_gb(self, loaded_tts:Dict[str, Module])->float:
|
||||
total_bytes = 0
|
||||
for model in loaded_tts.values():
|
||||
try:
|
||||
safe_path = convert_pt_to_safetensors(checkpoint_dir, False)
|
||||
msg = f'Created safetensors version of {os.path.basename(checkpoint_dir)} → {safe_path}'
|
||||
print(msg)
|
||||
safe_files.append(safe_path)
|
||||
except Exception as e:
|
||||
error = f'Failed to convert {os.path.basename(checkpoint_dir)}: {e}'
|
||||
print(error)
|
||||
total_bytes += model_size_bytes(model)
|
||||
except Exception:
|
||||
pass
|
||||
gb = total_bytes / (1024 ** 3)
|
||||
return round(gb, 2)
|
||||
|
||||
def _load_xtts_builtin_list(self)->dict:
|
||||
try:
|
||||
if len(xtts_builtin_speakers_list) > 0:
|
||||
return xtts_builtin_speakers_list
|
||||
speakers_path = hf_hub_download(repo_id=default_engine_settings[TTS_ENGINES['XTTSv2']]['repo'], filename='speakers_xtts.pth', cache_dir=tts_dir)
|
||||
loaded = torch.load(speakers_path, weights_only=False)
|
||||
if not isinstance(loaded, dict):
|
||||
raise TypeError(
|
||||
f"Invalid XTTS speakers format: {type(loaded)}"
|
||||
)
|
||||
for name, data in loaded.items():
|
||||
if name not in xtts_builtin_speakers_list:
|
||||
xtts_builtin_speakers_list[name] = data
|
||||
return xtts_builtin_speakers_list
|
||||
except Exception as error:
|
||||
raise RuntimeError(
|
||||
"self._load_xtts_builtin_list() failed"
|
||||
) from error
|
||||
|
||||
def _apply_cuda_policy(self, using_gpu:bool, enough_vram:bool, seed:int)->None:
|
||||
torch.cuda.manual_seed_all(0)
|
||||
if using_gpu and enough_vram:
|
||||
torch.cuda.set_per_process_memory_fraction(0.95)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
else:
|
||||
safe_files.append(checkpoint_dir)
|
||||
return safe_files
|
||||
if not os.path.isdir(checkpoint_dir):
|
||||
raise FileNotFoundError(f"Invalid checkpoint_dir: {checkpoint_dir}")
|
||||
for root, _, files in os.walk(checkpoint_dir):
|
||||
for fname in files:
|
||||
if fname.endswith(".pth") or fname.endswith(".pt"):
|
||||
pth_path = os.path.join(root, fname)
|
||||
if is_safetensors_file(pth_path):
|
||||
safe_files.append(pth_path)
|
||||
continue
|
||||
try:
|
||||
safe_path = convert_pt_to_safetensors(pth_path, False)
|
||||
msg = f'Created safetensors version of {os.path.relpath(pth_path, checkpoint_dir)} → {os.path.relpath(safe_path, checkpoint_dir)}'
|
||||
torch.cuda.set_per_process_memory_fraction(0.7)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
try:
|
||||
with _lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
try:
|
||||
with _lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
self._cleanup_memory()
|
||||
engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not engine_zs:
|
||||
engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
return engine_zs
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
xtts = TTS_ENGINES['XTTSv2']
|
||||
if self.session['language'] in default_engine_settings[xtts].get('languages', {}):
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
safe_files.append(safe_path)
|
||||
except Exception as e:
|
||||
error = f'Failed to convert {fname}: {e}'
|
||||
print(error)
|
||||
return safe_files
|
||||
key = f"{xtts}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = self._loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = default_engine_settings[xtts]['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{default_engine_settings[xtts]['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=xtts, key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[xtts]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[xtts]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path], librosa_trim_db=30, load_sr=24000, sound_norm_refs=True)
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[xtts]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {xtts} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
def _append_sentence2vtt(self, sentence_obj:dict[str, Any], path:str)->Union[int, bool]:
|
||||
|
||||
def format_timestamp(seconds:float)->str:
|
||||
m, s = divmod(seconds, 60)
|
||||
h, m = divmod(m, 60)
|
||||
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
|
||||
|
||||
try:
|
||||
index = 1
|
||||
if os.path.exists(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
if "-->" in line:
|
||||
index += 1
|
||||
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
|
||||
return index # Already written
|
||||
if not os.path.exists(path):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write("WEBVTT\n\n")
|
||||
with open(path, "a", encoding="utf-8") as f:
|
||||
start = format_timestamp(float(sentence_obj["start"]))
|
||||
end = format_timestamp(float(sentence_obj["end"]))
|
||||
text = re.sub(r"[\r\n]+", " ", str(sentence_obj["text"])).strip()
|
||||
f.write(f"{start} --> {end}\n{text}\n\n")
|
||||
return index + 1
|
||||
except Exception as e:
|
||||
error = f"self._append_sentence2vtt() error: {e}"
|
||||
print(error)
|
||||
return False
|
||||
@@ -1,882 +0,0 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class Coqui:
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params={TTS_ENGINES['XTTSv2']:{"latent_embedding":{}},TTS_ENGINES['BARK']:{},TTS_ENGINES['VITS']:{"semitones":{}},TTS_ENGINES['FAIRSEQ']:{"semitones":{}},TTS_ENGINES['TACOTRON2']:{"semitones":{}},TTS_ENGINES['YOURTTS']:{}}
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
if not xtts_builtin_speakers_list:
|
||||
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename='speakers_xtts.pth', cache_dir=self.cache_dir)
|
||||
xtts_builtin_speakers_list = torch.load(self.speakers_path, weights_only=False)
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if using_gpu and enough_vram:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found'] or devices['JETSON']['found']:
|
||||
if devices['JETSON']['found']:
|
||||
if not hasattr(torch, "distributed"):
|
||||
torch.distributed = types.SimpleNamespace()
|
||||
if not hasattr(torch.distributed, "ReduceOp"):
|
||||
class _ReduceOp:
|
||||
SUM = None
|
||||
MAX = None
|
||||
MIN = None
|
||||
torch.distributed.ReduceOp = _ReduceOp
|
||||
if not hasattr(torch.distributed, "all_reduce"):
|
||||
def _all_reduce(*args, **kwargs):
|
||||
return
|
||||
torch.distributed.all_reduce = _all_reduce
|
||||
torch.cuda.set_per_process_memory_fraction(0.95)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
else:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found'] or devices['JETSON']['found']:
|
||||
torch.cuda.set_per_process_memory_fraction(0.7)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
print(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
if engine_name == TTS_ENGINES['XTTSv2']:
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
elif engine_name == TTS_ENGINES['BARK']:
|
||||
from TTS.tts.configs.bark_config import BarkConfig
|
||||
from TTS.tts.models.bark import Bark
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
|
||||
error = f'Missing or invalid checkpoint_dir: {checkpoint_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
#check_pth = ensure_safe_checkpoint(checkpoint_dir)
|
||||
#if not check_pth:
|
||||
# error = f'No valid checkpoint files found or conversion failed in: {checkpoint_dir}'
|
||||
# raise RuntimeError(error)
|
||||
# return False
|
||||
config = BarkConfig()
|
||||
config.CACHE_DIR = self.cache_dir
|
||||
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
|
||||
engine = Bark.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_dir = checkpoint_dir,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
||||
if self.session['custom_model'] is not None:
|
||||
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
||||
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
||||
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
if self.session['fine_tuned'] == 'internal':
|
||||
hf_sub = ''
|
||||
if self.speakers_path is None:
|
||||
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename='speakers_xtts.pth', cache_dir=self.cache_dir)
|
||||
else:
|
||||
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
checkpoint_dir = os.path.dirname(text_model_path)
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
if sub is not None:
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['TACOTRON2']][self.session['fine_tuned']]['samplerate'][sub]
|
||||
if sub is None:
|
||||
iso_dir = self.session['language']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
if sub is not None:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
m = self.engine.synthesizer.tts_model
|
||||
d = m.decoder
|
||||
# Stability
|
||||
d.prenet_dropout = 0.0
|
||||
d.attention_dropout = 0.0
|
||||
d.decoder_dropout = 0.0
|
||||
m.attention.location_attention.dropout = 0.0
|
||||
# Stop-gate tuning
|
||||
d.gate_threshold = 0.5
|
||||
d.force_gate = True
|
||||
d.gate_delay = 10
|
||||
# Long-sentence fix
|
||||
d.max_decoder_steps = 1000
|
||||
# Prevent attention drift
|
||||
d.attention_keeplast = True
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
if self.engine:
|
||||
msg = f'TTS {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str)->bool:
|
||||
try:
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
|
||||
if os.path.exists(pth_voice_file):
|
||||
return True
|
||||
else:
|
||||
os.makedirs(pth_voice_dir,exist_ok=True)
|
||||
key = f"{TTS_ENGINES['BARK']}-internal"
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("bark_"):cast_type(self.session[key])
|
||||
for key,cast_type in{
|
||||
"bark_text_temp":float,
|
||||
"bark_waveform_temp":float
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.engine.synthesize(
|
||||
default_text,
|
||||
speaker_wav=voice_path,
|
||||
speaker=speaker,
|
||||
voice_dir=pth_voice_dir,
|
||||
**fine_tuned_params
|
||||
)
|
||||
del result
|
||||
msg = f"Saved file: {pth_voice_file}"
|
||||
print(msg)
|
||||
return True
|
||||
else:
|
||||
return True
|
||||
except Exception as e:
|
||||
error = f'_check_bark_npz() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
speaker = None
|
||||
audio_sentence = False
|
||||
settings = self.params[self.session['tts_engine']]
|
||||
settings['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if settings['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
||||
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['custom_model_dir'] not in settings['voice_path']:
|
||||
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker)
|
||||
if not settings['voice_path']:
|
||||
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
|
||||
print(msg)
|
||||
return False
|
||||
if self.engine:
|
||||
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
|
||||
self.engine.to(device)
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
return True
|
||||
elif not sentence.replace('—', '').strip() or sentence == TTS_SML['pause']:
|
||||
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
|
||||
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
|
||||
self.audio_segments.append(pause_tensor.clone())
|
||||
return True
|
||||
else:
|
||||
if sentence.endswith("'"):
|
||||
sentence = sentence[:-1]
|
||||
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
||||
trim_audio_buffer = 0.008
|
||||
sentence = sentence.replace('.', ' ;\n')
|
||||
sentence += ' ...' if sentence[-1].isalnum() else ''
|
||||
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
|
||||
else:
|
||||
msg = 'Computing speaker latents...'
|
||||
print(msg)
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
|
||||
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.engine.inference(
|
||||
text=sentence,
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=settings['gpt_cond_latent'],
|
||||
speaker_embedding=settings['speaker_embedding'],
|
||||
**fine_tuned_params
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
||||
trim_audio_buffer = 0.002
|
||||
sentence += '…' if sentence[-1].isalnum() else ''
|
||||
'''
|
||||
[laughter]
|
||||
[laughs]
|
||||
[sighs]
|
||||
[music]
|
||||
[gasps]
|
||||
[clears throat]
|
||||
— or ... for hesitations
|
||||
♪ for song lyrics
|
||||
CAPITALIZATION for emphasis of a word
|
||||
[MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
|
||||
'''
|
||||
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
|
||||
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
|
||||
else:
|
||||
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
|
||||
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker):
|
||||
error = 'Could not create pth voice file!'
|
||||
print(error)
|
||||
return False
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("bark_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"bark_text_temp": float,
|
||||
"bark_waveform_temp": float
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.engine.synthesize(
|
||||
sentence,
|
||||
#speaker_wav=settings['voice_path'],
|
||||
speaker=speaker,
|
||||
voice_dir=pth_voice_dir,
|
||||
**fine_tuned_params
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
||||
trim_audio_buffer = 0.004
|
||||
sentence += '—' if sentence[-1].isalnum() else ''
|
||||
speaker_argument = {}
|
||||
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
||||
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
|
||||
speaker_argument = {"speaker": 'p262'}
|
||||
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
||||
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
|
||||
speaker_argument = {"speaker": '09901'}
|
||||
if settings['voice_path'] is not None:
|
||||
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=sentence,
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
voice_path_gender = detect_gender(settings['voice_path'])
|
||||
voice_builtin_gender = detect_gender(tmp_in_wav)
|
||||
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
||||
print(msg)
|
||||
if voice_builtin_gender != voice_path_gender:
|
||||
semitones = -4 if voice_path_gender == 'male' else 4
|
||||
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
||||
print(msg)
|
||||
else:
|
||||
semitones = 0
|
||||
settings['semitones'][settings['voice_path']] = semitones
|
||||
if semitones > 0:
|
||||
try:
|
||||
cmd = [
|
||||
shutil.which('sox'), tmp_in_wav,
|
||||
"-r", str(settings['samplerate']), tmp_out_wav,
|
||||
"pitch", str(semitones * 100)
|
||||
]
|
||||
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
error = f"Subprocess error: {e.stderr}"
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
except FileNotFoundError as e:
|
||||
error = f"File not found: {e}"
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
os.remove(tmp_in_wav)
|
||||
if os.path.exists(tmp_out_wav):
|
||||
os.remove(tmp_out_wav)
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=sentence,
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
||||
trim_audio_buffer = 0.004
|
||||
sentence += '—' if sentence[-1].isalnum() else ''
|
||||
speaker_argument = {}
|
||||
not_supported_punc_pattern = re.compile(r"[.:—]")
|
||||
if settings['voice_path'] is not None:
|
||||
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
voice_path_gender = detect_gender(settings['voice_path'])
|
||||
voice_builtin_gender = detect_gender(tmp_in_wav)
|
||||
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
||||
print(msg)
|
||||
if voice_builtin_gender != voice_path_gender:
|
||||
semitones = -4 if voice_path_gender == 'male' else 4
|
||||
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
||||
print(msg)
|
||||
else:
|
||||
semitones = 0
|
||||
settings['semitones'][settings['voice_path']] = semitones
|
||||
if semitones > 0:
|
||||
try:
|
||||
cmd = [
|
||||
shutil.which('sox'), tmp_in_wav,
|
||||
"-r", str(settings['samplerate']), tmp_out_wav,
|
||||
"pitch", str(semitones * 100)
|
||||
]
|
||||
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
error = f'Subprocess error: {e.stderr}'
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
except FileNotFoundError as e:
|
||||
error = f'File not found: {e}'
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
os.remove(tmp_in_wav)
|
||||
if os.path.exists(tmp_out_wav):
|
||||
os.remove(tmp_out_wav)
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
||||
trim_audio_buffer = 0.004
|
||||
sentence += '...' if sentence[-1].isalnum() else ''
|
||||
speaker_argument = {}
|
||||
if self.session['language'] in ['zho', 'jpn', 'kor', 'tha', 'lao', 'mya', 'khm']:
|
||||
not_supported_punc_pattern = re.compile(r'\p{P}+')
|
||||
else:
|
||||
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
|
||||
if settings['voice_path'] is not None:
|
||||
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
voice_path_gender = detect_gender(settings['voice_path'])
|
||||
voice_builtin_gender = detect_gender(tmp_in_wav)
|
||||
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
||||
print(msg)
|
||||
if voice_builtin_gender != voice_path_gender:
|
||||
semitones = -4 if voice_path_gender == 'male' else 4
|
||||
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
||||
print(msg)
|
||||
else:
|
||||
semitones = 0
|
||||
settings['semitones'][settings['voice_path']] = semitones
|
||||
if semitones > 0:
|
||||
try:
|
||||
cmd = [
|
||||
shutil.which('sox'), tmp_in_wav,
|
||||
"-r", str(settings['samplerate']), tmp_out_wav,
|
||||
"pitch", str(semitones * 100)
|
||||
]
|
||||
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
error = f"Subprocess error: {e.stderr}"
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
except FileNotFoundError as e:
|
||||
error = f"File not found: {e}"
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
os.remove(tmp_in_wav)
|
||||
if os.path.exists(tmp_out_wav):
|
||||
os.remove(tmp_out_wav)
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
||||
trim_audio_buffer = 0.002
|
||||
sentence += '...' if sentence[-1].isalnum() else ''
|
||||
speaker_argument = {}
|
||||
not_supported_punc_pattern = re.compile(r'[—]')
|
||||
language = self.session['language_iso1'] if self.session['language_iso1'] == 'en' else 'fr-fr' if self.session['language_iso1'] == 'fr' else 'pt-br' if self.session['language_iso1'] == 'pt' else 'en'
|
||||
if settings['voice_path'] is not None:
|
||||
speaker_wav = settings['voice_path']
|
||||
speaker_argument = {"speaker_wav": speaker_wav}
|
||||
else:
|
||||
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
|
||||
speaker_argument = {"speaker": voice_key}
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
language=language,
|
||||
**speaker_argument
|
||||
)
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
if self.audio_segments:
|
||||
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
||||
start_time = self.sentences_total_time
|
||||
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
|
||||
end_time = start_time + duration
|
||||
self.sentences_total_time = end_time
|
||||
sentence_obj = {
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
else:
|
||||
error = f"Cannot create {final_sentence_file}"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"audio_sentence not valide"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
error = f'Coqui.convert(): {e}'
|
||||
raise ValueError(e)
|
||||
return False
|
||||
@@ -1,286 +1,63 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class Fairseq(TTSRegistry, name='fairseq'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
||||
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
if self.engine:
|
||||
engine = self._load_api(self.tts_key, model_path)
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
else:
|
||||
error = '_load_engine() failed!'
|
||||
raise ValueError(error)
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
raise ValueError(error)
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
@@ -288,7 +65,7 @@ class Fairseq(TTSRegistry, name='fairseq'):
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -411,11 +188,11 @@ class Fairseq(TTSRegistry, name='fairseq'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -428,7 +205,7 @@ class Fairseq(TTSRegistry, name='fairseq'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
14
lib/classes/tts_engines/presets/bark_presets.py
Normal file
14
lib/classes/tts_engines/presets/bark_presets.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
from lib.conf import voices_dir
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/multilingual/multi-dataset/bark", # load_checkpoint => erogol/bark, suno/bark, rsxdalv/suno. load_api => tts_models/multilingual/multi-dataset/bark
|
||||
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
|
||||
}
|
||||
}
|
||||
12
lib/classes/tts_engines/presets/fairseq_presets.py
Normal file
12
lib/classes/tts_engines/presets/fairseq_presets.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang]/fairseq/vits",
|
||||
"sub": "",
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
|
||||
}
|
||||
}
|
||||
24
lib/classes/tts_engines/presets/tacotron_presets.py
Normal file
24
lib/classes/tts_engines/presets/tacotron_presets.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang_iso1]/[xxx]",
|
||||
"sub": {
|
||||
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
|
||||
"thorsten/tacotron2-DDC": ['de'],
|
||||
"kokoro/tacotron2-DDC": ['ja'],
|
||||
"ljspeech/tacotron2-DDC": ['en'],
|
||||
"baker/tacotron2-DDC-GST": ['zh-CN']
|
||||
},
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
|
||||
"samplerate": {
|
||||
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
|
||||
},
|
||||
}
|
||||
}
|
||||
34
lib/classes/tts_engines/presets/vits_presets.py
Normal file
34
lib/classes/tts_engines/presets/vits_presets.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang_iso1]/[xxx]",
|
||||
"sub": {
|
||||
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
|
||||
"custom/vits": ['ca'],
|
||||
"custom/vits-female": ['bn', 'fa'],
|
||||
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
|
||||
"mai/vits": ['uk'],
|
||||
"mai_female/vits": ['pl'],
|
||||
"mai_male/vits": ['it'],
|
||||
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
|
||||
"vctk/vits": ['en'],
|
||||
"thorsten/vits": ['de']
|
||||
},
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
|
||||
"samplerate": {
|
||||
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"mai_female/vits": 24000,
|
||||
"mai_male/vits": 16000,
|
||||
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
|
||||
}
|
||||
}
|
||||
}
|
||||
278
lib/classes/tts_engines/presets/xtts_presets.py
Normal file
278
lib/classes/tts_engines/presets/xtts_presets.py
Normal file
@@ -0,0 +1,278 @@
|
||||
import os
|
||||
from lib.conf import voices_dir
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "coqui/XTTS-v2",
|
||||
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"AiExplained": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/AiExplained/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"AsmrRacoon": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/AsmrRacoon/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Awkwafina": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/Awkwafina/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BobOdenkirk": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BobOdenkirk/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BobRoss": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BobRoss/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BrinaPalencia": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BrinaPalencia/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BryanCranston": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BryanCranston/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DavidAttenborough": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DavidAttenborough/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DeathPussInBoots": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DeathPussInBoots/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DermotCrowley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DermotCrowley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"EvaSeymour": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/EvaSeymour/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"GideonOfnirEldenRing": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"GhostMW2": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/GhostMW2/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JohnButlerASMR": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JohnButlerASMR/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JohnButlerASMR.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JohnMulaney": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JohnMulaney/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JohnMulaney.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JillRedfield": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JillRedfield/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JuliaWhenlan": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JuliaWhenlan/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"LeeHorsley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/LeeHorsley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"MelinaEldenRing": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/MelinaEldenRing/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"MorganFreeman": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/MorganFreeman/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"NeilGaiman": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/NeilGaiman/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"PeterGriffinFamilyGuy": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RafeBeckley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RafeBeckley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RafeBeckley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RainyDayHeadSpace": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RayPorter": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RayPorter/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RelaxForAWhile": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RelaxForAWhile/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RosamundPike": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RosamundPike/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"ScarlettJohansson": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/ScarlettJohansson/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"SladeTeenTitans": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/SladeTeenTitans/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"StanleyParable": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/StanleyParable/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Top15s": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/Top15s/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'Top15s.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"WhisperSalemASMR": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/WhisperSalemASMR/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Konishev": {
|
||||
"lang": "rus",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/rus/Konishev/",
|
||||
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
}
|
||||
}
|
||||
12
lib/classes/tts_engines/presets/yourtts_presets.py
Normal file
12
lib/classes/tts_engines/presets/yourtts_presets.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from lib.conf_models import TTS_ENGINES, default_engine_settings
|
||||
|
||||
models = {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/multilingual/multi-dataset/your_tts",
|
||||
"sub": "",
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
|
||||
}
|
||||
}
|
||||
@@ -1,151 +1,68 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class Tacotron2(TTSRegistry, name='tacotron'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
iso_dir = default_engine_settings[self.session['tts_engine']]['languages'][self.session['language']]
|
||||
sub_dict = self.models[self.session['fine_tuned']]['sub']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate'][sub]
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate'][sub]
|
||||
if sub is None:
|
||||
iso_dir = self.session['language']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
if sub is not None:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
m = self.engine.synthesizer.tts_model
|
||||
engine = self._load_api(self.tts_key, model_path)
|
||||
m = engine.synthesizer.tts_model
|
||||
d = m.decoder
|
||||
# Stability
|
||||
d.prenet_dropout = 0.0
|
||||
d.attention_dropout = 0.0
|
||||
d.decoder_dropout = 0.0
|
||||
m.attention.location_attention.dropout = 0.0
|
||||
# Stop-gate tuning
|
||||
d.gate_threshold = 0.5
|
||||
d.force_gate = True
|
||||
@@ -157,156 +74,15 @@ class Tacotron2(TTSRegistry, name='tacotron'):
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
if self.engine:
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
else:
|
||||
error = '_load_engine() failed!'
|
||||
raise ValueError(error)
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
raise ValueError(error)
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
@@ -314,7 +90,7 @@ class Tacotron2(TTSRegistry, name='tacotron'):
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -440,11 +216,11 @@ class Tacotron2(TTSRegistry, name='tacotron'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -457,7 +233,7 @@ class Tacotron2(TTSRegistry, name='tacotron'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,294 +1,71 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class Vits(TTSUtils, TTSRegistry, name='vits'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class Vits(TTSRegistry, name='vits'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {"semitones":{}}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
iso_dir = default_engine_settings[self.session['tts_engine']]['languages'][self.session['language']]
|
||||
sub_dict = self.models[self.session['fine_tuned']]['sub']
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
if sub is not None:
|
||||
self.params['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate'][sub]
|
||||
model_path = self.models[self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
self.tts_key = model_path
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
engine = self._load_api(self.tts_key, model_path)
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
if self.engine:
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
else:
|
||||
error = '_load_engine() failed!'
|
||||
raise ValueError(error)
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
raise ValueError(error)
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
@@ -296,7 +73,7 @@ class Vits(TTSRegistry, name='vits'):
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -326,11 +103,11 @@ class Vits(TTSRegistry, name='vits'):
|
||||
trim_audio_buffer = 0.004
|
||||
sentence += '—' if sentence[-1].isalnum() else ''
|
||||
speaker_argument = {}
|
||||
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
||||
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
|
||||
if self.session['language'] == 'eng' and 'vctk/vits' in self.models['internal']['sub']:
|
||||
if self.session['language'] in self.models['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in self.models['internal']['sub']['vctk/vits']:
|
||||
speaker_argument = {"speaker": 'p262'}
|
||||
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
||||
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
|
||||
elif self.session['language'] == 'cat' and 'custom/vits' in self.models['internal']['sub']:
|
||||
if self.session['language'] in self.models['internal']['sub']['custom/vits'] or self.session['language_iso1'] in self.models['internal']['sub']['custom/vits']:
|
||||
speaker_argument = {"speaker": '09901'}
|
||||
if self.params['voice_path'] is not None:
|
||||
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
||||
@@ -424,11 +201,11 @@ class Vits(TTSRegistry, name='vits'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -441,7 +218,7 @@ class Vits(TTSRegistry, name='vits'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,295 +1,71 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class XTTSv2(TTSRegistry, name='xtts'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {"latent_embedding":{}}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {"latent_embedding":{}}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
||||
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
||||
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
hf_repo = self.models[self.session['fine_tuned']]['repo']
|
||||
if self.session['fine_tuned'] == 'internal':
|
||||
hf_sub = ''
|
||||
if self.speakers_path is None:
|
||||
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename='speakers_xtts.pth', cache_dir=self.cache_dir)
|
||||
else:
|
||||
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if self.engine:
|
||||
hf_sub = self.models[self.session['fine_tuned']]['sub']
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{self.models[self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
raise ValueError(error)
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
@@ -297,7 +73,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -335,7 +111,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[self.params['voice_path']])
|
||||
self.params['gpt_cond_latent'], self.params['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[self.params['voice_path']], librosa_trim_db=30, load_sr=24000, sound_norm_refs=True)
|
||||
self.params['latent_embedding'][self.params['voice_path']] = self.params['gpt_cond_latent'], self.params['speaker_embedding']
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
@@ -389,11 +165,11 @@ class XTTSv2(TTSRegistry, name='xtts'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -406,7 +182,7 @@ class XTTSv2(TTSRegistry, name='xtts'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,285 +1,62 @@
|
||||
import threading, torch, torchaudio, random, gc, shutil, subprocess, tempfile, uuid, types
|
||||
from lib.classes.tts_engines.common.headers import *
|
||||
from lib.classes.tts_engines.common.preset_loader import load_engine_presets
|
||||
|
||||
import regex as re
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
|
||||
|
||||
from multiprocessing.managers import DictProxy
|
||||
from typing import Any
|
||||
from pathlib import Path
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
from lib.classes.vram_detector import VRAMDetector
|
||||
from lib.classes.tts_engines.common.utils import cleanup_memory, append_sentence2vtt, loaded_tts_size_gb, load_xtts_builtin_list, apply_cuda_policy #, ensure_safe_checkpoint
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
from lib import *
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class YourTTS(TTSRegistry, name='yourtts'):
|
||||
def __init__(self, session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = self.session['model_cache']
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {}
|
||||
self.params['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self.models = load_engine_presets(self.session['tts_engine'])
|
||||
self.params = {}
|
||||
self.params['samplerate'] = self.models[self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
seed = 123456
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
seed = 0
|
||||
#random.seed(seed)
|
||||
#np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
has_cuda = (torch.version.cuda is not None and torch.cuda.is_available())
|
||||
if has_cuda:
|
||||
apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = load_xtts_builtin_list()
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
self._apply_cuda_policy(using_gpu=using_gpu, enough_vram=enough_vram, seed=seed)
|
||||
self.xtts_speakers = self._load_xtts_builtin_list()
|
||||
self.engine = self._load_engine()
|
||||
self.engine_zs = self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
error = f'Missing or invalid checkpoint_path: {checkpoint_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
error = f'Missing or invalid config_path: {config_path}'
|
||||
raise FileNotFoundError(error)
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] > models_loaded_size_gb:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
def _load_engine(self)->Any:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._cleanup_memory()
|
||||
engine = loaded_tts.get(self.tts_key, False)
|
||||
if not engine:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
self.engine = self._load_api(self.tts_key, model_path)
|
||||
if self.engine:
|
||||
model_path = self.models[self.session['fine_tuned']]['repo']
|
||||
engine = self._load_api(self.tts_key, model_path)
|
||||
if engine and engine is not None:
|
||||
msg = f'TTS {self.tts_key} Loaded!'
|
||||
return engine
|
||||
else:
|
||||
error = '_load_engine() failed!'
|
||||
raise ValueError(error)
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_memory()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model)
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {self.tts_zs_key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if (self.session['language'] in voice_parts or speaker in default_engine_settings[TTS_ENGINES['BARK']]['voices'] or self.session['language'] == 'eng'):
|
||||
return voice_path
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_memory()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
vram_dict = VRAMDetector().detect_vram(self.session['device'])
|
||||
self.session['free_vram_gb'] = vram_dict.get('free_vram_gb', 0)
|
||||
models_loaded_size_gb = loaded_tts_size_gb(loaded_tts)
|
||||
if self.session['free_vram_gb'] <= models_loaded_size_gb:
|
||||
del loaded_tts[self.tts_key]
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = self.xtts_speakers[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
#"xtts_codec_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
#"xtts_cvvp_weight": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
#"xtts_gpt_cond_len": int,
|
||||
#"xtts_gpt_batch_size": int,
|
||||
"xtts_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
self._load_engine()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
if new_voice_path:
|
||||
Path(new_voice_path).unlink(missing_ok=True)
|
||||
if proc_voice_path:
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
resample_tmp = os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(resample_tmp, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=resample_tmp, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
raise ValueError(error)
|
||||
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
@@ -287,7 +64,7 @@ class YourTTS(TTSRegistry, name='yourtts'):
|
||||
audio_sentence = False
|
||||
self.params['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
else self.models[self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if self.params['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(self.params['voice_path']))
|
||||
@@ -354,11 +131,11 @@ class YourTTS(TTSRegistry, name='yourtts'):
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
self.sentence_idx = self._append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, self.params['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_memory()
|
||||
self._cleanup_memory()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
@@ -371,7 +148,7 @@ class YourTTS(TTSRegistry, name='yourtts'):
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
error = f"TTS engine {self.session['tts_engine']} failed to load!"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
import lib.classes.tts_engines
|
||||
|
||||
from typing import Any
|
||||
from lib.classes.tts_registry import TTSRegistry
|
||||
|
||||
import lib.classes.tts_engines
|
||||
|
||||
class TTSManager:
|
||||
def __init__(self, session: Any) -> None:
|
||||
self.session = session
|
||||
|
||||
def __init__(self, session:Any)->None:
|
||||
self.session = session
|
||||
engine_name = session.get("tts_engine")
|
||||
if engine_name is None:
|
||||
raise ValueError("session['tts_engine'] is missing")
|
||||
|
||||
try:
|
||||
engine_cls = TTSRegistry.ENGINES[engine_name]
|
||||
except KeyError:
|
||||
@@ -18,8 +17,7 @@ class TTSManager:
|
||||
f"Invalid tts_engine '{engine_name}'. "
|
||||
f"Expected one of: {', '.join(TTSRegistry.ENGINES)}"
|
||||
)
|
||||
|
||||
self.engine = engine_cls(session)
|
||||
|
||||
def convert_sentence2audio(self, sentence_number:int, sentence:str)->bool:
|
||||
def convert_sentence2audio(self, sentence_number: int, sentence: str) -> bool:
|
||||
return self.engine.convert(sentence_number, sentence)
|
||||
@@ -12,10 +12,9 @@ from io import BytesIO
|
||||
from pydub import AudioSegment, silence
|
||||
from pydub.silence import detect_silence
|
||||
|
||||
from lib.conf import voice_formats, default_audio_proc_samplerate
|
||||
from lib.models import TTS_ENGINES, models
|
||||
from lib.classes.background_detector import BackgroundDetector
|
||||
from lib.classes.subprocess_pipe import SubprocessPipe
|
||||
from lib.conf import voice_formats, default_audio_proc_samplerate
|
||||
|
||||
class VoiceExtractor:
|
||||
def __init__(self, session:Any, voice_file:str, voice_name:str):
|
||||
|
||||
@@ -48,7 +48,8 @@ os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostran
|
||||
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
||||
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = '1'
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32,garbage_collection_threshold:0.6,expandable_segments:True'
|
||||
os.environ['TORCH_CUDA_ENABLE_CUDA_GRAPH'] = '0'
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,garbage_collection_threshold:0.6,expandable_segments:True'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||
os.environ["CUDA_CACHE_MAXSIZE"] = "2147483648"
|
||||
@@ -93,7 +94,6 @@ default_pytorch_url = 'https://download.pytorch.org/whl'
|
||||
default_jetson_url = 'https://www.e-blokos.com/whl/jetson' # TODO: find a definitive place where to upload the jetpack5 torch
|
||||
|
||||
jetson_torch_version_base = {
|
||||
"jetson51": "2.4.1",
|
||||
"jetson60": "2.4.0",
|
||||
"jetson61": "2.5.0"
|
||||
}
|
||||
@@ -125,7 +125,6 @@ torch_matrix = {
|
||||
"xpu": {"url": default_pytorch_url},
|
||||
|
||||
# JETSON
|
||||
"jetson51": {"url": default_jetson_url},
|
||||
"jetson60": {"url": default_jetson_url},
|
||||
"jetson61": {"url": default_jetson_url}
|
||||
}
|
||||
@@ -134,7 +133,7 @@ cuda_version_range = {"min": (11,8), "max": (12,8)}
|
||||
rocm_version_range = {"min": (5,5), "max": (6,4)}
|
||||
mps_version_range = {"min": (0,0), "max": (0,0)}
|
||||
xpu_version_range = {"min": (0,0), "max": (0,0)}
|
||||
jetson_version_range = {"min": (5,1), "max": (6,2)}
|
||||
jetson_version_range = {"min": (6,0), "max": (6,2)}
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Python environment references
|
||||
|
||||
File diff suppressed because one or more lines are too long
162
lib/conf_models.py
Normal file
162
lib/conf_models.py
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
1112
lib/gradio.py
1112
lib/gradio.py
File diff suppressed because it is too large
Load Diff
517
lib/models.py
517
lib/models.py
@@ -1,517 +0,0 @@
|
||||
import os
|
||||
from lib.conf import tts_dir, voices_dir
|
||||
|
||||
loaded_tts = {}
|
||||
xtts_builtin_speakers_list = {}
|
||||
|
||||
TTS_ENGINES = {
|
||||
"XTTSv2": "xtts",
|
||||
"BARK": "bark",
|
||||
"VITS": "vits",
|
||||
"FAIRSEQ": "fairseq",
|
||||
"TACOTRON2": "tacotron",
|
||||
"YOURTTS": "yourtts"
|
||||
}
|
||||
|
||||
TTS_VOICE_CONVERSION = {
|
||||
"freevc24": {"path": "voice_conversion_models/multilingual/vctk/freevc24", "samplerate": 24000},
|
||||
"knnvc": {"path": "voice_conversion_models/multilingual/multi-dataset/knnvc", "samplerate": 16000},
|
||||
"openvoice_v1": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v1", "samplerate": 22050},
|
||||
"openvoice_v2": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v2", "samplerate": 22050}
|
||||
}
|
||||
|
||||
TTS_SML = {
|
||||
"break": "‡break‡",
|
||||
"pause": "‡pause‡",
|
||||
"###": "‡pause‡"
|
||||
}
|
||||
|
||||
default_tts_engine = TTS_ENGINES['XTTSv2']
|
||||
default_fine_tuned = 'internal'
|
||||
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
|
||||
default_voice_detection_model = 'drewThomasson/segmentation'
|
||||
|
||||
max_custom_model = 100
|
||||
max_custom_voices = 1000
|
||||
|
||||
default_engine_settings = {
|
||||
TTS_ENGINES['XTTSv2']: {
|
||||
"samplerate": 24000,
|
||||
"temperature": 0.75,
|
||||
#"codec_temperature": 0.3,
|
||||
"length_penalty": 1.0,
|
||||
"num_beams": 1,
|
||||
"repetition_penalty": 2.0,
|
||||
#"cvvp_weight": 0.3,
|
||||
"top_k": 50,
|
||||
"top_p": 0.85,
|
||||
"speed": 1.0,
|
||||
#"gpt_cond_len": 512,
|
||||
#"gpt_batch_size": 1,
|
||||
"enable_text_splitting": False,
|
||||
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav'],
|
||||
"voices": {
|
||||
"ClaribelDervla": "Claribel Dervla", "DaisyStudious": "Daisy Studious", "GracieWise": "Gracie Wise",
|
||||
"TammieEma": "Tammie Ema", "AlisonDietlinde": "Alison Dietlinde", "AnaFlorence": "Ana Florence",
|
||||
"AnnmarieNele": "Annmarie Nele", "AsyaAnara": "Asya Anara", "BrendaStern": "Brenda Stern",
|
||||
"GittaNikolina": "Gitta Nikolina", "HenrietteUsha": "Henriette Usha", "SofiaHellen": "Sofia Hellen",
|
||||
"TammyGrit": "Tammy Grit", "TanjaAdelina": "Tanja Adelina", "VjollcaJohnnie": "Vjollca Johnnie",
|
||||
"AndrewChipper": "Andrew Chipper", "BadrOdhiambo": "Badr Odhiambo", "DionisioSchuyler": "Dionisio Schuyler",
|
||||
"RoystonMin": "Royston Min", "ViktorEka": "Viktor Eka", "AbrahanMack": "Abrahan Mack",
|
||||
"AddeMichal": "Adde Michal", "BaldurSanjin": "Baldur Sanjin", "CraigGutsy": "Craig Gutsy",
|
||||
"DamienBlack": "Damien Black", "GilbertoMathias": "Gilberto Mathias", "IlkinUrbano": "Ilkin Urbano",
|
||||
"KazuhikoAtallah": "Kazuhiko Atallah", "LudvigMilivoj": "Ludvig Milivoj", "SuadQasim": "Suad Qasim",
|
||||
"TorcullDiarmuid": "Torcull Diarmuid", "ViktorMenelaos": "Viktor Menelaos", "ZacharieAimilios": "Zacharie Aimilios",
|
||||
"NovaHogarth": "Nova Hogarth", "MajaRuoho": "Maja Ruoho", "UtaObando": "Uta Obando",
|
||||
"LidiyaSzekeres": "Lidiya Szekeres", "ChandraMacFarland": "Chandra MacFarland", "SzofiGranger": "Szofi Granger",
|
||||
"CamillaHolmström": "Camilla Holmström", "LilyaStainthorpe": "Lilya Stainthorpe", "ZofijaKendrick": "Zofija Kendrick",
|
||||
"NarelleMoon": "Narelle Moon", "BarboraMacLean": "Barbora MacLean", "AlexandraHisakawa": "Alexandra Hisakawa",
|
||||
"AlmaMaría": "Alma María", "RosemaryOkafor": "Rosemary Okafor", "IgeBehringer": "Ige Behringer",
|
||||
"FilipTraverse": "Filip Traverse", "DamjanChapman": "Damjan Chapman", "WulfCarlevaro": "Wulf Carlevaro",
|
||||
"AaronDreschner": "Aaron Dreschner", "KumarDahl": "Kumar Dahl", "EugenioMataracı": "Eugenio Mataracı",
|
||||
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
|
||||
"MarcosRudaski": "Marcos Rudaski"
|
||||
},
|
||||
"rating": {"VRAM": 4, "CPU": 2, "RAM": 4, "Realism": 5}
|
||||
},
|
||||
TTS_ENGINES['BARK']: {
|
||||
"samplerate": 24000,
|
||||
"text_temp": 0.22,
|
||||
"waveform_temp": 0.44,
|
||||
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
|
||||
"speakers_path": os.path.join(voices_dir, '__bark'),
|
||||
"voices": {
|
||||
"de_speaker_0": "Speaker 0", "de_speaker_1": "Speaker 1", "de_speaker_2": "Speaker 2",
|
||||
"de_speaker_3": "Speaker 3", "de_speaker_4": "Speaker 4", "de_speaker_5": "Speaker 5",
|
||||
"de_speaker_6": "Speaker 6", "de_speaker_7": "Speaker 7", "de_speaker_8": "Speaker 8",
|
||||
"de_speaker_9": "Speaker 9", "en_speaker_0": "Speaker 0", "en_speaker_1": "Speaker 1",
|
||||
"en_speaker_2": "Speaker 2", "en_speaker_3": "Speaker 3", "en_speaker_4": "Speaker 4",
|
||||
"en_speaker_5": "Speaker 5", "en_speaker_6": "Speaker 6", "en_speaker_7": "Speaker 7",
|
||||
"en_speaker_8": "Speaker 8", "en_speaker_9": "Speaker 9", "es_speaker_0": "Speaker 0",
|
||||
"es_speaker_1": "Speaker 1", "es_speaker_2": "Speaker 2", "es_speaker_3": "Speaker 3",
|
||||
"es_speaker_4": "Speaker 4", "es_speaker_5": "Speaker 5", "es_speaker_6": "Speaker 6",
|
||||
"es_speaker_7": "Speaker 7", "es_speaker_8": "Speaker 8", "es_speaker_9": "Speaker 9",
|
||||
"fr_speaker_0": "Speaker 0", "fr_speaker_1": "Speaker 1", "fr_speaker_2": "Speaker 2",
|
||||
"fr_speaker_3": "Speaker 3", "fr_speaker_4": "Speaker 4", "fr_speaker_5": "Speaker 5",
|
||||
"fr_speaker_6": "Speaker 6", "fr_speaker_7": "Speaker 7", "fr_speaker_8": "Speaker 8",
|
||||
"fr_speaker_9": "Speaker 9", "hi_speaker_0": "Speaker 0", "hi_speaker_1": "Speaker 1",
|
||||
"hi_speaker_2": "Speaker 2", "hi_speaker_3": "Speaker 3", "hi_speaker_4": "Speaker 4",
|
||||
"hi_speaker_5": "Speaker 5", "hi_speaker_6": "Speaker 6", "hi_speaker_7": "Speaker 7",
|
||||
"hi_speaker_8": "Speaker 8", "hi_speaker_9": "Speaker 9", "it_speaker_0": "Speaker 0",
|
||||
"it_speaker_1": "Speaker 1", "it_speaker_2": "Speaker 2", "it_speaker_3": "Speaker 3",
|
||||
"it_speaker_4": "Speaker 4", "it_speaker_5": "Speaker 5", "it_speaker_6": "Speaker 6",
|
||||
"it_speaker_7": "Speaker 7", "it_speaker_8": "Speaker 8", "it_speaker_9": "Speaker 9",
|
||||
"ja_speaker_0": "Speaker 0", "ja_speaker_1": "Speaker 1", "ja_speaker_2": "Speaker 2",
|
||||
"ja_speaker_3": "Speaker 3", "ja_speaker_4": "Speaker 4", "ja_speaker_5": "Speaker 5",
|
||||
"ja_speaker_6": "Speaker 6", "ja_speaker_7": "Speaker 7", "ja_speaker_8": "Speaker 8",
|
||||
"ja_speaker_9": "Speaker 9", "ko_speaker_0": "Speaker 0", "ko_speaker_1": "Speaker 1",
|
||||
"ko_speaker_2": "Speaker 2", "ko_speaker_3": "Speaker 3", "ko_speaker_4": "Speaker 4",
|
||||
"ko_speaker_5": "Speaker 5", "ko_speaker_6": "Speaker 6", "ko_speaker_7": "Speaker 7",
|
||||
"ko_speaker_8": "Speaker 8", "ko_speaker_9": "Speaker 9", "pl_speaker_0": "Speaker 0",
|
||||
"pl_speaker_1": "Speaker 1", "pl_speaker_2": "Speaker 2", "pl_speaker_3": "Speaker 3",
|
||||
"pl_speaker_4": "Speaker 4", "pl_speaker_5": "Speaker 5", "pl_speaker_6": "Speaker 6",
|
||||
"pl_speaker_7": "Speaker 7", "pl_speaker_8": "Speaker 8", "pl_speaker_9": "Speaker 9",
|
||||
"pt_speaker_0": "Speaker 0", "pt_speaker_1": "Speaker 1", "pt_speaker_2": "Speaker 2",
|
||||
"pt_speaker_3": "Speaker 3", "pt_speaker_4": "Speaker 4", "pt_speaker_5": "Speaker 5",
|
||||
"pt_speaker_6": "Speaker 6", "pt_speaker_7": "Speaker 7", "pt_speaker_8": "Speaker 8",
|
||||
"pt_speaker_9": "Speaker 9", "ru_speaker_0": "Speaker 0", "ru_speaker_1": "Speaker 1",
|
||||
"ru_speaker_2": "Speaker 2", "ru_speaker_3": "Speaker 3", "ru_speaker_4": "Speaker 4",
|
||||
"ru_speaker_5": "Speaker 5", "ru_speaker_6": "Speaker 6", "ru_speaker_7": "Speaker 7",
|
||||
"ru_speaker_8": "Speaker 8", "ru_speaker_9": "Speaker 9", "tr_speaker_0": "Speaker 0",
|
||||
"tr_speaker_1": "Speaker 1", "tr_speaker_2": "Speaker 2", "tr_speaker_3": "Speaker 3",
|
||||
"tr_speaker_4": "Speaker 4", "tr_speaker_5": "Speaker 5", "tr_speaker_6": "Speaker 6",
|
||||
"tr_speaker_7": "Speaker 7", "tr_speaker_8": "Speaker 8", "tr_speaker_9": "Speaker 9",
|
||||
"zh_speaker_0": "Speaker 0", "zh_speaker_1": "Speaker 1", "zh_speaker_2": "Speaker 2",
|
||||
"zh_speaker_3": "Speaker 3", "zh_speaker_4": "Speaker 4", "zh_speaker_5": "Speaker 5",
|
||||
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
|
||||
"zh_speaker_9": "Speaker 9"
|
||||
},
|
||||
"rating": {"VRAM": 6, "CPU": 1, "RAM": 6, "Realism": 5}
|
||||
},
|
||||
TTS_ENGINES['VITS']: {
|
||||
"samplerate": 22050,
|
||||
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
|
||||
"voices": {},
|
||||
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
|
||||
},
|
||||
TTS_ENGINES['FAIRSEQ']: {
|
||||
"samplerate": 16000,
|
||||
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
|
||||
"voices": {},
|
||||
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
|
||||
},
|
||||
TTS_ENGINES['TACOTRON2']: {
|
||||
"samplerate": 22050,
|
||||
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
|
||||
"voices": {},
|
||||
"rating": {"VRAM": 1, "CPU": 5, "RAM": 2, "Realism": 3}
|
||||
},
|
||||
TTS_ENGINES['YOURTTS']: {
|
||||
"samplerate": 16000,
|
||||
"files": ['config.json', 'model_file.pth'],
|
||||
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
|
||||
"rating": {"VRAM": 1, "CPU": 5, "RAM": 1, "Realism": 2}
|
||||
}
|
||||
}
|
||||
models = {
|
||||
TTS_ENGINES['XTTSv2']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "coqui/XTTS-v2",
|
||||
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"AiExplained": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/AiExplained/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"AsmrRacoon": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/AsmrRacoon/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Awkwafina": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/Awkwafina/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BobOdenkirk": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BobOdenkirk/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BobRoss": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BobRoss/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BrinaPalencia": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BrinaPalencia/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"BryanCranston": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/BryanCranston/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DavidAttenborough": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DavidAttenborough/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DeathPussInBoots": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DeathPussInBoots/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"DermotCrowley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/DermotCrowley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"EvaSeymour": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/EvaSeymour/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"GideonOfnirEldenRing": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"GhostMW2": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/GhostMW2/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JohnButlerASMR": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JohnButlerASMR/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JohnButlerASMR.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JohnMulaney": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JohnMulaney/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JohnMulaney.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JillRedfield": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JillRedfield/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"JuliaWhenlan": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/JuliaWhenlan/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"LeeHorsley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/LeeHorsley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"MelinaEldenRing": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/MelinaEldenRing/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"MorganFreeman": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/MorganFreeman/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"NeilGaiman": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/NeilGaiman/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"PeterGriffinFamilyGuy": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RafeBeckley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RafeBeckley/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RafeBeckley.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RainyDayHeadSpace": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RayPorter": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RayPorter/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RelaxForAWhile": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RelaxForAWhile/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RosamundPike": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/RosamundPike/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"ScarlettJohansson": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/ScarlettJohansson/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"SladeTeenTitans": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/SladeTeenTitans/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"StanleyParable": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/StanleyParable/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Top15s": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/Top15s/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'Top15s.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"WhisperSalemASMR": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/WhisperSalemASMR/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"Konishev": {
|
||||
"lang": "rus",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/rus/Konishev/",
|
||||
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
}
|
||||
},
|
||||
TTS_ENGINES['BARK']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "erogol/bark", # erogol/bark, suno/bark, rsxdalv/suno, tts_models/multilingual/multi-dataset/bark
|
||||
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
|
||||
}
|
||||
},
|
||||
TTS_ENGINES['VITS']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang_iso1]/[xxx]",
|
||||
"sub": {
|
||||
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
|
||||
"custom/vits": ['ca'],
|
||||
"custom/vits-female": ['bn', 'fa'],
|
||||
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
|
||||
"mai/vits": ['uk'],
|
||||
"mai_female/vits": ['pl'],
|
||||
"mai_male/vits": ['it'],
|
||||
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
|
||||
"vctk/vits": ['en'],
|
||||
"thorsten/vits": ['de']
|
||||
},
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
|
||||
"samplerate": {
|
||||
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"mai_female/vits": 24000,
|
||||
"mai_male/vits": 16000,
|
||||
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
||||
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
|
||||
}
|
||||
}
|
||||
},
|
||||
TTS_ENGINES['FAIRSEQ']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang]/fairseq/vits",
|
||||
"sub": "",
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
|
||||
}
|
||||
},
|
||||
TTS_ENGINES['TACOTRON2']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/[lang_iso1]/[xxx]",
|
||||
"sub": {
|
||||
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
|
||||
"thorsten/tacotron2-DDC": ['de'],
|
||||
"kokoro/tacotron2-DDC": ['ja'],
|
||||
"ljspeech/tacotron2-DDC": ['en'],
|
||||
"baker/tacotron2-DDC-GST": ['zh-CN']
|
||||
},
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
|
||||
"samplerate": {
|
||||
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
||||
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
|
||||
},
|
||||
}
|
||||
},
|
||||
TTS_ENGINES['YOURTTS']: {
|
||||
"internal": {
|
||||
"lang": "multi",
|
||||
"repo": "tts_models/multilingual/multi-dataset/your_tts",
|
||||
"sub": "",
|
||||
"voice": None,
|
||||
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user