mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-10 14:28:15 -05:00
Implement F5-TTS support following template guidelines
Co-authored-by: DrewThomasson <126999465+DrewThomasson@users.noreply.github.com>
This commit is contained in:
13
.github/workflows/E2A-Test.yml
vendored
13
.github/workflows/E2A-Test.yml
vendored
@@ -150,8 +150,8 @@ jobs:
|
||||
- name: Create Audiobook Output folders for Artifacts
|
||||
shell: bash
|
||||
run: |
|
||||
mkdir -p ~/ebook2audiobook/audiobooks/{F5TTS,TACOTRON2,FAIRSEQ,UnFAIRSEQ,VITS,YOURTTS,XTTSv2,XTTSv2FineTune,BARK}
|
||||
find ~/ebook2audiobook/audiobooks/{F5TTS,TACOTRON2,FAIRSEQ,UnFAIRSEQ,VITS,YOURTTS,XTTSv2,XTTSv2FineTune,BARK} -mindepth 1 -exec rm -rf {} +
|
||||
mkdir -p ~/ebook2audiobook/audiobooks/{TACOTRON2,FAIRSEQ,UnFAIRSEQ,VITS,YOURTTS,XTTSv2,XTTSv2FineTune,BARK}
|
||||
find ~/ebook2audiobook/audiobooks/{TACOTRON2,FAIRSEQ,UnFAIRSEQ,VITS,YOURTTS,XTTSv2,XTTSv2FineTune,BARK} -mindepth 1 -exec rm -rf {} +
|
||||
|
||||
- name: Add set -e at beginning of ebook2audiobook.sh (for error passing)
|
||||
shell: bash
|
||||
@@ -162,15 +162,6 @@ jobs:
|
||||
conda deactivate
|
||||
sed -i '' '1s;^;set -e\n;' ebook2audiobook.sh
|
||||
|
||||
- name: English F5-TTS Custom-Voice headless single test
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Running English F5-TTS Custom-Voice headless single test..."
|
||||
cd ~/ebook2audiobook
|
||||
source "$(conda info --base)/etc/profile.d/conda.sh"
|
||||
conda deactivate
|
||||
./ebook2audiobook.sh --headless --language eng --ebook "tools/workflow-testing/test1.txt" --tts_engine F5-TTS --voice "voices/eng/elder/male/DavidAttenborough.wav" --output_dir ~/ebook2audiobook/audiobooks/F5TTS
|
||||
|
||||
- name: English TACOTRON2 Custom-Voice headless single test
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
72
.gitignore
vendored
72
.gitignore
vendored
@@ -1,72 +0,0 @@
|
||||
# Python cache
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
python_env/
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Application specific
|
||||
models/
|
||||
audiobooks/
|
||||
ebooks/
|
||||
voices/
|
||||
tmp/
|
||||
*.log
|
||||
@@ -155,6 +155,14 @@ class Coqui:
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
if load_zeroshot:
|
||||
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
||||
if not tts_vc:
|
||||
@@ -174,14 +182,28 @@ class Coqui:
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
||||
from TTS.api import TTS as coquiAPI
|
||||
with lock:
|
||||
tts = coquiAPI(model_path)
|
||||
# Handle F5-TTS separately
|
||||
if self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
try:
|
||||
from f5_tts.api import F5TTS
|
||||
tts = F5TTS(model=model_path, device=device)
|
||||
except ImportError:
|
||||
error = 'F5-TTS not installed. Please install with: pip install f5-tts'
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
# Use Coqui TTS API for other engines
|
||||
from TTS.api import TTS as coquiAPI
|
||||
tts = coquiAPI(model_path)
|
||||
|
||||
if tts:
|
||||
if device == 'cuda':
|
||||
tts.cuda()
|
||||
else:
|
||||
tts.to(device)
|
||||
if self.session['tts_engine'] != TTS_ENGINES['F5-TTS']:
|
||||
# Apply device settings for Coqui TTS
|
||||
if device == 'cuda':
|
||||
tts.cuda()
|
||||
else:
|
||||
tts.to(device)
|
||||
loaded_tts[key] = {"engine": tts, "config": None}
|
||||
msg = f'{model_path} Loaded!'
|
||||
print(msg)
|
||||
@@ -778,6 +800,38 @@ class Coqui:
|
||||
language=language,
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
# F5-TTS requires reference text and audio for voice cloning
|
||||
ref_text = "This is a reference audio for voice cloning."
|
||||
if settings['voice_path'] and os.path.exists(settings['voice_path']):
|
||||
# Use provided reference audio
|
||||
ref_audio_path = settings['voice_path']
|
||||
else:
|
||||
# Fallback to default voice
|
||||
ref_audio_path = models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
|
||||
# Resample reference audio to expected sample rate if needed
|
||||
if ref_audio_path:
|
||||
ref_audio_path = self._resample_wav(ref_audio_path, settings['samplerate'])
|
||||
|
||||
# Get F5-TTS parameters from models configuration
|
||||
f5_settings = default_engine_settings[TTS_ENGINES['F5-TTS']]
|
||||
|
||||
# Generate audio using F5-TTS API
|
||||
wav, sr, spec = tts.infer(
|
||||
ref_file=ref_audio_path,
|
||||
ref_text=ref_text,
|
||||
gen_text=sentence,
|
||||
target_rms=0.1,
|
||||
cross_fade_duration=f5_settings.get('cross_fade_duration', 0.15),
|
||||
nfe_step=f5_settings.get('nfe_step', 32),
|
||||
cfg_strength=f5_settings.get('cfg_strength', 2),
|
||||
speed=f5_settings.get('speed', 1.0),
|
||||
remove_silence=False
|
||||
)
|
||||
|
||||
# Convert numpy array to tensor
|
||||
audio_sentence = torch.from_numpy(wav).float()
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
|
||||
@@ -1,247 +0,0 @@
|
||||
import hashlib
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
from lib import *
|
||||
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
|
||||
class F5TTS:
|
||||
|
||||
def __init__(self, session):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
||||
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
||||
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
||||
self.npz_path = None
|
||||
self.npz_data = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {TTS_ENGINES['F5-TTS']: {}}
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self._build()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _build(self):
|
||||
try:
|
||||
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if not tts:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
except Exception as e:
|
||||
error = f'build() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _load_api(self, key, model_path, device):
|
||||
global lock
|
||||
try:
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
||||
with lock:
|
||||
from f5_tts.api import F5TTS
|
||||
tts = F5TTS(model=model_path, device=device)
|
||||
if tts:
|
||||
loaded_tts[key] = {"engine": tts, "config": None}
|
||||
msg = f'{model_path} Loaded!'
|
||||
print(msg)
|
||||
return tts
|
||||
else:
|
||||
error = 'TTS engine could not be created!'
|
||||
print(error)
|
||||
except Exception as e:
|
||||
error = f'_load_api() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _load_checkpoint(self, **kwargs):
|
||||
global lock
|
||||
try:
|
||||
key = kwargs.get('key')
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
tts_engine = kwargs.get('tts_engine')
|
||||
device = kwargs.get('device')
|
||||
unload_tts(device, [self.tts_key])
|
||||
with lock:
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
from f5_tts.api import F5TTS
|
||||
tts = F5TTS(ckpt_file=checkpoint_dir, device=device)
|
||||
if tts:
|
||||
loaded_tts[key] = {"engine": tts, "config": None}
|
||||
msg = f'{tts_engine} Loaded!'
|
||||
print(msg)
|
||||
return tts
|
||||
else:
|
||||
error = 'TTS engine could not be created!'
|
||||
print(error)
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
return False
|
||||
|
||||
def _tensor_type(self, audio_data):
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data, np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data, list):
|
||||
return torch.tensor(audio_data, dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self, orig_sr, target_sr):
|
||||
key = (orig_sr, target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
||||
orig_freq=orig_sr, new_freq=target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self, wav_path, expected_sr):
|
||||
waveform, orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr == expected_sr and waveform.size(0) == 1:
|
||||
return wav_path
|
||||
if waveform.size(0) > 1:
|
||||
waveform = waveform.mean(dim=0, keepdim=True)
|
||||
if orig_sr != expected_sr:
|
||||
resampler = self._get_resampler(orig_sr, expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
def convert(self, sentence_number, sentence):
|
||||
try:
|
||||
speaker = None
|
||||
audio_data = False
|
||||
trim_audio_buffer = 0.004
|
||||
settings = self.params[self.session['tts_engine']]
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
||||
sentence = sentence.strip()
|
||||
settings['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
||||
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
)
|
||||
if settings['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
||||
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if tts:
|
||||
if sentence[-1].isalnum():
|
||||
sentence = f'{sentence} —'
|
||||
if sentence == TTS_SML['break']:
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
return True
|
||||
elif sentence == TTS_SML['pause']:
|
||||
pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
|
||||
self.audio_segments.append(pause_tensor.clone())
|
||||
return True
|
||||
else:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
# Need to provide reference text for F5-TTS - use a simple default if none available
|
||||
ref_text = "This is a reference audio for voice cloning."
|
||||
if settings['voice_path'] and os.path.exists(settings['voice_path']):
|
||||
# Resample reference audio to expected sample rate
|
||||
ref_audio_path = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
else:
|
||||
# Fallback to default voice
|
||||
ref_audio_path = models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
||||
if ref_audio_path:
|
||||
ref_audio_path = self._resample_wav(ref_audio_path, settings['samplerate'])
|
||||
|
||||
# Generate audio using F5-TTS API
|
||||
wav, sr, spec = tts.infer(
|
||||
ref_file=ref_audio_path,
|
||||
ref_text=ref_text,
|
||||
gen_text=sentence,
|
||||
target_rms=0.1,
|
||||
cross_fade_duration=0.15,
|
||||
nfe_step=32,
|
||||
cfg_strength=2,
|
||||
speed=1.0,
|
||||
remove_silence=False
|
||||
)
|
||||
|
||||
# F5-TTS returns numpy array, convert to tensor
|
||||
audio_sentence = torch.from_numpy(wav).float()
|
||||
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
if self.audio_segments:
|
||||
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
||||
start_time = self.sentences_total_time
|
||||
duration = audio_tensor.shape[-1] / settings['samplerate']
|
||||
end_time = start_time + duration
|
||||
self.sentences_total_time = end_time
|
||||
sentence_obj = {
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
else:
|
||||
error = f"Cannot create {final_sentence_file}"
|
||||
print(error)
|
||||
else:
|
||||
error = f"convert() error: {self.session['tts_engine']} is None"
|
||||
print(error)
|
||||
except Exception as e:
|
||||
error = f'F5TTS.convert(): {e}'
|
||||
raise ValueError(e)
|
||||
return False
|
||||
@@ -10,12 +10,8 @@ class TTSManager:
|
||||
|
||||
def _build(self):
|
||||
if self.session['tts_engine'] in TTS_ENGINES.values():
|
||||
if self.session['tts_engine'] == TTS_ENGINES['F5-TTS']:
|
||||
from lib.classes.tts_engines.f5tts import F5TTS
|
||||
self.tts = F5TTS(self.session)
|
||||
else:
|
||||
from lib.classes.tts_engines.coqui import Coqui
|
||||
self.tts = Coqui(self.session)
|
||||
from lib.classes.tts_engines.coqui import Coqui
|
||||
self.tts = Coqui(self.session)
|
||||
if self.tts:
|
||||
return True
|
||||
else:
|
||||
|
||||
@@ -1,36 +1,35 @@
|
||||
argostranslate
|
||||
beautifulsoup4
|
||||
cutlet
|
||||
deep_translator
|
||||
demucs
|
||||
docker
|
||||
ebooklib
|
||||
fastapi
|
||||
fugashi
|
||||
gradio>=5.40.0
|
||||
hangul-romanize
|
||||
indic-nlp-library
|
||||
iso-639
|
||||
jieba
|
||||
soynlp
|
||||
num2words
|
||||
pythainlp
|
||||
mutagen
|
||||
nvidia-ml-py
|
||||
phonemizer-fork
|
||||
pydub
|
||||
pyannote-audio
|
||||
PyOpenGL
|
||||
pypinyin
|
||||
ray
|
||||
regex
|
||||
translate
|
||||
tqdm
|
||||
unidic
|
||||
pymupdf4llm
|
||||
sudachipy
|
||||
sudachidict_core
|
||||
argostranslate
|
||||
beautifulsoup4
|
||||
cutlet
|
||||
deep_translator
|
||||
demucs
|
||||
docker
|
||||
ebooklib
|
||||
fastapi
|
||||
fugashi
|
||||
gradio>=5.40.0
|
||||
hangul-romanize
|
||||
indic-nlp-library
|
||||
iso-639
|
||||
jieba
|
||||
soynlp
|
||||
num2words
|
||||
pythainlp
|
||||
mutagen
|
||||
nvidia-ml-py
|
||||
phonemizer-fork
|
||||
pydub
|
||||
pyannote-audio
|
||||
PyOpenGL
|
||||
pypinyin
|
||||
ray
|
||||
regex
|
||||
translate
|
||||
tqdm
|
||||
unidic
|
||||
pymupdf4llm
|
||||
sudachipy
|
||||
sudachidict_core
|
||||
transformers==4.51.3
|
||||
coqui-tts[languages]==0.26.0
|
||||
torchvggish
|
||||
f5-tts
|
||||
torchvggish
|
||||
Reference in New Issue
Block a user