mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-10 06:18:02 -05:00
142 lines
5.6 KiB
Python
142 lines
5.6 KiB
Python
# NOTE: to run this script you must move it to the root of ebook2audiobook
|
|
|
|
import os
|
|
|
|
os.environ['PYTHONUTF8'] = '1'
|
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
|
os.environ['COQUI_TOS_AGREED'] = '1'
|
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
|
os.environ['DO_NOT_TRACK'] = 'true'
|
|
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
|
os.environ['HF_HOME'] = tts_dir
|
|
os.environ['TRANSFORMERS_CACHE'] = tts_dir
|
|
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
|
os.environ['BARK_CACHE_DIR'] = tts_dir
|
|
os.environ['TTS_CACHE'] = tts_dir
|
|
os.environ['TORCH_HOME'] = tts_dir
|
|
os.environ['TTS_HOME'] = models_dir
|
|
os.environ['XDG_CACHE_HOME'] = models_dir
|
|
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
|
|
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
|
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
|
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
|
|
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
|
|
|
|
import argparse
|
|
import hashlib
|
|
import numpy as np
|
|
import regex as re
|
|
import shutil
|
|
import soundfile as sf
|
|
import subprocess
|
|
import tempfile
|
|
import torch
|
|
import torchaudio
|
|
import threading
|
|
import uuid
|
|
|
|
from iso639 import languages
|
|
from huggingface_hub import hf_hub_download
|
|
from pathlib import Path
|
|
from scipy.io import wavfile as wav
|
|
from scipy.signal import find_peaks
|
|
from TTS.tts.configs.bark_config import BarkConfig
|
|
from TTS.tts.models.bark import Bark
|
|
|
|
from lib import *
|
|
|
|
import logging
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
torch.hub.set_dir(models_dir)
|
|
|
|
loaded_tts = {}
|
|
|
|
def load_checkpoint(**kwargs):
|
|
try:
|
|
key = kwargs.get('key')
|
|
tts_engine = kwargs.get('tts_engine')
|
|
device = kwargs.get('device')
|
|
checkpoint_dir = kwargs.get('checkpoint_dir')
|
|
config = BarkConfig()
|
|
config.CACHE_DIR = tts_dir
|
|
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
|
|
tts = Bark.init_from_config(config)
|
|
tts.load_checkpoint(
|
|
config,
|
|
checkpoint_dir=checkpoint_dir,
|
|
eval=True
|
|
)
|
|
if tts:
|
|
if device == 'cuda':
|
|
tts.cuda()
|
|
else:
|
|
tts.to(device)
|
|
loaded_tts[key] = {"engine": tts, "config": config}
|
|
msg = f'{tts_engine} Loaded!'
|
|
print(msg)
|
|
return tts
|
|
else:
|
|
error = 'TTS engine could not be created!'
|
|
print(error)
|
|
except Exception as e:
|
|
error = f'_load_checkpoint() error: {e}'
|
|
return False
|
|
|
|
def wav_to_npz(bark_dir, wav_dir):
|
|
try:
|
|
tts_internal_key = f"TTS_ENGINES['BARK']-internal"
|
|
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
|
|
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
|
|
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=tts_dir)
|
|
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=tts_dir)
|
|
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=tts_dir)
|
|
checkpoint_dir = os.path.dirname(text_model_path)
|
|
tts = load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device='cpu')
|
|
if tts:
|
|
fine_tuned_params = {
|
|
"text_temp": default_engine_settings[TTS_ENGINES['BARK']]['text_temp'],
|
|
"waveform_temp": default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']
|
|
}
|
|
for root, dirs, files in os.walk(wav_dir):
|
|
for file in files:
|
|
if file.lower().endswith('.wav'):
|
|
match = re.match(r"^([a-z]{2})_", file)
|
|
if match:
|
|
speaker = os.path.splitext(file)[0]
|
|
npz_file = f'{speaker}.npz'
|
|
iso1_lang = match.group(1)
|
|
lang_array = languages.get(part1=iso1_lang)
|
|
if lang_array:
|
|
iso3_lang = lang_array.part3
|
|
default_text_file = os.path.join(voices_dir, iso3_lang, 'default.txt')
|
|
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
|
with torch.no_grad():
|
|
torch.manual_seed(67878789)
|
|
audio_data = tts.synthesize(
|
|
default_text,
|
|
loaded_tts[tts_internal_key]['config'],
|
|
speaker_id=speaker,
|
|
voice_dirs=bark_dir,
|
|
silent=True,
|
|
**fine_tuned_params
|
|
)
|
|
del audio_data
|
|
msg = f"Saved NPZ file: {npz_file}"
|
|
print(msg)
|
|
else:
|
|
print('tts bark not loaded')
|
|
except Exception as e:
|
|
print(f'wav_to_npz() error: {e}')
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Convert WAV files to Bark NPZ format.")
|
|
parser.add_argument("--bark_dir", type=str, required=True, help="Path to the Bark asset directory")
|
|
parser.add_argument("--wav_dir", type=str, required=True, help="Path to the output WAV directory")
|
|
args = parser.parse_args()
|
|
bark_dir = os.path.abspath(args.bark_dir)
|
|
wav_dir = os.path.abspath(args.wav_dir)
|
|
wav_to_npz(bark_dir, wav_dir)
|
|
|