Files
ebook2audiobook/tools/wav_to_npz.py
unknown e6920dc462 ...
2025-06-23 07:44:58 -07:00

142 lines
5.6 KiB
Python

# NOTE: to run this script you must move it to the root of ebook2audiobook
import os
os.environ['PYTHONUTF8'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
os.environ['HF_HOME'] = tts_dir
os.environ['TRANSFORMERS_CACHE'] = tts_dir
os.environ['HF_DATASETS_CACHE'] = tts_dir
os.environ['BARK_CACHE_DIR'] = tts_dir
os.environ['TTS_CACHE'] = tts_dir
os.environ['TORCH_HOME'] = tts_dir
os.environ['TTS_HOME'] = models_dir
os.environ['XDG_CACHE_HOME'] = models_dir
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
import argparse
import hashlib
import numpy as np
import regex as re
import shutil
import soundfile as sf
import subprocess
import tempfile
import torch
import torchaudio
import threading
import uuid
from iso639 import languages
from huggingface_hub import hf_hub_download
from pathlib import Path
from scipy.io import wavfile as wav
from scipy.signal import find_peaks
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
from lib import *
import logging
logging.basicConfig(level=logging.DEBUG)
torch.hub.set_dir(models_dir)
loaded_tts = {}
def load_checkpoint(**kwargs):
try:
key = kwargs.get('key')
tts_engine = kwargs.get('tts_engine')
device = kwargs.get('device')
checkpoint_dir = kwargs.get('checkpoint_dir')
config = BarkConfig()
config.CACHE_DIR = tts_dir
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
tts = Bark.init_from_config(config)
tts.load_checkpoint(
config,
checkpoint_dir=checkpoint_dir,
eval=True
)
if tts:
if device == 'cuda':
tts.cuda()
else:
tts.to(device)
loaded_tts[key] = {"engine": tts, "config": config}
msg = f'{tts_engine} Loaded!'
print(msg)
return tts
else:
error = 'TTS engine could not be created!'
print(error)
except Exception as e:
error = f'_load_checkpoint() error: {e}'
return False
def wav_to_npz(bark_dir, wav_dir):
try:
tts_internal_key = f"TTS_ENGINES['BARK']-internal"
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=tts_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=tts_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=tts_dir)
checkpoint_dir = os.path.dirname(text_model_path)
tts = load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device='cpu')
if tts:
fine_tuned_params = {
"text_temp": default_engine_settings[TTS_ENGINES['BARK']]['text_temp'],
"waveform_temp": default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']
}
for root, dirs, files in os.walk(wav_dir):
for file in files:
if file.lower().endswith('.wav'):
match = re.match(r"^([a-z]{2})_", file)
if match:
speaker = os.path.splitext(file)[0]
npz_file = f'{speaker}.npz'
iso1_lang = match.group(1)
lang_array = languages.get(part1=iso1_lang)
if lang_array:
iso3_lang = lang_array.part3
default_text_file = os.path.join(voices_dir, iso3_lang, 'default.txt')
default_text = Path(default_text_file).read_text(encoding="utf-8")
with torch.no_grad():
torch.manual_seed(67878789)
audio_data = tts.synthesize(
default_text,
loaded_tts[tts_internal_key]['config'],
speaker_id=speaker,
voice_dirs=bark_dir,
silent=True,
**fine_tuned_params
)
del audio_data
msg = f"Saved NPZ file: {npz_file}"
print(msg)
else:
print('tts bark not loaded')
except Exception as e:
print(f'wav_to_npz() error: {e}')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert WAV files to Bark NPZ format.")
parser.add_argument("--bark_dir", type=str, required=True, help="Path to the Bark asset directory")
parser.add_argument("--wav_dir", type=str, required=True, help="Path to the output WAV directory")
args = parser.parse_args()
bark_dir = os.path.abspath(args.bark_dir)
wav_dir = os.path.abspath(args.wav_dir)
wav_to_npz(bark_dir, wav_dir)