This commit is contained in:
unknown
2025-11-24 07:13:36 -08:00
parent cf744e473d
commit 3b19821651
8 changed files with 44 additions and 225 deletions

View File

@@ -1 +1 @@
25.11.24
25.11.25

52
app.py
View File

@@ -268,12 +268,12 @@ def detect_gpu()->str:
# ============================================================
return 'cpu'
def torch_version_is_leq(current:str, target:str)->str:
def parse_torch_version(current:str)->str:
try:
parsed = Version(current)
except InvalidVersion:
parsed = Version(current.split('+')[0])
return parsed <= Version(target)
return parsed
def check_and_install_requirements(file_path:str)->bool:
if not os.path.exists(file_path):
@@ -281,7 +281,7 @@ def check_and_install_requirements(file_path:str)->bool:
print(error)
return False
try:
backend_specs = {"os": detect_platform_tag(), "arch": detect_arch_tag(), "pyvenv": sys.version_info[:2], "ref": detect_gpu()}
backend_specs = {"os": detect_platform_tag(), "arch": detect_arch_tag(), "pyvenv": sys.version_info[:2], "gpu": detect_gpu()}
print(f'--------------- {backend_specs} -------------')
try:
from packaging.specifiers import SpecifierSet
@@ -425,29 +425,43 @@ def check_and_install_requirements(file_path:str)->bool:
import numpy as np
torch_version = torch.__version__
numpy_version = Version(np.__version__)
if backend_specs['ref'] not in ['cpu', 'unknown', 'unsupported']:
pattern = re.search(r'\+(.+)$', torch_version)
tag = pattern.group(1)
match_tag = re.fullmatch(r'[0-9a-f]{7,40}', tag)
if match_tag is None or match_tag is not None and match_tag != torch_mapping['tag']:
if tag != backend_specs['ref']:
torch_pkg = torch_mapping[backend_specs['ref']]
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', torch_pkg])
if torch_version_is_leq(torch_version, '2.2.2') and numpy_version >= Version('2.0.0'):
torch_version_parsed = parse_torch_version(torch_version)
if torch_version_parsed <= Version('2.2.2') and numpy_version >= Version('2.0.0'):
try:
msg = 'torch version needs nump < 2. downgrading numpy to 1.26.4...'
msg = 'torch version needs numpy < 2. downgrading numpy to 1.26.4...'
print(msg)
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', '--use-pep517', 'numpy<2'])
except subprocess.CalledProcessError as e:
error = f'Failed to downgrade to numpy < 2: {e}'
print(error)
return False
if backend_specs['gpu'] not in ['cpu', 'unknown', 'unsupported']:
current_tag_pattern = re.search(r'\+(.+)$', torch_version)
current_tag = current_tag_pattern.group(1)
non_standard_tag = re.fullmatch(r'[0-9a-f]{7,40}', current_tag)
if non_standard_tag is None and current_tag != backend_specs['gpu'] or
non_standard_tag is not None and backend_specs['gpu'] in ['jetson-jetpack5', 'jetson-60', 'jetson-61'] and non_standard_tag != torch_mapping[backend_specs['gpu']]['tag']:
try:
backend_tag = torch_mapping[backend_specs['gpu']]['tag']
backend_os = backend_specs['os']
backend_arch = backend_specs['arch']
backend_url = torch_mapping[backend_specs['gpu']]['url']
if backend-specs['gpu'] == 'jetson-jetpack5'
torch_pkg = f''
elif backend_specs['gpu'] in ['jetson-60', 'jetson-61']:
jetson_torch_version = default_jetson60_torch if backend_specs['gpu'] == 'jetson-60' else default_jetson61_torch
torch_pkg = f'{backend_url}/v{backend_tag}/pytorch/torch-{jetson_torch_version}-{default_py_tag}-linux_{backend_arch}.whl'
else:
torch_pkg = f'{gpu_url}/{backend_tag}/torch/torch-{torch_version_parsed}+{gpu_tag}-{default_py_tag}-{backend_os}_{backend_arch}.whl'
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', torch_pkg])
import torch
torch_version = torch.__version__
except subprocess.CalledProcessError as e:
error = f'Failed to install {packages}: {e}'
print(error)
return False
devices['CUDA']['found'] = getattr(torch, "cuda", None) is not None and torch.cuda.is_available() and not (hasattr(torch.version, "hip") and torch.version.hip is not None)
devices['ROCM']['found'] = hasattr(torch.version, "hip") and torch.version.hip is not None and torch.cuda.is_available()
devices['MPS']['found'] = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()

View File

@@ -33,5 +33,4 @@ pyannote-audio<=3.4.0
argostranslate<=1.10.0
gradio==5.49.1
torch<=2.7.1
torchaudio<=2.7.1
coqui-tts[languages]==0.27.2

View File

@@ -1,6 +1,7 @@
import os
import platform
import tempfile
import sys
# ---------------------------------------------------------------------
# Global configuration
@@ -84,10 +85,19 @@ default_device = devices['CPU']['proc']
default_chapters_preview = False
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">GPU howto wiki</a>'
default_py_major = sys.version_info.major
default_py_minor = sys.version_info.minor
default_py_tag = f'cp{default_py_major}{default_py_minor}-cp{default_py_major}{default_py_minor}'
default_pytorch_url = 'https://download.pytorch.org/whl/'
default_jetson_url = 'https://developer.download.nvidia.com/compute/redist/jp/'
default_compiled_url = 'https://xxxxxxxxxx/compiled/xxxxxx.whl'
default_jetson5_torch = ''
default_jetson60_torch = '2.4.0a0+3bcc3cddb5.nv24.07.16234504'
default_jetson61_torch = '2.5.0a0+872d972e41.nv24.08.17622132'
torch_mapping = {
# CUDA

View File

@@ -52,7 +52,6 @@ dependencies = [
"argostranslate<=1.10.0",
"gradio==5.49.1",
"torch<=2.7.1",
"torchaudio<=2.7.1",
"coqui-tts[languages]==0.27.2"
]
readme = "README.md"

View File

@@ -33,5 +33,4 @@ pyannote-audio==3.4.0
argostranslate==1.10.0
gradio==5.49.1
torch<=2.7.1
torchaudio<=2.7.1
coqui-tts[languages]==0.27.2

View File

@@ -1,61 +0,0 @@
import os
import platform
import argparse
tmp_dir = os.path.abspath(os.path.join('..', 'tmp'))
models_dir = os.path.abspath(os.path.join('..', 'models'))
tts_dir = os.path.join(models_dir, 'tts')
os.environ['PYTHONUTF8'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
os.environ['HF_HOME'] = tts_dir
os.environ['HF_DATASETS_CACHE'] = tts_dir
os.environ['BARK_CACHE_DIR'] = tts_dir
os.environ['TTS_CACHE'] = tts_dir
os.environ['TORCH_HOME'] = tts_dir
os.environ['TTS_HOME'] = models_dir
os.environ['XDG_CACHE_HOME'] = models_dir
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
if platform.system() == 'Windows':
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
import torch
import torchaudio
import numpy as np
from pathlib import Path
from bark import SAMPLE_RATE, preload_models
from bark.generation import codec_decode
def npz_to_wav(npz_path, output_path):
preload_models()
data = np.load(npz_path)
fine_prompt = data["fine_prompt"]
audio_array = codec_decode(fine_prompt)
audio_tensor = torch.tensor(audio_array).unsqueeze(0)
torchaudio.save(output_path, audio_tensor, SAMPLE_RATE)
print(f"✅ Saved: {output_path}")
def process_all_npz_in_folder(folder_path):
preload_models()
for npz_file in Path(folder_path).rglob("*.npz"):
output_path = npz_file.with_suffix(".wav")
npz_to_wav(str(npz_file), str(output_path))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process all NPZ files in a folder.")
parser.add_argument("--folder_path", type=str, required=True, help="Path to the folder containing NPZ files")
args = parser.parse_args()
folder_path = os.path.abspath(args.folder_path)
process_all_npz_in_folder(folder_path)

View File

@@ -1,141 +0,0 @@
# NOTE: to run this script you must move it to the root of ebook2audiobook
import os
os.environ['PYTHONUTF8'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
os.environ['HF_HOME'] = tts_dir
os.environ['TRANSFORMERS_CACHE'] = tts_dir
os.environ['HF_DATASETS_CACHE'] = tts_dir
os.environ['BARK_CACHE_DIR'] = tts_dir
os.environ['TTS_CACHE'] = tts_dir
os.environ['TORCH_HOME'] = tts_dir
os.environ['TTS_HOME'] = models_dir
os.environ['XDG_CACHE_HOME'] = models_dir
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
import argparse
import hashlib
import numpy as np
import regex as re
import shutil
import soundfile as sf
import subprocess
import tempfile
import torch
import torchaudio
import threading
import uuid
from iso639 import languages
from huggingface_hub import hf_hub_download
from pathlib import Path
from scipy.io import wavfile as wav
from scipy.signal import find_peaks
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
from lib import *
import logging
logging.basicConfig(level=logging.DEBUG)
torch.hub.set_dir(models_dir)
loaded_tts = {}
def load_checkpoint(**kwargs):
try:
key = kwargs.get('key')
tts_engine = kwargs.get('tts_engine')
device = kwargs.get('device')
checkpoint_dir = kwargs.get('checkpoint_dir')
config = BarkConfig()
config.CACHE_DIR = tts_dir
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
tts = Bark.init_from_config(config)
tts.load_checkpoint(
config,
checkpoint_dir=checkpoint_dir,
eval=True
)
if tts:
if device == 'cuda':
tts.cuda()
else:
tts.to(device)
loaded_tts[key] = {"engine": tts, "config": config}
msg = f'{tts_engine} Loaded!'
print(msg)
return tts
else:
error = 'TTS engine could not be created!'
print(error)
except Exception as e:
error = f'_load_checkpoint() error: {e}'
return False
def wav_to_npz(bark_dir, wav_dir):
try:
tts_internal_key = f"TTS_ENGINES['BARK']-internal"
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=tts_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=tts_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=tts_dir)
checkpoint_dir = os.path.dirname(text_model_path)
tts = load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device='cpu')
if tts:
fine_tuned_params = {
"text_temp": default_engine_settings[TTS_ENGINES['BARK']]['text_temp'],
"waveform_temp": default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']
}
for root, dirs, files in os.walk(wav_dir):
for file in files:
if file.lower().endswith('.wav'):
match = re.match(r"^([a-z]{2})_", file)
if match:
speaker = os.path.splitext(file)[0]
npz_file = f'{speaker}.npz'
iso1_lang = match.group(1)
lang_array = languages.get(part1=iso1_lang)
if lang_array:
iso3_lang = lang_array.part3
default_text_file = os.path.join(voices_dir, iso3_lang, 'default.txt')
default_text = Path(default_text_file).read_text(encoding="utf-8")
with torch.no_grad():
torch.manual_seed(67878789)
audio_data = tts.synthesize(
default_text,
loaded_tts[tts_internal_key]['config'],
speaker_id=speaker,
voice_dirs=bark_dir,
silent=True,
**fine_tuned_params
)
del audio_data
msg = f"Saved NPZ file: {npz_file}"
print(msg)
else:
print('tts bark not loaded')
except Exception as e:
print(f'wav_to_npz() error: {e}')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert WAV files to Bark NPZ format.")
parser.add_argument("--bark_dir", type=str, required=True, help="Path to the Bark asset directory")
parser.add_argument("--wav_dir", type=str, required=True, help="Path to the output WAV directory")
args = parser.parse_args()
bark_dir = os.path.abspath(args.bark_dir)
wav_dir = os.path.abspath(args.wav_dir)
wav_to_npz(bark_dir, wav_dir)