Files
ebook2audiobook/tools/npz_to_wav.py
unknown 1bbebc4430 ...
2025-06-23 07:50:25 -07:00

62 lines
2.4 KiB
Python

import os
import platform
import argparse
tmp_dir = os.path.abspath(os.path.join('..', 'tmp'))
models_dir = os.path.abspath(os.path.join('..', 'models'))
tts_dir = os.path.join(models_dir, 'tts')
os.environ['PYTHONUTF8'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
os.environ['HF_HOME'] = tts_dir
os.environ['HF_DATASETS_CACHE'] = tts_dir
os.environ['BARK_CACHE_DIR'] = tts_dir
os.environ['TTS_CACHE'] = tts_dir
os.environ['TORCH_HOME'] = tts_dir
os.environ['TTS_HOME'] = models_dir
os.environ['XDG_CACHE_HOME'] = models_dir
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
if platform.system() == 'Windows':
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
import torch
import torchaudio
import numpy as np
from pathlib import Path
from bark import SAMPLE_RATE, preload_models
from bark.generation import codec_decode
def npz_to_wav(npz_path, output_path):
preload_models()
data = np.load(npz_path)
fine_prompt = data["fine_prompt"]
audio_array = codec_decode(fine_prompt)
audio_tensor = torch.tensor(audio_array).unsqueeze(0)
torchaudio.save(output_path, audio_tensor, SAMPLE_RATE)
print(f"✅ Saved: {output_path}")
def process_all_npz_in_folder(folder_path):
preload_models()
for npz_file in Path(folder_path).rglob("*.npz"):
output_path = npz_file.with_suffix(".wav")
npz_to_wav(str(npz_file), str(output_path))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process all NPZ files in a folder.")
parser.add_argument("--folder_path", type=str, required=True, help="Path to the folder containing NPZ files")
args = parser.parse_args()
folder_path = os.path.abspath(args.folder_path)
process_all_npz_in_folder(folder_path)