Merge branch 'v25' into DrewThomasson-RafeBeckley-patch-1

This commit is contained in:
Drew Thomasson
2025-11-11 07:05:07 -05:00
committed by GitHub
57 changed files with 4422 additions and 3809 deletions

View File

@@ -16,7 +16,6 @@ RUN apt-get update && \
# Install Rust compiler
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Set the working directory
WORKDIR /app
# Install UniDic (non-torch dependent)
RUN pip install --no-cache-dir unidic-lite unidic && \
@@ -31,74 +30,61 @@ ARG TORCH_VERSION=""
# Add parameter to control whether to skip the XTTS test
ARG SKIP_XTTS_TEST="false"
# Copy the application
WORKDIR /app
COPY . /app
# Extract torch versions from requirements.txt or set to empty strings if not found
RUN TORCH_VERSION_REQ=$(grep -E "^torch==" requirements.txt | cut -d'=' -f3 || echo "") && \
TORCHAUDIO_VERSION_REQ=$(grep -E "^torchaudio==" requirements.txt | cut -d'=' -f3 || echo "") && \
TORCHVISION_VERSION_REQ=$(grep -E "^torchvision==" requirements.txt | cut -d'=' -f3 || echo "") && \
echo "Found in requirements: torch==$TORCH_VERSION_REQ torchaudio==$TORCHAUDIO_VERSION_REQ torchvision==$TORCHVISION_VERSION_REQ"
# Install PyTorch with CUDA support if specified
# Install requirements.txt or PyTorch variants based on TORCH_VERSION
RUN if [ ! -z "$TORCH_VERSION" ]; then \
# Check if we need to use specific versions or get the latest
if [ ! -z "$TORCH_VERSION_REQ" ] && [ ! -z "$TORCHVISION_VERSION_REQ" ] && [ ! -z "$TORCHAUDIO_VERSION_REQ" ]; then \
echo "Using specific versions from requirements.txt" && \
TORCH_SPEC="torch==${TORCH_VERSION_REQ}" && \
TORCHVISION_SPEC="torchvision==${TORCHVISION_VERSION_REQ}" && \
TORCHAUDIO_SPEC="torchaudio==${TORCHAUDIO_VERSION_REQ}"; \
else \
echo "Using latest versions for the selected variant" && \
TORCH_SPEC="torch" && \
TORCHVISION_SPEC="torchvision" && \
TORCHAUDIO_SPEC="torchaudio"; \
fi && \
\
# Check if TORCH_VERSION contains "cuda" and extract version number
if echo "$TORCH_VERSION" | grep -q "cuda"; then \
CUDA_VERSION=$(echo "$TORCH_VERSION" | sed 's/cuda//g') && \
echo "Detected CUDA version: $CUDA_VERSION" && \
echo "Attempting to install PyTorch nightly for CUDA $CUDA_VERSION..." && \
#if ! pip install --no-cache-dir --pre $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
if ! pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
echo "❌ Nightly build for CUDA $CUDA_VERSION not available or failed" && \
echo "🔄 Trying stable release for CUDA $CUDA_VERSION..." && \
#if pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
if pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
echo "✅ Successfully installed stable PyTorch for CUDA $CUDA_VERSION"; \
else \
echo "❌ Both nightly and stable builds failed for CUDA $CUDA_VERSION"; \
echo "💡 This CUDA version may not be supported by PyTorch"; \
exit 1; \
fi; \
\
# Special handling for CUDA 11.8
if [ "$CUDA_VERSION" = "118" ]; then \
echo "Installing PyTorch for CUDA 11.8..." && \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install pyannote-audio==3.4.0 && pip install --no-cache-dir --upgrade torch==2.7.1 torchvision==2.7.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu118; \
elif [ "$CUDA_VERSION" = "128" ]; then \
echo "Installing PyTorch for CUDA 12.8..." && \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch==2.7.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128; \
else \
echo "✅ Successfully installed nightly PyTorch for CUDA $CUDA_VERSION"; \
echo "Attempting to install stable PyTorch for CUDA $CUDA_VERSION..." && \
if ! pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
echo "❌ Stable build for CUDA $CUDA_VERSION not available or failed" && \
echo "🔄 Trying nightly release for CUDA $CUDA_VERSION..." && \
if pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
echo "✅ Successfully installed nightly PyTorch for CUDA $CUDA_VERSION"; \
else \
echo "❌ Both stable and nightly builds failed for CUDA $CUDA_VERSION"; \
echo "💡 This CUDA version may not be supported by PyTorch"; \
exit 1; \
fi; \
else \
echo "✅ Successfully installed stable PyTorch for CUDA $CUDA_VERSION"; \
fi; \
fi; \
else \
# Handle non-CUDA cases (existing functionality)
# Handle non-CUDA cases
case "$TORCH_VERSION" in \
"rocm") \
# Using the correct syntax for ROCm PyTorch installation
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/rocm6.2 \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 \
;; \
"xpu") \
# Install PyTorch with Intel XPU support through IPEX
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC && \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio && \
pip install --no-cache-dir intel-extension-for-pytorch --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
;; \
"cpu") \
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/cpu \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu \
;; \
*) \
pip install --no-cache-dir $TORCH_VERSION \
echo "Installing custom PyTorch specification: $TORCH_VERSION" && \
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade $TORCH_VERSION \
;; \
esac; \
fi && \
# Install remaining requirements, skipping torch packages that might be there
grep -v -E "^torch==|^torchvision==|^torchaudio==|^torchvision$" requirements.txt > requirements_no_torch.txt && \
pip install --no-cache-dir --upgrade -r requirements_no_torch.txt && \
rm requirements_no_torch.txt; \
fi; \
else \
# Install all requirements as specified
echo "No TORCH_VERSION specified, using packages from requirements.txt" && \
pip install --no-cache-dir --upgrade -r requirements.txt; \
fi
@@ -114,9 +100,6 @@ RUN if [ "$SKIP_XTTS_TEST" != "true" ]; then \
echo "Skipping XTTS test run as requested."; \
fi
# Copy the application
COPY . /app
# Expose the required port
EXPOSE 7860
# Start the Gradio app with the required flag
@@ -126,3 +109,12 @@ ENTRYPOINT ["python", "app.py", "--script_mode", "full_docker"]
#docker build --pull --build-arg BASE_IMAGE=athomasson2/ebook2audiobook:latest -t your-image-name .
#The --pull flag forces Docker to always try to pull the latest version of the image, even if it already exists locally.
#Without --pull, Docker will only use the local version if it exists, which might not be the latest.
# Example build commands:
# For CUDA 11.8: docker build --build-arg TORCH_VERSION=cuda118 -t your-image-name .
# For CUDA 12.8: docker build --build-arg TORCH_VERSION=cuda128 -t your-image-name .
# For CUDA 12.1: docker build --build-arg TORCH_VERSION=cuda121 -t your-image-name .
# For ROCm: docker build --build-arg TORCH_VERSION=rocm -t your-image-name .
# For CPU: docker build --build-arg TORCH_VERSION=cpu -t your-image-name .
# For XPU: docker build --build-arg TORCH_VERSION=xpu -t your-image-name .
# Default (no TORCH_VERSION): docker build -t your-image-name .

165
README.md
View File

@@ -1,6 +1,6 @@
# 📚 ebook2audiobook
CPU/GPU Converter from eBooks to audiobooks with chapters and metadata<br/>
using XTTSv2, Bark, Vits, Fairseq, YourTTS, Tacotron and more. Supports voice cloning and +1110 languages!
using XTTSv2, Bark, Vits, Fairseq, YourTTS, Tacotron2 and more. Supports voice cloning and +1110 languages!
> [!IMPORTANT]
**This tool is intended for use with non-DRM, legally acquired eBooks only.** <br>
The authors are not responsible for any misuse of this software or any resulting legal consequences. <br>
@@ -83,18 +83,18 @@ https://github.com/user-attachments/assets/81c4baad-117e-4db5-ac86-efc2b7fea921
- [Basic Headless Usage](#basic--usage)
- [Headless Custom XTTS Model Usage](#example-of-custom-model-zip-upload)
- [Help command output](#help-command-output)
- [Run Remotely](#run-remotely)
- [Run Remotely](#run-remotely)
- [Docker](#docker-compose)
- [Docker Compose (Recommended)](#docker-compose)
- [Docker Compose Headless](#compose-headless)
- [Compose Build Arguments](#compose-build-arguments)
- [Compose container file locations](#compose-container-file-locations)
- [Common Docker issues](#common-docker-issues)
- [Docker Build (Manual)](https://github.com/DrewThomasson/ebook2audiobook/wiki/Manual-Docker-Guide)
- [Fine Tuned TTS models](#fine-tuned-tts-models)
- [Collection of Fine-Tuned TTS Models](#fine-tuned-tts-collection)
- [Train XTTSv2](#fine-tune-your-own-xttsv2-model)
- [Docker](#docker-gpu-options)
- [GPU options](#docker-gpu-options)
- [Docker Run](#running-the-pre-built-docker-container)
- [Docker Build](#building-the-docker-container)
- [Docker Compose](#docker-compose)
- [Docker headless guide](#docker-headless-guide)
- [Docker container file locations](#docker-container-file-locations)
- [Common Docker issues](#common-docker-issues)
- [Supported eBook Formats](#supported-ebook-formats)
- [Output Formats](#output-formats)
- [Updating to Latest Version](#updating-to-latest-version)
@@ -125,7 +125,7 @@ https://github.com/user-attachments/assets/81c4baad-117e-4db5-ac86-efc2b7fea921
## Hardware Requirements
- 4gb RAM minimum, 8GB recommended
- 2gb RAM minimum, 8GB recommended
- Virtualization enabled if running on windows (Docker only)
- CPU (intel, AMD, ARM), GPU (Nvidia, AMD*, Intel*) (Recommended), MPS (Apple Silicon CPU)
*available very soon
@@ -147,16 +147,18 @@ cd ebook2audiobook
```
### Launching Gradio Web Interface
1. **Run ebook2audiobook**:
1. **Run ebook2audiobook**:
- **Linux/MacOS**
```bash
./ebook2audiobook.sh # Run launch script
```
<i>Note for MacOS users: homebrew is installed to install missing programs.</i>
- **Mac Launcher**
Double click `Mac Ebook2Audiobook Launcher.command`
- **Windows**
```bash
ebook2audiobook.cmd # Run launch script or double click on it
@@ -164,22 +166,12 @@ cd ebook2audiobook
- **Windows Launcher**
Double click `ebook2audiobook.cmd`
- **Manual Python Install**
```bash
# (for experts only!)
REQUIRED_PROGRAMS=("calibre" "ffmpeg" "nodejs" "mecab" "espeak-ng" "rust" "sox")
REQUIRED_PYTHON_VERSION="3.12"
pip install -r requirements.txt # Install Python Requirements
python app.py # Run Ebook2Audiobook
```
1. **Open the Web App**: Click the URL provided in the terminal to access the web app and convert eBooks. `http://localhost:7860/`
2. **For Public Link**:
`python app.py --share` (all OS)
`./ebook2audiobook.sh --share` (Linux/MacOS)
`ebook2audiobook.cmd --share` (Windows)
`python app.py --share` (all OS)
> [!IMPORTANT]
**If the script is stopped and run again, you need to refresh your gradio GUI interface<br>
@@ -341,84 +333,11 @@ NOTE: in gradio/gui mode, to cancel a running conversion, just click on the [X]
TIP: if it needs some more pauses, just add '###' or '[pause]' between the words you wish more pause. one [pause] equals to 1.4 seconds
#### Docker GPU Options
Available pre-build tags: `latest` (CUDA 11.8)
#### Edit: IF GPU isn't detected then you'll have to build the image -> [Building the Docker Container](#building-the-docker-container)
#### Running the pre-built Docker Container
-Run with CPU only
```powershell
docker run --pull always --rm -p 7860:7860 athomasson2/ebook2audiobook
```
-Run with GPU Speedup (NVIDIA compatible only)
```powershell
docker run --pull always --rm --gpus all -p 7860:7860 athomasson2/ebook2audiobook
```
This command will start the Gradio interface on port 7860.(localhost:7860)
- For more options add the parameter `--help`
#### Building the Docker Container
- You can build the docker image with the command:
```powershell
docker build -t athomasson2/ebook2audiobook .
```
#### Avalible Docker Build Arguments
`--build-arg TORCH_VERSION=cuda118` Available tags: [cuda121, cuda118, cuda128, rocm, xpu, cpu]
All CUDA version numbers should work, Ex: CUDA 11.6-> cuda116
`--build-arg SKIP_XTTS_TEST=true` (Saves space by not baking XTTSv2 model into docker image)
## Docker container file locations
All ebook2audiobooks will have the base dir of `/app/`
For example:
`tmp` = `/app/tmp`
`audiobooks` = `/app/audiobooks`
## Docker headless guide
> [!IMPORTANT]
**For simpler headless setup use the [Compose](#compose-headless).** <br>
- Before you do run this you need to create a dir named "input-folder" in your current dir
which will be linked, This is where you can put your input files for the docker image to see
```bash
mkdir input-folder && mkdir Audiobooks
```
- In the command below swap out **YOUR_INPUT_FILE.TXT** with the name of your input file
```bash
docker run --pull always --rm \
-v $(pwd)/input-folder:/app/input_folder \
-v $(pwd)/audiobooks:/app/audiobooks \
athomasson2/ebook2audiobook \
--headless --ebook /input_folder/YOUR_EBOOK_FILE
```
- The output Audiobooks will be found in the Audiobook folder which will also be located
in your local dir you ran this docker command in
## To get the help command for the other parameters this program has you can run this
```bash
docker run --pull always --rm athomasson2/ebook2audiobook --help
```
That will output this
[Help command output](#help-command-output)
### Docker Compose
This project uses Docker Compose to run locally. You can enable or disable GPU support
by setting either `*gpu-enabled` or `*gpu-disabled` in `docker-compose.yml`
For pre-built image enable `#image: docker.io/athomasson2/ebook2audiobook:latest` in `docker-compose.yml`
#### Steps to Run
@@ -429,46 +348,48 @@ by setting either `*gpu-enabled` or `*gpu-disabled` in `docker-compose.yml`
```
2. **Set GPU Support (disabled by default)**
To enable GPU support, modify `docker-compose.yml` and change `*gpu-disabled` to `*gpu-enabled`
3. **Start the service:**
4. **Start the service:**
```bash
# Docker
docker-compose up -d # To rebuild add --build
docker-compose up -d # To rebuild add --build
# To stop -> docker-compose down
# Podman
podman compose -f podman-compose.yml up -d # To rebuild add --build
# To stop -> podman compose -f podman-compose.yml down
```
4. **Access the service:**
5. **Access the service:**
The service will be available at http://localhost:7860.
### Compose Build Arguments
```bash
SKIP_XTTS_TEST: "true" # (Saves space by not baking xtts model into docker image)
TORCH_VERSION: cuda118 # Available tags: [cuda121, cuda118, cuda128, rocm, xpu, cpu] # All CUDA version numbers should work, Ex: CUDA 11.6-> cuda116
```
### Compose Headless
[Headless Wiki for more info](https://github.com/DrewThomasson/ebook2audiobook/wiki/Docker-Compose-Headless-guide)
```bash
A headless example is already contained within the `docker-compose.yml` file.
The `docker-compose.yml` file will act as the base dir for any headless commands added.
```
### Compose container file locations
```bash
By Default: All compose containers share the contents your local `ebook2audiobook` folder
```
## Common Docker Issues
### Common Docker Issues
- My NVIDIA GPU isnt being detected?? -> [GPU ISSUES Wiki Page](https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES)
- `python: can't open file '/home/user/app/app.py': [Errno 2] No such file or directory` (Just remove all post arguments as I replaced the `CMD` with `ENTRYPOINT` in the [Dockerfile](Dockerfile))
- Example: `docker run --pull always athomasson2/ebook2audiobook app.py --script_mode full_docker` - > corrected - > `docker run --pull always athomasson2/ebook2audiobook`
- Arguments can be easily added like this now `docker run --pull always athomasson2/ebook2audiobook --share`
- Docker gets stuck downloading Fine-Tuned models.
(This does not happen for every computer but some appear to run into this issue)
Disabling the progress bar appears to fix the issue,
as discussed [here in #191](https://github.com/DrewThomasson/ebook2audiobook/issues/191)
Example of adding this fix in the `docker run` command
```Dockerfile
docker run --pull always --rm --gpus all -e HF_HUB_DISABLE_PROGRESS_BARS=1 -e HF_HUB_ENABLE_HF_TRANSFER=0 \
-p 7860:7860 athomasson2/ebook2audiobook
```
## Fine Tuned TTS models
#### Fine Tune your own XTTSv2 model

View File

@@ -1 +1 @@
25.10.25
25.11.11

221
app.py
View File

@@ -7,10 +7,10 @@ import socket
import subprocess
import sys
import tempfile
import time
import warnings
from importlib.metadata import version, PackageNotFoundError
from typing import Any, Optional, Union, Callable
from pathlib import Path
from lib import *
@@ -52,83 +52,125 @@ def check_and_install_requirements(file_path:str)->bool:
print(error)
return False
try:
from importlib.metadata import version, PackageNotFoundError
try:
from packaging.specifiers import SpecifierSet
from packaging.version import Version
from tqdm import tqdm
from packaging.markers import Marker
except ImportError:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'packaging'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'packaging', 'tqdm'])
from packaging.specifiers import SpecifierSet
from packaging.version import Version
from tqdm import tqdm
from packaging.markers import Marker
import re as regex
from tqdm import tqdm
flexible_packages = {"torch", "torchaudio", "numpy"}
torch_version = False
try:
import torch
torch_version = getattr(torch, '__version__', '')
devices['CUDA']['found'] = getattr(torch, "cuda", None) is not None and torch.cuda.is_available() and not (hasattr(torch.version, "hip") and torch.version.hip is not None)
devices['ROCM']['found'] = hasattr(torch.version, "hip") and torch.version.hip is not None and torch.cuda.is_available()
devices['MPS']['found'] = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
devices['XPU']['found'] = getattr(torch, "xpu", None) is not None and torch.xpu.is_available()
except ImportError:
pass
cuda_only_packages = ('deepspeed')
with open(file_path, 'r') as f:
contents = f.read().replace('\r', '\n')
packages = [
pkg.strip()
for pkg in contents.splitlines()
if pkg.strip() and regex.search(r'[a-zA-Z0-9]', pkg)
]
packages = [pkg.strip() for pkg in contents.splitlines() if pkg.strip() and regex.search(r'[a-zA-Z0-9]', pkg)]
if sys.version_info >= (3, 11):
packages.append("pymupdf-layout")
missing_packages = []
cuda_markers = ('+cu', '+xpu', '+nv', '+git')
for package in packages:
if ';' in package:
pkg_part, marker_part = package.split(';', 1)
marker_part = marker_part.strip()
try:
marker = Marker(marker_part)
if not marker.evaluate():
continue
except Exception as e:
error = f'Warning: Could not evaluate marker {marker_part} for {pkg_part}: {e}'
print(error)
package = pkg_part.strip()
if 'git+' in package or '://' in package:
pkg_name_match = regex.search(r'([\w\-]+)\s*@?\s*git\+', package)
pkg_name = pkg_name_match.group(1) if pkg_name_match else None
if pkg_name:
spec = importlib.util.find_spec(pkg_name)
if spec is None:
msg = f'{pkg_name} (git package) is missing.'
print(msg)
missing_packages.append(package)
else:
error = f'Unrecognized git package: {package}'
print(error)
missing_packages.append(package)
continue
clean_pkg = regex.sub(r'\[.*?\]', '', package)
pkg_name = regex.split(r'[<>=]', clean_pkg, 1)[0].strip()
pkg_name = regex.split(r'[<>=]', clean_pkg, maxsplit=1)[0].strip()
if pkg_name in cuda_only_packages:
has_cuda_build = False
if torch_version:
has_cuda_build = any(marker in torch_version for marker in cuda_markers)
if not has_cuda_build:
continue
try:
installed_version = version(pkg_name)
if pkg_name == 'num2words':
code = "ZH_CN"
spec = importlib.util.find_spec(f"num2words.lang_{code}")
if spec is None:
missing_packages.append(package)
except PackageNotFoundError:
error = f'{package} is missing.'
error = f'{pkg_name} is not installed.'
print(error)
missing_packages.append(package)
continue
if pkg_name in flexible_packages:
continue
if '+' in installed_version:
continue
else:
spec_str = clean_pkg[len(pkg_name):].strip()
if spec_str:
spec = SpecifierSet(spec_str)
# normalize installed version -> major.minor.patch (if available)
norm_match = regex.match(r'^(\d+\.\d+(?:\.\d+)?)', installed_version)
short_version = norm_match.group(1) if norm_match else installed_version
try:
installed_v = Version(short_version)
except Exception:
installed_v = Version("0")
# detect requirement version -> major.minor.patch (if available)
installed_v = Version('0')
req_match = regex.search(r'(\d+\.\d+(?:\.\d+)?)', spec_str)
if req_match:
req_v = Version(req_match.group(1))
imajor, iminor = installed_v.major, installed_v.minor
rmajor, rminor = req_v.major, req_v.minor
if "==" in spec_str:
if '==' in spec_str:
if imajor != rmajor or iminor != rminor:
error = f'{pkg_name} (installed {installed_version}) not in same major.minor as required {req_v}.'
print(error)
missing_packages.append(package)
elif ">=" in spec_str:
elif '>=' in spec_str:
if (imajor < rmajor) or (imajor == rmajor and iminor < rminor):
error = f'{pkg_name} (installed {installed_version}) < required {req_v}.'
print(error)
missing_packages.append(package)
elif "<=" in spec_str:
elif '<=' in spec_str:
if (imajor > rmajor) or (imajor == rmajor and iminor > rminor):
error = f'{pkg_name} (installed {installed_version}) > allowed {req_v}.'
print(error)
missing_packages.append(package)
elif ">" in spec_str:
elif '>' in spec_str:
if (imajor < rmajor) or (imajor == rmajor and iminor <= rminor):
error = f'{pkg_name} (installed {installed_version}) <= required {req_v}.'
print(error)
missing_packages.append(package)
elif "<" in spec_str:
elif '<' in spec_str:
if (imajor > rmajor) or (imajor == rmajor and iminor >= rminor):
error = f'{pkg_name} (installed {installed_version}) >= restricted {req_v}.'
print(error)
missing_packages.append(package)
else:
if installed_v not in spec:
error = (f'{pkg_name} (installed {installed_version}) does not satisfy "{spec_str}".')
error = f'{pkg_name} (installed {installed_version}) does not satisfy {spec_str}.'
print(error)
missing_packages.append(package)
if missing_packages:
@@ -136,25 +178,16 @@ def check_and_install_requirements(file_path:str)->bool:
print(msg)
tmp_dir = tempfile.mkdtemp()
os.environ['TMPDIR'] = tmp_dir
result = subprocess.call([sys.executable, '-m', 'pip', 'cache', 'purge'])
subprocess.call([sys.executable, '-m', 'pip', 'cache', 'purge'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
with tqdm(total=len(packages),
desc='Installation 0.00%',
bar_format='{desc}: {n_fmt}/{total_fmt} ',
unit='step') as t:
for package in tqdm(missing_packages, desc="Installing", unit="pkg"):
with tqdm(total = len(packages), desc = 'Installation 0.00%', bar_format = '{desc}: {n_fmt}/{total_fmt} ', unit = 'step') as t:
for package in tqdm(missing_packages, desc = 'Installing', unit = 'pkg'):
try:
if package == 'num2words':
pkgs = ['git+https://github.com/savoirfairelinux/num2words.git', '--force']
else:
pkgs = [package]
subprocess.check_call([
sys.executable, '-m', 'pip', 'install',
'--no-cache-dir', '--use-pep517',
*pkgs
])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', '--use-pep517', package])
t.update(1)
except subprocess.CalledProcessError as e:
if package in flexible_packages:
continue
error = f'Failed to install {package}: {e}'
print(error)
return False
@@ -175,7 +208,7 @@ def check_dictionary()->bool:
error = 'UniDic dictionary not found or incomplete. Downloading now...'
print(error)
subprocess.run(['python', '-m', 'unidic', 'download'], check=True)
except subprocess.CalledProcessError as e:
except (subprocess.CalledProcessError, ConnectionError, OSError) as e:
error = f'Failed to download UniDic dictionary. Error: {e}. Unable to continue without UniDic. Exiting...'
raise SystemExit(error)
return False
@@ -185,6 +218,26 @@ def is_port_in_use(port:int)->bool:
with socket.socket(socket.AF_INET,socket.SOCK_STREAM) as s:
return s.connect_ex(('0.0.0.0',port))==0
def kill_previous_instances(script_name: str):
current_pid = os.getpid()
this_script_path = os.path.realpath(script_name)
import psutil
for proc in psutil.process_iter(['pid', 'cmdline']):
try:
cmdline = proc.info['cmdline']
if not cmdline:
continue
# unify case and absolute paths for comparison
joined_cmd = ' '.join(cmdline).lower()
if this_script_path.lower().endswith(script_name.lower()) and \
(script_name.lower() in joined_cmd) and \
proc.info['pid'] != current_pid:
print(f"[WARN] Found running instance PID={proc.info['pid']} -> killing it.")
proc.kill()
proc.wait(timeout=3)
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
def main()->None:
# Argument parser to handle optional parameters with descriptions
parser = argparse.ArgumentParser(
@@ -195,12 +248,12 @@ Windows:
Gradio/GUI:
ebook2audiobook.cmd
Headless mode:
ebook2audiobook.cmd --headless --ebook '/path/to/file'
ebook2audiobook.cmd --headless --ebook '/path/to/file' --language eng
Linux/Mac:
Gradio/GUI:
./ebook2audiobook.sh
Headless mode:
./ebook2audiobook.sh --headless --ebook '/path/to/file'
./ebook2audiobook.sh --headless --ebook '/path/to/file' --language eng
Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
''',
@@ -233,35 +286,35 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
headless_optional_group = parser.add_argument_group('optional parameters')
headless_optional_group.add_argument(options[7], type=str, default=None, help='''(Optional) Path to the voice cloning file for TTS engine.
Uses the default voice if not present.''')
headless_optional_group.add_argument(options[8], type=str, default=default_device, choices=device_list, help=f'''(Optional) Pprocessor unit type for the conversion.
Default is set in ./lib/conf.py if not present. Fall back to CPU if GPU not available.''')
headless_optional_group.add_argument(options[8], type=str, default=default_device, choices=list(devices.values()), help=f'''(Optional) Pprocessor unit type for the conversion.
Default is set in ./lib/conf.py if not present. Fall back to CPU if CUDA or MPS is not available.''')
headless_optional_group.add_argument(options[9], type=str, default=None, choices=tts_engine_list_keys+tts_engine_list_values, help=f'''(Optional) Preferred TTS engine (available are: {tts_engine_list_keys+tts_engine_list_values}.
Default depends on the selected language. The tts engine should be compatible with the chosen language''')
headless_optional_group.add_argument(options[10], type=str, default=None, help=f'''(Optional) Path to the custom model zip file cntaining mandatory model files.
Please refer to ./lib/models.py''')
headless_optional_group.add_argument(options[11], type=str, default=default_fine_tuned, help='''(Optional) Fine tuned model path. Default is builtin model.''')
headless_optional_group.add_argument(options[12], type=str, default=default_output_format, help=f'''(Optional) Output audio format. Default is set in ./lib/conf.py''')
headless_optional_group.add_argument(options[13], type=float, default=None, help=f"""(xtts only, optional) Temperature for the model.
headless_optional_group.add_argument(options[13], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['temperature'], help=f"""(xtts only, optional) Temperature for the model.
Default to config.json model. Higher temperatures lead to more creative outputs.""")
headless_optional_group.add_argument(options[14], type=float, default=None, help=f"""(xtts only, optional) A length penalty applied to the autoregressive decoder.
headless_optional_group.add_argument(options[14], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['length_penalty'], help=f"""(xtts only, optional) A length penalty applied to the autoregressive decoder.
Default to config.json model. Not applied to custom models.""")
headless_optional_group.add_argument(options[15], type=int, default=None, help=f"""(xtts only, optional) Controls how many alternative sequences the model explores. Must be equal or greater than length penalty.
headless_optional_group.add_argument(options[15], type=int, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['num_beams'], help=f"""(xtts only, optional) Controls how many alternative sequences the model explores. Must be equal or greater than length penalty.
Default to config.json model.""")
headless_optional_group.add_argument(options[16], type=float, default=None, help=f"""(xtts only, optional) A penalty that prevents the autoregressive decoder from repeating itself.
headless_optional_group.add_argument(options[16], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['repetition_penalty'], help=f"""(xtts only, optional) A penalty that prevents the autoregressive decoder from repeating itself.
Default to config.json model.""")
headless_optional_group.add_argument(options[17], type=int, default=None, help=f"""(xtts only, optional) Top-k sampling.
headless_optional_group.add_argument(options[17], type=int, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['top_k'], help=f"""(xtts only, optional) Top-k sampling.
Lower values mean more likely outputs and increased audio generation speed.
Default to config.json model.""")
headless_optional_group.add_argument(options[18], type=float, default=None, help=f"""(xtts only, optional) Top-p sampling.
headless_optional_group.add_argument(options[18], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['top_p'], help=f"""(xtts only, optional) Top-p sampling.
Lower values mean more likely outputs and increased audio generation speed. Default to config.json model.""")
headless_optional_group.add_argument(options[19], type=float, default=None, help=f"""(xtts only, optional) Speed factor for the speech generation.
headless_optional_group.add_argument(options[19], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['speed'], help=f"""(xtts only, optional) Speed factor for the speech generation.
Default to config.json model.""")
headless_optional_group.add_argument(options[20], action='store_true', help=f"""(xtts only, optional) Enable TTS text splitting. This option is known to not be very efficient.
Default to config.json model.""")
headless_optional_group.add_argument(options[21], type=float, default=None, help=f"""(bark only, optional) Text Temperature for the model.
Default to {default_engine_settings[TTS_ENGINES['BARK']]['text_temp']}. Higher temperatures lead to more creative outputs.""")
headless_optional_group.add_argument(options[22], type=float, default=None, help=f"""(bark only, optional) Waveform Temperature for the model.
Default to {default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']}. Higher temperatures lead to more creative outputs.""")
headless_optional_group.add_argument(options[21], type=float, default=default_engine_settings[TTS_ENGINES['BARK']]['text_temp'], help=f"""(bark only, optional) Text Temperature for the model.
Default to config.json model.""")
headless_optional_group.add_argument(options[22], type=float, default=default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp'], help=f"""(bark only, optional) Waveform Temperature for the model.
Default to config.json model.""")
headless_optional_group.add_argument(options[23], type=str, help=f'''(Optional) Path to the output directory. Default is set in ./lib/conf.py''')
headless_optional_group.add_argument(options[24], action='version', version=f'ebook2audiobook version {prog_version}', help='''Show the version of the script and exit''')
headless_optional_group.add_argument(options[25], action='store_true', help=argparse.SUPPRESS)
@@ -304,17 +357,30 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
print(error)
sys.exit(1)
from lib.functions import SessionContext, convert_ebook_batch, convert_ebook, web_interface
ctx = SessionContext()
import lib.functions as f
f.context = f.SessionContext() if f.context is None else f.context
f.context_tracker = f.SessionTracker() if f.context_tracker is None else f.context_tracker
f.active_sessions = set() if f.active_sessions is None else f.active_sessions
# Conditions based on the --headless flag
if args['headless']:
args['is_gui_process'] = False
args['chapters_control'] = False
args['chapters_preview'] = False
args['event'] = ''
args['audiobooks_dir'] = os.path.abspath(args['output_dir']) if args['output_dir'] else audiobooks_cli_dir
args['device'] = 'cuda' if args['device'] == 'gpu' else args['device']
args['device'] = devices['CUDA'] if args['device'] == devices['CUDA'] else args['device']
args['tts_engine'] = TTS_ENGINES[args['tts_engine']] if args['tts_engine'] in TTS_ENGINES.keys() else args['tts_engine'] if args['tts_engine'] in TTS_ENGINES.values() else None
args['output_split'] = default_output_split
args['output_split_hours'] = default_output_split_hours
args['xtts_temperature'] = args['temperature']
args['xtts_length_penalty'] = args['length_penalty']
args['xtts_num_beams'] = args['num_beams']
args['xtts_repetition_penalty'] = args['repetition_penalty']
args['xtts_top_k'] = args['top_k']
args['xtts_top_p'] = args['top_p']
args['xtts_speed'] = args['speed']
args['xtts_enable_text_splitting'] = False
args['bark_text_temp'] = args['text_temp']
args['bark_waveform_temp'] = args['waveform_temp']
engine_setting_keys = {engine: list(settings.keys()) for engine, settings in default_engine_settings.items()}
valid_model_keys = engine_setting_keys.get(args['tts_engine'], [])
renamed_args = {}
@@ -349,7 +415,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
if any(file.endswith(ext) for ext in ebook_formats):
full_path = os.path.abspath(os.path.join(args['ebooks_dir'], file))
args['ebook_list'].append(full_path)
progress_status, passed = convert_ebook_batch(args, ctx)
progress_status, passed = f.convert_ebook_batch(args)
if passed is False:
error = f'Conversion failed: {progress_status}'
print(error)
@@ -360,7 +426,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
error = f'Error: The provided --ebook "{args["ebook"]}" does not exist.'
print(error)
sys.exit(1)
progress_status, passed = convert_ebook(args, ctx)
progress_status, passed = f.convert_ebook(args)
if passed is False:
error = f'Conversion failed: {progress_status}'
print(error)
@@ -375,10 +441,37 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
allowed_arguments = {'--share', '--script_mode'}
passed_args_set = {arg for arg in passed_arguments if arg.startswith('--')}
if passed_args_set.issubset(allowed_arguments):
web_interface(args, ctx)
try:
#script_name = os.path.basename(sys.argv[0])
#kill_previous_instances(script_name)
app = f.build_interface(args)
if app is not None:
app.queue(
default_concurrency_limit=interface_concurrency_limit
).launch(
debug=bool(int(os.environ.get('GRADIO_DEBUG', '0'))),
show_error=debug_mode, favicon_path='./favicon.ico',
server_name=interface_host,
server_port=interface_port,
share= args['share'],
max_file_size=max_upload_size
)
except OSError as e:
error = f'Connection error: {e}'
f.alert_exception(error, None)
except socket.error as e:
error = f'Socket error: {e}'
f.alert_exception(error, None)
except KeyboardInterrupt:
error = 'Server interrupted by user. Shutting down...'
f.alert_exception(error, None)
except Exception as e:
error = f'An unexpected error occurred: {e}'
f.alert_exception(error, None)
else:
error = 'Error: In non-headless mode, no option or only --share can be passed'
error = 'Error: In GUI mode, no option or only --share can be passed'
print(error)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -17,7 +17,7 @@ set "PYTHONUTF8=1"
set "PYTHONIOENCODING=utf-8"
set "CURRENT_ENV="
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox"
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox tesseract"
set "TMP=%SCRIPT_DIR%\tmp"
set "TEMP=%SCRIPT_DIR%\tmp"
@@ -78,11 +78,11 @@ exit /b
:conda_check
where /Q conda
if %errorlevel% neq 0 (
call rmdir /s /q "%CONDA_INSTALL_DIR%" 2>nul
echo Miniforge3 is not installed.
echo Miniforge3 is not installed.
set "CONDA_CHECK=1"
goto :install_components
)
:: Check if running in a Conda environment
if defined CONDA_DEFAULT_ENV (
set "CURRENT_ENV=%CONDA_PREFIX%"
@@ -158,7 +158,9 @@ if not "%CONDA_CHECK%"=="0" (
echo Conda installation failed.
goto :failed
)
call conda config --set auto_activate_base false
if not exist "%USERPROFILE%\.condarc" (
call conda config --set auto_activate false
)
call conda update conda -y
del "%CONDA_INSTALLER%"
set "CONDA_CHECK=0"
@@ -169,26 +171,66 @@ if not "%CONDA_CHECK%"=="0" (
:: Install missing packages one by one
if not "%PROGRAMS_CHECK%"=="0" (
echo Installing missing programs...
if "%SCOOP_CHECK%"=="0" (
call scoop bucket add muggle b https://github.com/hu3rror/scoop-muggle.git
call scoop bucket add extras
call scoop bucket add versions
)
for %%p in (%missing_prog_array%) do (
call scoop install %%p
set "prog=%%p"
if "%%p"=="nodejs" (
set "prog=node"
)
if "%%p"=="calibre-normal" set "prog=calibre"
where /Q !prog!
if !errorlevel! neq 0 (
echo %%p installation failed...
goto :failed
)
if "%SCOOP_CHECK%"=="0" (
call scoop bucket add muggle b https://github.com/hu3rror/scoop-muggle.git
call scoop bucket add extras
call scoop bucket add versions
)
call powershell -command "[System.Environment]::SetEnvironmentVariable('Path', [System.Environment]::GetEnvironmentVariable('Path', 'User') + '%SCOOP_SHIMS%;%SCOOP_APPS%;%CONDA_PATH%;%NODE_PATH%;', 'User')"
set "SCOOP_CHECK=0"
for %%p in (%missing_prog_array%) do (
set "prog=%%p"
call scoop install %%p
if "%%p"=="tesseract" (
where /Q !prog!
if !errorlevel! equ 0 (
set "syslang=%LANG%"
if not defined syslang set "syslang=en"
set "syslang=!syslang:~0,2!"
set "tesslang=eng"
if /I "!syslang!"=="fr" set "tesslang=fra"
if /I "!syslang!"=="de" set "tesslang=deu"
if /I "!syslang!"=="it" set "tesslang=ita"
if /I "!syslang!"=="es" set "tesslang=spa"
if /I "!syslang!"=="pt" set "tesslang=por"
if /I "!syslang!"=="ar" set "tesslang=ara"
if /I "!syslang!"=="tr" set "tesslang=tur"
if /I "!syslang!"=="ru" set "tesslang=rus"
if /I "!syslang!"=="bn" set "tesslang=ben"
if /I "!syslang!"=="zh" set "tesslang=chi_sim"
if /I "!syslang!"=="fa" set "tesslang=fas"
if /I "!syslang!"=="hi" set "tesslang=hin"
if /I "!syslang!"=="hu" set "tesslang=hun"
if /I "!syslang!"=="id" set "tesslang=ind"
if /I "!syslang!"=="jv" set "tesslang=jav"
if /I "!syslang!"=="ja" set "tesslang=jpn"
if /I "!syslang!"=="ko" set "tesslang=kor"
if /I "!syslang!"=="pl" set "tesslang=pol"
if /I "!syslang!"=="ta" set "tesslang=tam"
if /I "!syslang!"=="te" set "tesslang=tel"
if /I "!syslang!"=="yo" set "tesslang=yor"
echo Detected system language: !syslang! → downloading OCR language: !tesslang!
set "tessdata=%SCOOP_APPS%\tesseract\current\tessdata"
if not exist "!tessdata!\!tesslang!.traineddata" (
powershell -Command "Invoke-WebRequest -Uri https://github.com/tesseract-ocr/tessdata_best/raw/main/!tesslang!.traineddata -OutFile '!tessdata!\!tesslang!.traineddata'"
)
if exist "!tessdata!\!tesslang!.traineddata" (
echo Tesseract OCR language !tesslang! installed in !tessdata!
) else (
echo Failed to install OCR language !tesslang!
)
)
) else if "%%p"=="nodejs" (
set "prog=node"
) else if "%%p"=="calibre-normal" (
set "prog=calibre"
)
where /Q !prog!
if !errorlevel! neq 0 (
echo %%p installation failed...
goto :failed
)
)
call powershell -Command "[System.Environment]::SetEnvironmentVariable('Path', [System.Environment]::GetEnvironmentVariable('Path', 'User') + ';%SCOOP_SHIMS%;%SCOOP_APPS%;%CONDA_PATH%;%NODE_PATH%', 'User')"
set "SCOOP_CHECK=0"
set "PROGRAMS_CHECK=0"
set "missing_prog_array="
)

View File

@@ -1,20 +1,18 @@
torchvggish
numpy<2
num2words @ git+https://github.com/savoirfairelinux/num2words.git
regex
tqdm
cutlet
deep_translator
docker
ebooklib
fastapi
num2words
argostranslate
beautifulsoup4
fugashi
sudachipy
sudachidict_core
ray
PyMuPDF
pytesseract
unidic
pymupdf4llm
translate
hangul-romanize
indic-nlp-library
iso639-lang
@@ -25,14 +23,14 @@ pypinyin
pythainlp
mutagen
PyOpenGL
nvidia-ml-py
phonemizer-fork
pydub
pyannote-audio==3.4.0
demucs==4.0.1
gradio>=5.49
transformers==4.51.3
coqui-tts[languages]==0.26.0
torch>=2.8.0,<2.9
torchaudio>=2.8.0,<2.9
torchvggish
demucs
deepspeed
pyannote-audio<=3.4.0
stanza<=1.10.1
argostranslate<=1.10.0
gradio>=5.49.1
torch<=2.7.1
torchaudio<=2.7.1
coqui-tts[languages]==0.27.2

View File

@@ -1,14 +1,16 @@
#!/usr/bin/env bash
if [[ "$OSTYPE" = "darwin"* && -z "$SWITCHED_TO_ZSH" && "$(ps -p $$ -o comm=)" != "zsh" ]]; then
export SWITCHED_TO_ZSH=1
exec env zsh "$0" "$@"
export SWITCHED_TO_ZSH=1
exec env zsh "$0" "$@"
fi
unset SWITCHED_TO_ZSH
#unset SWITCHED_TO_ZSH
ARCH=$(uname -m)
PYTHON_VERSION="3.12"
PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || echo "3.12")
MIN_PYTHON_VERSION="3.10"
MAX_PYTHON_VERSION="3.13"
export PYTHONUTF8="1"
export PYTHONIOENCODING="utf-8"
@@ -48,7 +50,7 @@ SCRIPT_MODE="$NATIVE"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
WGET=$(which wget 2>/dev/null)
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox")
REQUIRED_PROGRAMS=("curl" "pkg-config" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox" "tesseract")
PYTHON_ENV="python_env"
CURRENT_ENV=""
@@ -60,9 +62,6 @@ fi
if [[ "$OSTYPE" = "darwin"* ]]; then
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-$(uname -m).sh"
CONFIG_FILE="$HOME/.zshrc"
if [[ "$ARCH" == "x86_64" ]]; then
PYTHON_VERSION="3.11"
fi
elif [[ "$OSTYPE" = "linux"* ]]; then
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
CONFIG_FILE="$HOME/.bashrc"
@@ -76,6 +75,20 @@ CONDA_ENV="$CONDA_INSTALL_DIR/etc/profile.d/conda.sh"
export TMPDIR="$SCRIPT_DIR/.cache"
export PATH="$CONDA_PATH:$PATH"
compare_versions() {
local ver1=$1
local ver2=$2
# Pad each version to 3 parts
IFS='.' read -r v1_major v1_minor <<<"$ver1"
IFS='.' read -r v2_major v2_minor <<<"$ver2"
((v1_major < v2_major)) && return 1
((v1_major > v2_major)) && return 2
((v1_minor < v2_minor)) && return 1
((v1_minor > v2_minor)) && return 2
return 0
}
# Check if the current script is run inside a docker container
if [[ -n "$container" || -f /.dockerenv ]]; then
SCRIPT_MODE="$FULL_DOCKER"
@@ -123,14 +136,37 @@ else
local programs=("$@")
programs_missing=()
for program in "${programs[@]}"; do
bin="$program"
if [ "$program" = "nodejs" ]; then
bin="node"
elif [ "$program" = "rust" ]; then
if command -v apt-get &> /dev/null; then
bin="rustc"
fi
if [ "$program" = "rust" ]; then
if command -v apt-get &>/dev/null; then
program="rustc"
fi
bin="rustc"
fi
if [ "$program" = "tesseract" ]; then
if command -v brew &> /dev/null; then
program="tesseract"
elif command -v emerge &> /dev/null; then
program="tesseract"
elif command -v dnf &> /dev/null; then
program="tesseract"
elif command -v yum &> /dev/null; then
program="tesseract"
elif command -v zypper &> /dev/null; then
program="tesseract-ocr"
elif command -v pacman &> /dev/null; then
program="tesseract"
elif command -v apt-get &> /dev/null; then
program="tesseract-ocr"
elif command -v apk &> /dev/null; then
program="tesseract-ocr"
else
echo "Cannot recognize your applications package manager. Please install the required applications manually."
return 1
fi
else
bin="$program"
fi
if ! command -v "$bin" >/dev/null 2>&1; then
echo -e "\e[33m$program is not installed.\e[0m"
@@ -156,8 +192,9 @@ else
if ! command -v brew &> /dev/null; then
echo -e "\e[33mHomebrew is not installed. Installing Homebrew...\e[0m"
/usr/bin/env bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> $HOME/.zprofile
eval "$(/opt/homebrew/bin/brew shellenv)"
echo >> $HOME/.zprofile
echo 'eval "$(/usr/local/bin/brew shellenv)"' >> $HOME/.zprofile
eval "$(/usr/local/bin/brew shellenv)"
fi
else
SUDO="sudo"
@@ -175,7 +212,7 @@ else
PACK_MGR="zypper install"
PACK_MGR_OPTIONS="-y"
elif command -v pacman &> /dev/null; then
PACK_MGR="pacman -Sy"
PACK_MGR="pacman -Sy --noconfirm"
elif command -v apt-get &> /dev/null; then
$SUDO apt-get update
PACK_MGR="apt-get install"
@@ -186,7 +223,6 @@ else
echo "Cannot recognize your applications package manager. Please install the required applications manually."
return 1
fi
fi
if [ -z "$WGET" ]; then
echo -e "\e[33m wget is missing! trying to install it... \e[0m"
@@ -200,9 +236,9 @@ else
fi
fi
for program in "${programs_missing[@]}"; do
if [ "$program" = "calibre" ];then
if [ "$program" = "calibre" ]; then
# avoid conflict with calibre builtin lxml
pip uninstall lxml -y 2>/dev/null
#pip uninstall lxml -y 2>/dev/null
echo -e "\e[33mInstalling Calibre...\e[0m"
if [[ "$OSTYPE" = "darwin"* ]]; then
eval "$PACK_MGR --cask calibre"
@@ -219,21 +255,75 @@ else
echo "$program installation failed."
fi
fi
elif [ "$program" = "rust" ]; then
if command -v apt-get &> /dev/null; then
app="rustc"
else
app="$program"
fi
elif [[ "$program" = "rust" || "$program" = "rustc" ]]; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
if command -v $app &>/dev/null; then
if command -v $program &>/dev/null; then
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
else
echo "$program installation failed."
fi
elif [[ "$program" = "tesseract" || "$program" = "tesseract-ocr" ]]; then
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
if command -v $program >/dev/null 2>&1; then
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
sys_lang=$(echo "${LANG:-en}" | cut -d_ -f1 | tr '[:upper:]' '[:lower:]')
case "$sys_lang" in
en) tess_lang="eng" ;;
fr) tess_lang="fra" ;;
de) tess_lang="deu" ;;
it) tess_lang="ita" ;;
es) tess_lang="spa" ;;
pt) tess_lang="por" ;;
ar) tess_lang="ara" ;;
tr) tess_lang="tur" ;;
ru) tess_lang="rus" ;;
bn) tess_lang="ben" ;;
zh) tess_lang="chi_sim" ;;
fa) tess_lang="fas" ;;
hi) tess_lang="hin" ;;
hu) tess_lang="hun" ;;
id) tess_lang="ind" ;;
jv) tess_lang="jav" ;;
ja) tess_lang="jpn" ;;
ko) tess_lang="kor" ;;
pl) tess_lang="pol" ;;
ta) tess_lang="tam" ;;
te) tess_lang="tel" ;;
yo) tess_lang="yor" ;;
*) tess_lang="eng" ;;
esac
echo "Detected system language: $sys_lang → installing Tesseract OCR language: $tess_lang"
langpack=""
if command -v brew &> /dev/null; then
langpack="tesseract-lang-$tess_lang"
elif command -v apt-get &>/dev/null; then
langpack="tesseract-ocr-$tess_lang"
elif command -v dnf &>/dev/null || command -v yum &>/dev/null; then
langpack="tesseract-langpack-$tess_lang"
elif command -v zypper &>/dev/null; then
langpack="tesseract-ocr-$tess_lang"
elif command -v pacman &>/dev/null; then
langpack="tesseract-data-$tess_lang"
elif command -v apk &>/dev/null; then
langpack="tesseract-ocr-$tess_lang"
else
echo "Cannot recognize your applications package manager. Please install the required applications manually."
return 1
fi
if [ -n "$langpack" ]; then
eval "$SUDO $PACK_MGR $langpack $PACK_MGR_OPTIONS"
if tesseract --list-langs | grep -q "$tess_lang"; then
echo "Tesseract OCR language '$tess_lang' successfully installed."
else
echo "Tesseract OCR language '$tess_lang' not installed properly."
fi
fi
else
echo "$program installation failed."
fi
else
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
if command -v $program >/dev/null 2>&1; then
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
else
@@ -251,18 +341,25 @@ else
function conda_check {
if ! command -v conda &> /dev/null || [ ! -f "$CONDA_ENV" ]; then
echo -e "\e[33mDownloading Miniforge3 installer...\e[0m"
if [[ "$OSTYPE" = "darwin"* ]]; then
if [[ "$OSTYPE" == darwin* ]]; then
curl -fsSLo "$CONDA_INSTALLER" "$CONDA_URL"
shell_name="zsh"
else
wget -O "$CONDA_INSTALLER" "$CONDA_URL"
shell_name="bash"
fi
if [[ -f "$CONDA_INSTALLER" ]]; then
echo -e "\e[33mInstalling Miniforge3...\e[0m"
bash "$CONDA_INSTALLER" -b -u -p "$CONDA_INSTALL_DIR"
rm -f "$CONDA_INSTALLER"
if [[ -f "$CONDA_INSTALL_DIR/bin/conda" ]]; then
$CONDA_INSTALL_DIR/bin/conda config --set auto_activate_base false
source $CONDA_ENV
if [ ! -f "$HOME/.condarc" ]; then
$CONDA_INSTALL_DIR/bin/conda config --set auto_activate false
fi
[ -f "$CONFIG_FILE" ] || touch "$CONFIG_FILE"
grep -qxF 'export PATH="$HOME/Miniforge3/bin:$PATH"' "$CONFIG_FILE" || echo 'export PATH="$HOME/Miniforge3/bin:$PATH"' >> "$CONFIG_FILE"
source "$CONFIG_FILE"
conda init "$shell_name"
echo -e "\e[32m===============>>> conda is installed! <<===============\e[0m"
else
echo -e "\e[31mconda installation failed.\e[0m"
@@ -275,8 +372,20 @@ else
fi
fi
if [[ ! -d "$SCRIPT_DIR/$PYTHON_ENV" ]]; then
if [[ "$OSTYPE" = "darwin"* && "$ARCH" = "x86_64" ]]; then
PYTHON_VERSION="3.11"
else
compare_versions "$PYTHON_VERSION" "$MIN_PYTHON_VERSION"
case $? in
1) PYTHON_VERSION="$MIN_PYTHON_VERSION" ;;
esac
compare_versions "$PYTHON_VERSION" "$MAX_PYTHON_VERSION"
case $? in
2) PYTHON_VERSION="$MAX_PYTHON_VERSION" ;;
esac
fi
# Use this condition to chmod writable folders once
chmod -R 777 ./audiobooks ./tmp ./models
chmod -R u+rwX,go+rX ./audiobooks ./tmp ./models
conda create --prefix "$SCRIPT_DIR/$PYTHON_ENV" python=$PYTHON_VERSION -y
conda init > /dev/null 2>&1
source $CONDA_ENV
@@ -286,7 +395,7 @@ else
python -m pip install --upgrade --no-cache-dir --use-pep517 --progress-bar=on -r requirements.txt
tts_version=$(python -c "import importlib.metadata; print(importlib.metadata.version('coqui-tts'))" 2>/dev/null)
if [[ -n "$tts_version" ]]; then
if [[ "$(printf '%s\n' "$tts_version" "0.26.1" | sort -V | tail -n1)" == "0.26.1" ]]; then
if [[ "$(printf '%s\n' "$tts_version" "0.26.1" | sort -V | tail -n1)" = "0.26.1" ]]; then
python -m pip install --no-cache-dir --use-pep517 --progress-bar=on 'transformers<=4.51.3'
fi
fi
@@ -295,24 +404,132 @@ else
return 0
}
function create_macos_app_bundle {
local APP_NAME="ebook2audiobook"
local APP_BUNDLE="$HOME/Applications/$APP_NAME.app"
local CONTENTS="$APP_BUNDLE/Contents"
local MACOS="$CONTENTS/MacOS"
local RESOURCES="$CONTENTS/Resources"
local ICON_PATH="$SCRIPT_DIR/icons/mac/appIcon.icns"
echo "🚀 Creating $APP_NAME.app bundle..."
mkdir -p "$MACOS" "$RESOURCES"
# Create the executable script inside the bundle
cat > "$MACOS/$APP_NAME" << EOF
#!/bin/bash
# Create a temporary script file to run in Terminal
TEMP_SCRIPT=\$(mktemp)
cat > "\$TEMP_SCRIPT" << 'SCRIPT'
#!/bin/bash
cd "$SCRIPT_DIR"
conda deactivate
bash ebook2audiobook.sh
# Wait 10 seconds for the server to start
sleep 10
# Open the browser
open http://localhost:7860/
SCRIPT
chmod +x "\$TEMP_SCRIPT"
# Open Terminal and run the script
open -a Terminal "\$TEMP_SCRIPT"
# Clean up the temp script after 60 seconds
sleep 60
rm "\$TEMP_SCRIPT"
EOF
chmod +x "$MACOS/$APP_NAME"
# Copy the icon to the bundle
if [ -f "$ICON_PATH" ]; then
cp "$ICON_PATH" "$RESOURCES/AppIcon.icns"
echo "✓ Icon copied to bundle"
else
echo "⚠️ Warning: Icon not found at $ICON_PATH"
fi
# Create the Info.plist file (required for macOS app bundles)
cat > "$CONTENTS/Info.plist" << 'PLIST'
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleDevelopmentRegion</key>
<string>en</string>
<key>CFBundleExecutable</key>
<string>ebook2audiobook</string>
<key>CFBundleIdentifier</key>
<string>com.local.ebook2audiobook</string>
<key>CFBundleInfoDictionaryVersion</key>
<string>6.0</string>
<key>CFBundleName</key>
<string>ebook2audiobook</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>CFBundleShortVersionString</key>
<string>1.0</string>
<key>CFBundleVersion</key>
<string>1</string>
<key>LSMinimumSystemVersion</key>
<string>10.9</string>
<key>NSPrincipalClass</key>
<string>NSApplication</string>
<key>CFBundleIconFile</key>
<string>AppIcon</string>
</dict>
</plist>
PLIST
echo "✓ Info.plist created"
# Update macOS cache to recognize the new app
touch "$APP_BUNDLE"
echo ""
echo "✅ Application bundle created successfully!"
echo "📍 Location: $APP_BUNDLE"
echo ""
}
function create_linux_app_launcher {
# Linux desktop entry creation goes here
return 0
}
function create_app_bundle {
if [[ "$OSTYPE" = "darwin"* ]]; then
create_macos_app_bundle
elif [[ "$OSTYPE" = "linux"* ]]; then
create_linux_app_launcher
fi
}
if [ "$SCRIPT_MODE" = "$FULL_DOCKER" ]; then
python app.py --script_mode "$SCRIPT_MODE" "${ARGS[@]}"
conda deactivate
conda deactivate
elif [ "$SCRIPT_MODE" = "$NATIVE" ]; then
pass=true
if [ "$SCRIPT_MODE" = "$NATIVE" ]; then
if ! required_programs_check "${REQUIRED_PROGRAMS[@]}"; then
if ! install_programs; then
pass=false
fi
pass=true
if ! required_programs_check "${REQUIRED_PROGRAMS[@]}"; then
if ! install_programs; then
pass=false
fi
fi
if [ $pass = true ]; then
if [ "$pass" = true ]; then
if conda_check; then
conda init > /dev/null 2>&1
source $CONDA_ENV
conda activate "$SCRIPT_DIR/$PYTHON_ENV"
create_app_bundle
python app.py --script_mode "$SCRIPT_MODE" "${ARGS[@]}"
conda deactivate
conda deactivate
@@ -323,4 +540,4 @@ else
fi
fi
exit 0
exit 0

View File

@@ -1,15 +1,15 @@
from .models import (
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
default_engine_settings, default_vc_model, default_voice_detection_model,
loaded_tts, max_custom_model, max_custom_voices,
max_tts_in_memory, max_upload_size, models, os, voices_dir
loaded_tts, xtts_builtin_speakers_list, max_custom_model, max_custom_voices,
max_upload_size, models, os, voices_dir
)
from .conf import (
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
default_audio_proc_format, default_device, default_gpu_wiki,
default_chapters_control, default_output_format, device_list, ebook_formats,
default_chapters_preview, default_output_format, devices, ebook_formats,
ebooks_dir, interface_component_options, interface_concurrency_limit,
interface_host, interface_port, interface_shared_tmp_expire,
max_python_version, min_python_version, models_dir, os,
@@ -31,15 +31,15 @@ __all__ = [
# from models
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
"loaded_tts", "max_custom_model",
"max_custom_voices", "max_tts_in_memory", "max_upload_size",
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
"max_custom_voices", "max_upload_size",
"models", "os", "voices_dir",
# from conf
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
"audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
"default_audio_proc_format", "default_device", "default_gpu_wiki",
"default_chapters_control", "default_output_format", "device_list", "ebook_formats",
"default_chapters_preview", "default_output_format", "devices", "ebook_formats",
"ebooks_dir", "interface_component_options", "interface_concurrency_limit",
"interface_host", "interface_port", "interface_shared_tmp_expire",
"max_python_version", "min_python_version", "models_dir", "os",

View File

@@ -3,7 +3,6 @@ import tempfile
import argostranslate.package
import argostranslate.translate
from typing import Any, Optional, Union, Callable
from iso639 import Lang
from lib.conf import models_dir
from lib.lang import language_mapping
@@ -50,7 +49,7 @@ class ArgosTranslator:
]
return language_translate_options
def get_all_target_packages(self,source_lang:str)->list[Any]:
def get_all_target_packages(self,source_lang:str)->list:
available_packages=argostranslate.package.get_available_packages()
return [pkg for pkg in available_packages if pkg.from_code==source_lang]
@@ -64,7 +63,7 @@ class ArgosTranslator:
error=f'is_package_installed() error: {e}'
return False
def download_and_install_argos_package(self,source_lang:str,target_lang:str)->tuple[Optional[str],bool]:
def download_and_install_argos_package(self,source_lang:str,target_lang:str)->tuple[str|None,bool]:
try:
if self.is_package_installed(source_lang,target_lang):
print(f"Package for translation from {source_lang} to {target_lang} is already installed.")
@@ -77,6 +76,9 @@ class ArgosTranslator:
target_package=pkg
break
if target_package:
#tmp_dir = os.path.join(session['process_dir'], "tmp")
#os.makedirs(tmp_dir, exist_ok=True)
#with tempfile.TemporaryDirectory(dir=tmp_dir) as tmpdirname:
with tempfile.TemporaryDirectory() as tmpdirname:
print(f"Downloading package for translation from {source_lang} to {target_lang}...")
package_path=target_package.download()
@@ -97,7 +99,7 @@ class ArgosTranslator:
error=f'AgrosTranslator.process() error: {e}'
return error,False
def start(self,source_lang:str,target_lang:str)->tuple[Optional[str],bool]:
def start(self,source_lang:str,target_lang:str)->tuple[str|None,bool]:
try:
if self.neural_machine!="argostranslate":
error=f"Neural machine '{self.neural_machine}' is not supported."

View File

@@ -2,7 +2,6 @@ import os
import numpy as np
import librosa
from typing import Any, Optional, Union, Callable
from pyannote.audio import Model
from pyannote.audio.pipelines import VoiceActivityDetection
from lib.conf import tts_dir

View File

@@ -2,7 +2,7 @@ import time
import logging
from queue import Queue, Empty
from typing import Any, Optional, Union, Callable
from typing import Any
class RedirectConsole:

View File

@@ -1,42 +1,38 @@
import subprocess, re, sys, gradio as gr
from typing import Any, Optional, Union, Callable
class SubprocessPipe:
def __init__(self,cmd:str,session:Any,total_duration:float):
self.cmd=cmd
self.session=session
self.total_duration=total_duration
self.process=None
self._stop_requested=False
self.progress_bar=None
self.start()
def _on_start(self)->None:
print('Export started')
if self.session.get('is_gui_process'):
def __init__(self,cmd:str, is_gui_process:bool, total_duration:float, msg:str='Processing'):
self.cmd = cmd
self.is_gui_process = is_gui_process
self.total_duration = total_duration
self.msg = msg
self.process = None
self._stop_requested = False
self.progress_bar = None
if self.is_gui_process:
self.progress_bar=gr.Progress(track_tqdm=False)
self.progress_bar(0.0,desc='Starting export...')
self._run_process()
def _on_progress(self,percent:float)->None:
sys.stdout.write(f'\rFinal Encoding: {percent:.1f}%')
sys.stdout.write(f'\r{self.msg}: {percent:.1f}%')
sys.stdout.flush()
if self.session.get('is_gui_process'):
self.progress_bar(percent/100,desc='Final Encoding')
if self.is_gui_process:
self.progress_bar(percent/100,desc=self.msg)
def _on_complete(self)->None:
print('\nExport completed successfully')
if self.session.get('is_gui_process'):
self.progress_bar(1.0,desc='Export completed')
msg = f"\n{self.msg} completed"
print(msg)
if self.is_gui_process:
self.progress_bar(1.0,desc=msg)
def _on_error(self,err:Exception)->None:
print(f'\nExport failed: {err}')
if self.session.get('is_gui_process'):
self.progress_bar(0.0,desc='Export failed')
def _on_error(self, err:Exception)->None:
error = f"\n{self.msg} failed: {err}"
print(error)
if self.is_gui_process:
self.progress_bar(0.0,desc=error)
def start(self)->bool:
def _run_process(self)->bool:
try:
self._on_start()
self.process=subprocess.Popen(
self.cmd,
stdout=subprocess.DEVNULL,
@@ -48,14 +44,11 @@ class SubprocessPipe:
last_percent=0.0
for raw_line in self.process.stderr:
line=raw_line.decode(errors='ignore')
if self._stop_requested or self.session.get('cancellation_requested'):
print('\nExport cancelled')
return self.stop()
match=time_pattern.search(raw_line)
if match and self.total_duration>0:
if match and self.total_duration > 0:
current_time=int(match.group(1))/1_000_000
percent=min((current_time/self.total_duration)*100,100)
if abs(percent-last_percent)>=0.5:
if abs(percent-last_percent) >= 0.5:
self._on_progress(percent)
last_percent=percent
elif b'progress=end' in raw_line:

View File

@@ -1,24 +1,26 @@
import hashlib
import math
import os
import shutil
import subprocess
import tempfile
import threading
import uuid
import numpy as np
import regex as re
import soundfile as sf
import torch
import torchaudio
_original_load = torch.load
def patched_torch_load(*args, **kwargs):
kwargs.setdefault("weights_only", False)
return _original_load(*args, **kwargs)
torch.load = patched_torch_load
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
import numpy as np, regex as re, soundfile as sf, torchaudio
import gc
from typing import Any
from multiprocessing.managers import DictProxy
from torch import Tensor
from huggingface_hub import hf_hub_download
from pathlib import Path
from pprint import pprint
from lib import *
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
from lib.classes.tts_engines.common.utils import cleanup_garbage, unload_tts, append_sentence2vtt
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
#import logging
@@ -27,149 +29,266 @@ from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_aud
lock = threading.Lock()
class Coqui:
def __init__(self, session):
def __init__(self,session:DictProxy):
try:
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
self.npz_path = None
self.npz_data = None
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params = {TTS_ENGINES['NEW_TTS']: {}}
self.params={TTS_ENGINES['XXX']:{}
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache = {}
self.audio_segments = []
self._build()
if not xtts_builtin_speakers_list:
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XXX']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XXX']]['files'][4], cache_dir=self.cache_dir)
xtts_builtin_speakers_list = torch.load(self.speakers_path)
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
if using_gpu and enough_vram:
if devices['CUDA']['found'] or devices['ROCM']['found']:
torch.cuda.set_per_process_memory_fraction(0.95)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
else:
if devices['CUDA']['found'] or devices['ROCM']['found']:
torch.cuda.set_per_process_memory_fraction(0.7)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
self._load_engine()
self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
print(error)
def _load_api(self, key:str, model_path:str, device:str)->Any:
global lock
try:
with lock:
unload_tts()
from XXX import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
###########
###### Load XXX api
# engine =
###########
if engine:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _build(self):
try:
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
if not tts:
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
tts = self._load_api(self.tts_key, model_path, self.session['device'])
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
except Exception as e:
error = f'build() error: {e}'
print(error)
return False
def _load_api(self, key, model_path, device):
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
if key in loaded_tts.keys():
return loaded_tts[key]['engine']
unload_tts(device, [self.tts_key, self.tts_vc_key])
with lock:
tts = NEW_TTS(model_path)
if tts
if device == 'cuda':
NEW_TTS.WITH_CUDA
else:
NEW_TTS.WITHOUT_CUDA
loaded_tts[key] = {"engine": tts, "config": None}
msg = f'{model_path} Loaded!'
print(msg)
return tts
else:
error = 'TTS engine could not be created!'
print(error)
except Exception as e:
error = f'_load_api() error: {e}'
print(error)
return False
def _load_checkpoint(self, **kwargs):
global lock
try:
key = kwargs.get('key')
if key in loaded_tts.keys():
return loaded_tts[key]['engine']
tts_engine = kwargs.get('tts_engine')
device = kwargs.get('device')
unload_tts(device, [self.tts_key])
with lock:
checkpoint_dir = kwargs.get('checkpoint_dir')
NEW_TTS.LOAD_CHECKPOINT(
config,
checkpoint_dir=checkpoint_dir,
eval=True
)
if tts:
if device == 'cuda':
NEW_TTS.WITH_CUDA
else:
NEW_TTS.WITHOUT_CUDA
loaded_tts[key] = {"engine": tts, "config": config}
msg = f'{tts_engine} Loaded!'
print(msg)
return tts
else:
error = 'TTS engine could not be created!'
print(error)
key = kwargs.get('key')
device = kwargs.get('device')
unload_tts()
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
if engine_name == TTS_ENGINES['XXX']:
from XXX import XXXConfig
from XXX import XXXtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
raise FileNotFoundError(f"Missing or invalid checkpoint_path: {checkpoint_path}")
return False
if not config_path or not os.path.exists(config_path):
raise FileNotFoundError(f"Missing or invalid config_path: {config_path}")
return False
###########
###### Load XXX checkpoint
# engine =
###########
)
if engine:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
return False
print(error)
return None
def _load_engine(self)->None:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_garbage()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
if self.session['tts_engine'] == TTS_ENGINES['XXX']:
if self.session['custom_model'] is not None:
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
if self.engine:
self.session['model_cache'] = self.tts_key
msg = f'TTS {key} Loaded!'
except Exception as e:
error = f'_load_engine() error: {e}'
def _load_engine_zs(self)->Any:
try:
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_garbage()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model, self.session['device'])
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {key} Loaded!'
except Exception as e:
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str, device:str)->str|bool:
try:
voice_parts = Path(voice_path).parts
if(self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng'):
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
cleanup_garbage()
engine = loaded_tts.get(key, False)
if not engine:
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
"xtts_temperature": float,
"xtts_length_penalty": float,
"xtts_num_beams": int,
"xtts_repetition_penalty": float,
"xtts_top_k": int,
"xtts_top_p": float,
"xtts_speed": float,
"xtts_enable_text_splitting": bool,
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
return new_voice_path
else:
error = 'normalize_audio() error:'
else:
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
else:
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
return False
else:
return voice_path
else:
return voice_path
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
print(error)
return False
def _tensor_type(self, audio_data):
if isinstance(audio_data, torch.Tensor):
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data,torch.Tensor):
return audio_data
elif isinstance(audio_data, np.ndarray):
elif isinstance(audio_data,np.ndarray):
return torch.from_numpy(audio_data).float()
elif isinstance(audio_data, list):
return torch.tensor(audio_data, dtype=torch.float32)
elif isinstance(audio_data,list):
return torch.tensor(audio_data,dtype=torch.float32)
else:
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
def _get_resampler(self, orig_sr, target_sr):
key = (orig_sr, target_sr)
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
key=(orig_sr,target_sr)
if key not in self.resampler_cache:
self.resampler_cache[key] = torchaudio.transforms.Resample(
orig_freq=orig_sr, new_freq=target_sr
self.resampler_cache[key]=torchaudio.transforms.Resample(
orig_freq = orig_sr,new_freq = target_sr
)
return self.resampler_cache[key]
def _resample_wav(self, wav_path, expected_sr):
waveform, orig_sr = torchaudio.load(wav_path)
if orig_sr == expected_sr and waveform.size(0) == 1:
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
waveform,orig_sr = torchaudio.load(wav_path)
if orig_sr==expected_sr and waveform.size(0)==1:
return wav_path
if waveform.size(0) > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if orig_sr != expected_sr:
resampler = self._get_resampler(orig_sr, expected_sr)
if waveform.size(0)>1:
waveform = waveform.mean(dim=0,keepdim=True)
if orig_sr!=expected_sr:
resampler = self._get_resampler(orig_sr,expected_sr)
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(tmp_dir, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
def convert(self, sentence_number, sentence):
def convert(self, sentence_index:int, sentence:str)->bool:
global xtts_builtin_speakers_list
try:
speaker = None
audio_data = False
trim_audio_buffer = 0.004
audio_sentence = False
settings = self.params[self.session['tts_engine']]
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
sentence = sentence.strip()
settings['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
@@ -177,56 +296,112 @@ class Coqui:
)
if settings['voice_path'] is not None:
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
if tts:
if sentence[-1].isalnum():
sentence = f'{sentence}'
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and os.path.basename(settings['voice_path']) != 'ref.wav':
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker, self.session['device'])
if not settings['voice_path']:
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
print(msg)
return False
if self.engine:
self.engine.to(self.session['device'])
trim_audio_buffer = 0.004
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
if sentence == TTS_SML['break']:
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
self.audio_segments.append(break_tensor.clone())
return True
elif sentence == TTS_SML['pause']:
pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
elif not sentence.replace('', '').strip() or sentence == TTS_SML['pause']:
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
self.audio_segments.append(pause_tensor.clone())
return True
else:
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
audio_sentence = NEW_TTS.CONVERT() # audio_sentence must be torch.Tensor or (list, tuple) or np.ndarray
if sentence[-1].isalnum():
sentence = f'{sentence}'
elif sentence.endswith("'"):
sentence = sentence[:-1]
if self.session['tts_engine'] == TTS_ENGINES['XXX']:
trim_audio_buffer = 0.008
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
else:
msg = 'Computing speaker latents...'
print(msg)
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
fine_tuned_params = {
key.removeprefix("xxx_"): cast_type(self.session[key])
for key, cast_type in {
"xxx_temperature": float,
"xxx_length_penalty": float,
"xxx_num_beams": int,
"xxx_repetition_penalty": float,
"xxx_top_k": int,
"xxx_top_p": float,
"xxx_speed": float,
"xxx_enable_text_splitting": bool
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.engine.inference(
text=sentence.replace('.', ''),
language=self.session['language_iso1'],
gpt_cond_latent=settings['gpt_cond_latent'],
speaker_embedding=settings['speaker_embedding'],
**fine_tuned_params
)
audio_sentence = result.get('wav')
if is_audio_data_valid(audio_sentence):
audio_sentence = audio_sentence.tolist()
if is_audio_data_valid(audio_sentence):
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
if sentence[-1].isalnum() or sentence[-1] == '':
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
self.audio_segments.append(audio_tensor)
if not re.search(r'\w$', sentence, flags=re.UNICODE):
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
self.audio_segments.append(break_tensor.clone())
if self.audio_segments:
audio_tensor = torch.cat(self.audio_segments, dim=-1)
start_time = self.sentences_total_time
duration = audio_tensor.shape[-1] / settings['samplerate']
end_time = start_time + duration
self.sentences_total_time = end_time
sentence_obj = {
"start": start_time,
"end": end_time,
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
del audio_tensor
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
else:
error = f"Cannot create {final_sentence_file}"
print(error)
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
if audio_tensor is not None and audio_tensor.numel() > 0:
self.audio_segments.append(audio_tensor)
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '':
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
self.audio_segments.append(break_tensor.clone())
if self.audio_segments:
audio_tensor = torch.cat(self.audio_segments, dim=-1)
start_time = self.sentences_total_time
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
end_time = start_time + duration
self.sentences_total_time = end_time
sentence_obj = {
"start": start_time,
"end": end_time,
"text": sentence,
"resume_check": self.sentence_idx
}
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_garbage()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
else:
error = f"Cannot create {final_sentence_file}"
print(error)
return False
else:
error = f"audio_sentence not valide"
print(error)
return False
else:
error = f"convert() error: {self.session['tts_engine']} is None"
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
print(error)
return False
except Exception as e:
error = f'Coquit.convert(): {e}'
error = f'XXX.convert(): {e}'
raise ValueError(e)
return False
return False

View File

@@ -2,13 +2,16 @@ import numpy as np
import torch
import subprocess
import shutil
import json
from torch import Tensor
from typing import Any, Optional, Union, Callable
from typing import Any, Union
from scipy.io import wavfile as wav
from scipy.signal import find_peaks
def detect_gender(voice_path:str)->Optional[str]:
from lib.classes.subprocess_pipe import SubprocessPipe
def detect_gender(voice_path:str)->str|None:
try:
samplerate, signal = wav.read(voice_path)
# Ensure mono
@@ -57,7 +60,29 @@ def trim_audio(audio_data: Union[list[float], Tensor], samplerate: int, silence_
raise TypeError(error)
return torch.tensor([], dtype=torch.float32)
def normalize_audio(input_file:str, output_file:str, samplerate:int)->bool:
def get_audio_duration(filepath:str)->float:
try:
ffprobe_cmd = [
shutil.which('ffprobe'),
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'json',
filepath
]
result = subprocess.run(ffprobe_cmd, capture_output=True, text=True)
try:
return float(json.loads(result.stdout)['format']['duration'])
except Exception:
return 0
except subprocess.CalledProcessError as e:
DependencyError(e)
return 0
except Exception as e:
error = f"get_audio_duration() Error: Failed to process {txt_file}{out_file}: {e}"
print(error)
return 0
def normalize_audio(input_file:str, output_file:str, samplerate:int, is_gui_process:bool)->bool:
filter_complex = (
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
'afftdn=nf=-70,'
@@ -70,24 +95,17 @@ def normalize_audio(input_file:str, output_file:str, samplerate:int)->bool:
'equalizer=f=9000:t=q:w=2:g=-2,'
'highpass=f=63[audio]'
)
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
ffmpeg_cmd += [
cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
cmd += [
'-filter_complex', filter_complex,
'-map', '[audio]',
'-ar', str(samplerate),
'-y', output_file
]
try:
subprocess.run(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
encoding='utf-8',
errors='ignore'
)
proc_pipe = SubprocessPipe(cmd, is_gui_process=is_gui_process, total_duration=get_audio_duration(input_file), msg='Normalize')
if proc_pipe:
return True
except subprocess.CalledProcessError as e:
else:
error = f"normalize_audio() error: {input_file}: {e}"
print(error)
return False

View File

@@ -1,31 +1,35 @@
import os
import gc
import torch
import regex as re
import stanza
from typing import Any, Optional, Union, Callable
from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
from typing import Any, Union
from lib.models import loaded_tts, TTS_ENGINES
from lib.functions import context
def unload_tts(device:str, reserved_keys:Optional[list[str]] = None, tts_key:Optional[str] = None)->bool:
def cleanup_garbage():
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.synchronize()
def unload_tts()->None:
try:
if len(loaded_tts) >= max_tts_in_memory:
if reserved_keys is None:
reserved_keys = []
if tts_key is not None:
if tts_key in loaded_tts:
del loaded_tts[tts_key]
if device == "cuda":
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
else:
for key in list(loaded_tts.keys()):
if key not in reserved_keys:
del loaded_tts[key]
return True
active_models = {
cache
for session in context.sessions.values()
for cache in (session.get('model_cache'), session.get('model_zs_cache'), session.get('stanza_cache'))
if cache is not None
}
for key in list(loaded_tts.keys()):
if key not in active_models:
del loaded_tts[key]
cleanup_garbage()
except Exception as e:
error = f"unload_tts() error: {e}"
print(error)
return False
def append_sentence2vtt(sentence_obj:dict[str, Any], path:str)->Union[int, bool]:

View File

@@ -1,74 +1,170 @@
import torch
from typing import Any, Optional, Union, Callable
_original_load = torch.load
def patched_torch_load(*args, **kwargs):
kwargs.setdefault("weights_only", False)
return _original_load(*args, **kwargs)
torch.load = patched_torch_load
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
import numpy as np, regex as re, soundfile as sf, torchaudio
import gc
from typing import Any
from multiprocessing.managers import DictProxy
from torch import Tensor
from huggingface_hub import hf_hub_download
from pathlib import Path
from pprint import pprint
from lib import *
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
from lib.classes.tts_engines.common.utils import cleanup_garbage, unload_tts, append_sentence2vtt
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
#import logging
#logging.basicConfig(level=logging.DEBUG)
lock = threading.Lock()
xtts_builtin_speakers_list = None
class Coqui:
def __init__(self,session:Any):
def __init__(self,session:DictProxy):
try:
global xtts_builtin_speakers_list
self.session = session
self.cache_dir = tts_dir
self.speakers_path = None
self.tts = None
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
self.tts_vc_key = default_vc_model.rsplit('/',1)[-1]
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported()==True else False
self.npz_path = None
self.npz_data = None
self.engine = None
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
self.engine_zs = None
self.pth_voice_file = None
self.sentences_total_time = 0.0
self.sentence_idx = 1
self.params={TTS_ENGINES['XTTSv2']:{"latent_embedding":{}},TTS_ENGINES['BARK']:{},TTS_ENGINES['VITS']:{"semitones":{}},TTS_ENGINES['FAIRSEQ']:{"semitones":{}},TTS_ENGINES['TACOTRON2']:{"semitones":{}},TTS_ENGINES['YOURTTS']:{}}
self.params[self.session['tts_engine']]['samplerate']=models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
self.resampler_cache={}
self.audio_segments=[]
self._build()
self.resampler_cache = {}
self.audio_segments = []
if not xtts_builtin_speakers_list:
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
xtts_builtin_speakers_list = torch.load(self.speakers_path)
using_gpu = self.session['device'] != devices['CPU']['proc']
enough_vram = self.session['free_vram_gb'] > 4.0
if using_gpu and enough_vram:
if devices['CUDA']['found'] or devices['ROCM']['found']:
torch.cuda.set_per_process_memory_fraction(0.95)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
else:
if devices['CUDA']['found'] or devices['ROCM']['found']:
torch.cuda.set_per_process_memory_fraction(0.7)
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
self._load_engine()
self._load_engine_zs()
except Exception as e:
error = f'__init__() error: {e}'
print(error)
def _build(self)->bool:
def _load_api(self, key:str, model_path:str, device:str)->Any:
global lock
try:
global xtts_builtin_speakers_list
load_zeroshot = True if self.session['tts_engine'] in [TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2']] else False
self.tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
if not self.tts:
if xtts_builtin_speakers_list is None:
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
xtts_builtin_speakers_list = torch.load(self.speakers_path)
with lock:
unload_tts()
from TTS.api import TTS as TTSEngine
engine = loaded_tts.get(key, False)
if not engine:
engine = TTSEngine(model_path)
if engine:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return None
def _load_checkpoint(self,**kwargs:Any)->Any:
global lock
try:
with lock:
key = kwargs.get('key')
device = kwargs.get('device')
unload_tts()
engine = loaded_tts.get(key, False)
if not engine:
engine_name = kwargs.get('tts_engine', None)
if engine_name == TTS_ENGINES['XTTSv2']:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
if not checkpoint_path or not os.path.exists(checkpoint_path):
raise FileNotFoundError(f"Missing or invalid checkpoint_path: {checkpoint_path}")
return False
if not config_path or not os.path.exists(config_path):
raise FileNotFoundError(f"Missing or invalid config_path: {config_path}")
return False
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
engine = Xtts.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
use_deepspeed = default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'] if self.session['device'] in [devices['CUDA']['proc'], devices['XPU']['proc'], devices['ROCM']['proc']] else False,
eval = True
)
elif engine_name == TTS_ENGINES['BARK']:
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
checkpoint_dir = kwargs.get('checkpoint_dir')
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
raise FileNotFoundError(f"Missing or invalid checkpoint_dir: {checkpoint_dir}")
return False
config = BarkConfig()
config.CACHE_DIR = self.cache_dir
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
engine = Bark.init_from_config(config)
engine.load_checkpoint(
config,
checkpoint_dir = checkpoint_dir,
eval = True
)
if engine:
loaded_tts[key] = engine
return engine
except Exception as e:
error = f'_load_checkpoint() error: {e}'
print(error)
return None
def _load_engine(self)->None:
try:
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
print(msg)
cleanup_garbage()
self.engine = loaded_tts.get(self.tts_key, False)
if not self.engine:
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
msg = f"Loading TTS {self.session['tts_engine']} model, it takes a while, please be patient..."
print(msg)
if self.session['custom_model'] is not None:
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
if self.session['fine_tuned'] == 'internal':
@@ -80,12 +176,11 @@ class Coqui:
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
else:
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
@@ -93,12 +188,11 @@ class Coqui:
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
checkpoint_dir = os.path.dirname(text_model_path)
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
@@ -106,28 +200,23 @@ class Coqui:
if sub is not None:
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
print(msg)
self.tts_key = model_path
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
return False
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
self.tts_key = model_path
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
print(msg)
else:
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
@@ -138,126 +227,39 @@ class Coqui:
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
if sub is not None:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
print(msg)
self.tts_key = model_path
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
else:
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
print(msg)
return False
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
if self.session['custom_model'] is not None:
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
print(msg)
return False
else:
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
if load_zeroshot:
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
if not tts_vc:
if self.session['voice'] is not None:
msg = f"Loading TTS {self.tts_vc_key} zeroshot model, it takes a while, please be patient..."
print(msg)
tts_vc = self._load_api(self.tts_vc_key, default_vc_model, self.session['device'])
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
if self.engine:
self.session['model_cache'] = self.tts_key
msg = f'TTS {key} Loaded!'
except Exception as e:
error = f'build() error: {e}'
print(error)
return False
error = f'_load_engine() error: {e}'
def _load_api(self, key: str, model_path: str, device: str) -> bool | Any:
global lock
def _load_engine_zs(self)->Any:
try:
if key in loaded_tts:
print(f"Reusing cached TTS engine for key: {key}")
tts = loaded_tts[key]['engine']
return tts
unload_tts(device, [self.tts_key, self.tts_vc_key])
from TTS.api import TTS as CoquiAPI
with lock:
print(f"Loading Coqui model from: {model_path}")
tts = CoquiAPI(model_path)
if not tts:
return False
if device == "cuda" and torch.cuda.is_available():
tts.cuda()
elif device == "mps" and torch.backends.mps.is_available():
tts.to(torch.device("mps"))
else:
tts.to(device)
loaded_tts[key] = {"engine": tts, "config": None}
msg = f"Model loaded successfully: {model_path} ({device})"
print(msg)
return tts
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
print(msg)
cleanup_garbage()
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
if not self.engine_zs:
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model, self.session['device'])
if self.engine_zs:
self.session['model_zs_cache'] = self.tts_zs_key
msg = f'ZeroShot {key} Loaded!'
except Exception as e:
error = f"_load_api() error: {e}"
print(error)
return False
def _load_checkpoint(self,**kwargs:Any)->bool|Any:
global lock
try:
key = kwargs.get('key')
if key in loaded_tts.keys():
return loaded_tts[key]['engine']
tts_engine = kwargs.get('tts_engine')
device = kwargs.get('device')
unload_tts(device,[self.tts_key,self.tts_vc_key])
with lock:
if tts_engine==TTS_ENGINES['XTTSv2']:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
checkpoint_path = kwargs.get('checkpoint_path')
config_path = kwargs.get('config_path',None)
vocab_path = kwargs.get('vocab_path',None)
config = XttsConfig()
config.models_dir = os.path.join("models","tts")
config.load_json(config_path)
tts = Xtts.init_from_config(config)
tts.load_checkpoint(
config,
checkpoint_path = checkpoint_path,
vocab_path = vocab_path,
use_deepspeed = default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'],
eval = True
)
elif tts_engine==TTS_ENGINES['BARK']:
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
checkpoint_dir = kwargs.get('checkpoint_dir')
config = BarkConfig()
config.CACHE_DIR = self.cache_dir
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS','').lower()=='true'
tts = Bark.init_from_config(config)
tts.load_checkpoint(
config,
checkpoint_dir = checkpoint_dir,
eval = True
)
if tts:
if device=='cuda':
tts.cuda()
else:
if device=='mps':
tts.to(torch.device('mps'))
else:
tts.to(device)
loaded_tts[key]={"engine":tts,"config":config}
msg = f'{tts_engine} Loaded!'
print(msg)
return tts
else:
error='TTS engine could not be created!'
print(error)
except Exception as e:
error = f'_load_checkpoint() error: {e}'
return False
error = f'_load_engine_zs() error: {e}'
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str, device:str)->str|bool:
def _valid_tensor(t:Any):
return isinstance(t, torch.Tensor) and not (torch.isnan(t).any() or torch.isinf(t).any())
try:
voice_parts = Path(voice_path).parts
if(self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng'):
@@ -266,23 +268,22 @@ class Coqui:
if os.path.exists(default_text_file):
msg = f"Converting builtin eng voice to {self.session['language']}..."
print(msg)
tts_internal_key = f"{TTS_ENGINES['XTTSv2']}-internal"
key = f"{TTS_ENGINES['XTTSv2']}-internal"
default_text = Path(default_text_file).read_text(encoding="utf-8")
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
self.tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
if not self.tts:
for key in list(loaded_tts.keys()):
unload_tts(device, None, key)
cleanup_garbage()
engine = loaded_tts.get(key, False)
if not engine:
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
hf_sub = ''
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
self.tts = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=tts_internal_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
if self.tts:
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
if engine:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
gpt_cond_latent, speaker_embedding = self.tts.get_conditioning_latents(audio_path=[voice_path])
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
for key, cast_type in {
@@ -298,27 +299,27 @@ class Coqui:
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.tts.inference(
result = engine.inference(
text=default_text.strip(),
language=self.session['language_iso1'],
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
**fine_tuned_params,
)
audio_data = result.get('wav') if isinstance(result, dict) else None
if audio_data is not None:
audio_data = audio_data.tolist()
sourceTensor = self._tensor_type(audio_data)
audio_sentence = result.get('wav') if isinstance(result, dict) else None
if audio_sentence is not None:
audio_sentence = audio_sentence.tolist()
sourceTensor = self._tensor_type(audio_sentence)
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
# CON is a reserved name on windows
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
del audio_data, sourceTensor, audio_tensor
if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
del self.tts
unload_tts(device, None, tts_internal_key)
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
del audio_sentence, sourceTensor, audio_tensor
Path(proc_voice_path).unlink(missing_ok=True)
gc.collect()
return new_voice_path
else:
error = 'normalize_audio() error:'
@@ -329,6 +330,7 @@ class Coqui:
else:
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
print(error)
return False
else:
return voice_path
else:
@@ -336,68 +338,52 @@ class Coqui:
except Exception as e:
error = f'_check_xtts_builtin_speakers() error: {e}'
print(error)
return False
return False
def _check_bark_npz(self,voice_path:str,bark_dir:str,speaker:str,device:str)->bool:
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str, device:str)->bool:
try:
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
npz_dir = os.path.join(bark_dir,speaker)
npz_file = os.path.join(npz_dir,f'{speaker}.npz')
if os.path.exists(npz_file):
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
if os.path.exists(pth_voice_file):
return True
else:
os.makedirs(npz_dir,exist_ok=True)
tts_internal_key = f"{TTS_ENGINES['BARK']}-internal"
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
self.tts = (loaded_tts.get(tts_internal_key) or {}).get('engine',False)
if not self.tts:
for key in list(loaded_tts.keys()):unload_tts(device,None,key)
text_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}",cache_dir=self.cache_dir)
coarse_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}",cache_dir=self.cache_dir)
fine_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}",cache_dir=self.cache_dir)
checkpoint_dir = os.path.dirname(text_model_path)
self.tts = self._load_checkpoint(tts_engine=TTS_ENGINES['BARK'],key=tts_internal_key,checkpoint_dir=checkpoint_dir,device=device)
if self.tts:
voice_temp=os.path.splitext(npz_file)[0]+'.wav'
shutil.copy(voice_path,voice_temp)
default_text_file = os.path.join(voices_dir,self.session['language'],'default.txt')
default_text = Path(default_text_file).read_text(encoding="utf-8")
fine_tuned_params={
key.removeprefix("bark_"):cast_type(self.session[key])
for key,cast_type in{
"bark_text_temp":float,
"bark_waveform_temp":float
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
torch.manual_seed(67878789)
audio_data = self.tts.synthesize(
default_text,
loaded_tts[tts_internal_key]['config'],
speaker_id=speaker,
voice_dirs=bark_dir,
silent=True,
**fine_tuned_params
)
os.remove(voice_temp)
del audio_data
if self.session['tts_engine']!=TTS_ENGINES['BARK']:
del self.tts
unload_tts(device,None,tts_internal_key)
msg = f"Saved NPZ file: {npz_file}"
print(msg)
return True
else:
error = f'_check_bark_npz() error: {tts_internal_key} is False'
print(error)
os.makedirs(pth_voice_dir,exist_ok=True)
key = f"{TTS_ENGINES['BARK']}-internal"
voice_temp = os.path.splitext(pth_voice_file)[0]+'.wav'
shutil.copy(voice_path,voice_temp)
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
default_text = Path(default_text_file).read_text(encoding="utf-8")
fine_tuned_params = {
key.removeprefix("bark_"):cast_type(self.session[key])
for key,cast_type in{
"bark_text_temp":float,
"bark_waveform_temp":float
}.items()
if self.session.get(key) is not None
}
with torch.no_grad():
#torch.manual_seed(67878789)
audio_sentence = self.engine.synthesize(
default_text,
speaker_wav=voice_path,
speaker=speaker,
voice_dir=pth_voice_dir,
silent=True,
**fine_tuned_params
)
os.remove(voice_temp)
del audio_sentence
msg = f"Saved file: {pth_voice_file}"
print(msg)
gc.collect()
return True
else:
return True
except Exception as e:
error = f'_check_bark_npz() error: {e}'
print(error)
return False
return False
def _tensor_type(self,audio_data:Any)->torch.Tensor:
if isinstance(audio_data,torch.Tensor):
@@ -428,22 +414,19 @@ class Coqui:
waveform = resampler(waveform)
wav_tensor = waveform.squeeze(0)
wav_numpy = wav_tensor.cpu().numpy()
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav",delete=False)
os.path.join(self.session['process_dir'], 'tmp')
os.makedirs(tmp_dir, exist_ok=True)
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
tmp_path = tmp_fh.name
tmp_fh.close()
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
return tmp_path
def convert(self, s_n:int, s:str)->bool:
global xtts_builtin_speakers_list
def convert(self, sentence_index:int, sentence:str)->bool:
try:
sentence_number = s_n
sentence = s
speaker = None
audio_data = False
trim_audio_buffer = 0.004
audio_sentence = False
settings = self.params[self.session['tts_engine']]
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
settings['voice_path'] = (
self.session['voice'] if self.session['voice'] is not None
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
@@ -457,8 +440,10 @@ class Coqui:
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
print(msg)
return False
self.tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
if self.tts:
if self.engine:
self.engine.to(self.session['device'])
trim_audio_buffer = 0.004
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
@@ -484,7 +469,7 @@ class Coqui:
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
else:
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.tts.get_conditioning_latents(audio_path=[settings['voice_path']])
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
fine_tuned_params = {
key.removeprefix("xtts_"): cast_type(self.session[key])
@@ -501,7 +486,7 @@ class Coqui:
if self.session.get(key) is not None
}
with torch.no_grad():
result = self.tts.inference(
result = self.engine.inference(
text=sentence.replace('.', ''),
language=self.session['language_iso1'],
gpt_cond_latent=settings['gpt_cond_latent'],
@@ -530,10 +515,11 @@ class Coqui:
else:
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker, self.session['device']):
error = 'Could not create npz file!'
error = 'Could not create pth file!'
print(error)
return False
npz_file = os.path.join(bark_dir, speaker, f'{speaker}.npz')
pth_voice_dir = os.path.join(bark_dir, speaker)
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
fine_tuned_params = {
key.removeprefix("bark_"): cast_type(self.session[key])
for key, cast_type in {
@@ -542,22 +528,16 @@ class Coqui:
}.items()
if self.session.get(key) is not None
}
if self.npz_path is None or self.npz_path != npz_file:
self.npz_path = npz_file
self.npz_data = np.load(self.npz_path, allow_pickle=True)
history_prompt = [
self.npz_data["semantic_prompt"],
self.npz_data["coarse_prompt"],
self.npz_data["fine_prompt"]
]
with torch.no_grad():
torch.manual_seed(67878789)
audio_sentence, _ = self.tts.generate_audio(
#torch.manual_seed(67878789)
result = self.engine.synthesize(
sentence,
history_prompt=history_prompt,
speaker=speaker,
voice_dir=pth_voice_dir,
silent=True,
**fine_tuned_params
)
audio_sentence = result.get('wav')
if is_audio_data_valid(audio_sentence):
audio_sentence = audio_sentence.tolist()
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
@@ -573,11 +553,12 @@ class Coqui:
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
self.tts.tts_to_file(
text=sentence,
file_path=tmp_in_wav,
**speaker_argument
)
with torch.no_grad():
self.engine.tts_to_file(
text=sentence,
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
@@ -612,17 +593,16 @@ class Coqui:
return False
else:
tmp_out_wav = tmp_in_wav
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
if tts_vc:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = tts_vc.voice_conversion(
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_vc_key} is None'
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
@@ -632,10 +612,11 @@ class Coqui:
if os.path.exists(source_wav):
os.remove(source_wav)
else:
audio_sentence = self.tts.tts(
text=sentence,
**speaker_argument
)
with torch.no_grad():
audio_sentence = self.engine.tts(
text=sentence,
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
speaker_argument = {}
not_supported_punc_pattern = re.compile(r"[.:—]")
@@ -644,11 +625,12 @@ class Coqui:
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
self.tts.tts_to_file(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
with torch.no_grad():
self.engine.tts_to_file(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
@@ -672,26 +654,27 @@ class Coqui:
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except subprocess.CalledProcessError as e:
print(f"Subprocess error: {e.stderr}")
error = f'Subprocess error: {e.stderr}'
print(error)
DependencyError(e)
return False
except FileNotFoundError as e:
print(f"File not found: {e}")
error = f'File not found: {e}'
print(error)
DependencyError(e)
return False
else:
tmp_out_wav = tmp_in_wav
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
if tts_vc:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = tts_vc.voice_conversion(
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_vc_key} is None'
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
@@ -701,23 +684,28 @@ class Coqui:
if os.path.exists(source_wav):
os.remove(source_wav)
else:
audio_sentence = self.tts.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
**speaker_argument
)
with torch.no_grad():
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
speaker_argument = {}
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
if self.session['language'] in ['zho', 'jpn', 'kor', 'tha', 'lao', 'mya', 'khm']:
not_supported_punc_pattern = re.compile(r'\p{P}+')
else:
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
if settings['voice_path'] is not None:
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
os.makedirs(proc_dir, exist_ok=True)
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
self.tts.tts_to_file(
text=re.sub(not_supported_punc_pattern, '', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
with torch.no_grad():
self.engine.tts_to_file(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
file_path=tmp_in_wav,
**speaker_argument
)
if settings['voice_path'] in settings['semitones'].keys():
semitones = settings['semitones'][settings['voice_path']]
else:
@@ -752,17 +740,16 @@ class Coqui:
return False
else:
tmp_out_wav = tmp_in_wav
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
if tts_vc:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
if self.engine_zs:
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
audio_sentence = tts_vc.voice_conversion(
audio_sentence = self.engine_zs.voice_conversion(
source_wav=source_wav,
target_wav=target_wav
)
else:
error = f'Engine {self.tts_vc_key} is None'
error = f'Engine {self.tts_zs_key} is None'
print(error)
return False
if os.path.exists(tmp_in_wav):
@@ -772,10 +759,11 @@ class Coqui:
if os.path.exists(source_wav):
os.remove(source_wav)
else:
audio_sentence = self.tts.tts(
text=re.sub(not_supported_punc_pattern, '', sentence),
**speaker_argument
)
with torch.no_grad():
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
**speaker_argument
)
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
trim_audio_buffer = 0.002
speaker_argument = {}
@@ -788,8 +776,8 @@ class Coqui:
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
speaker_argument = {"speaker": voice_key}
with torch.no_grad():
audio_sentence = self.tts.tts(
text=re.sub(not_supported_punc_pattern, '', sentence),
audio_sentence = self.engine.tts(
text=re.sub(not_supported_punc_pattern, ' ', sentence),
language=language,
**speaker_argument
)
@@ -820,16 +808,23 @@ class Coqui:
if self.sentence_idx:
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
del audio_tensor
cleanup_garbage()
self.audio_segments = []
if os.path.exists(final_sentence_file):
return True
else:
error = f"Cannot create {final_sentence_file}"
print(error)
return False
else:
error = f"audio_sentence not valide"
print(error)
return False
else:
error = f"convert() error: {self.session['tts_engine']} is None"
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
print(error)
return False
except Exception as e:
error = f'Coquit.convert(): {e}'
raise ValueError(e)
return False
return False

View File

@@ -1,12 +1,12 @@
import os
from typing import Any, Optional, Union, Callable
from typing import Any
from lib.models import TTS_ENGINES
class TTSManager:
def __init__(self, session:Any):
self.session = session
self.engine = None
self.engine = False
self._build()
def _build(self)->None:
@@ -17,9 +17,6 @@ class TTSManager:
#elif self.session['tts_engine'] in [TTS_ENGINES['NEW_TTS']]:
# from lib.classes.tts_engines.new_tts import NewTts
# self.engine = NewTts(self.session)
if not self.engine:
error='TTS engine could not be created!'
print(error)
else:
print('Other TTS engines coming soon!')
@@ -32,4 +29,3 @@ class TTSManager:
except Exception as e:
error=f'convert_sentence2audio(): {e}'
raise ValueError(e)
return False

View File

@@ -5,8 +5,9 @@ import scipy.fftpack
import soundfile as sf
import subprocess
import shutil
import json
from typing import Any, Optional, Union, Callable
from typing import Any
from io import BytesIO
from pydub import AudioSegment, silence
from pydub.silence import detect_silence
@@ -14,6 +15,7 @@ from pydub.silence import detect_silence
from lib.conf import voice_formats, default_audio_proc_samplerate
from lib.models import TTS_ENGINES, models
from lib.classes.background_detector import BackgroundDetector
from lib.classes.subprocess_pipe import SubprocessPipe
class VoiceExtractor:
def __init__(self, session:Any, voice_file:str, voice_name:str):
@@ -30,7 +32,7 @@ class VoiceExtractor:
def _validate_format(self)->tuple[bool,str]:
file_extension = os.path.splitext(self.voice_file)[1].lower()
if file_extension in voice_formats:
msg = 'Input file valid'
msg = 'Input file is valid'
return True,msg
error = f'Unsupported file format: {file_extension}. Supported formats are: {", ".join(voice_formats)}'
return False,error
@@ -38,33 +40,21 @@ class VoiceExtractor:
def _convert2wav(self)->tuple[bool, str]:
try:
self.wav_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
ffmpeg_cmd = [
cmd = [
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_file,
'-ac', '1', '-y', self.wav_file
]
process = subprocess.Popen(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=False # <── raw bytes mode (no implicit UTF-8 decoding)
)
# Decode safely line by line
for raw_line in iter(process.stdout.readline, b''):
try:
line = raw_line.decode('utf-8', errors='replace') # <── replaces invalid bytes
except Exception:
line = raw_line.decode('latin-1', errors='replace')
print(line, end='')
process.wait()
if process.returncode != 0:
error = f'_convert2wav(): process.returncode: {process.returncode}'
elif not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
]
proc_pipe = SubprocessPipe(cmd, is_gui_process=self.session['is_gui_process'], total_duration=self._get_audio_duration(self.voice_file), msg='Convert')
if proc_pipe:
if not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
return False, error
else:
msg = 'Conversion to .wav format for processing successful'
return True, msg
else:
msg = 'Conversion to .wav format for processing successful'
return True, msg
error = f'_convert2wav() error:: {self.wav_file}'
return False, error
except subprocess.CalledProcessError as e:
try:
stderr_text = e.stderr.decode('utf-8', errors='replace')
@@ -201,12 +191,35 @@ class VoiceExtractor:
error = f'_trim_and_clean() error: {e}'
raise ValueError(error)
def _get_audio_duration(self, filepath:str)->float:
try:
cmd = [
shutil.which('ffprobe'),
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'json',
filepath
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
duration = json.loads(result.stdout)['format']['duration']
return float(duration)
except Exception:
return 0
except subprocess.CalledProcessError as e:
DependencyError(e)
return 0
except Exception as e:
error = f"get_audio_duration() Error: Failed to process {filepath}: {e}"
print(error)
return 0
def _normalize_audio(self)->tuple[bool, str]:
error = ''
try:
proc_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
final_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
filter_complex = (
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
'afftdn=nf=-70,'
@@ -219,33 +232,26 @@ class VoiceExtractor:
'equalizer=f=9000:t=q:w=2:g=-2,'
'highpass=f=63[audio]'
)
ffmpeg_cmd += [
cmd += [
'-filter_complex', filter_complex,
'-map', '[audio]',
'-ar', f'{default_audio_proc_samplerate}',
'-y', proc_voice_file
]
try:
process = subprocess.Popen(
ffmpeg_cmd,
env = {},
stdout = subprocess.PIPE,
stderr = subprocess.PIPE,
encoding = 'utf-8',
errors = 'ignore'
)
for line in process.stdout:
print(line, end = '')
process.wait()
if process.returncode != 0:
error = f'_normalize_audio(): process.returncode: {process.returncode}'
elif not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
proc_pipe = SubprocessPipe(cmd, is_gui_process=self.session['is_gui_process'], total_duration=self._get_audio_duration(self.voice_track), msg='Normalize')
if proc_pipe:
if not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
return False, error
else:
os.replace(proc_voice_file, final_voice_file)
shutil.rmtree(self.demucs_dir, ignore_errors = True)
msg = 'Audio normalization successful!'
return True, msg
else:
os.replace(proc_voice_file, final_voice_file)
shutil.rmtree(self.demucs_dir, ignore_errors = True)
msg = 'Audio normalization successful!'
return True, msg
error = f'normalize_audio() error: {final_voice_file}'
return False, error
except subprocess.CalledProcessError as e:
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
except FileNotFoundError as e:

View File

@@ -1,145 +1,110 @@
import os, platform, subprocess, re, json, psutil, tempfile, time
from typing import Any, Optional, Union, Callable
import os, platform, json, psutil, subprocess, re
from typing import Any
class VRAMDetector:
def __init__(self):
self.system:str = platform.system().lower()
self.system = platform.system().lower()
def _run(self, cmd:list[str], timeout:int = 3)->str:
@staticmethod
def _fmt(b:int)->str:
if not b: return 'Unknown'
if b >= 1024**3: return f'{b/1024**3:.2f} GB'
if b >= 1024**2: return f'{b/1024**2:.2f} MB'
if b >= 1024: return f'{b/1024:.2f} KB'
return f'{b} B'
def detect_vram(self, device:str, as_json:bool=False)->Any:
info = {}
# ───────────────────────────── CUDA (NVIDIA)
try:
result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, text = True, timeout = timeout)
return result.stdout.strip()
import torch
if device == 'cuda':
if torch.cuda.is_available():
free, total = torch.cuda.mem_get_info()
alloc = torch.cuda.memory_allocated()
resv = torch.cuda.memory_reserved()
info = {
"os": self.system,
"device_type": "cuda",
"device_name": torch.cuda.get_device_name(0),
"free_bytes": free,
"total_bytes": total,
"allocated_bytes": alloc,
"reserved_bytes": resv,
"free_human": self._fmt(free),
"total_human": self._fmt(total),
"allocated_human": self._fmt(alloc),
"reserved_human": self._fmt(resv),
}
return json.dumps(info, indent=2) if as_json else info
# ─────────────────────────── ROCm (AMD)
if hasattr(torch, 'hip') and torch.hip.is_available():
free, total = torch.hip.mem_get_info()
alloc = torch.hip.memory_allocated()
resv = torch.hip.memory_reserved()
info = {
"os": self.system,
"device_type": "rocm",
"device_name": torch.hip.get_device_name(0),
"free_bytes": free,
"total_bytes": total,
"allocated_bytes": alloc,
"reserved_bytes": resv,
"free_human": self._fmt(free),
"total_human": self._fmt(total),
"allocated_human": self._fmt(alloc),
"reserved_human": self._fmt(resv),
}
return json.dumps(info, indent=2) if as_json else info
# ─────────────────────────── Intel XPU (oneAPI)
if hasattr(torch, 'xpu') and torch.xpu.is_available():
free, total = torch.xpu.mem_get_info()
alloc = torch.xpu.memory_allocated()
resv = torch.xpu.memory_reserved()
info = {
"os": self.system,
"device_type": "xpu",
"device_name": torch.xpu.get_device_name(0),
"free_bytes": free,
"total_bytes": total,
"allocated_bytes": alloc,
"reserved_bytes": resv,
"free_human": self._fmt(free),
"total_human": self._fmt(total),
"allocated_human": self._fmt(alloc),
"reserved_human": self._fmt(resv),
}
return json.dumps(info, indent=2) if as_json else info
# ─────────────────────────── Apple MPS (Metal)
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
info = {
"os": self.system,
"device_type": "mps",
"device_name": "Apple GPU (Metal)",
"note": "PyTorch MPS does not expose memory info; reporting system RAM",
}
mem = psutil.virtual_memory()
info['free_bytes'] = mem.available
info['total_bytes'] = mem.total
info['free_human'] = self._fmt(mem.available)
info['total_human'] = self._fmt(mem.total)
return json.dumps(info, indent=2) if as_json else info
except Exception:
return ""
pass
def _parse_bytes(self, val:str)->int:
if not val:
return 0
val = val.strip().upper()
m = re.findall(r"([\d.]+)", val)
if not m:
return 0
n = float(m[0])
if "GB" in val: return int(n*1024**3)
if "MB" in val: return int(n*1024**2)
if "KB" in val: return int(n*1024)
return int(n)
def _fmt(self, b:int)->str:
if not b: return "Unknown"
if b >= 1024**3: return f"{b/1024**3:.1f} GB"
if b >= 1024**2: return f"{b/1024**2:.1f} MB"
return f"{b} B"
# ---- Windows GPU detection ----
def _get_windows_vram(self)->list[dict[str,Any]]:
gpus = []
out = self._run(["wmic","path","win32_VideoController","get","Name,AdapterRAM","/format:list"])
for block in out.split("\n\n"):
if "Name = " not in block: continue
name = re.search(r"Name = (.*)", block)
vram = re.search(r"AdapterRAM = (\d+)", block)
if name:
val = int(vram.group(1)) if vram else 0
gpus.append({"name":name.group(1).strip(),"vram_bytes":val,"vram":self._fmt(val)})
if any(g["vram_bytes"]>0 for g in gpus):
return gpus
with tempfile.NamedTemporaryFile(delete = False, suffix = ".txt") as tf:
path = tf.name
try:
subprocess.Popen(["dxdiag","/t",path],stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
for _ in range(30):
if os.path.exists(path) and os.path.getsize(path)>0:
break
time.sleep(0.1)
with open(path,encoding = "utf-16",errors = "ignore") as f:
data = f.read()
except Exception:
data = ""
finally:
try: os.remove(path)
except: pass
for m in re.finditer(r"Card name:\s*(.*?)\r?\n.*?(?:Dedicated Memory|Display Memory):\s*([^\r\n]+)", data, re.S):
name,mem = m.groups()
vb = self._parse_bytes(mem)
if vb:
gpus.append({"name":name.strip(),"vram_bytes":vb,"vram":self._fmt(vb)})
return gpus
def _get_windows_shared(self)->int:
try:
with tempfile.NamedTemporaryFile(delete = False, suffix = ".txt") as tf:
path = tf.name
subprocess.Popen(["dxdiag","/t",path],stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
for _ in range(30):
if os.path.exists(path) and os.path.getsize(path)>0:
break
time.sleep(0.1)
with open(path,encoding = "utf-16",errors = "ignore") as f:
data = f.read()
except Exception:
data = ""
finally:
try: os.remove(path)
except: pass
m = re.search(r"Shared Memory:\s*([^\r\n]+)", data)
return self._parse_bytes(m.group(1)) if m else 0
# ---- Linux/macOS simplified ----
def _get_linux_vram(self)->list[dict[str,Any]]:
out = self._run(["nvidia-smi","--query-gpu = name,memory.total","--format = csv,noheader,nounits"])
gpus = []
for line in out.splitlines():
if "," not in line: continue
name,mem = line.split(",",1)
vb = int(mem.strip())*1024**2
gpus.append({"name":name.strip(),"vram_bytes":vb,"vram":self._fmt(vb)})
return gpus
def _get_linux_shared(self)->int:
return psutil.virtual_memory().total//4 if hasattr(psutil,"virtual_memory") else 0
def _get_macos_vram(self)->list[dict[str,Any]]:
out = self._run(["system_profiler","SPDisplaysDataType","-json"])
try:data = json.loads(out)
except: return []
g = []
for gpu in data.get("SPDisplaysDataType",[]):
v = self._parse_bytes(gpu.get("spdisplays_vram",""))
g.append({"name":gpu.get("_name","GPU"),"vram_bytes":v,"vram":self._fmt(v)})
return g
def _get_macos_shared(self)->int:
out = self._run(["system_profiler","SPDisplaysDataType","-json"])
try:data = json.loads(out)
except:return 0
for gpu in data.get("SPDisplaysDataType",[]):
for key in ("spdisplays_vram_shared","spdisplays_vram_dynamic"):
if key in gpu:
return self._parse_bytes(gpu[key])
return 0
# ---- main API ----
def detect_vram(self,as_json:bool = False)->Any:
sys = self.system
if sys == "windows":
g = self._get_windows_vram(); s = self._get_windows_shared()
elif sys == "linux":
g = self._get_linux_vram(); s = self._get_linux_shared()
elif sys == "darwin":
g = self._get_macos_vram(); s = self._get_macos_shared()
else:
g = []; s = 0
total = sum(x.get("vram_bytes",0) for x in g)
res = {
"os":sys,
"gpu_count":len(g),
"gpus":g,
"total_vram_bytes":total,
"total_vram_human":self._fmt(total),
"shared_memory_bytes":s,
"shared_memory_human":self._fmt(s),
"total_combined_human":self._fmt(total+s)
# ─────────────────────────── CPU fallback
mem = psutil.virtual_memory()
info = {
"os": self.system,
"device_type": "cpu",
"device_name": "System RAM",
"free_bytes": mem.available,
"total_bytes": mem.total,
"free_human": self._fmt(mem.available),
"total_human": self._fmt(mem.total),
}
return json.dumps(res,indent = 2) if as_json else res
return json.dumps(info, indent=2) if as_json else info

View File

@@ -1,7 +1,12 @@
import os
import platform
import tempfile
min_python_version = (3,10)
max_python_version = (3,13)
tmp_dir = os.path.abspath('tmp')
tempfile.tempdir = tmp_dir
tmp_expire = 7 # days
models_dir = os.path.abspath('models')
@@ -14,10 +19,10 @@ os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
os.environ['GRADIO_DEBUG'] = '1'
os.environ['DO_NOT_TRACK'] = 'true'
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
os.environ['GRADIO_DEBUG'] = '0'
os.environ['DO_NOT_TRACK'] = 'True'
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
os.environ['HF_HOME'] = tts_dir
os.environ['HF_DATASETS_CACHE'] = tts_dir
@@ -30,25 +35,27 @@ os.environ['STANZA_RESOURCES_DIR'] = os.path.join(models_dir, 'stanza')
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32,garbage_collection_threshold:0.6,expandable_segments:True'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_CACHE_MAXSIZE"] = "2147483648"
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
if platform.system() == 'Windows':
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
prog_version = (lambda: open('VERSION.txt').read().strip())()
min_python_version = (3,10)
max_python_version = (3,12)
NATIVE = 'native'
FULL_DOCKER = 'full_docker'
debug_mode = True
debug_mode = False
device_list = ['cpu', 'gpu', 'mps']
default_device = 'cpu'
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">howto wiki</a>'
default_chapters_control = False
devices = {"CPU": {"proc": "cpu", "found": True}, "CUDA": {"proc": "cuda", "found": False}, "MPS": {"proc": "mps", "found": False}, "ROCM": {"proc": "rocm", "found": False}, "XPU": {"proc": "xpu", "found": False}}
default_device = devices['CPU']['proc']
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">GPU howto wiki</a>'
default_chapters_preview = False
python_env_dir = os.path.abspath(os.path.join('.','python_env'))
requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
@@ -56,7 +63,7 @@ requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
interface_host = '0.0.0.0'
interface_port = 7860
interface_shared_tmp_expire = 3 # in days
interface_concurrency_limit = 1 # or None for unlimited
interface_concurrency_limit = 1 # or None for unlimited multiple parallele user conversion
interface_component_options = {
"gr_tab_xtts_params": True,

File diff suppressed because it is too large Load Diff

View File

@@ -834,7 +834,6 @@ language_mapping = {
"ben": {"name": "Bengali", "native_name": "বাংলা", "max_chars": 142},
"zho": {"name": "Chinese", "native_name": "中文", "max_chars": 82},
"eng": {"name": "English", "native_name": "English", "max_chars": 250},
"fas": {"name": "Persian", "native_name": "فارسی", "max_chars": 182},
"fra": {"name": "French", "native_name": "Français", "max_chars": 273},
"deu": {"name": "German, Standard", "native_name": "Deutsch", "max_chars": 253},
"hin": {"name": "Hindi", "native_name": "हिन्दी", "max_chars": 142},
@@ -844,6 +843,7 @@ language_mapping = {
"jav": {"name": "Javanese", "native_name": "Basa Jawa", "max_chars": 182},
"jpn": {"name": "Japanese", "native_name": "日本語", "max_chars": 71},
"kor": {"name": "Korean", "native_name": "한국어", "max_chars": 95},
"fas": {"name": "Persian", "native_name": "فارسی", "max_chars": 182},
"pol": {"name": "Polish", "native_name": "Polski", "max_chars": 224},
"por": {"name": "Portuguese", "native_name": "Português", "max_chars": 203},
"rus": {"name": "Russian", "native_name": "Русский", "max_chars": 182},

View File

@@ -1,7 +1,8 @@
import os
from lib.conf import tts_dir, voices_dir
loaded_tts = {}
xtts_builtin_speakers_list = []
TTS_ENGINES = {
"XTTSv2": "xtts",
@@ -30,7 +31,6 @@ default_fine_tuned = 'internal'
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
default_voice_detection_model = 'drewThomasson/segmentation'
max_tts_in_memory = 2 # TTS engines to keep in memory (1 tts engine ~= 4GB to 8GB RAM).
max_custom_model = 100
max_custom_voices = 1000
max_upload_size = '6GB'
@@ -46,10 +46,6 @@ default_engine_settings = {
"top_p": 0.85,
"speed": 1.0,
"enable_text_splitting": False,
# to enable deepspeed, you must install it first:
# conda activate ./python_env (linux/mac) or .\python_env (windows)
# pip install deepspeed
# conda deactivate
"use_deepspeed": False,
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav', 'speakers_xtts.pth'],
"voices": {
@@ -74,12 +70,12 @@ default_engine_settings = {
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
"MarcosRudaski": "Marcos Rudaski"
},
"rating": {"GPU VRAM": 4, "CPU": 3, "RAM": 8, "Realism": 5}
"rating": {"VRAM": 2, "CPU": 2, "RAM": 4, "Realism": 5}
},
TTS_ENGINES['BARK']: {
"samplerate": 24000,
"text_temp": 0.50,
"waveform_temp": 0.50,
"text_temp": 0.4,
"waveform_temp": 0.6,
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
"speakers_path": os.path.join(voices_dir, '__bark'),
"voices": {
@@ -128,31 +124,31 @@ default_engine_settings = {
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
"zh_speaker_9": "Speaker 9"
},
"rating": {"GPU VRAM": 4, "CPU": 1, "RAM": 16, "Realism": 4}
"rating": {"VRAM": 6, "CPU": 1, "RAM": 8, "Realism": 5}
},
TTS_ENGINES['VITS']: {
"samplerate": 22050,
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
"voices": {},
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
},
TTS_ENGINES['FAIRSEQ']: {
"samplerate": 16000,
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
"voices": {},
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
},
TTS_ENGINES['TACOTRON2']: {
"samplerate": 22050,
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
"voices": {},
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
"rating": {"VRAM": 1, "CPU": 5, "RAM": 2, "Realism": 3}
},
TTS_ENGINES['YOURTTS']: {
"samplerate": 16000,
"files": ['config.json', 'model_file.pth'],
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
"rating": {"GPU VRAM": 1, "CPU": 5, "RAM": 4, "Realism": 2}
"rating": {"VRAM": 0, "CPU": 5, "RAM": 1, "Realism": 2}
}
}
models = {
@@ -333,6 +329,14 @@ models = {
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"PeterGriffinFamilyGuy": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
},
"RafeBeckley": {
"lang": "eng",
"repo": "drewThomasson/fineTunedTTSModels",

View File

@@ -1,69 +1,67 @@
[build-system]
name = "ebook2audiobook"
requires = ["setuptools >= 64"]
build-backend = "setuptools.build_meta"
[tool.poetry]
name = "ebook2audiobook"
version = "25.10.25"
[tool.setuptools.dynamic]
version = {file = "VERSION.txt"}
[project]
name = "ebook2audiobook"
description = "Convert eBooks to audiobooks with chapters and metadata"
authors = [
{ name = "Drew Thomasson" }
]
dependencies = [
"regex",
"tqdm",
"cutlet",
"deep_translator",
"docker",
"ebooklib",
"fastapi",
"num2words",
"argostranslate",
"beautifulsoup4",
"fugashi",
"sudachipy",
"sudachidict_core",
"ray",
"unidic",
"pymupdf4llm",
"translate",
"hangul-romanize",
"indic-nlp-library",
"iso639-lang",
"jieba",
"pycantonese",
"soynlp",
"pypinyin",
"pythainlp",
"mutagen",
"PyOpenGL",
"nvidia-ml-py",
"phonemizer-fork",
"pydub",
"pyannote-audio==3.4.0",
"demucs==4.0.1",
"gradio>=5.49",
"transformers==4.51.3",
"coqui-tts[languages]==0.26.0",
"torch>=2.8.0,<2.9",
"torchaudio>=2.8.0,<2.9",
"torchvggish"
]
readme = "README.md"
requires-python = ">3.9,<3.13"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
scripts = { "ebook2audiobook" = "app:main" }
[project.urls]
"Homepage" = "https://github.com/DrewThomasson/ebook2audiobook"
[build-system]
name = "ebook2audiobook"
requires = ["setuptools >= 64"]
build-backend = "setuptools.build_meta"
[tool.poetry]
name = "ebook2audiobook"
version = "25.10.30"
[tool.setuptools.dynamic]
version = {file = "VERSION.txt"}
[project]
name = "ebook2audiobook"
description = "Convert eBooks to audiobooks with chapters and metadata"
authors = [
{ name = "Drew Thomasson" }
]
dependencies = [
"torchvggish",
"numpy<2",
"num2words @ git+https://github.com/savoirfairelinux/num2words.git",
"regex",
"tqdm",
"docker",
"ebooklib",
"fastapi",
"beautifulsoup4",
"fugashi",
"sudachipy",
"sudachidict_core",
"PyMuPDF",
"pytesseract",
"unidic",
"hangul-romanize",
"indic-nlp-library",
"iso639-lang",
"jieba",
"pycantonese",
"soynlp",
"pypinyin",
"pythainlp",
"mutagen",
"PyOpenGL",
"phonemizer-fork",
"pydub",
"demucs",
"deepspeed",
"pyannote-audio<=3.4.0",
"stanza<=1.10.1",
"argostranslate<=1.10.0",
"gradio>=5.49.1",
"torch<=2.7.1",
"torchaudio<=2.7.1",
"coqui-tts[languages]==0.27.2"
]
readme = "README.md"
requires-python = ">3.9,<3.14"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
scripts = { "ebook2audiobook" = "app:main" }
[project.urls]
"Homepage" = "https://github.com/DrewThomasson/ebook2audiobook"

View File

@@ -1,20 +1,18 @@
torchvggish
numpy<2
num2words @ git+https://github.com/savoirfairelinux/num2words.git
regex
tqdm
cutlet
deep_translator
docker
ebooklib
fastapi
num2words
argostranslate
beautifulsoup4
fugashi
sudachipy
sudachidict_core
ray
PyMuPDF
pytesseract
unidic
pymupdf4llm
translate
hangul-romanize
indic-nlp-library
iso639-lang
@@ -25,14 +23,14 @@ pypinyin
pythainlp
mutagen
PyOpenGL
nvidia-ml-py
phonemizer-fork
pydub
pyannote-audio==3.4.0
demucs==4.0.1
gradio>=5.49
transformers==4.51.3
coqui-tts[languages]==0.26.0
torch>=2.8.0,<2.9
torchaudio>=2.8.0,<2.9
torchvggish
demucs
deepspeed
pyannote-audio<=3.4.0
stanza<=1.10.1
argostranslate<=1.10.0
gradio>=5.49.1
torch<=2.7.1
torchaudio<=2.7.1
coqui-tts[languages]==0.27.2

0
tmp/.gitkeep Executable file → Normal file
View File

BIN
tools/icons/appLogo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 461 KiB

View File

@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
Multi-platform icon generator
Converts appLogo.png into platform-specific formats and sizes
Requires: Pillow (PIL), cairosvg (optional for SVG)
Installation:
pip install Pillow cairosvg
"""
import os
import sys
from PIL import Image
# Icon sizes for each platform
ICON_SIZES = {
'windows': [16, 24, 32, 48, 256],
'mac': [16, 32, 64, 128, 256, 512, 1024],
'linux': [16, 24, 32, 48, 64, 128, 256]
}
def create_directories():
"""Create output directories for each platform"""
for platform in ICON_SIZES.keys():
os.makedirs(f'icons/{platform}', exist_ok=True)
print("✓ Directories created")
def resize_image(source_path, output_dir, sizes):
"""Resize image to multiple sizes"""
try:
img = Image.open(source_path)
# Convert to RGBA to ensure transparency support
img = img.convert('RGBA')
for size in sizes:
resized = img.resize((size, size), Image.Resampling.LANCZOS)
output_path = f'{output_dir}/icon-{size}.png'
resized.save(output_path, 'PNG')
print(f" ✓ Generated {size}x{size} icon")
return True
except Exception as e:
print(f"✗ Error resizing image: {e}")
return False
def create_windows_ico(output_dir):
"""Create Windows ICO file from PNGs"""
try:
sizes = ICON_SIZES['windows']
images = []
for size in sizes:
img_path = f'{output_dir}/icon-{size}.png'
images.append(Image.open(img_path))
# Save as ICO with multiple sizes
images[0].save(
f'{output_dir}/appIcon.ico',
format='ICO',
sizes=[(size, size) for size in sizes]
)
print("✓ Windows ICO file created: icons/windows/appIcon.ico")
return True
except Exception as e:
print(f"✗ Error creating ICO: {e}")
return False
def create_mac_icns(output_dir):
"""Create macOS ICNS file from PNGs (requires imagemagick or online conversion)"""
try:
import subprocess
sizes = ICON_SIZES['mac']
# Create iconset directory
iconset_dir = f'{output_dir}/appIcon.iconset'
os.makedirs(iconset_dir, exist_ok=True)
for size in sizes:
img_path = f'{output_dir}/icon-{size}.png'
# macOS uses specific naming conventions
scale = 2 if size > 256 else 1
icon_name = f'icon_{size // scale}x{size // scale}'
if scale == 2:
icon_name += '@2x'
output_path = f'{iconset_dir}/{icon_name}.png'
os.system(f'cp {img_path} {output_path}')
# Try to create ICNS using iconutil (macOS only) or convert
try:
subprocess.run(['iconutil', '-c', 'icns', '-o',
f'{output_dir}/appIcon.icns', iconset_dir],
check=True, capture_output=True)
print("✓ macOS ICNS file created: icons/mac/appIcon.icns")
except (subprocess.CalledProcessError, FileNotFoundError):
print("⚠ Note: iconutil not found. ICNS not created.")
print(" On macOS, run: iconutil -c icns -o icons/mac/appIcon.icns icons/mac/appIcon.iconset")
return False
return True
except Exception as e:
print(f"✗ Error creating ICNS: {e}")
return False
def create_svg_copy(source_path, output_dir):
"""Create SVG copy for Linux (optional, requires vector source)"""
try:
import shutil
svg_path = source_path.replace('.png', '.svg')
if os.path.exists(svg_path):
shutil.copy(svg_path, f'{output_dir}/appIcon.svg')
print(f"✓ SVG icon copied: icons/linux/appIcon.svg")
return True
else:
print("⚠ No SVG source found (optional for Linux)")
return True
except Exception as e:
print(f"✗ Error copying SVG: {e}")
return False
def main():
"""Main execution"""
print("🎨 Multi-Platform Icon Generator\n")
# Find source image
source_image = 'appLogo.png'
if not os.path.exists(source_image):
print(f"✗ Error: {source_image} not found in current directory")
sys.exit(1)
print(f"Source: {source_image}\n")
# Create directories
create_directories()
print()
# Generate icons for each platform
for platform, sizes in ICON_SIZES.items():
print(f"Generating {platform.upper()} icons...")
output_dir = f'icons/{platform}'
if not resize_image(source_image, output_dir, sizes):
sys.exit(1)
print()
# Create platform-specific formats
print("Creating platform-specific formats...\n")
if not create_windows_ico('icons/windows'):
print("⚠ Continuing despite ICO creation issue\n")
if not create_mac_icns('icons/mac'):
print("⚠ Continuing despite ICNS creation issue\n")
if not create_svg_copy(source_image, 'icons/linux'):
print("⚠ Continuing despite SVG copy issue\n")
print("✅ Icon generation complete!")
print("\nOutput structure:")
print(" icons/")
print(" ├── windows/")
print(" │ ├── appIcon.ico")
print(" │ └── icon-*.png")
print(" ├── mac/")
print(" │ ├── appIcon.icns (if created)")
print(" │ └── icon-*.png")
print(" └── linux/")
print(" ├── appIcon.svg (if available)")
print(" └── icon-*.png")
if __name__ == '__main__':
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 345 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 591 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 876 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 345 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 876 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 465 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 465 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

BIN
tools/icons/mac/icon-16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 345 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
tools/icons/mac/icon-32.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 876 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

BIN
tools/icons/mac/icon-64.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 367 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 345 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 591 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 876 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.