Merge branch 'v25' into DrewThomasson-RafeBeckley-patch-1
94
Dockerfile
@@ -16,7 +16,6 @@ RUN apt-get update && \
|
||||
# Install Rust compiler
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
# Install UniDic (non-torch dependent)
|
||||
RUN pip install --no-cache-dir unidic-lite unidic && \
|
||||
@@ -31,74 +30,61 @@ ARG TORCH_VERSION=""
|
||||
# Add parameter to control whether to skip the XTTS test
|
||||
ARG SKIP_XTTS_TEST="false"
|
||||
|
||||
# Copy the application
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
|
||||
# Extract torch versions from requirements.txt or set to empty strings if not found
|
||||
RUN TORCH_VERSION_REQ=$(grep -E "^torch==" requirements.txt | cut -d'=' -f3 || echo "") && \
|
||||
TORCHAUDIO_VERSION_REQ=$(grep -E "^torchaudio==" requirements.txt | cut -d'=' -f3 || echo "") && \
|
||||
TORCHVISION_VERSION_REQ=$(grep -E "^torchvision==" requirements.txt | cut -d'=' -f3 || echo "") && \
|
||||
echo "Found in requirements: torch==$TORCH_VERSION_REQ torchaudio==$TORCHAUDIO_VERSION_REQ torchvision==$TORCHVISION_VERSION_REQ"
|
||||
|
||||
# Install PyTorch with CUDA support if specified
|
||||
# Install requirements.txt or PyTorch variants based on TORCH_VERSION
|
||||
RUN if [ ! -z "$TORCH_VERSION" ]; then \
|
||||
# Check if we need to use specific versions or get the latest
|
||||
if [ ! -z "$TORCH_VERSION_REQ" ] && [ ! -z "$TORCHVISION_VERSION_REQ" ] && [ ! -z "$TORCHAUDIO_VERSION_REQ" ]; then \
|
||||
echo "Using specific versions from requirements.txt" && \
|
||||
TORCH_SPEC="torch==${TORCH_VERSION_REQ}" && \
|
||||
TORCHVISION_SPEC="torchvision==${TORCHVISION_VERSION_REQ}" && \
|
||||
TORCHAUDIO_SPEC="torchaudio==${TORCHAUDIO_VERSION_REQ}"; \
|
||||
else \
|
||||
echo "Using latest versions for the selected variant" && \
|
||||
TORCH_SPEC="torch" && \
|
||||
TORCHVISION_SPEC="torchvision" && \
|
||||
TORCHAUDIO_SPEC="torchaudio"; \
|
||||
fi && \
|
||||
\
|
||||
# Check if TORCH_VERSION contains "cuda" and extract version number
|
||||
if echo "$TORCH_VERSION" | grep -q "cuda"; then \
|
||||
CUDA_VERSION=$(echo "$TORCH_VERSION" | sed 's/cuda//g') && \
|
||||
echo "Detected CUDA version: $CUDA_VERSION" && \
|
||||
echo "Attempting to install PyTorch nightly for CUDA $CUDA_VERSION..." && \
|
||||
#if ! pip install --no-cache-dir --pre $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
|
||||
if ! pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
|
||||
echo "❌ Nightly build for CUDA $CUDA_VERSION not available or failed" && \
|
||||
echo "🔄 Trying stable release for CUDA $CUDA_VERSION..." && \
|
||||
#if pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
|
||||
if pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
|
||||
echo "✅ Successfully installed stable PyTorch for CUDA $CUDA_VERSION"; \
|
||||
else \
|
||||
echo "❌ Both nightly and stable builds failed for CUDA $CUDA_VERSION"; \
|
||||
echo "💡 This CUDA version may not be supported by PyTorch"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
\
|
||||
# Special handling for CUDA 11.8
|
||||
if [ "$CUDA_VERSION" = "118" ]; then \
|
||||
echo "Installing PyTorch for CUDA 11.8..." && \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install pyannote-audio==3.4.0 && pip install --no-cache-dir --upgrade torch==2.7.1 torchvision==2.7.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu118; \
|
||||
elif [ "$CUDA_VERSION" = "128" ]; then \
|
||||
echo "Installing PyTorch for CUDA 12.8..." && \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch==2.7.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu128; \
|
||||
else \
|
||||
echo "✅ Successfully installed nightly PyTorch for CUDA $CUDA_VERSION"; \
|
||||
echo "Attempting to install stable PyTorch for CUDA $CUDA_VERSION..." && \
|
||||
if ! pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}; then \
|
||||
echo "❌ Stable build for CUDA $CUDA_VERSION not available or failed" && \
|
||||
echo "🔄 Trying nightly release for CUDA $CUDA_VERSION..." && \
|
||||
if pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION}; then \
|
||||
echo "✅ Successfully installed nightly PyTorch for CUDA $CUDA_VERSION"; \
|
||||
else \
|
||||
echo "❌ Both stable and nightly builds failed for CUDA $CUDA_VERSION"; \
|
||||
echo "💡 This CUDA version may not be supported by PyTorch"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
else \
|
||||
echo "✅ Successfully installed stable PyTorch for CUDA $CUDA_VERSION"; \
|
||||
fi; \
|
||||
fi; \
|
||||
else \
|
||||
# Handle non-CUDA cases (existing functionality)
|
||||
# Handle non-CUDA cases
|
||||
case "$TORCH_VERSION" in \
|
||||
"rocm") \
|
||||
# Using the correct syntax for ROCm PyTorch installation
|
||||
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/rocm6.2 \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2 \
|
||||
;; \
|
||||
"xpu") \
|
||||
# Install PyTorch with Intel XPU support through IPEX
|
||||
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC && \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio && \
|
||||
pip install --no-cache-dir intel-extension-for-pytorch --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
|
||||
;; \
|
||||
"cpu") \
|
||||
pip install --no-cache-dir $TORCH_SPEC $TORCHVISION_SPEC $TORCHAUDIO_SPEC --extra-index-url https://download.pytorch.org/whl/cpu \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu \
|
||||
;; \
|
||||
*) \
|
||||
pip install --no-cache-dir $TORCH_VERSION \
|
||||
echo "Installing custom PyTorch specification: $TORCH_VERSION" && \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt && pip install --no-cache-dir --upgrade $TORCH_VERSION \
|
||||
;; \
|
||||
esac; \
|
||||
fi && \
|
||||
# Install remaining requirements, skipping torch packages that might be there
|
||||
grep -v -E "^torch==|^torchvision==|^torchaudio==|^torchvision$" requirements.txt > requirements_no_torch.txt && \
|
||||
pip install --no-cache-dir --upgrade -r requirements_no_torch.txt && \
|
||||
rm requirements_no_torch.txt; \
|
||||
fi; \
|
||||
else \
|
||||
# Install all requirements as specified
|
||||
echo "No TORCH_VERSION specified, using packages from requirements.txt" && \
|
||||
pip install --no-cache-dir --upgrade -r requirements.txt; \
|
||||
fi
|
||||
|
||||
@@ -114,9 +100,6 @@ RUN if [ "$SKIP_XTTS_TEST" != "true" ]; then \
|
||||
echo "Skipping XTTS test run as requested."; \
|
||||
fi
|
||||
|
||||
# Copy the application
|
||||
COPY . /app
|
||||
|
||||
# Expose the required port
|
||||
EXPOSE 7860
|
||||
# Start the Gradio app with the required flag
|
||||
@@ -126,3 +109,12 @@ ENTRYPOINT ["python", "app.py", "--script_mode", "full_docker"]
|
||||
#docker build --pull --build-arg BASE_IMAGE=athomasson2/ebook2audiobook:latest -t your-image-name .
|
||||
#The --pull flag forces Docker to always try to pull the latest version of the image, even if it already exists locally.
|
||||
#Without --pull, Docker will only use the local version if it exists, which might not be the latest.
|
||||
|
||||
# Example build commands:
|
||||
# For CUDA 11.8: docker build --build-arg TORCH_VERSION=cuda118 -t your-image-name .
|
||||
# For CUDA 12.8: docker build --build-arg TORCH_VERSION=cuda128 -t your-image-name .
|
||||
# For CUDA 12.1: docker build --build-arg TORCH_VERSION=cuda121 -t your-image-name .
|
||||
# For ROCm: docker build --build-arg TORCH_VERSION=rocm -t your-image-name .
|
||||
# For CPU: docker build --build-arg TORCH_VERSION=cpu -t your-image-name .
|
||||
# For XPU: docker build --build-arg TORCH_VERSION=xpu -t your-image-name .
|
||||
# Default (no TORCH_VERSION): docker build -t your-image-name .
|
||||
|
||||
165
README.md
@@ -1,6 +1,6 @@
|
||||
# 📚 ebook2audiobook
|
||||
CPU/GPU Converter from eBooks to audiobooks with chapters and metadata<br/>
|
||||
using XTTSv2, Bark, Vits, Fairseq, YourTTS, Tacotron and more. Supports voice cloning and +1110 languages!
|
||||
using XTTSv2, Bark, Vits, Fairseq, YourTTS, Tacotron2 and more. Supports voice cloning and +1110 languages!
|
||||
> [!IMPORTANT]
|
||||
**This tool is intended for use with non-DRM, legally acquired eBooks only.** <br>
|
||||
The authors are not responsible for any misuse of this software or any resulting legal consequences. <br>
|
||||
@@ -83,18 +83,18 @@ https://github.com/user-attachments/assets/81c4baad-117e-4db5-ac86-efc2b7fea921
|
||||
- [Basic Headless Usage](#basic--usage)
|
||||
- [Headless Custom XTTS Model Usage](#example-of-custom-model-zip-upload)
|
||||
- [Help command output](#help-command-output)
|
||||
- [Run Remotely](#run-remotely)
|
||||
- [Run Remotely](#run-remotely)
|
||||
- [Docker](#docker-compose)
|
||||
- [Docker Compose (Recommended)](#docker-compose)
|
||||
- [Docker Compose Headless](#compose-headless)
|
||||
- [Compose Build Arguments](#compose-build-arguments)
|
||||
- [Compose container file locations](#compose-container-file-locations)
|
||||
- [Common Docker issues](#common-docker-issues)
|
||||
- [Docker Build (Manual)](https://github.com/DrewThomasson/ebook2audiobook/wiki/Manual-Docker-Guide)
|
||||
|
||||
- [Fine Tuned TTS models](#fine-tuned-tts-models)
|
||||
- [Collection of Fine-Tuned TTS Models](#fine-tuned-tts-collection)
|
||||
- [Train XTTSv2](#fine-tune-your-own-xttsv2-model)
|
||||
- [Docker](#docker-gpu-options)
|
||||
- [GPU options](#docker-gpu-options)
|
||||
- [Docker Run](#running-the-pre-built-docker-container)
|
||||
- [Docker Build](#building-the-docker-container)
|
||||
- [Docker Compose](#docker-compose)
|
||||
- [Docker headless guide](#docker-headless-guide)
|
||||
- [Docker container file locations](#docker-container-file-locations)
|
||||
- [Common Docker issues](#common-docker-issues)
|
||||
- [Supported eBook Formats](#supported-ebook-formats)
|
||||
- [Output Formats](#output-formats)
|
||||
- [Updating to Latest Version](#updating-to-latest-version)
|
||||
@@ -125,7 +125,7 @@ https://github.com/user-attachments/assets/81c4baad-117e-4db5-ac86-efc2b7fea921
|
||||
|
||||
|
||||
## Hardware Requirements
|
||||
- 4gb RAM minimum, 8GB recommended
|
||||
- 2gb RAM minimum, 8GB recommended
|
||||
- Virtualization enabled if running on windows (Docker only)
|
||||
- CPU (intel, AMD, ARM), GPU (Nvidia, AMD*, Intel*) (Recommended), MPS (Apple Silicon CPU)
|
||||
*available very soon
|
||||
@@ -147,16 +147,18 @@ cd ebook2audiobook
|
||||
```
|
||||
|
||||
### Launching Gradio Web Interface
|
||||
1. **Run ebook2audiobook**:
|
||||
1. **Run ebook2audiobook**:
|
||||
|
||||
- **Linux/MacOS**
|
||||
```bash
|
||||
./ebook2audiobook.sh # Run launch script
|
||||
```
|
||||
|
||||
<i>Note for MacOS users: homebrew is installed to install missing programs.</i>
|
||||
|
||||
- **Mac Launcher**
|
||||
Double click `Mac Ebook2Audiobook Launcher.command`
|
||||
|
||||
|
||||
|
||||
- **Windows**
|
||||
```bash
|
||||
ebook2audiobook.cmd # Run launch script or double click on it
|
||||
@@ -164,22 +166,12 @@ cd ebook2audiobook
|
||||
|
||||
- **Windows Launcher**
|
||||
Double click `ebook2audiobook.cmd`
|
||||
|
||||
|
||||
- **Manual Python Install**
|
||||
```bash
|
||||
# (for experts only!)
|
||||
REQUIRED_PROGRAMS=("calibre" "ffmpeg" "nodejs" "mecab" "espeak-ng" "rust" "sox")
|
||||
REQUIRED_PYTHON_VERSION="3.12"
|
||||
pip install -r requirements.txt # Install Python Requirements
|
||||
python app.py # Run Ebook2Audiobook
|
||||
```
|
||||
|
||||
1. **Open the Web App**: Click the URL provided in the terminal to access the web app and convert eBooks. `http://localhost:7860/`
|
||||
2. **For Public Link**:
|
||||
`python app.py --share` (all OS)
|
||||
`./ebook2audiobook.sh --share` (Linux/MacOS)
|
||||
`ebook2audiobook.cmd --share` (Windows)
|
||||
`python app.py --share` (all OS)
|
||||
|
||||
> [!IMPORTANT]
|
||||
**If the script is stopped and run again, you need to refresh your gradio GUI interface<br>
|
||||
@@ -341,84 +333,11 @@ NOTE: in gradio/gui mode, to cancel a running conversion, just click on the [X]
|
||||
|
||||
TIP: if it needs some more pauses, just add '###' or '[pause]' between the words you wish more pause. one [pause] equals to 1.4 seconds
|
||||
|
||||
#### Docker GPU Options
|
||||
|
||||
Available pre-build tags: `latest` (CUDA 11.8)
|
||||
#### Edit: IF GPU isn't detected then you'll have to build the image -> [Building the Docker Container](#building-the-docker-container)
|
||||
|
||||
|
||||
|
||||
#### Running the pre-built Docker Container
|
||||
|
||||
-Run with CPU only
|
||||
```powershell
|
||||
docker run --pull always --rm -p 7860:7860 athomasson2/ebook2audiobook
|
||||
```
|
||||
-Run with GPU Speedup (NVIDIA compatible only)
|
||||
```powershell
|
||||
docker run --pull always --rm --gpus all -p 7860:7860 athomasson2/ebook2audiobook
|
||||
```
|
||||
|
||||
This command will start the Gradio interface on port 7860.(localhost:7860)
|
||||
- For more options add the parameter `--help`
|
||||
|
||||
|
||||
#### Building the Docker Container
|
||||
- You can build the docker image with the command:
|
||||
```powershell
|
||||
docker build -t athomasson2/ebook2audiobook .
|
||||
```
|
||||
#### Avalible Docker Build Arguments
|
||||
|
||||
`--build-arg TORCH_VERSION=cuda118` Available tags: [cuda121, cuda118, cuda128, rocm, xpu, cpu]
|
||||
|
||||
All CUDA version numbers should work, Ex: CUDA 11.6-> cuda116
|
||||
|
||||
`--build-arg SKIP_XTTS_TEST=true` (Saves space by not baking XTTSv2 model into docker image)
|
||||
|
||||
|
||||
## Docker container file locations
|
||||
All ebook2audiobooks will have the base dir of `/app/`
|
||||
For example:
|
||||
`tmp` = `/app/tmp`
|
||||
`audiobooks` = `/app/audiobooks`
|
||||
|
||||
|
||||
## Docker headless guide
|
||||
|
||||
> [!IMPORTANT]
|
||||
**For simpler headless setup use the [Compose](#compose-headless).** <br>
|
||||
|
||||
- Before you do run this you need to create a dir named "input-folder" in your current dir
|
||||
which will be linked, This is where you can put your input files for the docker image to see
|
||||
```bash
|
||||
mkdir input-folder && mkdir Audiobooks
|
||||
```
|
||||
- In the command below swap out **YOUR_INPUT_FILE.TXT** with the name of your input file
|
||||
```bash
|
||||
docker run --pull always --rm \
|
||||
-v $(pwd)/input-folder:/app/input_folder \
|
||||
-v $(pwd)/audiobooks:/app/audiobooks \
|
||||
athomasson2/ebook2audiobook \
|
||||
--headless --ebook /input_folder/YOUR_EBOOK_FILE
|
||||
```
|
||||
- The output Audiobooks will be found in the Audiobook folder which will also be located
|
||||
in your local dir you ran this docker command in
|
||||
|
||||
|
||||
## To get the help command for the other parameters this program has you can run this
|
||||
|
||||
```bash
|
||||
docker run --pull always --rm athomasson2/ebook2audiobook --help
|
||||
|
||||
```
|
||||
That will output this
|
||||
[Help command output](#help-command-output)
|
||||
|
||||
|
||||
### Docker Compose
|
||||
This project uses Docker Compose to run locally. You can enable or disable GPU support
|
||||
by setting either `*gpu-enabled` or `*gpu-disabled` in `docker-compose.yml`
|
||||
|
||||
For pre-built image enable `#image: docker.io/athomasson2/ebook2audiobook:latest` in `docker-compose.yml`
|
||||
|
||||
|
||||
|
||||
#### Steps to Run
|
||||
@@ -429,46 +348,48 @@ by setting either `*gpu-enabled` or `*gpu-disabled` in `docker-compose.yml`
|
||||
```
|
||||
2. **Set GPU Support (disabled by default)**
|
||||
To enable GPU support, modify `docker-compose.yml` and change `*gpu-disabled` to `*gpu-enabled`
|
||||
3. **Start the service:**
|
||||
4. **Start the service:**
|
||||
```bash
|
||||
# Docker
|
||||
docker-compose up -d # To rebuild add --build
|
||||
docker-compose up -d # To rebuild add --build
|
||||
# To stop -> docker-compose down
|
||||
|
||||
# Podman
|
||||
podman compose -f podman-compose.yml up -d # To rebuild add --build
|
||||
# To stop -> podman compose -f podman-compose.yml down
|
||||
```
|
||||
4. **Access the service:**
|
||||
5. **Access the service:**
|
||||
The service will be available at http://localhost:7860.
|
||||
|
||||
|
||||
### Compose Build Arguments
|
||||
|
||||
```bash
|
||||
SKIP_XTTS_TEST: "true" # (Saves space by not baking xtts model into docker image)
|
||||
TORCH_VERSION: cuda118 # Available tags: [cuda121, cuda118, cuda128, rocm, xpu, cpu] # All CUDA version numbers should work, Ex: CUDA 11.6-> cuda116
|
||||
```
|
||||
|
||||
|
||||
### Compose Headless
|
||||
|
||||
[Headless Wiki for more info](https://github.com/DrewThomasson/ebook2audiobook/wiki/Docker-Compose-Headless-guide)
|
||||
|
||||
```bash
|
||||
A headless example is already contained within the `docker-compose.yml` file.
|
||||
|
||||
The `docker-compose.yml` file will act as the base dir for any headless commands added.
|
||||
```
|
||||
|
||||
### Compose container file locations
|
||||
|
||||
```bash
|
||||
By Default: All compose containers share the contents your local `ebook2audiobook` folder
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## Common Docker Issues
|
||||
### Common Docker Issues
|
||||
|
||||
- My NVIDIA GPU isnt being detected?? -> [GPU ISSUES Wiki Page](https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES)
|
||||
|
||||
- `python: can't open file '/home/user/app/app.py': [Errno 2] No such file or directory` (Just remove all post arguments as I replaced the `CMD` with `ENTRYPOINT` in the [Dockerfile](Dockerfile))
|
||||
- Example: `docker run --pull always athomasson2/ebook2audiobook app.py --script_mode full_docker` - > corrected - > `docker run --pull always athomasson2/ebook2audiobook`
|
||||
- Arguments can be easily added like this now `docker run --pull always athomasson2/ebook2audiobook --share`
|
||||
|
||||
- Docker gets stuck downloading Fine-Tuned models.
|
||||
(This does not happen for every computer but some appear to run into this issue)
|
||||
Disabling the progress bar appears to fix the issue,
|
||||
as discussed [here in #191](https://github.com/DrewThomasson/ebook2audiobook/issues/191)
|
||||
Example of adding this fix in the `docker run` command
|
||||
```Dockerfile
|
||||
docker run --pull always --rm --gpus all -e HF_HUB_DISABLE_PROGRESS_BARS=1 -e HF_HUB_ENABLE_HF_TRANSFER=0 \
|
||||
-p 7860:7860 athomasson2/ebook2audiobook
|
||||
```
|
||||
|
||||
|
||||
## Fine Tuned TTS models
|
||||
#### Fine Tune your own XTTSv2 model
|
||||
|
||||
@@ -1 +1 @@
|
||||
25.10.25
|
||||
25.11.11
|
||||
221
app.py
@@ -7,10 +7,10 @@ import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
|
||||
from importlib.metadata import version, PackageNotFoundError
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from pathlib import Path
|
||||
from lib import *
|
||||
|
||||
@@ -52,83 +52,125 @@ def check_and_install_requirements(file_path:str)->bool:
|
||||
print(error)
|
||||
return False
|
||||
try:
|
||||
from importlib.metadata import version, PackageNotFoundError
|
||||
try:
|
||||
from packaging.specifiers import SpecifierSet
|
||||
from packaging.version import Version
|
||||
from tqdm import tqdm
|
||||
from packaging.markers import Marker
|
||||
except ImportError:
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'packaging'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', 'packaging', 'tqdm'])
|
||||
from packaging.specifiers import SpecifierSet
|
||||
from packaging.version import Version
|
||||
from tqdm import tqdm
|
||||
from packaging.markers import Marker
|
||||
import re as regex
|
||||
from tqdm import tqdm
|
||||
flexible_packages = {"torch", "torchaudio", "numpy"}
|
||||
torch_version = False
|
||||
try:
|
||||
import torch
|
||||
torch_version = getattr(torch, '__version__', '')
|
||||
devices['CUDA']['found'] = getattr(torch, "cuda", None) is not None and torch.cuda.is_available() and not (hasattr(torch.version, "hip") and torch.version.hip is not None)
|
||||
devices['ROCM']['found'] = hasattr(torch.version, "hip") and torch.version.hip is not None and torch.cuda.is_available()
|
||||
devices['MPS']['found'] = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
||||
devices['XPU']['found'] = getattr(torch, "xpu", None) is not None and torch.xpu.is_available()
|
||||
except ImportError:
|
||||
pass
|
||||
cuda_only_packages = ('deepspeed')
|
||||
with open(file_path, 'r') as f:
|
||||
contents = f.read().replace('\r', '\n')
|
||||
packages = [
|
||||
pkg.strip()
|
||||
for pkg in contents.splitlines()
|
||||
if pkg.strip() and regex.search(r'[a-zA-Z0-9]', pkg)
|
||||
]
|
||||
packages = [pkg.strip() for pkg in contents.splitlines() if pkg.strip() and regex.search(r'[a-zA-Z0-9]', pkg)]
|
||||
if sys.version_info >= (3, 11):
|
||||
packages.append("pymupdf-layout")
|
||||
missing_packages = []
|
||||
cuda_markers = ('+cu', '+xpu', '+nv', '+git')
|
||||
for package in packages:
|
||||
if ';' in package:
|
||||
pkg_part, marker_part = package.split(';', 1)
|
||||
marker_part = marker_part.strip()
|
||||
try:
|
||||
marker = Marker(marker_part)
|
||||
if not marker.evaluate():
|
||||
continue
|
||||
except Exception as e:
|
||||
error = f'Warning: Could not evaluate marker {marker_part} for {pkg_part}: {e}'
|
||||
print(error)
|
||||
package = pkg_part.strip()
|
||||
if 'git+' in package or '://' in package:
|
||||
pkg_name_match = regex.search(r'([\w\-]+)\s*@?\s*git\+', package)
|
||||
pkg_name = pkg_name_match.group(1) if pkg_name_match else None
|
||||
if pkg_name:
|
||||
spec = importlib.util.find_spec(pkg_name)
|
||||
if spec is None:
|
||||
msg = f'{pkg_name} (git package) is missing.'
|
||||
print(msg)
|
||||
missing_packages.append(package)
|
||||
else:
|
||||
error = f'Unrecognized git package: {package}'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
continue
|
||||
clean_pkg = regex.sub(r'\[.*?\]', '', package)
|
||||
pkg_name = regex.split(r'[<>=]', clean_pkg, 1)[0].strip()
|
||||
pkg_name = regex.split(r'[<>=]', clean_pkg, maxsplit=1)[0].strip()
|
||||
if pkg_name in cuda_only_packages:
|
||||
has_cuda_build = False
|
||||
if torch_version:
|
||||
has_cuda_build = any(marker in torch_version for marker in cuda_markers)
|
||||
if not has_cuda_build:
|
||||
continue
|
||||
try:
|
||||
installed_version = version(pkg_name)
|
||||
if pkg_name == 'num2words':
|
||||
code = "ZH_CN"
|
||||
spec = importlib.util.find_spec(f"num2words.lang_{code}")
|
||||
if spec is None:
|
||||
missing_packages.append(package)
|
||||
except PackageNotFoundError:
|
||||
error = f'{package} is missing.'
|
||||
error = f'{pkg_name} is not installed.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
continue
|
||||
if pkg_name in flexible_packages:
|
||||
continue
|
||||
if '+' in installed_version:
|
||||
continue
|
||||
else:
|
||||
spec_str = clean_pkg[len(pkg_name):].strip()
|
||||
if spec_str:
|
||||
spec = SpecifierSet(spec_str)
|
||||
# normalize installed version -> major.minor.patch (if available)
|
||||
norm_match = regex.match(r'^(\d+\.\d+(?:\.\d+)?)', installed_version)
|
||||
short_version = norm_match.group(1) if norm_match else installed_version
|
||||
try:
|
||||
installed_v = Version(short_version)
|
||||
except Exception:
|
||||
installed_v = Version("0")
|
||||
# detect requirement version -> major.minor.patch (if available)
|
||||
installed_v = Version('0')
|
||||
req_match = regex.search(r'(\d+\.\d+(?:\.\d+)?)', spec_str)
|
||||
if req_match:
|
||||
req_v = Version(req_match.group(1))
|
||||
imajor, iminor = installed_v.major, installed_v.minor
|
||||
rmajor, rminor = req_v.major, req_v.minor
|
||||
if "==" in spec_str:
|
||||
if '==' in spec_str:
|
||||
if imajor != rmajor or iminor != rminor:
|
||||
error = f'{pkg_name} (installed {installed_version}) not in same major.minor as required {req_v}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
elif ">=" in spec_str:
|
||||
elif '>=' in spec_str:
|
||||
if (imajor < rmajor) or (imajor == rmajor and iminor < rminor):
|
||||
error = f'{pkg_name} (installed {installed_version}) < required {req_v}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
elif "<=" in spec_str:
|
||||
elif '<=' in spec_str:
|
||||
if (imajor > rmajor) or (imajor == rmajor and iminor > rminor):
|
||||
error = f'{pkg_name} (installed {installed_version}) > allowed {req_v}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
elif ">" in spec_str:
|
||||
elif '>' in spec_str:
|
||||
if (imajor < rmajor) or (imajor == rmajor and iminor <= rminor):
|
||||
error = f'{pkg_name} (installed {installed_version}) <= required {req_v}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
elif "<" in spec_str:
|
||||
elif '<' in spec_str:
|
||||
if (imajor > rmajor) or (imajor == rmajor and iminor >= rminor):
|
||||
error = f'{pkg_name} (installed {installed_version}) >= restricted {req_v}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
else:
|
||||
if installed_v not in spec:
|
||||
error = (f'{pkg_name} (installed {installed_version}) does not satisfy "{spec_str}".')
|
||||
error = f'{pkg_name} (installed {installed_version}) does not satisfy {spec_str}.'
|
||||
print(error)
|
||||
missing_packages.append(package)
|
||||
if missing_packages:
|
||||
@@ -136,25 +178,16 @@ def check_and_install_requirements(file_path:str)->bool:
|
||||
print(msg)
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
os.environ['TMPDIR'] = tmp_dir
|
||||
result = subprocess.call([sys.executable, '-m', 'pip', 'cache', 'purge'])
|
||||
subprocess.call([sys.executable, '-m', 'pip', 'cache', 'purge'])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'])
|
||||
with tqdm(total=len(packages),
|
||||
desc='Installation 0.00%',
|
||||
bar_format='{desc}: {n_fmt}/{total_fmt} ',
|
||||
unit='step') as t:
|
||||
for package in tqdm(missing_packages, desc="Installing", unit="pkg"):
|
||||
with tqdm(total = len(packages), desc = 'Installation 0.00%', bar_format = '{desc}: {n_fmt}/{total_fmt} ', unit = 'step') as t:
|
||||
for package in tqdm(missing_packages, desc = 'Installing', unit = 'pkg'):
|
||||
try:
|
||||
if package == 'num2words':
|
||||
pkgs = ['git+https://github.com/savoirfairelinux/num2words.git', '--force']
|
||||
else:
|
||||
pkgs = [package]
|
||||
subprocess.check_call([
|
||||
sys.executable, '-m', 'pip', 'install',
|
||||
'--no-cache-dir', '--use-pep517',
|
||||
*pkgs
|
||||
])
|
||||
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-cache-dir', '--use-pep517', package])
|
||||
t.update(1)
|
||||
except subprocess.CalledProcessError as e:
|
||||
if package in flexible_packages:
|
||||
continue
|
||||
error = f'Failed to install {package}: {e}'
|
||||
print(error)
|
||||
return False
|
||||
@@ -175,7 +208,7 @@ def check_dictionary()->bool:
|
||||
error = 'UniDic dictionary not found or incomplete. Downloading now...'
|
||||
print(error)
|
||||
subprocess.run(['python', '-m', 'unidic', 'download'], check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
except (subprocess.CalledProcessError, ConnectionError, OSError) as e:
|
||||
error = f'Failed to download UniDic dictionary. Error: {e}. Unable to continue without UniDic. Exiting...'
|
||||
raise SystemExit(error)
|
||||
return False
|
||||
@@ -185,6 +218,26 @@ def is_port_in_use(port:int)->bool:
|
||||
with socket.socket(socket.AF_INET,socket.SOCK_STREAM) as s:
|
||||
return s.connect_ex(('0.0.0.0',port))==0
|
||||
|
||||
def kill_previous_instances(script_name: str):
|
||||
current_pid = os.getpid()
|
||||
this_script_path = os.path.realpath(script_name)
|
||||
import psutil
|
||||
for proc in psutil.process_iter(['pid', 'cmdline']):
|
||||
try:
|
||||
cmdline = proc.info['cmdline']
|
||||
if not cmdline:
|
||||
continue
|
||||
# unify case and absolute paths for comparison
|
||||
joined_cmd = ' '.join(cmdline).lower()
|
||||
if this_script_path.lower().endswith(script_name.lower()) and \
|
||||
(script_name.lower() in joined_cmd) and \
|
||||
proc.info['pid'] != current_pid:
|
||||
print(f"[WARN] Found running instance PID={proc.info['pid']} -> killing it.")
|
||||
proc.kill()
|
||||
proc.wait(timeout=3)
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
continue
|
||||
|
||||
def main()->None:
|
||||
# Argument parser to handle optional parameters with descriptions
|
||||
parser = argparse.ArgumentParser(
|
||||
@@ -195,12 +248,12 @@ Windows:
|
||||
Gradio/GUI:
|
||||
ebook2audiobook.cmd
|
||||
Headless mode:
|
||||
ebook2audiobook.cmd --headless --ebook '/path/to/file'
|
||||
ebook2audiobook.cmd --headless --ebook '/path/to/file' --language eng
|
||||
Linux/Mac:
|
||||
Gradio/GUI:
|
||||
./ebook2audiobook.sh
|
||||
Headless mode:
|
||||
./ebook2audiobook.sh --headless --ebook '/path/to/file'
|
||||
./ebook2audiobook.sh --headless --ebook '/path/to/file' --language eng
|
||||
|
||||
Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
''',
|
||||
@@ -233,35 +286,35 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
headless_optional_group = parser.add_argument_group('optional parameters')
|
||||
headless_optional_group.add_argument(options[7], type=str, default=None, help='''(Optional) Path to the voice cloning file for TTS engine.
|
||||
Uses the default voice if not present.''')
|
||||
headless_optional_group.add_argument(options[8], type=str, default=default_device, choices=device_list, help=f'''(Optional) Pprocessor unit type for the conversion.
|
||||
Default is set in ./lib/conf.py if not present. Fall back to CPU if GPU not available.''')
|
||||
headless_optional_group.add_argument(options[8], type=str, default=default_device, choices=list(devices.values()), help=f'''(Optional) Pprocessor unit type for the conversion.
|
||||
Default is set in ./lib/conf.py if not present. Fall back to CPU if CUDA or MPS is not available.''')
|
||||
headless_optional_group.add_argument(options[9], type=str, default=None, choices=tts_engine_list_keys+tts_engine_list_values, help=f'''(Optional) Preferred TTS engine (available are: {tts_engine_list_keys+tts_engine_list_values}.
|
||||
Default depends on the selected language. The tts engine should be compatible with the chosen language''')
|
||||
headless_optional_group.add_argument(options[10], type=str, default=None, help=f'''(Optional) Path to the custom model zip file cntaining mandatory model files.
|
||||
Please refer to ./lib/models.py''')
|
||||
headless_optional_group.add_argument(options[11], type=str, default=default_fine_tuned, help='''(Optional) Fine tuned model path. Default is builtin model.''')
|
||||
headless_optional_group.add_argument(options[12], type=str, default=default_output_format, help=f'''(Optional) Output audio format. Default is set in ./lib/conf.py''')
|
||||
headless_optional_group.add_argument(options[13], type=float, default=None, help=f"""(xtts only, optional) Temperature for the model.
|
||||
headless_optional_group.add_argument(options[13], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['temperature'], help=f"""(xtts only, optional) Temperature for the model.
|
||||
Default to config.json model. Higher temperatures lead to more creative outputs.""")
|
||||
headless_optional_group.add_argument(options[14], type=float, default=None, help=f"""(xtts only, optional) A length penalty applied to the autoregressive decoder.
|
||||
headless_optional_group.add_argument(options[14], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['length_penalty'], help=f"""(xtts only, optional) A length penalty applied to the autoregressive decoder.
|
||||
Default to config.json model. Not applied to custom models.""")
|
||||
headless_optional_group.add_argument(options[15], type=int, default=None, help=f"""(xtts only, optional) Controls how many alternative sequences the model explores. Must be equal or greater than length penalty.
|
||||
headless_optional_group.add_argument(options[15], type=int, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['num_beams'], help=f"""(xtts only, optional) Controls how many alternative sequences the model explores. Must be equal or greater than length penalty.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[16], type=float, default=None, help=f"""(xtts only, optional) A penalty that prevents the autoregressive decoder from repeating itself.
|
||||
headless_optional_group.add_argument(options[16], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['repetition_penalty'], help=f"""(xtts only, optional) A penalty that prevents the autoregressive decoder from repeating itself.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[17], type=int, default=None, help=f"""(xtts only, optional) Top-k sampling.
|
||||
headless_optional_group.add_argument(options[17], type=int, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['top_k'], help=f"""(xtts only, optional) Top-k sampling.
|
||||
Lower values mean more likely outputs and increased audio generation speed.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[18], type=float, default=None, help=f"""(xtts only, optional) Top-p sampling.
|
||||
headless_optional_group.add_argument(options[18], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['top_p'], help=f"""(xtts only, optional) Top-p sampling.
|
||||
Lower values mean more likely outputs and increased audio generation speed. Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[19], type=float, default=None, help=f"""(xtts only, optional) Speed factor for the speech generation.
|
||||
headless_optional_group.add_argument(options[19], type=float, default=default_engine_settings[TTS_ENGINES['XTTSv2']]['speed'], help=f"""(xtts only, optional) Speed factor for the speech generation.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[20], action='store_true', help=f"""(xtts only, optional) Enable TTS text splitting. This option is known to not be very efficient.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[21], type=float, default=None, help=f"""(bark only, optional) Text Temperature for the model.
|
||||
Default to {default_engine_settings[TTS_ENGINES['BARK']]['text_temp']}. Higher temperatures lead to more creative outputs.""")
|
||||
headless_optional_group.add_argument(options[22], type=float, default=None, help=f"""(bark only, optional) Waveform Temperature for the model.
|
||||
Default to {default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']}. Higher temperatures lead to more creative outputs.""")
|
||||
headless_optional_group.add_argument(options[21], type=float, default=default_engine_settings[TTS_ENGINES['BARK']]['text_temp'], help=f"""(bark only, optional) Text Temperature for the model.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[22], type=float, default=default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp'], help=f"""(bark only, optional) Waveform Temperature for the model.
|
||||
Default to config.json model.""")
|
||||
headless_optional_group.add_argument(options[23], type=str, help=f'''(Optional) Path to the output directory. Default is set in ./lib/conf.py''')
|
||||
headless_optional_group.add_argument(options[24], action='version', version=f'ebook2audiobook version {prog_version}', help='''Show the version of the script and exit''')
|
||||
headless_optional_group.add_argument(options[25], action='store_true', help=argparse.SUPPRESS)
|
||||
@@ -304,17 +357,30 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
|
||||
from lib.functions import SessionContext, convert_ebook_batch, convert_ebook, web_interface
|
||||
ctx = SessionContext()
|
||||
import lib.functions as f
|
||||
f.context = f.SessionContext() if f.context is None else f.context
|
||||
f.context_tracker = f.SessionTracker() if f.context_tracker is None else f.context_tracker
|
||||
f.active_sessions = set() if f.active_sessions is None else f.active_sessions
|
||||
# Conditions based on the --headless flag
|
||||
if args['headless']:
|
||||
args['is_gui_process'] = False
|
||||
args['chapters_control'] = False
|
||||
args['chapters_preview'] = False
|
||||
args['event'] = ''
|
||||
args['audiobooks_dir'] = os.path.abspath(args['output_dir']) if args['output_dir'] else audiobooks_cli_dir
|
||||
args['device'] = 'cuda' if args['device'] == 'gpu' else args['device']
|
||||
args['device'] = devices['CUDA'] if args['device'] == devices['CUDA'] else args['device']
|
||||
args['tts_engine'] = TTS_ENGINES[args['tts_engine']] if args['tts_engine'] in TTS_ENGINES.keys() else args['tts_engine'] if args['tts_engine'] in TTS_ENGINES.values() else None
|
||||
args['output_split'] = default_output_split
|
||||
args['output_split_hours'] = default_output_split_hours
|
||||
args['xtts_temperature'] = args['temperature']
|
||||
args['xtts_length_penalty'] = args['length_penalty']
|
||||
args['xtts_num_beams'] = args['num_beams']
|
||||
args['xtts_repetition_penalty'] = args['repetition_penalty']
|
||||
args['xtts_top_k'] = args['top_k']
|
||||
args['xtts_top_p'] = args['top_p']
|
||||
args['xtts_speed'] = args['speed']
|
||||
args['xtts_enable_text_splitting'] = False
|
||||
args['bark_text_temp'] = args['text_temp']
|
||||
args['bark_waveform_temp'] = args['waveform_temp']
|
||||
engine_setting_keys = {engine: list(settings.keys()) for engine, settings in default_engine_settings.items()}
|
||||
valid_model_keys = engine_setting_keys.get(args['tts_engine'], [])
|
||||
renamed_args = {}
|
||||
@@ -349,7 +415,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
if any(file.endswith(ext) for ext in ebook_formats):
|
||||
full_path = os.path.abspath(os.path.join(args['ebooks_dir'], file))
|
||||
args['ebook_list'].append(full_path)
|
||||
progress_status, passed = convert_ebook_batch(args, ctx)
|
||||
progress_status, passed = f.convert_ebook_batch(args)
|
||||
if passed is False:
|
||||
error = f'Conversion failed: {progress_status}'
|
||||
print(error)
|
||||
@@ -360,7 +426,7 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
error = f'Error: The provided --ebook "{args["ebook"]}" does not exist.'
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
progress_status, passed = convert_ebook(args, ctx)
|
||||
progress_status, passed = f.convert_ebook(args)
|
||||
if passed is False:
|
||||
error = f'Conversion failed: {progress_status}'
|
||||
print(error)
|
||||
@@ -375,10 +441,37 @@ Tip: to add of silence (1.4 seconds) into your text just use "###" or "[pause]".
|
||||
allowed_arguments = {'--share', '--script_mode'}
|
||||
passed_args_set = {arg for arg in passed_arguments if arg.startswith('--')}
|
||||
if passed_args_set.issubset(allowed_arguments):
|
||||
web_interface(args, ctx)
|
||||
try:
|
||||
#script_name = os.path.basename(sys.argv[0])
|
||||
#kill_previous_instances(script_name)
|
||||
app = f.build_interface(args)
|
||||
if app is not None:
|
||||
app.queue(
|
||||
default_concurrency_limit=interface_concurrency_limit
|
||||
).launch(
|
||||
debug=bool(int(os.environ.get('GRADIO_DEBUG', '0'))),
|
||||
show_error=debug_mode, favicon_path='./favicon.ico',
|
||||
server_name=interface_host,
|
||||
server_port=interface_port,
|
||||
share= args['share'],
|
||||
max_file_size=max_upload_size
|
||||
)
|
||||
except OSError as e:
|
||||
error = f'Connection error: {e}'
|
||||
f.alert_exception(error, None)
|
||||
except socket.error as e:
|
||||
error = f'Socket error: {e}'
|
||||
f.alert_exception(error, None)
|
||||
except KeyboardInterrupt:
|
||||
error = 'Server interrupted by user. Shutting down...'
|
||||
f.alert_exception(error, None)
|
||||
except Exception as e:
|
||||
error = f'An unexpected error occurred: {e}'
|
||||
f.alert_exception(error, None)
|
||||
else:
|
||||
error = 'Error: In non-headless mode, no option or only --share can be passed'
|
||||
error = 'Error: In GUI mode, no option or only --share can be passed'
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
@@ -17,7 +17,7 @@ set "PYTHONUTF8=1"
|
||||
set "PYTHONIOENCODING=utf-8"
|
||||
set "CURRENT_ENV="
|
||||
|
||||
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox"
|
||||
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox tesseract"
|
||||
|
||||
set "TMP=%SCRIPT_DIR%\tmp"
|
||||
set "TEMP=%SCRIPT_DIR%\tmp"
|
||||
@@ -78,11 +78,11 @@ exit /b
|
||||
:conda_check
|
||||
where /Q conda
|
||||
if %errorlevel% neq 0 (
|
||||
call rmdir /s /q "%CONDA_INSTALL_DIR%" 2>nul
|
||||
echo Miniforge3 is not installed.
|
||||
echo Miniforge3 is not installed.
|
||||
set "CONDA_CHECK=1"
|
||||
goto :install_components
|
||||
)
|
||||
|
||||
:: Check if running in a Conda environment
|
||||
if defined CONDA_DEFAULT_ENV (
|
||||
set "CURRENT_ENV=%CONDA_PREFIX%"
|
||||
@@ -158,7 +158,9 @@ if not "%CONDA_CHECK%"=="0" (
|
||||
echo Conda installation failed.
|
||||
goto :failed
|
||||
)
|
||||
call conda config --set auto_activate_base false
|
||||
if not exist "%USERPROFILE%\.condarc" (
|
||||
call conda config --set auto_activate false
|
||||
)
|
||||
call conda update conda -y
|
||||
del "%CONDA_INSTALLER%"
|
||||
set "CONDA_CHECK=0"
|
||||
@@ -169,26 +171,66 @@ if not "%CONDA_CHECK%"=="0" (
|
||||
:: Install missing packages one by one
|
||||
if not "%PROGRAMS_CHECK%"=="0" (
|
||||
echo Installing missing programs...
|
||||
if "%SCOOP_CHECK%"=="0" (
|
||||
call scoop bucket add muggle b https://github.com/hu3rror/scoop-muggle.git
|
||||
call scoop bucket add extras
|
||||
call scoop bucket add versions
|
||||
)
|
||||
for %%p in (%missing_prog_array%) do (
|
||||
call scoop install %%p
|
||||
set "prog=%%p"
|
||||
if "%%p"=="nodejs" (
|
||||
set "prog=node"
|
||||
)
|
||||
if "%%p"=="calibre-normal" set "prog=calibre"
|
||||
where /Q !prog!
|
||||
if !errorlevel! neq 0 (
|
||||
echo %%p installation failed...
|
||||
goto :failed
|
||||
)
|
||||
if "%SCOOP_CHECK%"=="0" (
|
||||
call scoop bucket add muggle b https://github.com/hu3rror/scoop-muggle.git
|
||||
call scoop bucket add extras
|
||||
call scoop bucket add versions
|
||||
)
|
||||
call powershell -command "[System.Environment]::SetEnvironmentVariable('Path', [System.Environment]::GetEnvironmentVariable('Path', 'User') + '%SCOOP_SHIMS%;%SCOOP_APPS%;%CONDA_PATH%;%NODE_PATH%;', 'User')"
|
||||
set "SCOOP_CHECK=0"
|
||||
for %%p in (%missing_prog_array%) do (
|
||||
set "prog=%%p"
|
||||
call scoop install %%p
|
||||
if "%%p"=="tesseract" (
|
||||
where /Q !prog!
|
||||
if !errorlevel! equ 0 (
|
||||
set "syslang=%LANG%"
|
||||
if not defined syslang set "syslang=en"
|
||||
set "syslang=!syslang:~0,2!"
|
||||
set "tesslang=eng"
|
||||
if /I "!syslang!"=="fr" set "tesslang=fra"
|
||||
if /I "!syslang!"=="de" set "tesslang=deu"
|
||||
if /I "!syslang!"=="it" set "tesslang=ita"
|
||||
if /I "!syslang!"=="es" set "tesslang=spa"
|
||||
if /I "!syslang!"=="pt" set "tesslang=por"
|
||||
if /I "!syslang!"=="ar" set "tesslang=ara"
|
||||
if /I "!syslang!"=="tr" set "tesslang=tur"
|
||||
if /I "!syslang!"=="ru" set "tesslang=rus"
|
||||
if /I "!syslang!"=="bn" set "tesslang=ben"
|
||||
if /I "!syslang!"=="zh" set "tesslang=chi_sim"
|
||||
if /I "!syslang!"=="fa" set "tesslang=fas"
|
||||
if /I "!syslang!"=="hi" set "tesslang=hin"
|
||||
if /I "!syslang!"=="hu" set "tesslang=hun"
|
||||
if /I "!syslang!"=="id" set "tesslang=ind"
|
||||
if /I "!syslang!"=="jv" set "tesslang=jav"
|
||||
if /I "!syslang!"=="ja" set "tesslang=jpn"
|
||||
if /I "!syslang!"=="ko" set "tesslang=kor"
|
||||
if /I "!syslang!"=="pl" set "tesslang=pol"
|
||||
if /I "!syslang!"=="ta" set "tesslang=tam"
|
||||
if /I "!syslang!"=="te" set "tesslang=tel"
|
||||
if /I "!syslang!"=="yo" set "tesslang=yor"
|
||||
echo Detected system language: !syslang! → downloading OCR language: !tesslang!
|
||||
set "tessdata=%SCOOP_APPS%\tesseract\current\tessdata"
|
||||
if not exist "!tessdata!\!tesslang!.traineddata" (
|
||||
powershell -Command "Invoke-WebRequest -Uri https://github.com/tesseract-ocr/tessdata_best/raw/main/!tesslang!.traineddata -OutFile '!tessdata!\!tesslang!.traineddata'"
|
||||
)
|
||||
if exist "!tessdata!\!tesslang!.traineddata" (
|
||||
echo Tesseract OCR language !tesslang! installed in !tessdata!
|
||||
) else (
|
||||
echo Failed to install OCR language !tesslang!
|
||||
)
|
||||
)
|
||||
) else if "%%p"=="nodejs" (
|
||||
set "prog=node"
|
||||
) else if "%%p"=="calibre-normal" (
|
||||
set "prog=calibre"
|
||||
)
|
||||
where /Q !prog!
|
||||
if !errorlevel! neq 0 (
|
||||
echo %%p installation failed...
|
||||
goto :failed
|
||||
)
|
||||
)
|
||||
call powershell -Command "[System.Environment]::SetEnvironmentVariable('Path', [System.Environment]::GetEnvironmentVariable('Path', 'User') + ';%SCOOP_SHIMS%;%SCOOP_APPS%;%CONDA_PATH%;%NODE_PATH%', 'User')"
|
||||
set "SCOOP_CHECK=0"
|
||||
set "PROGRAMS_CHECK=0"
|
||||
set "missing_prog_array="
|
||||
)
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
torchvggish
|
||||
numpy<2
|
||||
num2words @ git+https://github.com/savoirfairelinux/num2words.git
|
||||
regex
|
||||
tqdm
|
||||
cutlet
|
||||
deep_translator
|
||||
docker
|
||||
ebooklib
|
||||
fastapi
|
||||
num2words
|
||||
argostranslate
|
||||
beautifulsoup4
|
||||
fugashi
|
||||
sudachipy
|
||||
sudachidict_core
|
||||
ray
|
||||
PyMuPDF
|
||||
pytesseract
|
||||
unidic
|
||||
pymupdf4llm
|
||||
translate
|
||||
hangul-romanize
|
||||
indic-nlp-library
|
||||
iso639-lang
|
||||
@@ -25,14 +23,14 @@ pypinyin
|
||||
pythainlp
|
||||
mutagen
|
||||
PyOpenGL
|
||||
nvidia-ml-py
|
||||
phonemizer-fork
|
||||
pydub
|
||||
pyannote-audio==3.4.0
|
||||
demucs==4.0.1
|
||||
gradio>=5.49
|
||||
transformers==4.51.3
|
||||
coqui-tts[languages]==0.26.0
|
||||
torch>=2.8.0,<2.9
|
||||
torchaudio>=2.8.0,<2.9
|
||||
torchvggish
|
||||
demucs
|
||||
deepspeed
|
||||
pyannote-audio<=3.4.0
|
||||
stanza<=1.10.1
|
||||
argostranslate<=1.10.0
|
||||
gradio>=5.49.1
|
||||
torch<=2.7.1
|
||||
torchaudio<=2.7.1
|
||||
coqui-tts[languages]==0.27.2
|
||||
@@ -1,14 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [[ "$OSTYPE" = "darwin"* && -z "$SWITCHED_TO_ZSH" && "$(ps -p $$ -o comm=)" != "zsh" ]]; then
|
||||
export SWITCHED_TO_ZSH=1
|
||||
exec env zsh "$0" "$@"
|
||||
export SWITCHED_TO_ZSH=1
|
||||
exec env zsh "$0" "$@"
|
||||
fi
|
||||
|
||||
unset SWITCHED_TO_ZSH
|
||||
#unset SWITCHED_TO_ZSH
|
||||
|
||||
ARCH=$(uname -m)
|
||||
PYTHON_VERSION="3.12"
|
||||
PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")' 2>/dev/null || echo "3.12")
|
||||
MIN_PYTHON_VERSION="3.10"
|
||||
MAX_PYTHON_VERSION="3.13"
|
||||
|
||||
export PYTHONUTF8="1"
|
||||
export PYTHONIOENCODING="utf-8"
|
||||
@@ -48,7 +50,7 @@ SCRIPT_MODE="$NATIVE"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
WGET=$(which wget 2>/dev/null)
|
||||
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox")
|
||||
REQUIRED_PROGRAMS=("curl" "pkg-config" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox" "tesseract")
|
||||
PYTHON_ENV="python_env"
|
||||
CURRENT_ENV=""
|
||||
|
||||
@@ -60,9 +62,6 @@ fi
|
||||
if [[ "$OSTYPE" = "darwin"* ]]; then
|
||||
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-$(uname -m).sh"
|
||||
CONFIG_FILE="$HOME/.zshrc"
|
||||
if [[ "$ARCH" == "x86_64" ]]; then
|
||||
PYTHON_VERSION="3.11"
|
||||
fi
|
||||
elif [[ "$OSTYPE" = "linux"* ]]; then
|
||||
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
|
||||
CONFIG_FILE="$HOME/.bashrc"
|
||||
@@ -76,6 +75,20 @@ CONDA_ENV="$CONDA_INSTALL_DIR/etc/profile.d/conda.sh"
|
||||
export TMPDIR="$SCRIPT_DIR/.cache"
|
||||
export PATH="$CONDA_PATH:$PATH"
|
||||
|
||||
compare_versions() {
|
||||
local ver1=$1
|
||||
local ver2=$2
|
||||
# Pad each version to 3 parts
|
||||
IFS='.' read -r v1_major v1_minor <<<"$ver1"
|
||||
IFS='.' read -r v2_major v2_minor <<<"$ver2"
|
||||
|
||||
((v1_major < v2_major)) && return 1
|
||||
((v1_major > v2_major)) && return 2
|
||||
((v1_minor < v2_minor)) && return 1
|
||||
((v1_minor > v2_minor)) && return 2
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check if the current script is run inside a docker container
|
||||
if [[ -n "$container" || -f /.dockerenv ]]; then
|
||||
SCRIPT_MODE="$FULL_DOCKER"
|
||||
@@ -123,14 +136,37 @@ else
|
||||
local programs=("$@")
|
||||
programs_missing=()
|
||||
for program in "${programs[@]}"; do
|
||||
bin="$program"
|
||||
if [ "$program" = "nodejs" ]; then
|
||||
bin="node"
|
||||
elif [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &> /dev/null; then
|
||||
bin="rustc"
|
||||
fi
|
||||
if [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &>/dev/null; then
|
||||
program="rustc"
|
||||
fi
|
||||
bin="rustc"
|
||||
fi
|
||||
if [ "$program" = "tesseract" ]; then
|
||||
if command -v brew &> /dev/null; then
|
||||
program="tesseract"
|
||||
elif command -v emerge &> /dev/null; then
|
||||
program="tesseract"
|
||||
elif command -v dnf &> /dev/null; then
|
||||
program="tesseract"
|
||||
elif command -v yum &> /dev/null; then
|
||||
program="tesseract"
|
||||
elif command -v zypper &> /dev/null; then
|
||||
program="tesseract-ocr"
|
||||
elif command -v pacman &> /dev/null; then
|
||||
program="tesseract"
|
||||
elif command -v apt-get &> /dev/null; then
|
||||
program="tesseract-ocr"
|
||||
elif command -v apk &> /dev/null; then
|
||||
program="tesseract-ocr"
|
||||
else
|
||||
echo "Cannot recognize your applications package manager. Please install the required applications manually."
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
bin="$program"
|
||||
fi
|
||||
if ! command -v "$bin" >/dev/null 2>&1; then
|
||||
echo -e "\e[33m$program is not installed.\e[0m"
|
||||
@@ -156,8 +192,9 @@ else
|
||||
if ! command -v brew &> /dev/null; then
|
||||
echo -e "\e[33mHomebrew is not installed. Installing Homebrew...\e[0m"
|
||||
/usr/bin/env bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
|
||||
echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> $HOME/.zprofile
|
||||
eval "$(/opt/homebrew/bin/brew shellenv)"
|
||||
echo >> $HOME/.zprofile
|
||||
echo 'eval "$(/usr/local/bin/brew shellenv)"' >> $HOME/.zprofile
|
||||
eval "$(/usr/local/bin/brew shellenv)"
|
||||
fi
|
||||
else
|
||||
SUDO="sudo"
|
||||
@@ -175,7 +212,7 @@ else
|
||||
PACK_MGR="zypper install"
|
||||
PACK_MGR_OPTIONS="-y"
|
||||
elif command -v pacman &> /dev/null; then
|
||||
PACK_MGR="pacman -Sy"
|
||||
PACK_MGR="pacman -Sy --noconfirm"
|
||||
elif command -v apt-get &> /dev/null; then
|
||||
$SUDO apt-get update
|
||||
PACK_MGR="apt-get install"
|
||||
@@ -186,7 +223,6 @@ else
|
||||
echo "Cannot recognize your applications package manager. Please install the required applications manually."
|
||||
return 1
|
||||
fi
|
||||
|
||||
fi
|
||||
if [ -z "$WGET" ]; then
|
||||
echo -e "\e[33m wget is missing! trying to install it... \e[0m"
|
||||
@@ -200,9 +236,9 @@ else
|
||||
fi
|
||||
fi
|
||||
for program in "${programs_missing[@]}"; do
|
||||
if [ "$program" = "calibre" ];then
|
||||
if [ "$program" = "calibre" ]; then
|
||||
# avoid conflict with calibre builtin lxml
|
||||
pip uninstall lxml -y 2>/dev/null
|
||||
#pip uninstall lxml -y 2>/dev/null
|
||||
echo -e "\e[33mInstalling Calibre...\e[0m"
|
||||
if [[ "$OSTYPE" = "darwin"* ]]; then
|
||||
eval "$PACK_MGR --cask calibre"
|
||||
@@ -219,21 +255,75 @@ else
|
||||
echo "$program installation failed."
|
||||
fi
|
||||
fi
|
||||
elif [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &> /dev/null; then
|
||||
app="rustc"
|
||||
else
|
||||
app="$program"
|
||||
fi
|
||||
elif [[ "$program" = "rust" || "$program" = "rustc" ]]; then
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
source $HOME/.cargo/env
|
||||
if command -v $app &>/dev/null; then
|
||||
if command -v $program &>/dev/null; then
|
||||
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
|
||||
else
|
||||
echo "$program installation failed."
|
||||
fi
|
||||
elif [[ "$program" = "tesseract" || "$program" = "tesseract-ocr" ]]; then
|
||||
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
|
||||
if command -v $program >/dev/null 2>&1; then
|
||||
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
|
||||
sys_lang=$(echo "${LANG:-en}" | cut -d_ -f1 | tr '[:upper:]' '[:lower:]')
|
||||
case "$sys_lang" in
|
||||
en) tess_lang="eng" ;;
|
||||
fr) tess_lang="fra" ;;
|
||||
de) tess_lang="deu" ;;
|
||||
it) tess_lang="ita" ;;
|
||||
es) tess_lang="spa" ;;
|
||||
pt) tess_lang="por" ;;
|
||||
ar) tess_lang="ara" ;;
|
||||
tr) tess_lang="tur" ;;
|
||||
ru) tess_lang="rus" ;;
|
||||
bn) tess_lang="ben" ;;
|
||||
zh) tess_lang="chi_sim" ;;
|
||||
fa) tess_lang="fas" ;;
|
||||
hi) tess_lang="hin" ;;
|
||||
hu) tess_lang="hun" ;;
|
||||
id) tess_lang="ind" ;;
|
||||
jv) tess_lang="jav" ;;
|
||||
ja) tess_lang="jpn" ;;
|
||||
ko) tess_lang="kor" ;;
|
||||
pl) tess_lang="pol" ;;
|
||||
ta) tess_lang="tam" ;;
|
||||
te) tess_lang="tel" ;;
|
||||
yo) tess_lang="yor" ;;
|
||||
*) tess_lang="eng" ;;
|
||||
esac
|
||||
echo "Detected system language: $sys_lang → installing Tesseract OCR language: $tess_lang"
|
||||
langpack=""
|
||||
if command -v brew &> /dev/null; then
|
||||
langpack="tesseract-lang-$tess_lang"
|
||||
elif command -v apt-get &>/dev/null; then
|
||||
langpack="tesseract-ocr-$tess_lang"
|
||||
elif command -v dnf &>/dev/null || command -v yum &>/dev/null; then
|
||||
langpack="tesseract-langpack-$tess_lang"
|
||||
elif command -v zypper &>/dev/null; then
|
||||
langpack="tesseract-ocr-$tess_lang"
|
||||
elif command -v pacman &>/dev/null; then
|
||||
langpack="tesseract-data-$tess_lang"
|
||||
elif command -v apk &>/dev/null; then
|
||||
langpack="tesseract-ocr-$tess_lang"
|
||||
else
|
||||
echo "Cannot recognize your applications package manager. Please install the required applications manually."
|
||||
return 1
|
||||
fi
|
||||
if [ -n "$langpack" ]; then
|
||||
eval "$SUDO $PACK_MGR $langpack $PACK_MGR_OPTIONS"
|
||||
if tesseract --list-langs | grep -q "$tess_lang"; then
|
||||
echo "Tesseract OCR language '$tess_lang' successfully installed."
|
||||
else
|
||||
echo "Tesseract OCR language '$tess_lang' not installed properly."
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "$program installation failed."
|
||||
fi
|
||||
else
|
||||
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
|
||||
eval "$SUDO $PACK_MGR $program $PACK_MGR_OPTIONS"
|
||||
if command -v $program >/dev/null 2>&1; then
|
||||
echo -e "\e[32m===============>>> $program is installed! <<===============\e[0m"
|
||||
else
|
||||
@@ -251,18 +341,25 @@ else
|
||||
function conda_check {
|
||||
if ! command -v conda &> /dev/null || [ ! -f "$CONDA_ENV" ]; then
|
||||
echo -e "\e[33mDownloading Miniforge3 installer...\e[0m"
|
||||
if [[ "$OSTYPE" = "darwin"* ]]; then
|
||||
if [[ "$OSTYPE" == darwin* ]]; then
|
||||
curl -fsSLo "$CONDA_INSTALLER" "$CONDA_URL"
|
||||
shell_name="zsh"
|
||||
else
|
||||
wget -O "$CONDA_INSTALLER" "$CONDA_URL"
|
||||
shell_name="bash"
|
||||
fi
|
||||
if [[ -f "$CONDA_INSTALLER" ]]; then
|
||||
echo -e "\e[33mInstalling Miniforge3...\e[0m"
|
||||
bash "$CONDA_INSTALLER" -b -u -p "$CONDA_INSTALL_DIR"
|
||||
rm -f "$CONDA_INSTALLER"
|
||||
if [[ -f "$CONDA_INSTALL_DIR/bin/conda" ]]; then
|
||||
$CONDA_INSTALL_DIR/bin/conda config --set auto_activate_base false
|
||||
source $CONDA_ENV
|
||||
if [ ! -f "$HOME/.condarc" ]; then
|
||||
$CONDA_INSTALL_DIR/bin/conda config --set auto_activate false
|
||||
fi
|
||||
[ -f "$CONFIG_FILE" ] || touch "$CONFIG_FILE"
|
||||
grep -qxF 'export PATH="$HOME/Miniforge3/bin:$PATH"' "$CONFIG_FILE" || echo 'export PATH="$HOME/Miniforge3/bin:$PATH"' >> "$CONFIG_FILE"
|
||||
source "$CONFIG_FILE"
|
||||
conda init "$shell_name"
|
||||
echo -e "\e[32m===============>>> conda is installed! <<===============\e[0m"
|
||||
else
|
||||
echo -e "\e[31mconda installation failed.\e[0m"
|
||||
@@ -275,8 +372,20 @@ else
|
||||
fi
|
||||
fi
|
||||
if [[ ! -d "$SCRIPT_DIR/$PYTHON_ENV" ]]; then
|
||||
if [[ "$OSTYPE" = "darwin"* && "$ARCH" = "x86_64" ]]; then
|
||||
PYTHON_VERSION="3.11"
|
||||
else
|
||||
compare_versions "$PYTHON_VERSION" "$MIN_PYTHON_VERSION"
|
||||
case $? in
|
||||
1) PYTHON_VERSION="$MIN_PYTHON_VERSION" ;;
|
||||
esac
|
||||
compare_versions "$PYTHON_VERSION" "$MAX_PYTHON_VERSION"
|
||||
case $? in
|
||||
2) PYTHON_VERSION="$MAX_PYTHON_VERSION" ;;
|
||||
esac
|
||||
fi
|
||||
# Use this condition to chmod writable folders once
|
||||
chmod -R 777 ./audiobooks ./tmp ./models
|
||||
chmod -R u+rwX,go+rX ./audiobooks ./tmp ./models
|
||||
conda create --prefix "$SCRIPT_DIR/$PYTHON_ENV" python=$PYTHON_VERSION -y
|
||||
conda init > /dev/null 2>&1
|
||||
source $CONDA_ENV
|
||||
@@ -286,7 +395,7 @@ else
|
||||
python -m pip install --upgrade --no-cache-dir --use-pep517 --progress-bar=on -r requirements.txt
|
||||
tts_version=$(python -c "import importlib.metadata; print(importlib.metadata.version('coqui-tts'))" 2>/dev/null)
|
||||
if [[ -n "$tts_version" ]]; then
|
||||
if [[ "$(printf '%s\n' "$tts_version" "0.26.1" | sort -V | tail -n1)" == "0.26.1" ]]; then
|
||||
if [[ "$(printf '%s\n' "$tts_version" "0.26.1" | sort -V | tail -n1)" = "0.26.1" ]]; then
|
||||
python -m pip install --no-cache-dir --use-pep517 --progress-bar=on 'transformers<=4.51.3'
|
||||
fi
|
||||
fi
|
||||
@@ -295,24 +404,132 @@ else
|
||||
return 0
|
||||
}
|
||||
|
||||
function create_macos_app_bundle {
|
||||
local APP_NAME="ebook2audiobook"
|
||||
local APP_BUNDLE="$HOME/Applications/$APP_NAME.app"
|
||||
local CONTENTS="$APP_BUNDLE/Contents"
|
||||
local MACOS="$CONTENTS/MacOS"
|
||||
local RESOURCES="$CONTENTS/Resources"
|
||||
local ICON_PATH="$SCRIPT_DIR/icons/mac/appIcon.icns"
|
||||
|
||||
echo "🚀 Creating $APP_NAME.app bundle..."
|
||||
mkdir -p "$MACOS" "$RESOURCES"
|
||||
|
||||
# Create the executable script inside the bundle
|
||||
cat > "$MACOS/$APP_NAME" << EOF
|
||||
#!/bin/bash
|
||||
|
||||
# Create a temporary script file to run in Terminal
|
||||
TEMP_SCRIPT=\$(mktemp)
|
||||
|
||||
cat > "\$TEMP_SCRIPT" << 'SCRIPT'
|
||||
#!/bin/bash
|
||||
cd "$SCRIPT_DIR"
|
||||
conda deactivate
|
||||
bash ebook2audiobook.sh
|
||||
|
||||
# Wait 10 seconds for the server to start
|
||||
sleep 10
|
||||
|
||||
# Open the browser
|
||||
open http://localhost:7860/
|
||||
|
||||
SCRIPT
|
||||
|
||||
chmod +x "\$TEMP_SCRIPT"
|
||||
|
||||
# Open Terminal and run the script
|
||||
open -a Terminal "\$TEMP_SCRIPT"
|
||||
|
||||
# Clean up the temp script after 60 seconds
|
||||
sleep 60
|
||||
rm "\$TEMP_SCRIPT"
|
||||
|
||||
EOF
|
||||
|
||||
chmod +x "$MACOS/$APP_NAME"
|
||||
|
||||
# Copy the icon to the bundle
|
||||
if [ -f "$ICON_PATH" ]; then
|
||||
cp "$ICON_PATH" "$RESOURCES/AppIcon.icns"
|
||||
echo "✓ Icon copied to bundle"
|
||||
else
|
||||
echo "⚠️ Warning: Icon not found at $ICON_PATH"
|
||||
fi
|
||||
|
||||
# Create the Info.plist file (required for macOS app bundles)
|
||||
cat > "$CONTENTS/Info.plist" << 'PLIST'
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleDevelopmentRegion</key>
|
||||
<string>en</string>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>ebook2audiobook</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>com.local.ebook2audiobook</string>
|
||||
<key>CFBundleInfoDictionaryVersion</key>
|
||||
<string>6.0</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>ebook2audiobook</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>APPL</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1</string>
|
||||
<key>LSMinimumSystemVersion</key>
|
||||
<string>10.9</string>
|
||||
<key>NSPrincipalClass</key>
|
||||
<string>NSApplication</string>
|
||||
<key>CFBundleIconFile</key>
|
||||
<string>AppIcon</string>
|
||||
</dict>
|
||||
</plist>
|
||||
PLIST
|
||||
|
||||
echo "✓ Info.plist created"
|
||||
|
||||
# Update macOS cache to recognize the new app
|
||||
touch "$APP_BUNDLE"
|
||||
|
||||
echo ""
|
||||
echo "✅ Application bundle created successfully!"
|
||||
echo "📍 Location: $APP_BUNDLE"
|
||||
echo ""
|
||||
}
|
||||
|
||||
function create_linux_app_launcher {
|
||||
# Linux desktop entry creation goes here
|
||||
return 0
|
||||
}
|
||||
|
||||
function create_app_bundle {
|
||||
if [[ "$OSTYPE" = "darwin"* ]]; then
|
||||
create_macos_app_bundle
|
||||
elif [[ "$OSTYPE" = "linux"* ]]; then
|
||||
create_linux_app_launcher
|
||||
fi
|
||||
}
|
||||
|
||||
if [ "$SCRIPT_MODE" = "$FULL_DOCKER" ]; then
|
||||
python app.py --script_mode "$SCRIPT_MODE" "${ARGS[@]}"
|
||||
conda deactivate
|
||||
conda deactivate
|
||||
elif [ "$SCRIPT_MODE" = "$NATIVE" ]; then
|
||||
pass=true
|
||||
if [ "$SCRIPT_MODE" = "$NATIVE" ]; then
|
||||
if ! required_programs_check "${REQUIRED_PROGRAMS[@]}"; then
|
||||
if ! install_programs; then
|
||||
pass=false
|
||||
fi
|
||||
pass=true
|
||||
if ! required_programs_check "${REQUIRED_PROGRAMS[@]}"; then
|
||||
if ! install_programs; then
|
||||
pass=false
|
||||
fi
|
||||
fi
|
||||
if [ $pass = true ]; then
|
||||
if [ "$pass" = true ]; then
|
||||
if conda_check; then
|
||||
conda init > /dev/null 2>&1
|
||||
source $CONDA_ENV
|
||||
conda activate "$SCRIPT_DIR/$PYTHON_ENV"
|
||||
create_app_bundle
|
||||
python app.py --script_mode "$SCRIPT_MODE" "${ARGS[@]}"
|
||||
conda deactivate
|
||||
conda deactivate
|
||||
@@ -323,4 +540,4 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
exit 0
|
||||
exit 0
|
||||
@@ -1,15 +1,15 @@
|
||||
from .models import (
|
||||
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
|
||||
default_engine_settings, default_vc_model, default_voice_detection_model,
|
||||
loaded_tts, max_custom_model, max_custom_voices,
|
||||
max_tts_in_memory, max_upload_size, models, os, voices_dir
|
||||
loaded_tts, xtts_builtin_speakers_list, max_custom_model, max_custom_voices,
|
||||
max_upload_size, models, os, voices_dir
|
||||
)
|
||||
|
||||
from .conf import (
|
||||
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
|
||||
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
|
||||
default_audio_proc_format, default_device, default_gpu_wiki,
|
||||
default_chapters_control, default_output_format, device_list, ebook_formats,
|
||||
default_chapters_preview, default_output_format, devices, ebook_formats,
|
||||
ebooks_dir, interface_component_options, interface_concurrency_limit,
|
||||
interface_host, interface_port, interface_shared_tmp_expire,
|
||||
max_python_version, min_python_version, models_dir, os,
|
||||
@@ -31,15 +31,15 @@ __all__ = [
|
||||
# from models
|
||||
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
|
||||
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
|
||||
"loaded_tts", "max_custom_model",
|
||||
"max_custom_voices", "max_tts_in_memory", "max_upload_size",
|
||||
"loaded_tts", "xtts_builtin_speakers_list", "max_custom_model",
|
||||
"max_custom_voices", "max_upload_size",
|
||||
"models", "os", "voices_dir",
|
||||
|
||||
# from conf
|
||||
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
|
||||
"audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
|
||||
"default_audio_proc_format", "default_device", "default_gpu_wiki",
|
||||
"default_chapters_control", "default_output_format", "device_list", "ebook_formats",
|
||||
"default_chapters_preview", "default_output_format", "devices", "ebook_formats",
|
||||
"ebooks_dir", "interface_component_options", "interface_concurrency_limit",
|
||||
"interface_host", "interface_port", "interface_shared_tmp_expire",
|
||||
"max_python_version", "min_python_version", "models_dir", "os",
|
||||
|
||||
@@ -3,7 +3,6 @@ import tempfile
|
||||
import argostranslate.package
|
||||
import argostranslate.translate
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from iso639 import Lang
|
||||
from lib.conf import models_dir
|
||||
from lib.lang import language_mapping
|
||||
@@ -50,7 +49,7 @@ class ArgosTranslator:
|
||||
]
|
||||
return language_translate_options
|
||||
|
||||
def get_all_target_packages(self,source_lang:str)->list[Any]:
|
||||
def get_all_target_packages(self,source_lang:str)->list:
|
||||
available_packages=argostranslate.package.get_available_packages()
|
||||
return [pkg for pkg in available_packages if pkg.from_code==source_lang]
|
||||
|
||||
@@ -64,7 +63,7 @@ class ArgosTranslator:
|
||||
error=f'is_package_installed() error: {e}'
|
||||
return False
|
||||
|
||||
def download_and_install_argos_package(self,source_lang:str,target_lang:str)->tuple[Optional[str],bool]:
|
||||
def download_and_install_argos_package(self,source_lang:str,target_lang:str)->tuple[str|None,bool]:
|
||||
try:
|
||||
if self.is_package_installed(source_lang,target_lang):
|
||||
print(f"Package for translation from {source_lang} to {target_lang} is already installed.")
|
||||
@@ -77,6 +76,9 @@ class ArgosTranslator:
|
||||
target_package=pkg
|
||||
break
|
||||
if target_package:
|
||||
#tmp_dir = os.path.join(session['process_dir'], "tmp")
|
||||
#os.makedirs(tmp_dir, exist_ok=True)
|
||||
#with tempfile.TemporaryDirectory(dir=tmp_dir) as tmpdirname:
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
print(f"Downloading package for translation from {source_lang} to {target_lang}...")
|
||||
package_path=target_package.download()
|
||||
@@ -97,7 +99,7 @@ class ArgosTranslator:
|
||||
error=f'AgrosTranslator.process() error: {e}'
|
||||
return error,False
|
||||
|
||||
def start(self,source_lang:str,target_lang:str)->tuple[Optional[str],bool]:
|
||||
def start(self,source_lang:str,target_lang:str)->tuple[str|None,bool]:
|
||||
try:
|
||||
if self.neural_machine!="argostranslate":
|
||||
error=f"Neural machine '{self.neural_machine}' is not supported."
|
||||
|
||||
@@ -2,7 +2,6 @@ import os
|
||||
import numpy as np
|
||||
import librosa
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from pyannote.audio import Model
|
||||
from pyannote.audio.pipelines import VoiceActivityDetection
|
||||
from lib.conf import tts_dir
|
||||
|
||||
@@ -2,7 +2,7 @@ import time
|
||||
import logging
|
||||
|
||||
from queue import Queue, Empty
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from typing import Any
|
||||
|
||||
|
||||
class RedirectConsole:
|
||||
|
||||
@@ -1,42 +1,38 @@
|
||||
import subprocess, re, sys, gradio as gr
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
|
||||
class SubprocessPipe:
|
||||
def __init__(self,cmd:str,session:Any,total_duration:float):
|
||||
self.cmd=cmd
|
||||
self.session=session
|
||||
self.total_duration=total_duration
|
||||
self.process=None
|
||||
self._stop_requested=False
|
||||
self.progress_bar=None
|
||||
self.start()
|
||||
|
||||
def _on_start(self)->None:
|
||||
print('Export started')
|
||||
if self.session.get('is_gui_process'):
|
||||
def __init__(self,cmd:str, is_gui_process:bool, total_duration:float, msg:str='Processing'):
|
||||
self.cmd = cmd
|
||||
self.is_gui_process = is_gui_process
|
||||
self.total_duration = total_duration
|
||||
self.msg = msg
|
||||
self.process = None
|
||||
self._stop_requested = False
|
||||
self.progress_bar = None
|
||||
if self.is_gui_process:
|
||||
self.progress_bar=gr.Progress(track_tqdm=False)
|
||||
self.progress_bar(0.0,desc='Starting export...')
|
||||
self._run_process()
|
||||
|
||||
def _on_progress(self,percent:float)->None:
|
||||
sys.stdout.write(f'\rFinal Encoding: {percent:.1f}%')
|
||||
sys.stdout.write(f'\r{self.msg}: {percent:.1f}%')
|
||||
sys.stdout.flush()
|
||||
if self.session.get('is_gui_process'):
|
||||
self.progress_bar(percent/100,desc='Final Encoding')
|
||||
if self.is_gui_process:
|
||||
self.progress_bar(percent/100,desc=self.msg)
|
||||
|
||||
def _on_complete(self)->None:
|
||||
print('\nExport completed successfully')
|
||||
if self.session.get('is_gui_process'):
|
||||
self.progress_bar(1.0,desc='Export completed')
|
||||
msg = f"\n{self.msg} completed"
|
||||
print(msg)
|
||||
if self.is_gui_process:
|
||||
self.progress_bar(1.0,desc=msg)
|
||||
|
||||
def _on_error(self,err:Exception)->None:
|
||||
print(f'\nExport failed: {err}')
|
||||
if self.session.get('is_gui_process'):
|
||||
self.progress_bar(0.0,desc='Export failed')
|
||||
def _on_error(self, err:Exception)->None:
|
||||
error = f"\n{self.msg} failed: {err}"
|
||||
print(error)
|
||||
if self.is_gui_process:
|
||||
self.progress_bar(0.0,desc=error)
|
||||
|
||||
def start(self)->bool:
|
||||
def _run_process(self)->bool:
|
||||
try:
|
||||
self._on_start()
|
||||
self.process=subprocess.Popen(
|
||||
self.cmd,
|
||||
stdout=subprocess.DEVNULL,
|
||||
@@ -48,14 +44,11 @@ class SubprocessPipe:
|
||||
last_percent=0.0
|
||||
for raw_line in self.process.stderr:
|
||||
line=raw_line.decode(errors='ignore')
|
||||
if self._stop_requested or self.session.get('cancellation_requested'):
|
||||
print('\nExport cancelled')
|
||||
return self.stop()
|
||||
match=time_pattern.search(raw_line)
|
||||
if match and self.total_duration>0:
|
||||
if match and self.total_duration > 0:
|
||||
current_time=int(match.group(1))/1_000_000
|
||||
percent=min((current_time/self.total_duration)*100,100)
|
||||
if abs(percent-last_percent)>=0.5:
|
||||
if abs(percent-last_percent) >= 0.5:
|
||||
self._on_progress(percent)
|
||||
last_percent=percent
|
||||
elif b'progress=end' in raw_line:
|
||||
|
||||
@@ -1,24 +1,26 @@
|
||||
import hashlib
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import soundfile as sf
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
_original_load = torch.load
|
||||
|
||||
def patched_torch_load(*args, **kwargs):
|
||||
kwargs.setdefault("weights_only", False)
|
||||
return _original_load(*args, **kwargs)
|
||||
|
||||
torch.load = patched_torch_load
|
||||
|
||||
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
|
||||
import numpy as np, regex as re, soundfile as sf, torchaudio
|
||||
import gc
|
||||
|
||||
from typing import Any
|
||||
from multiprocessing.managers import DictProxy
|
||||
from torch import Tensor
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
from lib import *
|
||||
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
||||
from lib.classes.tts_engines.common.utils import cleanup_garbage, unload_tts, append_sentence2vtt
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
|
||||
#import logging
|
||||
@@ -27,149 +29,266 @@ from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_aud
|
||||
lock = threading.Lock()
|
||||
|
||||
class Coqui:
|
||||
|
||||
def __init__(self, session):
|
||||
def __init__(self,session:DictProxy):
|
||||
try:
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
||||
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
||||
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
||||
self.npz_path = None
|
||||
self.npz_data = None
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params = {TTS_ENGINES['NEW_TTS']: {}}
|
||||
self.params={TTS_ENGINES['XXX']:{}
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
self._build()
|
||||
if not xtts_builtin_speakers_list:
|
||||
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XXX']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XXX']]['files'][4], cache_dir=self.cache_dir)
|
||||
xtts_builtin_speakers_list = torch.load(self.speakers_path)
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
if using_gpu and enough_vram:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found']:
|
||||
torch.cuda.set_per_process_memory_fraction(0.95)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
||||
|
||||
else:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found']:
|
||||
torch.cuda.set_per_process_memory_fraction(0.7)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
print(error)
|
||||
|
||||
def _load_api(self, key:str, model_path:str, device:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
unload_tts()
|
||||
from XXX import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
###########
|
||||
###### Load XXX api
|
||||
# engine =
|
||||
###########
|
||||
if engine:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _build(self):
|
||||
try:
|
||||
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if not tts:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
except Exception as e:
|
||||
error = f'build() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _load_api(self, key, model_path, device):
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
||||
with lock:
|
||||
tts = NEW_TTS(model_path)
|
||||
if tts
|
||||
if device == 'cuda':
|
||||
NEW_TTS.WITH_CUDA
|
||||
else:
|
||||
NEW_TTS.WITHOUT_CUDA
|
||||
loaded_tts[key] = {"engine": tts, "config": None}
|
||||
msg = f'{model_path} Loaded!'
|
||||
print(msg)
|
||||
return tts
|
||||
else:
|
||||
error = 'TTS engine could not be created!'
|
||||
print(error)
|
||||
except Exception as e:
|
||||
error = f'_load_api() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _load_checkpoint(self, **kwargs):
|
||||
global lock
|
||||
try:
|
||||
key = kwargs.get('key')
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
tts_engine = kwargs.get('tts_engine')
|
||||
device = kwargs.get('device')
|
||||
unload_tts(device, [self.tts_key])
|
||||
with lock:
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
NEW_TTS.LOAD_CHECKPOINT(
|
||||
config,
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
eval=True
|
||||
)
|
||||
if tts:
|
||||
if device == 'cuda':
|
||||
NEW_TTS.WITH_CUDA
|
||||
else:
|
||||
NEW_TTS.WITHOUT_CUDA
|
||||
loaded_tts[key] = {"engine": tts, "config": config}
|
||||
msg = f'{tts_engine} Loaded!'
|
||||
print(msg)
|
||||
return tts
|
||||
else:
|
||||
error = 'TTS engine could not be created!'
|
||||
print(error)
|
||||
key = kwargs.get('key')
|
||||
device = kwargs.get('device')
|
||||
unload_tts()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
if engine_name == TTS_ENGINES['XXX']:
|
||||
from XXX import XXXConfig
|
||||
from XXX import XXXtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
raise FileNotFoundError(f"Missing or invalid checkpoint_path: {checkpoint_path}")
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
raise FileNotFoundError(f"Missing or invalid config_path: {config_path}")
|
||||
return False
|
||||
###########
|
||||
###### Load XXX checkpoint
|
||||
# engine =
|
||||
###########
|
||||
)
|
||||
if engine:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
return False
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_garbage()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['XXX']:
|
||||
if self.session['custom_model'] is not None:
|
||||
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
||||
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
||||
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
||||
if self.engine:
|
||||
self.session['model_cache'] = self.tts_key
|
||||
msg = f'TTS {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_garbage()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model, self.session['device'])
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str, device:str)->str|bool:
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if(self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng'):
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
cleanup_garbage()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xtts_temperature": float,
|
||||
"xtts_length_penalty": float,
|
||||
"xtts_num_beams": int,
|
||||
"xtts_repetition_penalty": float,
|
||||
"xtts_top_k": int,
|
||||
"xtts_top_p": float,
|
||||
"xtts_speed": float,
|
||||
"xtts_enable_text_splitting": bool,
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
else:
|
||||
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
||||
else:
|
||||
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
return voice_path
|
||||
else:
|
||||
return voice_path
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _tensor_type(self, audio_data):
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data,torch.Tensor):
|
||||
return audio_data
|
||||
elif isinstance(audio_data, np.ndarray):
|
||||
elif isinstance(audio_data,np.ndarray):
|
||||
return torch.from_numpy(audio_data).float()
|
||||
elif isinstance(audio_data, list):
|
||||
return torch.tensor(audio_data, dtype=torch.float32)
|
||||
elif isinstance(audio_data,list):
|
||||
return torch.tensor(audio_data,dtype=torch.float32)
|
||||
else:
|
||||
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
||||
|
||||
def _get_resampler(self, orig_sr, target_sr):
|
||||
key = (orig_sr, target_sr)
|
||||
def _get_resampler(self,orig_sr:int,target_sr:int)->torchaudio.transforms.Resample:
|
||||
key=(orig_sr,target_sr)
|
||||
if key not in self.resampler_cache:
|
||||
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
||||
orig_freq=orig_sr, new_freq=target_sr
|
||||
self.resampler_cache[key]=torchaudio.transforms.Resample(
|
||||
orig_freq = orig_sr,new_freq = target_sr
|
||||
)
|
||||
return self.resampler_cache[key]
|
||||
|
||||
def _resample_wav(self, wav_path, expected_sr):
|
||||
waveform, orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr == expected_sr and waveform.size(0) == 1:
|
||||
def _resample_wav(self,wav_path:str,expected_sr:int)->str:
|
||||
waveform,orig_sr = torchaudio.load(wav_path)
|
||||
if orig_sr==expected_sr and waveform.size(0)==1:
|
||||
return wav_path
|
||||
if waveform.size(0) > 1:
|
||||
waveform = waveform.mean(dim=0, keepdim=True)
|
||||
if orig_sr != expected_sr:
|
||||
resampler = self._get_resampler(orig_sr, expected_sr)
|
||||
if waveform.size(0)>1:
|
||||
waveform = waveform.mean(dim=0,keepdim=True)
|
||||
if orig_sr!=expected_sr:
|
||||
resampler = self._get_resampler(orig_sr,expected_sr)
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
def convert(self, sentence_number, sentence):
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
global xtts_builtin_speakers_list
|
||||
try:
|
||||
speaker = None
|
||||
audio_data = False
|
||||
trim_audio_buffer = 0.004
|
||||
audio_sentence = False
|
||||
settings = self.params[self.session['tts_engine']]
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
||||
sentence = sentence.strip()
|
||||
settings['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
||||
@@ -177,56 +296,112 @@ class Coqui:
|
||||
)
|
||||
if settings['voice_path'] is not None:
|
||||
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
||||
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if tts:
|
||||
if sentence[-1].isalnum():
|
||||
sentence = f'{sentence} —'
|
||||
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and os.path.basename(settings['voice_path']) != 'ref.wav':
|
||||
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker, self.session['device'])
|
||||
if not settings['voice_path']:
|
||||
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
|
||||
print(msg)
|
||||
return False
|
||||
if self.engine:
|
||||
self.engine.to(self.session['device'])
|
||||
trim_audio_buffer = 0.004
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
return True
|
||||
elif sentence == TTS_SML['pause']:
|
||||
pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
|
||||
elif not sentence.replace('—', '').strip() or sentence == TTS_SML['pause']:
|
||||
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
|
||||
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
|
||||
self.audio_segments.append(pause_tensor.clone())
|
||||
return True
|
||||
else:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
||||
audio_sentence = NEW_TTS.CONVERT() # audio_sentence must be torch.Tensor or (list, tuple) or np.ndarray
|
||||
if sentence[-1].isalnum():
|
||||
sentence = f'{sentence} —'
|
||||
elif sentence.endswith("'"):
|
||||
sentence = sentence[:-1]
|
||||
if self.session['tts_engine'] == TTS_ENGINES['XXX']:
|
||||
trim_audio_buffer = 0.008
|
||||
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
|
||||
else:
|
||||
msg = 'Computing speaker latents...'
|
||||
print(msg)
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
|
||||
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xxx_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
"xxx_temperature": float,
|
||||
"xxx_length_penalty": float,
|
||||
"xxx_num_beams": int,
|
||||
"xxx_repetition_penalty": float,
|
||||
"xxx_top_k": int,
|
||||
"xxx_top_p": float,
|
||||
"xxx_speed": float,
|
||||
"xxx_enable_text_splitting": bool
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.engine.inference(
|
||||
text=sentence.replace('.', ' —'),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=settings['gpt_cond_latent'],
|
||||
speaker_embedding=settings['speaker_embedding'],
|
||||
**fine_tuned_params
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
if sentence[-1].isalnum() or sentence[-1] == '—':
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
if self.audio_segments:
|
||||
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
||||
start_time = self.sentences_total_time
|
||||
duration = audio_tensor.shape[-1] / settings['samplerate']
|
||||
end_time = start_time + duration
|
||||
self.sentences_total_time = end_time
|
||||
sentence_obj = {
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
else:
|
||||
error = f"Cannot create {final_sentence_file}"
|
||||
print(error)
|
||||
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.001, trim_audio_buffer).unsqueeze(0)
|
||||
if audio_tensor is not None and audio_tensor.numel() > 0:
|
||||
self.audio_segments.append(audio_tensor)
|
||||
if not re.search(r'\w$', sentence, flags=re.UNICODE) and sentence[-1] != '—':
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
|
||||
self.audio_segments.append(break_tensor.clone())
|
||||
if self.audio_segments:
|
||||
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
||||
start_time = self.sentences_total_time
|
||||
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
|
||||
end_time = start_time + duration
|
||||
self.sentences_total_time = end_time
|
||||
sentence_obj = {
|
||||
"start": start_time,
|
||||
"end": end_time,
|
||||
"text": sentence,
|
||||
"resume_check": self.sentence_idx
|
||||
}
|
||||
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_garbage()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
else:
|
||||
error = f"Cannot create {final_sentence_file}"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"audio_sentence not valide"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"convert() error: {self.session['tts_engine']} is None"
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
error = f'Coquit.convert(): {e}'
|
||||
error = f'XXX.convert(): {e}'
|
||||
raise ValueError(e)
|
||||
return False
|
||||
return False
|
||||
@@ -2,13 +2,16 @@ import numpy as np
|
||||
import torch
|
||||
import subprocess
|
||||
import shutil
|
||||
import json
|
||||
|
||||
from torch import Tensor
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from typing import Any, Union
|
||||
from scipy.io import wavfile as wav
|
||||
from scipy.signal import find_peaks
|
||||
|
||||
def detect_gender(voice_path:str)->Optional[str]:
|
||||
from lib.classes.subprocess_pipe import SubprocessPipe
|
||||
|
||||
def detect_gender(voice_path:str)->str|None:
|
||||
try:
|
||||
samplerate, signal = wav.read(voice_path)
|
||||
# Ensure mono
|
||||
@@ -57,7 +60,29 @@ def trim_audio(audio_data: Union[list[float], Tensor], samplerate: int, silence_
|
||||
raise TypeError(error)
|
||||
return torch.tensor([], dtype=torch.float32)
|
||||
|
||||
def normalize_audio(input_file:str, output_file:str, samplerate:int)->bool:
|
||||
def get_audio_duration(filepath:str)->float:
|
||||
try:
|
||||
ffprobe_cmd = [
|
||||
shutil.which('ffprobe'),
|
||||
'-v', 'error',
|
||||
'-show_entries', 'format=duration',
|
||||
'-of', 'json',
|
||||
filepath
|
||||
]
|
||||
result = subprocess.run(ffprobe_cmd, capture_output=True, text=True)
|
||||
try:
|
||||
return float(json.loads(result.stdout)['format']['duration'])
|
||||
except Exception:
|
||||
return 0
|
||||
except subprocess.CalledProcessError as e:
|
||||
DependencyError(e)
|
||||
return 0
|
||||
except Exception as e:
|
||||
error = f"get_audio_duration() Error: Failed to process {txt_file} → {out_file}: {e}"
|
||||
print(error)
|
||||
return 0
|
||||
|
||||
def normalize_audio(input_file:str, output_file:str, samplerate:int, is_gui_process:bool)->bool:
|
||||
filter_complex = (
|
||||
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
||||
'afftdn=nf=-70,'
|
||||
@@ -70,24 +95,17 @@ def normalize_audio(input_file:str, output_file:str, samplerate:int)->bool:
|
||||
'equalizer=f=9000:t=q:w=2:g=-2,'
|
||||
'highpass=f=63[audio]'
|
||||
)
|
||||
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
||||
ffmpeg_cmd += [
|
||||
cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
||||
cmd += [
|
||||
'-filter_complex', filter_complex,
|
||||
'-map', '[audio]',
|
||||
'-ar', str(samplerate),
|
||||
'-y', output_file
|
||||
]
|
||||
try:
|
||||
subprocess.run(
|
||||
ffmpeg_cmd,
|
||||
env={},
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
encoding='utf-8',
|
||||
errors='ignore'
|
||||
)
|
||||
proc_pipe = SubprocessPipe(cmd, is_gui_process=is_gui_process, total_duration=get_audio_duration(input_file), msg='Normalize')
|
||||
if proc_pipe:
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
else:
|
||||
error = f"normalize_audio() error: {input_file}: {e}"
|
||||
print(error)
|
||||
return False
|
||||
|
||||
@@ -1,31 +1,35 @@
|
||||
import os
|
||||
import gc
|
||||
import torch
|
||||
import regex as re
|
||||
import stanza
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
|
||||
from typing import Any, Union
|
||||
from lib.models import loaded_tts, TTS_ENGINES
|
||||
from lib.functions import context
|
||||
|
||||
def unload_tts(device:str, reserved_keys:Optional[list[str]] = None, tts_key:Optional[str] = None)->bool:
|
||||
def cleanup_garbage():
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.ipc_collect()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
def unload_tts()->None:
|
||||
try:
|
||||
if len(loaded_tts) >= max_tts_in_memory:
|
||||
if reserved_keys is None:
|
||||
reserved_keys = []
|
||||
if tts_key is not None:
|
||||
if tts_key in loaded_tts:
|
||||
del loaded_tts[tts_key]
|
||||
if device == "cuda":
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.ipc_collect()
|
||||
else:
|
||||
for key in list(loaded_tts.keys()):
|
||||
if key not in reserved_keys:
|
||||
del loaded_tts[key]
|
||||
return True
|
||||
active_models = {
|
||||
cache
|
||||
for session in context.sessions.values()
|
||||
for cache in (session.get('model_cache'), session.get('model_zs_cache'), session.get('stanza_cache'))
|
||||
if cache is not None
|
||||
}
|
||||
for key in list(loaded_tts.keys()):
|
||||
if key not in active_models:
|
||||
del loaded_tts[key]
|
||||
cleanup_garbage()
|
||||
except Exception as e:
|
||||
error = f"unload_tts() error: {e}"
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def append_sentence2vtt(sentence_obj:dict[str, Any], path:str)->Union[int, bool]:
|
||||
|
||||
|
||||
@@ -1,74 +1,170 @@
|
||||
import torch
|
||||
from typing import Any, Optional, Union, Callable
|
||||
|
||||
_original_load = torch.load
|
||||
|
||||
def patched_torch_load(*args, **kwargs):
|
||||
kwargs.setdefault("weights_only", False)
|
||||
return _original_load(*args, **kwargs)
|
||||
|
||||
|
||||
torch.load = patched_torch_load
|
||||
|
||||
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
|
||||
import numpy as np, regex as re, soundfile as sf, torchaudio
|
||||
import gc
|
||||
|
||||
from typing import Any
|
||||
from multiprocessing.managers import DictProxy
|
||||
from torch import Tensor
|
||||
from huggingface_hub import hf_hub_download
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
from lib import *
|
||||
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
||||
from lib.classes.tts_engines.common.utils import cleanup_garbage, unload_tts, append_sentence2vtt
|
||||
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
||||
|
||||
#import logging
|
||||
#logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
lock = threading.Lock()
|
||||
xtts_builtin_speakers_list = None
|
||||
|
||||
class Coqui:
|
||||
def __init__(self,session:Any):
|
||||
def __init__(self,session:DictProxy):
|
||||
try:
|
||||
global xtts_builtin_speakers_list
|
||||
self.session = session
|
||||
self.cache_dir = tts_dir
|
||||
self.speakers_path = None
|
||||
self.tts = None
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
||||
self.tts_vc_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported()==True else False
|
||||
self.npz_path = None
|
||||
self.npz_data = None
|
||||
self.engine = None
|
||||
self.tts_zs_key = default_vc_model.rsplit('/',1)[-1]
|
||||
self.engine_zs = None
|
||||
self.pth_voice_file = None
|
||||
self.sentences_total_time = 0.0
|
||||
self.sentence_idx = 1
|
||||
self.params={TTS_ENGINES['XTTSv2']:{"latent_embedding":{}},TTS_ENGINES['BARK']:{},TTS_ENGINES['VITS']:{"semitones":{}},TTS_ENGINES['FAIRSEQ']:{"semitones":{}},TTS_ENGINES['TACOTRON2']:{"semitones":{}},TTS_ENGINES['YOURTTS']:{}}
|
||||
self.params[self.session['tts_engine']]['samplerate']=models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
||||
self.vtt_path = os.path.join(self.session['process_dir'],Path(self.session['final_name']).stem+'.vtt')
|
||||
self.resampler_cache={}
|
||||
self.audio_segments=[]
|
||||
self._build()
|
||||
self.resampler_cache = {}
|
||||
self.audio_segments = []
|
||||
if not xtts_builtin_speakers_list:
|
||||
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
||||
xtts_builtin_speakers_list = torch.load(self.speakers_path)
|
||||
using_gpu = self.session['device'] != devices['CPU']['proc']
|
||||
enough_vram = self.session['free_vram_gb'] > 4.0
|
||||
if using_gpu and enough_vram:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found']:
|
||||
torch.cuda.set_per_process_memory_fraction(0.95)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
|
||||
|
||||
else:
|
||||
if devices['CUDA']['found'] or devices['ROCM']['found']:
|
||||
torch.cuda.set_per_process_memory_fraction(0.7)
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
|
||||
self._load_engine()
|
||||
self._load_engine_zs()
|
||||
except Exception as e:
|
||||
error = f'__init__() error: {e}'
|
||||
print(error)
|
||||
|
||||
def _build(self)->bool:
|
||||
def _load_api(self, key:str, model_path:str, device:str)->Any:
|
||||
global lock
|
||||
try:
|
||||
global xtts_builtin_speakers_list
|
||||
load_zeroshot = True if self.session['tts_engine'] in [TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2']] else False
|
||||
self.tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if not self.tts:
|
||||
if xtts_builtin_speakers_list is None:
|
||||
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
||||
xtts_builtin_speakers_list = torch.load(self.speakers_path)
|
||||
with lock:
|
||||
unload_tts()
|
||||
from TTS.api import TTS as TTSEngine
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine = TTSEngine(model_path)
|
||||
if engine:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->Any:
|
||||
global lock
|
||||
try:
|
||||
with lock:
|
||||
key = kwargs.get('key')
|
||||
device = kwargs.get('device')
|
||||
unload_tts()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
engine_name = kwargs.get('tts_engine', None)
|
||||
if engine_name == TTS_ENGINES['XTTSv2']:
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
if not checkpoint_path or not os.path.exists(checkpoint_path):
|
||||
raise FileNotFoundError(f"Missing or invalid checkpoint_path: {checkpoint_path}")
|
||||
return False
|
||||
if not config_path or not os.path.exists(config_path):
|
||||
raise FileNotFoundError(f"Missing or invalid config_path: {config_path}")
|
||||
return False
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
engine = Xtts.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
use_deepspeed = default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'] if self.session['device'] in [devices['CUDA']['proc'], devices['XPU']['proc'], devices['ROCM']['proc']] else False,
|
||||
eval = True
|
||||
)
|
||||
elif engine_name == TTS_ENGINES['BARK']:
|
||||
from TTS.tts.configs.bark_config import BarkConfig
|
||||
from TTS.tts.models.bark import Bark
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
if not checkpoint_dir or not os.path.exists(checkpoint_dir):
|
||||
raise FileNotFoundError(f"Missing or invalid checkpoint_dir: {checkpoint_dir}")
|
||||
return False
|
||||
config = BarkConfig()
|
||||
config.CACHE_DIR = self.cache_dir
|
||||
config.USE_SMALLER_MODELS = True if os.environ['SUNO_USE_SMALL_MODELS'] == 'True' else False
|
||||
engine = Bark.init_from_config(config)
|
||||
engine.load_checkpoint(
|
||||
config,
|
||||
checkpoint_dir = checkpoint_dir,
|
||||
eval = True
|
||||
)
|
||||
if engine:
|
||||
loaded_tts[key] = engine
|
||||
return engine
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
print(error)
|
||||
return None
|
||||
|
||||
def _load_engine(self)->None:
|
||||
try:
|
||||
msg = f"Loading TTS {self.tts_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_garbage()
|
||||
self.engine = loaded_tts.get(self.tts_key, False)
|
||||
if not self.engine:
|
||||
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
||||
msg = f"Loading TTS {self.session['tts_engine']} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
if self.session['custom_model'] is not None:
|
||||
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
||||
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
||||
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
||||
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
||||
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
if self.session['fine_tuned'] == 'internal':
|
||||
@@ -80,12 +176,11 @@ class Coqui:
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
@@ -93,12 +188,11 @@ class Coqui:
|
||||
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
||||
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
||||
checkpoint_dir = os.path.dirname(text_model_path)
|
||||
self.tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
|
||||
self.engine = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
@@ -106,28 +200,23 @@ class Coqui:
|
||||
if sub is not None:
|
||||
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
self.tts_key = model_path
|
||||
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
return False
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
||||
self.tts_key = model_path
|
||||
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
print(msg)
|
||||
else:
|
||||
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
||||
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
||||
@@ -138,126 +227,39 @@ class Coqui:
|
||||
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
||||
if sub is not None:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
||||
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
self.tts_key = model_path
|
||||
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
else:
|
||||
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
||||
print(msg)
|
||||
return False
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
||||
if self.session['custom_model'] is not None:
|
||||
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
||||
print(msg)
|
||||
return False
|
||||
else:
|
||||
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
||||
self.tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
if load_zeroshot:
|
||||
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
||||
if not tts_vc:
|
||||
if self.session['voice'] is not None:
|
||||
msg = f"Loading TTS {self.tts_vc_key} zeroshot model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
tts_vc = self._load_api(self.tts_vc_key, default_vc_model, self.session['device'])
|
||||
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
self.engine = self._load_api(self.tts_key, model_path, self.session['device'])
|
||||
if self.engine:
|
||||
self.session['model_cache'] = self.tts_key
|
||||
msg = f'TTS {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f'build() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
error = f'_load_engine() error: {e}'
|
||||
|
||||
def _load_api(self, key: str, model_path: str, device: str) -> bool | Any:
|
||||
global lock
|
||||
def _load_engine_zs(self)->Any:
|
||||
try:
|
||||
if key in loaded_tts:
|
||||
print(f"Reusing cached TTS engine for key: {key}")
|
||||
tts = loaded_tts[key]['engine']
|
||||
return tts
|
||||
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
||||
from TTS.api import TTS as CoquiAPI
|
||||
with lock:
|
||||
print(f"Loading Coqui model from: {model_path}")
|
||||
tts = CoquiAPI(model_path)
|
||||
if not tts:
|
||||
return False
|
||||
if device == "cuda" and torch.cuda.is_available():
|
||||
tts.cuda()
|
||||
elif device == "mps" and torch.backends.mps.is_available():
|
||||
tts.to(torch.device("mps"))
|
||||
else:
|
||||
tts.to(device)
|
||||
loaded_tts[key] = {"engine": tts, "config": None}
|
||||
msg = f"Model loaded successfully: {model_path} ({device})"
|
||||
print(msg)
|
||||
return tts
|
||||
msg = f"Loading ZeroShot {self.tts_zs_key} model, it takes a while, please be patient..."
|
||||
print(msg)
|
||||
cleanup_garbage()
|
||||
self.engine_zs = loaded_tts.get(self.tts_zs_key, False)
|
||||
if not self.engine_zs:
|
||||
self.engine_zs = self._load_api(self.tts_zs_key, default_vc_model, self.session['device'])
|
||||
if self.engine_zs:
|
||||
self.session['model_zs_cache'] = self.tts_zs_key
|
||||
msg = f'ZeroShot {key} Loaded!'
|
||||
except Exception as e:
|
||||
error = f"_load_api() error: {e}"
|
||||
print(error)
|
||||
return False
|
||||
|
||||
def _load_checkpoint(self,**kwargs:Any)->bool|Any:
|
||||
global lock
|
||||
try:
|
||||
key = kwargs.get('key')
|
||||
if key in loaded_tts.keys():
|
||||
return loaded_tts[key]['engine']
|
||||
tts_engine = kwargs.get('tts_engine')
|
||||
device = kwargs.get('device')
|
||||
unload_tts(device,[self.tts_key,self.tts_vc_key])
|
||||
with lock:
|
||||
if tts_engine==TTS_ENGINES['XTTSv2']:
|
||||
from TTS.tts.configs.xtts_config import XttsConfig
|
||||
from TTS.tts.models.xtts import Xtts
|
||||
checkpoint_path = kwargs.get('checkpoint_path')
|
||||
config_path = kwargs.get('config_path',None)
|
||||
vocab_path = kwargs.get('vocab_path',None)
|
||||
config = XttsConfig()
|
||||
config.models_dir = os.path.join("models","tts")
|
||||
config.load_json(config_path)
|
||||
tts = Xtts.init_from_config(config)
|
||||
tts.load_checkpoint(
|
||||
config,
|
||||
checkpoint_path = checkpoint_path,
|
||||
vocab_path = vocab_path,
|
||||
use_deepspeed = default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'],
|
||||
eval = True
|
||||
)
|
||||
elif tts_engine==TTS_ENGINES['BARK']:
|
||||
from TTS.tts.configs.bark_config import BarkConfig
|
||||
from TTS.tts.models.bark import Bark
|
||||
checkpoint_dir = kwargs.get('checkpoint_dir')
|
||||
config = BarkConfig()
|
||||
config.CACHE_DIR = self.cache_dir
|
||||
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS','').lower()=='true'
|
||||
tts = Bark.init_from_config(config)
|
||||
tts.load_checkpoint(
|
||||
config,
|
||||
checkpoint_dir = checkpoint_dir,
|
||||
eval = True
|
||||
)
|
||||
if tts:
|
||||
if device=='cuda':
|
||||
tts.cuda()
|
||||
else:
|
||||
if device=='mps':
|
||||
tts.to(torch.device('mps'))
|
||||
else:
|
||||
tts.to(device)
|
||||
loaded_tts[key]={"engine":tts,"config":config}
|
||||
msg = f'{tts_engine} Loaded!'
|
||||
print(msg)
|
||||
return tts
|
||||
else:
|
||||
error='TTS engine could not be created!'
|
||||
print(error)
|
||||
except Exception as e:
|
||||
error = f'_load_checkpoint() error: {e}'
|
||||
return False
|
||||
error = f'_load_engine_zs() error: {e}'
|
||||
|
||||
def _check_xtts_builtin_speakers(self, voice_path:str, speaker:str, device:str)->str|bool:
|
||||
def _valid_tensor(t:Any):
|
||||
return isinstance(t, torch.Tensor) and not (torch.isnan(t).any() or torch.isinf(t).any())
|
||||
try:
|
||||
voice_parts = Path(voice_path).parts
|
||||
if(self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng'):
|
||||
@@ -266,23 +268,22 @@ class Coqui:
|
||||
if os.path.exists(default_text_file):
|
||||
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
||||
print(msg)
|
||||
tts_internal_key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
self.tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
|
||||
if not self.tts:
|
||||
for key in list(loaded_tts.keys()):
|
||||
unload_tts(device, None, key)
|
||||
cleanup_garbage()
|
||||
engine = loaded_tts.get(key, False)
|
||||
if not engine:
|
||||
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
||||
hf_sub = ''
|
||||
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
||||
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
||||
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
||||
self.tts = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=tts_internal_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
|
||||
if self.tts:
|
||||
engine = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
|
||||
if engine:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
gpt_cond_latent, speaker_embedding = self.tts.get_conditioning_latents(audio_path=[voice_path])
|
||||
gpt_cond_latent, speaker_embedding = engine.get_conditioning_latents(audio_path=[voice_path])
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
@@ -298,27 +299,27 @@ class Coqui:
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.tts.inference(
|
||||
result = engine.inference(
|
||||
text=default_text.strip(),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=gpt_cond_latent,
|
||||
speaker_embedding=speaker_embedding,
|
||||
**fine_tuned_params,
|
||||
)
|
||||
audio_data = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_data is not None:
|
||||
audio_data = audio_data.tolist()
|
||||
sourceTensor = self._tensor_type(audio_data)
|
||||
audio_sentence = result.get('wav') if isinstance(result, dict) else None
|
||||
if audio_sentence is not None:
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
sourceTensor = self._tensor_type(audio_sentence)
|
||||
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
||||
# CON is a reserved name on windows
|
||||
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
||||
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
||||
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
||||
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
|
||||
del audio_data, sourceTensor, audio_tensor
|
||||
if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
|
||||
del self.tts
|
||||
unload_tts(device, None, tts_internal_key)
|
||||
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate, self.session['is_gui_process']):
|
||||
del audio_sentence, sourceTensor, audio_tensor
|
||||
Path(proc_voice_path).unlink(missing_ok=True)
|
||||
gc.collect()
|
||||
return new_voice_path
|
||||
else:
|
||||
error = 'normalize_audio() error:'
|
||||
@@ -329,6 +330,7 @@ class Coqui:
|
||||
else:
|
||||
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
return voice_path
|
||||
else:
|
||||
@@ -336,68 +338,52 @@ class Coqui:
|
||||
except Exception as e:
|
||||
error = f'_check_xtts_builtin_speakers() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
return False
|
||||
|
||||
def _check_bark_npz(self,voice_path:str,bark_dir:str,speaker:str,device:str)->bool:
|
||||
def _check_bark_npz(self, voice_path:str, bark_dir:str, speaker:str, device:str)->bool:
|
||||
try:
|
||||
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
|
||||
npz_dir = os.path.join(bark_dir,speaker)
|
||||
npz_file = os.path.join(npz_dir,f'{speaker}.npz')
|
||||
if os.path.exists(npz_file):
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(pth_voice_dir,f'{speaker}.pth')
|
||||
if os.path.exists(pth_voice_file):
|
||||
return True
|
||||
else:
|
||||
os.makedirs(npz_dir,exist_ok=True)
|
||||
tts_internal_key = f"{TTS_ENGINES['BARK']}-internal"
|
||||
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
|
||||
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
|
||||
self.tts = (loaded_tts.get(tts_internal_key) or {}).get('engine',False)
|
||||
if not self.tts:
|
||||
for key in list(loaded_tts.keys()):unload_tts(device,None,key)
|
||||
text_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}",cache_dir=self.cache_dir)
|
||||
coarse_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}",cache_dir=self.cache_dir)
|
||||
fine_model_path = hf_hub_download(repo_id=hf_repo,filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}",cache_dir=self.cache_dir)
|
||||
checkpoint_dir = os.path.dirname(text_model_path)
|
||||
self.tts = self._load_checkpoint(tts_engine=TTS_ENGINES['BARK'],key=tts_internal_key,checkpoint_dir=checkpoint_dir,device=device)
|
||||
if self.tts:
|
||||
voice_temp=os.path.splitext(npz_file)[0]+'.wav'
|
||||
shutil.copy(voice_path,voice_temp)
|
||||
default_text_file = os.path.join(voices_dir,self.session['language'],'default.txt')
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
fine_tuned_params={
|
||||
key.removeprefix("bark_"):cast_type(self.session[key])
|
||||
for key,cast_type in{
|
||||
"bark_text_temp":float,
|
||||
"bark_waveform_temp":float
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
torch.manual_seed(67878789)
|
||||
audio_data = self.tts.synthesize(
|
||||
default_text,
|
||||
loaded_tts[tts_internal_key]['config'],
|
||||
speaker_id=speaker,
|
||||
voice_dirs=bark_dir,
|
||||
silent=True,
|
||||
**fine_tuned_params
|
||||
)
|
||||
os.remove(voice_temp)
|
||||
del audio_data
|
||||
if self.session['tts_engine']!=TTS_ENGINES['BARK']:
|
||||
del self.tts
|
||||
unload_tts(device,None,tts_internal_key)
|
||||
msg = f"Saved NPZ file: {npz_file}"
|
||||
print(msg)
|
||||
return True
|
||||
else:
|
||||
error = f'_check_bark_npz() error: {tts_internal_key} is False'
|
||||
print(error)
|
||||
os.makedirs(pth_voice_dir,exist_ok=True)
|
||||
key = f"{TTS_ENGINES['BARK']}-internal"
|
||||
voice_temp = os.path.splitext(pth_voice_file)[0]+'.wav'
|
||||
shutil.copy(voice_path,voice_temp)
|
||||
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
||||
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("bark_"):cast_type(self.session[key])
|
||||
for key,cast_type in{
|
||||
"bark_text_temp":float,
|
||||
"bark_waveform_temp":float
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
#torch.manual_seed(67878789)
|
||||
audio_sentence = self.engine.synthesize(
|
||||
default_text,
|
||||
speaker_wav=voice_path,
|
||||
speaker=speaker,
|
||||
voice_dir=pth_voice_dir,
|
||||
silent=True,
|
||||
**fine_tuned_params
|
||||
)
|
||||
os.remove(voice_temp)
|
||||
del audio_sentence
|
||||
msg = f"Saved file: {pth_voice_file}"
|
||||
print(msg)
|
||||
gc.collect()
|
||||
return True
|
||||
else:
|
||||
return True
|
||||
except Exception as e:
|
||||
error = f'_check_bark_npz() error: {e}'
|
||||
print(error)
|
||||
return False
|
||||
return False
|
||||
|
||||
def _tensor_type(self,audio_data:Any)->torch.Tensor:
|
||||
if isinstance(audio_data,torch.Tensor):
|
||||
@@ -428,22 +414,19 @@ class Coqui:
|
||||
waveform = resampler(waveform)
|
||||
wav_tensor = waveform.squeeze(0)
|
||||
wav_numpy = wav_tensor.cpu().numpy()
|
||||
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav",delete=False)
|
||||
os.path.join(self.session['process_dir'], 'tmp')
|
||||
os.makedirs(tmp_dir, exist_ok=True)
|
||||
tmp_fh = tempfile.NamedTemporaryFile(dir=tmp_dir, suffix=".wav", delete=False)
|
||||
tmp_path = tmp_fh.name
|
||||
tmp_fh.close()
|
||||
sf.write(tmp_path,wav_numpy,expected_sr,subtype="PCM_16")
|
||||
return tmp_path
|
||||
|
||||
def convert(self, s_n:int, s:str)->bool:
|
||||
global xtts_builtin_speakers_list
|
||||
def convert(self, sentence_index:int, sentence:str)->bool:
|
||||
try:
|
||||
sentence_number = s_n
|
||||
sentence = s
|
||||
speaker = None
|
||||
audio_data = False
|
||||
trim_audio_buffer = 0.004
|
||||
audio_sentence = False
|
||||
settings = self.params[self.session['tts_engine']]
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
||||
settings['voice_path'] = (
|
||||
self.session['voice'] if self.session['voice'] is not None
|
||||
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
||||
@@ -457,8 +440,10 @@ class Coqui:
|
||||
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
|
||||
print(msg)
|
||||
return False
|
||||
self.tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
||||
if self.tts:
|
||||
if self.engine:
|
||||
self.engine.to(self.session['device'])
|
||||
trim_audio_buffer = 0.004
|
||||
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
|
||||
if sentence == TTS_SML['break']:
|
||||
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
||||
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
|
||||
@@ -484,7 +469,7 @@ class Coqui:
|
||||
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
||||
else:
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.tts.get_conditioning_latents(audio_path=[settings['voice_path']])
|
||||
settings['gpt_cond_latent'], settings['speaker_embedding'] = self.engine.get_conditioning_latents(audio_path=[settings['voice_path']])
|
||||
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("xtts_"): cast_type(self.session[key])
|
||||
@@ -501,7 +486,7 @@ class Coqui:
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
with torch.no_grad():
|
||||
result = self.tts.inference(
|
||||
result = self.engine.inference(
|
||||
text=sentence.replace('.', ' —'),
|
||||
language=self.session['language_iso1'],
|
||||
gpt_cond_latent=settings['gpt_cond_latent'],
|
||||
@@ -530,10 +515,11 @@ class Coqui:
|
||||
else:
|
||||
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
|
||||
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker, self.session['device']):
|
||||
error = 'Could not create npz file!'
|
||||
error = 'Could not create pth file!'
|
||||
print(error)
|
||||
return False
|
||||
npz_file = os.path.join(bark_dir, speaker, f'{speaker}.npz')
|
||||
pth_voice_dir = os.path.join(bark_dir, speaker)
|
||||
pth_voice_file = os.path.join(bark_dir, speaker, f'{speaker}.pth')
|
||||
fine_tuned_params = {
|
||||
key.removeprefix("bark_"): cast_type(self.session[key])
|
||||
for key, cast_type in {
|
||||
@@ -542,22 +528,16 @@ class Coqui:
|
||||
}.items()
|
||||
if self.session.get(key) is not None
|
||||
}
|
||||
if self.npz_path is None or self.npz_path != npz_file:
|
||||
self.npz_path = npz_file
|
||||
self.npz_data = np.load(self.npz_path, allow_pickle=True)
|
||||
history_prompt = [
|
||||
self.npz_data["semantic_prompt"],
|
||||
self.npz_data["coarse_prompt"],
|
||||
self.npz_data["fine_prompt"]
|
||||
]
|
||||
with torch.no_grad():
|
||||
torch.manual_seed(67878789)
|
||||
audio_sentence, _ = self.tts.generate_audio(
|
||||
#torch.manual_seed(67878789)
|
||||
result = self.engine.synthesize(
|
||||
sentence,
|
||||
history_prompt=history_prompt,
|
||||
speaker=speaker,
|
||||
voice_dir=pth_voice_dir,
|
||||
silent=True,
|
||||
**fine_tuned_params
|
||||
)
|
||||
audio_sentence = result.get('wav')
|
||||
if is_audio_data_valid(audio_sentence):
|
||||
audio_sentence = audio_sentence.tolist()
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
||||
@@ -573,11 +553,12 @@ class Coqui:
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
self.tts.tts_to_file(
|
||||
text=sentence,
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=sentence,
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
@@ -612,17 +593,16 @@ class Coqui:
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
||||
if tts_vc:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = tts_vc.voice_conversion(
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_vc_key} is None'
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
@@ -632,10 +612,11 @@ class Coqui:
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
audio_sentence = self.tts.tts(
|
||||
text=sentence,
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=sentence,
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
||||
speaker_argument = {}
|
||||
not_supported_punc_pattern = re.compile(r"[.:—]")
|
||||
@@ -644,11 +625,12 @@ class Coqui:
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
self.tts.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
@@ -672,26 +654,27 @@ class Coqui:
|
||||
]
|
||||
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Subprocess error: {e.stderr}")
|
||||
error = f'Subprocess error: {e.stderr}'
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
except FileNotFoundError as e:
|
||||
print(f"File not found: {e}")
|
||||
error = f'File not found: {e}'
|
||||
print(error)
|
||||
DependencyError(e)
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
||||
if tts_vc:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = tts_vc.voice_conversion(
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_vc_key} is None'
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
@@ -701,23 +684,28 @@ class Coqui:
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
audio_sentence = self.tts.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
||||
speaker_argument = {}
|
||||
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
|
||||
if self.session['language'] in ['zho', 'jpn', 'kor', 'tha', 'lao', 'mya', 'khm']:
|
||||
not_supported_punc_pattern = re.compile(r'\p{P}+')
|
||||
else:
|
||||
not_supported_punc_pattern = re.compile(r'["—…¡¿]')
|
||||
if settings['voice_path'] is not None:
|
||||
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
||||
os.makedirs(proc_dir, exist_ok=True)
|
||||
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
||||
self.tts.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, '', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
self.engine.tts_to_file(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
file_path=tmp_in_wav,
|
||||
**speaker_argument
|
||||
)
|
||||
if settings['voice_path'] in settings['semitones'].keys():
|
||||
semitones = settings['semitones'][settings['voice_path']]
|
||||
else:
|
||||
@@ -752,17 +740,16 @@ class Coqui:
|
||||
return False
|
||||
else:
|
||||
tmp_out_wav = tmp_in_wav
|
||||
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
||||
if tts_vc:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
||||
if self.engine_zs:
|
||||
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_zs_key]['samplerate']
|
||||
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
||||
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
||||
audio_sentence = tts_vc.voice_conversion(
|
||||
audio_sentence = self.engine_zs.voice_conversion(
|
||||
source_wav=source_wav,
|
||||
target_wav=target_wav
|
||||
)
|
||||
else:
|
||||
error = f'Engine {self.tts_vc_key} is None'
|
||||
error = f'Engine {self.tts_zs_key} is None'
|
||||
print(error)
|
||||
return False
|
||||
if os.path.exists(tmp_in_wav):
|
||||
@@ -772,10 +759,11 @@ class Coqui:
|
||||
if os.path.exists(source_wav):
|
||||
os.remove(source_wav)
|
||||
else:
|
||||
audio_sentence = self.tts.tts(
|
||||
text=re.sub(not_supported_punc_pattern, '', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
**speaker_argument
|
||||
)
|
||||
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
||||
trim_audio_buffer = 0.002
|
||||
speaker_argument = {}
|
||||
@@ -788,8 +776,8 @@ class Coqui:
|
||||
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
|
||||
speaker_argument = {"speaker": voice_key}
|
||||
with torch.no_grad():
|
||||
audio_sentence = self.tts.tts(
|
||||
text=re.sub(not_supported_punc_pattern, '', sentence),
|
||||
audio_sentence = self.engine.tts(
|
||||
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
||||
language=language,
|
||||
**speaker_argument
|
||||
)
|
||||
@@ -820,16 +808,23 @@ class Coqui:
|
||||
if self.sentence_idx:
|
||||
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
||||
del audio_tensor
|
||||
cleanup_garbage()
|
||||
self.audio_segments = []
|
||||
if os.path.exists(final_sentence_file):
|
||||
return True
|
||||
else:
|
||||
error = f"Cannot create {final_sentence_file}"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"audio_sentence not valide"
|
||||
print(error)
|
||||
return False
|
||||
else:
|
||||
error = f"convert() error: {self.session['tts_engine']} is None"
|
||||
error = f"TTS engine {self.session['tts_engine']} could not be loaded!\nPossible reason can be not enough VRAM/RAM memory"
|
||||
print(error)
|
||||
return False
|
||||
except Exception as e:
|
||||
error = f'Coquit.convert(): {e}'
|
||||
raise ValueError(e)
|
||||
return False
|
||||
return False
|
||||
@@ -1,12 +1,12 @@
|
||||
import os
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from typing import Any
|
||||
from lib.models import TTS_ENGINES
|
||||
|
||||
class TTSManager:
|
||||
def __init__(self, session:Any):
|
||||
self.session = session
|
||||
self.engine = None
|
||||
self.engine = False
|
||||
self._build()
|
||||
|
||||
def _build(self)->None:
|
||||
@@ -17,9 +17,6 @@ class TTSManager:
|
||||
#elif self.session['tts_engine'] in [TTS_ENGINES['NEW_TTS']]:
|
||||
# from lib.classes.tts_engines.new_tts import NewTts
|
||||
# self.engine = NewTts(self.session)
|
||||
if not self.engine:
|
||||
error='TTS engine could not be created!'
|
||||
print(error)
|
||||
else:
|
||||
print('Other TTS engines coming soon!')
|
||||
|
||||
@@ -32,4 +29,3 @@ class TTSManager:
|
||||
except Exception as e:
|
||||
error=f'convert_sentence2audio(): {e}'
|
||||
raise ValueError(e)
|
||||
return False
|
||||
@@ -5,8 +5,9 @@ import scipy.fftpack
|
||||
import soundfile as sf
|
||||
import subprocess
|
||||
import shutil
|
||||
import json
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
from typing import Any
|
||||
from io import BytesIO
|
||||
from pydub import AudioSegment, silence
|
||||
from pydub.silence import detect_silence
|
||||
@@ -14,6 +15,7 @@ from pydub.silence import detect_silence
|
||||
from lib.conf import voice_formats, default_audio_proc_samplerate
|
||||
from lib.models import TTS_ENGINES, models
|
||||
from lib.classes.background_detector import BackgroundDetector
|
||||
from lib.classes.subprocess_pipe import SubprocessPipe
|
||||
|
||||
class VoiceExtractor:
|
||||
def __init__(self, session:Any, voice_file:str, voice_name:str):
|
||||
@@ -30,7 +32,7 @@ class VoiceExtractor:
|
||||
def _validate_format(self)->tuple[bool,str]:
|
||||
file_extension = os.path.splitext(self.voice_file)[1].lower()
|
||||
if file_extension in voice_formats:
|
||||
msg = 'Input file valid'
|
||||
msg = 'Input file is valid'
|
||||
return True,msg
|
||||
error = f'Unsupported file format: {file_extension}. Supported formats are: {", ".join(voice_formats)}'
|
||||
return False,error
|
||||
@@ -38,33 +40,21 @@ class VoiceExtractor:
|
||||
def _convert2wav(self)->tuple[bool, str]:
|
||||
try:
|
||||
self.wav_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
||||
ffmpeg_cmd = [
|
||||
cmd = [
|
||||
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_file,
|
||||
'-ac', '1', '-y', self.wav_file
|
||||
]
|
||||
process = subprocess.Popen(
|
||||
ffmpeg_cmd,
|
||||
env={},
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=False # <── raw bytes mode (no implicit UTF-8 decoding)
|
||||
)
|
||||
# Decode safely line by line
|
||||
for raw_line in iter(process.stdout.readline, b''):
|
||||
try:
|
||||
line = raw_line.decode('utf-8', errors='replace') # <── replaces invalid bytes
|
||||
except Exception:
|
||||
line = raw_line.decode('latin-1', errors='replace')
|
||||
print(line, end='')
|
||||
|
||||
process.wait()
|
||||
if process.returncode != 0:
|
||||
error = f'_convert2wav(): process.returncode: {process.returncode}'
|
||||
elif not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
|
||||
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
|
||||
]
|
||||
proc_pipe = SubprocessPipe(cmd, is_gui_process=self.session['is_gui_process'], total_duration=self._get_audio_duration(self.voice_file), msg='Convert')
|
||||
if proc_pipe:
|
||||
if not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
|
||||
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
|
||||
return False, error
|
||||
else:
|
||||
msg = 'Conversion to .wav format for processing successful'
|
||||
return True, msg
|
||||
else:
|
||||
msg = 'Conversion to .wav format for processing successful'
|
||||
return True, msg
|
||||
error = f'_convert2wav() error:: {self.wav_file}'
|
||||
return False, error
|
||||
except subprocess.CalledProcessError as e:
|
||||
try:
|
||||
stderr_text = e.stderr.decode('utf-8', errors='replace')
|
||||
@@ -201,12 +191,35 @@ class VoiceExtractor:
|
||||
error = f'_trim_and_clean() error: {e}'
|
||||
raise ValueError(error)
|
||||
|
||||
def _get_audio_duration(self, filepath:str)->float:
|
||||
try:
|
||||
cmd = [
|
||||
shutil.which('ffprobe'),
|
||||
'-v', 'error',
|
||||
'-show_entries', 'format=duration',
|
||||
'-of', 'json',
|
||||
filepath
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
try:
|
||||
duration = json.loads(result.stdout)['format']['duration']
|
||||
return float(duration)
|
||||
except Exception:
|
||||
return 0
|
||||
except subprocess.CalledProcessError as e:
|
||||
DependencyError(e)
|
||||
return 0
|
||||
except Exception as e:
|
||||
error = f"get_audio_duration() Error: Failed to process {filepath}: {e}"
|
||||
print(error)
|
||||
return 0
|
||||
|
||||
def _normalize_audio(self)->tuple[bool, str]:
|
||||
error = ''
|
||||
try:
|
||||
proc_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
|
||||
final_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
||||
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
|
||||
cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
|
||||
filter_complex = (
|
||||
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
||||
'afftdn=nf=-70,'
|
||||
@@ -219,33 +232,26 @@ class VoiceExtractor:
|
||||
'equalizer=f=9000:t=q:w=2:g=-2,'
|
||||
'highpass=f=63[audio]'
|
||||
)
|
||||
ffmpeg_cmd += [
|
||||
cmd += [
|
||||
'-filter_complex', filter_complex,
|
||||
'-map', '[audio]',
|
||||
'-ar', f'{default_audio_proc_samplerate}',
|
||||
'-y', proc_voice_file
|
||||
]
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
ffmpeg_cmd,
|
||||
env = {},
|
||||
stdout = subprocess.PIPE,
|
||||
stderr = subprocess.PIPE,
|
||||
encoding = 'utf-8',
|
||||
errors = 'ignore'
|
||||
)
|
||||
for line in process.stdout:
|
||||
print(line, end = '')
|
||||
process.wait()
|
||||
if process.returncode != 0:
|
||||
error = f'_normalize_audio(): process.returncode: {process.returncode}'
|
||||
elif not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
|
||||
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
|
||||
proc_pipe = SubprocessPipe(cmd, is_gui_process=self.session['is_gui_process'], total_duration=self._get_audio_duration(self.voice_track), msg='Normalize')
|
||||
if proc_pipe:
|
||||
if not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
|
||||
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
|
||||
return False, error
|
||||
else:
|
||||
os.replace(proc_voice_file, final_voice_file)
|
||||
shutil.rmtree(self.demucs_dir, ignore_errors = True)
|
||||
msg = 'Audio normalization successful!'
|
||||
return True, msg
|
||||
else:
|
||||
os.replace(proc_voice_file, final_voice_file)
|
||||
shutil.rmtree(self.demucs_dir, ignore_errors = True)
|
||||
msg = 'Audio normalization successful!'
|
||||
return True, msg
|
||||
error = f'normalize_audio() error: {final_voice_file}'
|
||||
return False, error
|
||||
except subprocess.CalledProcessError as e:
|
||||
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
|
||||
except FileNotFoundError as e:
|
||||
|
||||
@@ -1,145 +1,110 @@
|
||||
import os, platform, subprocess, re, json, psutil, tempfile, time
|
||||
|
||||
from typing import Any, Optional, Union, Callable
|
||||
import os, platform, json, psutil, subprocess, re
|
||||
from typing import Any
|
||||
|
||||
class VRAMDetector:
|
||||
def __init__(self):
|
||||
self.system:str = platform.system().lower()
|
||||
self.system = platform.system().lower()
|
||||
|
||||
def _run(self, cmd:list[str], timeout:int = 3)->str:
|
||||
@staticmethod
|
||||
def _fmt(b:int)->str:
|
||||
if not b: return 'Unknown'
|
||||
if b >= 1024**3: return f'{b/1024**3:.2f} GB'
|
||||
if b >= 1024**2: return f'{b/1024**2:.2f} MB'
|
||||
if b >= 1024: return f'{b/1024:.2f} KB'
|
||||
return f'{b} B'
|
||||
|
||||
def detect_vram(self, device:str, as_json:bool=False)->Any:
|
||||
info = {}
|
||||
# ───────────────────────────── CUDA (NVIDIA)
|
||||
try:
|
||||
result = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, text = True, timeout = timeout)
|
||||
return result.stdout.strip()
|
||||
import torch
|
||||
if device == 'cuda':
|
||||
if torch.cuda.is_available():
|
||||
free, total = torch.cuda.mem_get_info()
|
||||
alloc = torch.cuda.memory_allocated()
|
||||
resv = torch.cuda.memory_reserved()
|
||||
info = {
|
||||
"os": self.system,
|
||||
"device_type": "cuda",
|
||||
"device_name": torch.cuda.get_device_name(0),
|
||||
"free_bytes": free,
|
||||
"total_bytes": total,
|
||||
"allocated_bytes": alloc,
|
||||
"reserved_bytes": resv,
|
||||
"free_human": self._fmt(free),
|
||||
"total_human": self._fmt(total),
|
||||
"allocated_human": self._fmt(alloc),
|
||||
"reserved_human": self._fmt(resv),
|
||||
}
|
||||
return json.dumps(info, indent=2) if as_json else info
|
||||
|
||||
# ─────────────────────────── ROCm (AMD)
|
||||
if hasattr(torch, 'hip') and torch.hip.is_available():
|
||||
free, total = torch.hip.mem_get_info()
|
||||
alloc = torch.hip.memory_allocated()
|
||||
resv = torch.hip.memory_reserved()
|
||||
info = {
|
||||
"os": self.system,
|
||||
"device_type": "rocm",
|
||||
"device_name": torch.hip.get_device_name(0),
|
||||
"free_bytes": free,
|
||||
"total_bytes": total,
|
||||
"allocated_bytes": alloc,
|
||||
"reserved_bytes": resv,
|
||||
"free_human": self._fmt(free),
|
||||
"total_human": self._fmt(total),
|
||||
"allocated_human": self._fmt(alloc),
|
||||
"reserved_human": self._fmt(resv),
|
||||
}
|
||||
return json.dumps(info, indent=2) if as_json else info
|
||||
|
||||
# ─────────────────────────── Intel XPU (oneAPI)
|
||||
if hasattr(torch, 'xpu') and torch.xpu.is_available():
|
||||
free, total = torch.xpu.mem_get_info()
|
||||
alloc = torch.xpu.memory_allocated()
|
||||
resv = torch.xpu.memory_reserved()
|
||||
info = {
|
||||
"os": self.system,
|
||||
"device_type": "xpu",
|
||||
"device_name": torch.xpu.get_device_name(0),
|
||||
"free_bytes": free,
|
||||
"total_bytes": total,
|
||||
"allocated_bytes": alloc,
|
||||
"reserved_bytes": resv,
|
||||
"free_human": self._fmt(free),
|
||||
"total_human": self._fmt(total),
|
||||
"allocated_human": self._fmt(alloc),
|
||||
"reserved_human": self._fmt(resv),
|
||||
}
|
||||
return json.dumps(info, indent=2) if as_json else info
|
||||
|
||||
# ─────────────────────────── Apple MPS (Metal)
|
||||
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
||||
info = {
|
||||
"os": self.system,
|
||||
"device_type": "mps",
|
||||
"device_name": "Apple GPU (Metal)",
|
||||
"note": "PyTorch MPS does not expose memory info; reporting system RAM",
|
||||
}
|
||||
mem = psutil.virtual_memory()
|
||||
info['free_bytes'] = mem.available
|
||||
info['total_bytes'] = mem.total
|
||||
info['free_human'] = self._fmt(mem.available)
|
||||
info['total_human'] = self._fmt(mem.total)
|
||||
return json.dumps(info, indent=2) if as_json else info
|
||||
|
||||
except Exception:
|
||||
return ""
|
||||
pass
|
||||
|
||||
def _parse_bytes(self, val:str)->int:
|
||||
if not val:
|
||||
return 0
|
||||
val = val.strip().upper()
|
||||
m = re.findall(r"([\d.]+)", val)
|
||||
if not m:
|
||||
return 0
|
||||
n = float(m[0])
|
||||
if "GB" in val: return int(n*1024**3)
|
||||
if "MB" in val: return int(n*1024**2)
|
||||
if "KB" in val: return int(n*1024)
|
||||
return int(n)
|
||||
|
||||
def _fmt(self, b:int)->str:
|
||||
if not b: return "Unknown"
|
||||
if b >= 1024**3: return f"{b/1024**3:.1f} GB"
|
||||
if b >= 1024**2: return f"{b/1024**2:.1f} MB"
|
||||
return f"{b} B"
|
||||
|
||||
# ---- Windows GPU detection ----
|
||||
def _get_windows_vram(self)->list[dict[str,Any]]:
|
||||
gpus = []
|
||||
out = self._run(["wmic","path","win32_VideoController","get","Name,AdapterRAM","/format:list"])
|
||||
for block in out.split("\n\n"):
|
||||
if "Name = " not in block: continue
|
||||
name = re.search(r"Name = (.*)", block)
|
||||
vram = re.search(r"AdapterRAM = (\d+)", block)
|
||||
if name:
|
||||
val = int(vram.group(1)) if vram else 0
|
||||
gpus.append({"name":name.group(1).strip(),"vram_bytes":val,"vram":self._fmt(val)})
|
||||
if any(g["vram_bytes"]>0 for g in gpus):
|
||||
return gpus
|
||||
with tempfile.NamedTemporaryFile(delete = False, suffix = ".txt") as tf:
|
||||
path = tf.name
|
||||
try:
|
||||
subprocess.Popen(["dxdiag","/t",path],stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
|
||||
for _ in range(30):
|
||||
if os.path.exists(path) and os.path.getsize(path)>0:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
with open(path,encoding = "utf-16",errors = "ignore") as f:
|
||||
data = f.read()
|
||||
except Exception:
|
||||
data = ""
|
||||
finally:
|
||||
try: os.remove(path)
|
||||
except: pass
|
||||
for m in re.finditer(r"Card name:\s*(.*?)\r?\n.*?(?:Dedicated Memory|Display Memory):\s*([^\r\n]+)", data, re.S):
|
||||
name,mem = m.groups()
|
||||
vb = self._parse_bytes(mem)
|
||||
if vb:
|
||||
gpus.append({"name":name.strip(),"vram_bytes":vb,"vram":self._fmt(vb)})
|
||||
return gpus
|
||||
|
||||
def _get_windows_shared(self)->int:
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(delete = False, suffix = ".txt") as tf:
|
||||
path = tf.name
|
||||
subprocess.Popen(["dxdiag","/t",path],stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL)
|
||||
for _ in range(30):
|
||||
if os.path.exists(path) and os.path.getsize(path)>0:
|
||||
break
|
||||
time.sleep(0.1)
|
||||
with open(path,encoding = "utf-16",errors = "ignore") as f:
|
||||
data = f.read()
|
||||
except Exception:
|
||||
data = ""
|
||||
finally:
|
||||
try: os.remove(path)
|
||||
except: pass
|
||||
m = re.search(r"Shared Memory:\s*([^\r\n]+)", data)
|
||||
return self._parse_bytes(m.group(1)) if m else 0
|
||||
|
||||
# ---- Linux/macOS simplified ----
|
||||
def _get_linux_vram(self)->list[dict[str,Any]]:
|
||||
out = self._run(["nvidia-smi","--query-gpu = name,memory.total","--format = csv,noheader,nounits"])
|
||||
gpus = []
|
||||
for line in out.splitlines():
|
||||
if "," not in line: continue
|
||||
name,mem = line.split(",",1)
|
||||
vb = int(mem.strip())*1024**2
|
||||
gpus.append({"name":name.strip(),"vram_bytes":vb,"vram":self._fmt(vb)})
|
||||
return gpus
|
||||
|
||||
def _get_linux_shared(self)->int:
|
||||
return psutil.virtual_memory().total//4 if hasattr(psutil,"virtual_memory") else 0
|
||||
|
||||
def _get_macos_vram(self)->list[dict[str,Any]]:
|
||||
out = self._run(["system_profiler","SPDisplaysDataType","-json"])
|
||||
try:data = json.loads(out)
|
||||
except: return []
|
||||
g = []
|
||||
for gpu in data.get("SPDisplaysDataType",[]):
|
||||
v = self._parse_bytes(gpu.get("spdisplays_vram",""))
|
||||
g.append({"name":gpu.get("_name","GPU"),"vram_bytes":v,"vram":self._fmt(v)})
|
||||
return g
|
||||
|
||||
def _get_macos_shared(self)->int:
|
||||
out = self._run(["system_profiler","SPDisplaysDataType","-json"])
|
||||
try:data = json.loads(out)
|
||||
except:return 0
|
||||
for gpu in data.get("SPDisplaysDataType",[]):
|
||||
for key in ("spdisplays_vram_shared","spdisplays_vram_dynamic"):
|
||||
if key in gpu:
|
||||
return self._parse_bytes(gpu[key])
|
||||
return 0
|
||||
|
||||
# ---- main API ----
|
||||
def detect_vram(self,as_json:bool = False)->Any:
|
||||
sys = self.system
|
||||
if sys == "windows":
|
||||
g = self._get_windows_vram(); s = self._get_windows_shared()
|
||||
elif sys == "linux":
|
||||
g = self._get_linux_vram(); s = self._get_linux_shared()
|
||||
elif sys == "darwin":
|
||||
g = self._get_macos_vram(); s = self._get_macos_shared()
|
||||
else:
|
||||
g = []; s = 0
|
||||
total = sum(x.get("vram_bytes",0) for x in g)
|
||||
res = {
|
||||
"os":sys,
|
||||
"gpu_count":len(g),
|
||||
"gpus":g,
|
||||
"total_vram_bytes":total,
|
||||
"total_vram_human":self._fmt(total),
|
||||
"shared_memory_bytes":s,
|
||||
"shared_memory_human":self._fmt(s),
|
||||
"total_combined_human":self._fmt(total+s)
|
||||
# ─────────────────────────── CPU fallback
|
||||
mem = psutil.virtual_memory()
|
||||
info = {
|
||||
"os": self.system,
|
||||
"device_type": "cpu",
|
||||
"device_name": "System RAM",
|
||||
"free_bytes": mem.available,
|
||||
"total_bytes": mem.total,
|
||||
"free_human": self._fmt(mem.available),
|
||||
"total_human": self._fmt(mem.total),
|
||||
}
|
||||
return json.dumps(res,indent = 2) if as_json else res
|
||||
return json.dumps(info, indent=2) if as_json else info
|
||||
33
lib/conf.py
@@ -1,7 +1,12 @@
|
||||
import os
|
||||
import platform
|
||||
import tempfile
|
||||
|
||||
min_python_version = (3,10)
|
||||
max_python_version = (3,13)
|
||||
|
||||
tmp_dir = os.path.abspath('tmp')
|
||||
tempfile.tempdir = tmp_dir
|
||||
tmp_expire = 7 # days
|
||||
|
||||
models_dir = os.path.abspath('models')
|
||||
@@ -14,10 +19,10 @@ os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||
os.environ['COQUI_TOS_AGREED'] = '1'
|
||||
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
|
||||
os.environ['GRADIO_DEBUG'] = '1'
|
||||
os.environ['DO_NOT_TRACK'] = 'true'
|
||||
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
|
||||
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
|
||||
os.environ['GRADIO_DEBUG'] = '0'
|
||||
os.environ['DO_NOT_TRACK'] = 'True'
|
||||
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
||||
os.environ['HF_HOME'] = tts_dir
|
||||
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
||||
@@ -30,25 +35,27 @@ os.environ['STANZA_RESOURCES_DIR'] = os.path.join(models_dir, 'stanza')
|
||||
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
|
||||
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
||||
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
||||
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
|
||||
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
|
||||
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING'] = '1'
|
||||
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:32,garbage_collection_threshold:0.6,expandable_segments:True'
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
|
||||
os.environ["CUDA_CACHE_MAXSIZE"] = "2147483648"
|
||||
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
|
||||
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
|
||||
if platform.system() == 'Windows':
|
||||
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
|
||||
|
||||
prog_version = (lambda: open('VERSION.txt').read().strip())()
|
||||
|
||||
min_python_version = (3,10)
|
||||
max_python_version = (3,12)
|
||||
|
||||
NATIVE = 'native'
|
||||
FULL_DOCKER = 'full_docker'
|
||||
|
||||
debug_mode = True
|
||||
debug_mode = False
|
||||
|
||||
device_list = ['cpu', 'gpu', 'mps']
|
||||
default_device = 'cpu'
|
||||
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">howto wiki</a>'
|
||||
default_chapters_control = False
|
||||
devices = {"CPU": {"proc": "cpu", "found": True}, "CUDA": {"proc": "cuda", "found": False}, "MPS": {"proc": "mps", "found": False}, "ROCM": {"proc": "rocm", "found": False}, "XPU": {"proc": "xpu", "found": False}}
|
||||
default_device = devices['CPU']['proc']
|
||||
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">GPU howto wiki</a>'
|
||||
default_chapters_preview = False
|
||||
|
||||
python_env_dir = os.path.abspath(os.path.join('.','python_env'))
|
||||
requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
|
||||
@@ -56,7 +63,7 @@ requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
|
||||
interface_host = '0.0.0.0'
|
||||
interface_port = 7860
|
||||
interface_shared_tmp_expire = 3 # in days
|
||||
interface_concurrency_limit = 1 # or None for unlimited
|
||||
interface_concurrency_limit = 1 # or None for unlimited multiple parallele user conversion
|
||||
|
||||
interface_component_options = {
|
||||
"gr_tab_xtts_params": True,
|
||||
|
||||
5357
lib/functions.py
@@ -834,7 +834,6 @@ language_mapping = {
|
||||
"ben": {"name": "Bengali", "native_name": "বাংলা", "max_chars": 142},
|
||||
"zho": {"name": "Chinese", "native_name": "中文", "max_chars": 82},
|
||||
"eng": {"name": "English", "native_name": "English", "max_chars": 250},
|
||||
"fas": {"name": "Persian", "native_name": "فارسی", "max_chars": 182},
|
||||
"fra": {"name": "French", "native_name": "Français", "max_chars": 273},
|
||||
"deu": {"name": "German, Standard", "native_name": "Deutsch", "max_chars": 253},
|
||||
"hin": {"name": "Hindi", "native_name": "हिन्दी", "max_chars": 142},
|
||||
@@ -844,6 +843,7 @@ language_mapping = {
|
||||
"jav": {"name": "Javanese", "native_name": "Basa Jawa", "max_chars": 182},
|
||||
"jpn": {"name": "Japanese", "native_name": "日本語", "max_chars": 71},
|
||||
"kor": {"name": "Korean", "native_name": "한국어", "max_chars": 95},
|
||||
"fas": {"name": "Persian", "native_name": "فارسی", "max_chars": 182},
|
||||
"pol": {"name": "Polish", "native_name": "Polski", "max_chars": 224},
|
||||
"por": {"name": "Portuguese", "native_name": "Português", "max_chars": 203},
|
||||
"rus": {"name": "Russian", "native_name": "Русский", "max_chars": 182},
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import os
|
||||
|
||||
from lib.conf import tts_dir, voices_dir
|
||||
|
||||
loaded_tts = {}
|
||||
xtts_builtin_speakers_list = []
|
||||
|
||||
TTS_ENGINES = {
|
||||
"XTTSv2": "xtts",
|
||||
@@ -30,7 +31,6 @@ default_fine_tuned = 'internal'
|
||||
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
|
||||
default_voice_detection_model = 'drewThomasson/segmentation'
|
||||
|
||||
max_tts_in_memory = 2 # TTS engines to keep in memory (1 tts engine ~= 4GB to 8GB RAM).
|
||||
max_custom_model = 100
|
||||
max_custom_voices = 1000
|
||||
max_upload_size = '6GB'
|
||||
@@ -46,10 +46,6 @@ default_engine_settings = {
|
||||
"top_p": 0.85,
|
||||
"speed": 1.0,
|
||||
"enable_text_splitting": False,
|
||||
# to enable deepspeed, you must install it first:
|
||||
# conda activate ./python_env (linux/mac) or .\python_env (windows)
|
||||
# pip install deepspeed
|
||||
# conda deactivate
|
||||
"use_deepspeed": False,
|
||||
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav', 'speakers_xtts.pth'],
|
||||
"voices": {
|
||||
@@ -74,12 +70,12 @@ default_engine_settings = {
|
||||
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
|
||||
"MarcosRudaski": "Marcos Rudaski"
|
||||
},
|
||||
"rating": {"GPU VRAM": 4, "CPU": 3, "RAM": 8, "Realism": 5}
|
||||
"rating": {"VRAM": 2, "CPU": 2, "RAM": 4, "Realism": 5}
|
||||
},
|
||||
TTS_ENGINES['BARK']: {
|
||||
"samplerate": 24000,
|
||||
"text_temp": 0.50,
|
||||
"waveform_temp": 0.50,
|
||||
"text_temp": 0.4,
|
||||
"waveform_temp": 0.6,
|
||||
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
|
||||
"speakers_path": os.path.join(voices_dir, '__bark'),
|
||||
"voices": {
|
||||
@@ -128,31 +124,31 @@ default_engine_settings = {
|
||||
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
|
||||
"zh_speaker_9": "Speaker 9"
|
||||
},
|
||||
"rating": {"GPU VRAM": 4, "CPU": 1, "RAM": 16, "Realism": 4}
|
||||
"rating": {"VRAM": 6, "CPU": 1, "RAM": 8, "Realism": 5}
|
||||
},
|
||||
TTS_ENGINES['VITS']: {
|
||||
"samplerate": 22050,
|
||||
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
|
||||
"voices": {},
|
||||
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
|
||||
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
|
||||
},
|
||||
TTS_ENGINES['FAIRSEQ']: {
|
||||
"samplerate": 16000,
|
||||
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
|
||||
"voices": {},
|
||||
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
|
||||
"rating": {"VRAM": 2, "CPU": 4, "RAM": 4, "Realism": 4}
|
||||
},
|
||||
TTS_ENGINES['TACOTRON2']: {
|
||||
"samplerate": 22050,
|
||||
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
|
||||
"voices": {},
|
||||
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 3}
|
||||
"rating": {"VRAM": 1, "CPU": 5, "RAM": 2, "Realism": 3}
|
||||
},
|
||||
TTS_ENGINES['YOURTTS']: {
|
||||
"samplerate": 16000,
|
||||
"files": ['config.json', 'model_file.pth'],
|
||||
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
|
||||
"rating": {"GPU VRAM": 1, "CPU": 5, "RAM": 4, "Realism": 2}
|
||||
"rating": {"VRAM": 0, "CPU": 5, "RAM": 1, "Realism": 2}
|
||||
}
|
||||
}
|
||||
models = {
|
||||
@@ -333,6 +329,14 @@ models = {
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"PeterGriffinFamilyGuy": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
"sub": "xtts-v2/eng/PeterGriffinFamilyGuy/",
|
||||
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'PeterGriffinFamilyGuy.wav'),
|
||||
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
||||
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
||||
},
|
||||
"RafeBeckley": {
|
||||
"lang": "eng",
|
||||
"repo": "drewThomasson/fineTunedTTSModels",
|
||||
|
||||
136
pyproject.toml
@@ -1,69 +1,67 @@
|
||||
[build-system]
|
||||
name = "ebook2audiobook"
|
||||
requires = ["setuptools >= 64"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.poetry]
|
||||
name = "ebook2audiobook"
|
||||
version = "25.10.25"
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {file = "VERSION.txt"}
|
||||
|
||||
[project]
|
||||
name = "ebook2audiobook"
|
||||
description = "Convert eBooks to audiobooks with chapters and metadata"
|
||||
authors = [
|
||||
{ name = "Drew Thomasson" }
|
||||
]
|
||||
dependencies = [
|
||||
"regex",
|
||||
"tqdm",
|
||||
"cutlet",
|
||||
"deep_translator",
|
||||
"docker",
|
||||
"ebooklib",
|
||||
"fastapi",
|
||||
"num2words",
|
||||
"argostranslate",
|
||||
"beautifulsoup4",
|
||||
"fugashi",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
"ray",
|
||||
"unidic",
|
||||
"pymupdf4llm",
|
||||
"translate",
|
||||
"hangul-romanize",
|
||||
"indic-nlp-library",
|
||||
"iso639-lang",
|
||||
"jieba",
|
||||
"pycantonese",
|
||||
"soynlp",
|
||||
"pypinyin",
|
||||
"pythainlp",
|
||||
"mutagen",
|
||||
"PyOpenGL",
|
||||
"nvidia-ml-py",
|
||||
"phonemizer-fork",
|
||||
"pydub",
|
||||
"pyannote-audio==3.4.0",
|
||||
"demucs==4.0.1",
|
||||
"gradio>=5.49",
|
||||
"transformers==4.51.3",
|
||||
"coqui-tts[languages]==0.26.0",
|
||||
"torch>=2.8.0,<2.9",
|
||||
"torchaudio>=2.8.0,<2.9",
|
||||
"torchvggish"
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">3.9,<3.13"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
scripts = { "ebook2audiobook" = "app:main" }
|
||||
|
||||
[project.urls]
|
||||
"Homepage" = "https://github.com/DrewThomasson/ebook2audiobook"
|
||||
[build-system]
|
||||
name = "ebook2audiobook"
|
||||
requires = ["setuptools >= 64"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.poetry]
|
||||
name = "ebook2audiobook"
|
||||
version = "25.10.30"
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {file = "VERSION.txt"}
|
||||
|
||||
[project]
|
||||
name = "ebook2audiobook"
|
||||
description = "Convert eBooks to audiobooks with chapters and metadata"
|
||||
authors = [
|
||||
{ name = "Drew Thomasson" }
|
||||
]
|
||||
dependencies = [
|
||||
"torchvggish",
|
||||
"numpy<2",
|
||||
"num2words @ git+https://github.com/savoirfairelinux/num2words.git",
|
||||
"regex",
|
||||
"tqdm",
|
||||
"docker",
|
||||
"ebooklib",
|
||||
"fastapi",
|
||||
"beautifulsoup4",
|
||||
"fugashi",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
"PyMuPDF",
|
||||
"pytesseract",
|
||||
"unidic",
|
||||
"hangul-romanize",
|
||||
"indic-nlp-library",
|
||||
"iso639-lang",
|
||||
"jieba",
|
||||
"pycantonese",
|
||||
"soynlp",
|
||||
"pypinyin",
|
||||
"pythainlp",
|
||||
"mutagen",
|
||||
"PyOpenGL",
|
||||
"phonemizer-fork",
|
||||
"pydub",
|
||||
"demucs",
|
||||
"deepspeed",
|
||||
"pyannote-audio<=3.4.0",
|
||||
"stanza<=1.10.1",
|
||||
"argostranslate<=1.10.0",
|
||||
"gradio>=5.49.1",
|
||||
"torch<=2.7.1",
|
||||
"torchaudio<=2.7.1",
|
||||
"coqui-tts[languages]==0.27.2"
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">3.9,<3.14"
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
scripts = { "ebook2audiobook" = "app:main" }
|
||||
|
||||
[project.urls]
|
||||
"Homepage" = "https://github.com/DrewThomasson/ebook2audiobook"
|
||||
@@ -1,20 +1,18 @@
|
||||
torchvggish
|
||||
numpy<2
|
||||
num2words @ git+https://github.com/savoirfairelinux/num2words.git
|
||||
regex
|
||||
tqdm
|
||||
cutlet
|
||||
deep_translator
|
||||
docker
|
||||
ebooklib
|
||||
fastapi
|
||||
num2words
|
||||
argostranslate
|
||||
beautifulsoup4
|
||||
fugashi
|
||||
sudachipy
|
||||
sudachidict_core
|
||||
ray
|
||||
PyMuPDF
|
||||
pytesseract
|
||||
unidic
|
||||
pymupdf4llm
|
||||
translate
|
||||
hangul-romanize
|
||||
indic-nlp-library
|
||||
iso639-lang
|
||||
@@ -25,14 +23,14 @@ pypinyin
|
||||
pythainlp
|
||||
mutagen
|
||||
PyOpenGL
|
||||
nvidia-ml-py
|
||||
phonemizer-fork
|
||||
pydub
|
||||
pyannote-audio==3.4.0
|
||||
demucs==4.0.1
|
||||
gradio>=5.49
|
||||
transformers==4.51.3
|
||||
coqui-tts[languages]==0.26.0
|
||||
torch>=2.8.0,<2.9
|
||||
torchaudio>=2.8.0,<2.9
|
||||
torchvggish
|
||||
demucs
|
||||
deepspeed
|
||||
pyannote-audio<=3.4.0
|
||||
stanza<=1.10.1
|
||||
argostranslate<=1.10.0
|
||||
gradio>=5.49.1
|
||||
torch<=2.7.1
|
||||
torchaudio<=2.7.1
|
||||
coqui-tts[languages]==0.27.2
|
||||
0
tmp/.gitkeep
Executable file → Normal file
BIN
tools/icons/appLogo.png
Normal file
|
After Width: | Height: | Size: 461 KiB |
173
tools/icons/generate-icons.py
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Multi-platform icon generator
|
||||
Converts appLogo.png into platform-specific formats and sizes
|
||||
Requires: Pillow (PIL), cairosvg (optional for SVG)
|
||||
|
||||
Installation:
|
||||
pip install Pillow cairosvg
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from PIL import Image
|
||||
|
||||
# Icon sizes for each platform
|
||||
ICON_SIZES = {
|
||||
'windows': [16, 24, 32, 48, 256],
|
||||
'mac': [16, 32, 64, 128, 256, 512, 1024],
|
||||
'linux': [16, 24, 32, 48, 64, 128, 256]
|
||||
}
|
||||
|
||||
def create_directories():
|
||||
"""Create output directories for each platform"""
|
||||
for platform in ICON_SIZES.keys():
|
||||
os.makedirs(f'icons/{platform}', exist_ok=True)
|
||||
print("✓ Directories created")
|
||||
|
||||
def resize_image(source_path, output_dir, sizes):
|
||||
"""Resize image to multiple sizes"""
|
||||
try:
|
||||
img = Image.open(source_path)
|
||||
# Convert to RGBA to ensure transparency support
|
||||
img = img.convert('RGBA')
|
||||
|
||||
for size in sizes:
|
||||
resized = img.resize((size, size), Image.Resampling.LANCZOS)
|
||||
output_path = f'{output_dir}/icon-{size}.png'
|
||||
resized.save(output_path, 'PNG')
|
||||
print(f" ✓ Generated {size}x{size} icon")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Error resizing image: {e}")
|
||||
return False
|
||||
|
||||
def create_windows_ico(output_dir):
|
||||
"""Create Windows ICO file from PNGs"""
|
||||
try:
|
||||
sizes = ICON_SIZES['windows']
|
||||
images = []
|
||||
|
||||
for size in sizes:
|
||||
img_path = f'{output_dir}/icon-{size}.png'
|
||||
images.append(Image.open(img_path))
|
||||
|
||||
# Save as ICO with multiple sizes
|
||||
images[0].save(
|
||||
f'{output_dir}/appIcon.ico',
|
||||
format='ICO',
|
||||
sizes=[(size, size) for size in sizes]
|
||||
)
|
||||
print("✓ Windows ICO file created: icons/windows/appIcon.ico")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Error creating ICO: {e}")
|
||||
return False
|
||||
|
||||
def create_mac_icns(output_dir):
|
||||
"""Create macOS ICNS file from PNGs (requires imagemagick or online conversion)"""
|
||||
try:
|
||||
import subprocess
|
||||
sizes = ICON_SIZES['mac']
|
||||
|
||||
# Create iconset directory
|
||||
iconset_dir = f'{output_dir}/appIcon.iconset'
|
||||
os.makedirs(iconset_dir, exist_ok=True)
|
||||
|
||||
for size in sizes:
|
||||
img_path = f'{output_dir}/icon-{size}.png'
|
||||
# macOS uses specific naming conventions
|
||||
scale = 2 if size > 256 else 1
|
||||
icon_name = f'icon_{size // scale}x{size // scale}'
|
||||
if scale == 2:
|
||||
icon_name += '@2x'
|
||||
|
||||
output_path = f'{iconset_dir}/{icon_name}.png'
|
||||
os.system(f'cp {img_path} {output_path}')
|
||||
|
||||
# Try to create ICNS using iconutil (macOS only) or convert
|
||||
try:
|
||||
subprocess.run(['iconutil', '-c', 'icns', '-o',
|
||||
f'{output_dir}/appIcon.icns', iconset_dir],
|
||||
check=True, capture_output=True)
|
||||
print("✓ macOS ICNS file created: icons/mac/appIcon.icns")
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
print("⚠ Note: iconutil not found. ICNS not created.")
|
||||
print(" On macOS, run: iconutil -c icns -o icons/mac/appIcon.icns icons/mac/appIcon.iconset")
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Error creating ICNS: {e}")
|
||||
return False
|
||||
|
||||
def create_svg_copy(source_path, output_dir):
|
||||
"""Create SVG copy for Linux (optional, requires vector source)"""
|
||||
try:
|
||||
import shutil
|
||||
svg_path = source_path.replace('.png', '.svg')
|
||||
|
||||
if os.path.exists(svg_path):
|
||||
shutil.copy(svg_path, f'{output_dir}/appIcon.svg')
|
||||
print(f"✓ SVG icon copied: icons/linux/appIcon.svg")
|
||||
return True
|
||||
else:
|
||||
print("⚠ No SVG source found (optional for Linux)")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ Error copying SVG: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main execution"""
|
||||
print("🎨 Multi-Platform Icon Generator\n")
|
||||
|
||||
# Find source image
|
||||
source_image = 'appLogo.png'
|
||||
if not os.path.exists(source_image):
|
||||
print(f"✗ Error: {source_image} not found in current directory")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Source: {source_image}\n")
|
||||
|
||||
# Create directories
|
||||
create_directories()
|
||||
print()
|
||||
|
||||
# Generate icons for each platform
|
||||
for platform, sizes in ICON_SIZES.items():
|
||||
print(f"Generating {platform.upper()} icons...")
|
||||
output_dir = f'icons/{platform}'
|
||||
|
||||
if not resize_image(source_image, output_dir, sizes):
|
||||
sys.exit(1)
|
||||
print()
|
||||
|
||||
# Create platform-specific formats
|
||||
print("Creating platform-specific formats...\n")
|
||||
|
||||
if not create_windows_ico('icons/windows'):
|
||||
print("⚠ Continuing despite ICO creation issue\n")
|
||||
|
||||
if not create_mac_icns('icons/mac'):
|
||||
print("⚠ Continuing despite ICNS creation issue\n")
|
||||
|
||||
if not create_svg_copy(source_image, 'icons/linux'):
|
||||
print("⚠ Continuing despite SVG copy issue\n")
|
||||
|
||||
print("✅ Icon generation complete!")
|
||||
print("\nOutput structure:")
|
||||
print(" icons/")
|
||||
print(" ├── windows/")
|
||||
print(" │ ├── appIcon.ico")
|
||||
print(" │ └── icon-*.png")
|
||||
print(" ├── mac/")
|
||||
print(" │ ├── appIcon.icns (if created)")
|
||||
print(" │ └── icon-*.png")
|
||||
print(" └── linux/")
|
||||
print(" ├── appIcon.svg (if available)")
|
||||
print(" └── icon-*.png")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
BIN
tools/icons/linux/icon-128.png
Normal file
|
After Width: | Height: | Size: 5.5 KiB |
BIN
tools/icons/linux/icon-16.png
Normal file
|
After Width: | Height: | Size: 345 B |
BIN
tools/icons/linux/icon-24.png
Normal file
|
After Width: | Height: | Size: 591 B |
BIN
tools/icons/linux/icon-256.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
tools/icons/linux/icon-32.png
Normal file
|
After Width: | Height: | Size: 876 B |
BIN
tools/icons/linux/icon-48.png
Normal file
|
After Width: | Height: | Size: 1.4 KiB |
BIN
tools/icons/linux/icon-64.png
Normal file
|
After Width: | Height: | Size: 2.3 KiB |
BIN
tools/icons/mac/appIcon.icns
Normal file
BIN
tools/icons/mac/appIcon.iconset/icon_128x128.png
Normal file
|
After Width: | Height: | Size: 5.5 KiB |
BIN
tools/icons/mac/appIcon.iconset/icon_16x16.png
Normal file
|
After Width: | Height: | Size: 345 B |
BIN
tools/icons/mac/appIcon.iconset/icon_256x256.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
tools/icons/mac/appIcon.iconset/icon_256x256@2x.png
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
tools/icons/mac/appIcon.iconset/icon_32x32.png
Normal file
|
After Width: | Height: | Size: 876 B |
BIN
tools/icons/mac/appIcon.iconset/icon_512x512@2x.png
Normal file
|
After Width: | Height: | Size: 465 KiB |
BIN
tools/icons/mac/appIcon.iconset/icon_64x64.png
Normal file
|
After Width: | Height: | Size: 2.3 KiB |
BIN
tools/icons/mac/icon-1024.png
Normal file
|
After Width: | Height: | Size: 465 KiB |
BIN
tools/icons/mac/icon-128.png
Normal file
|
After Width: | Height: | Size: 5.5 KiB |
BIN
tools/icons/mac/icon-16.png
Normal file
|
After Width: | Height: | Size: 345 B |
BIN
tools/icons/mac/icon-256.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
tools/icons/mac/icon-32.png
Normal file
|
After Width: | Height: | Size: 876 B |
BIN
tools/icons/mac/icon-512.png
Normal file
|
After Width: | Height: | Size: 46 KiB |
BIN
tools/icons/mac/icon-64.png
Normal file
|
After Width: | Height: | Size: 2.3 KiB |
BIN
tools/icons/windows/appIcon.ico
Normal file
|
After Width: | Height: | Size: 367 B |
BIN
tools/icons/windows/icon-16.png
Normal file
|
After Width: | Height: | Size: 345 B |
BIN
tools/icons/windows/icon-24.png
Normal file
|
After Width: | Height: | Size: 591 B |
BIN
tools/icons/windows/icon-256.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
BIN
tools/icons/windows/icon-32.png
Normal file
|
After Width: | Height: | Size: 876 B |
BIN
tools/icons/windows/icon-48.png
Normal file
|
After Width: | Height: | Size: 1.4 KiB |