# NOTE!!NOTE!!!NOTE!!NOTE!!!NOTE!!NOTE!!!NOTE!!NOTE!!!
# THE WORD "CHAPTER" IN THE CODE DOES NOT MEAN
# IT'S THE REAL CHAPTER OF THE EBOOK SINCE NO STANDARDS
# ARE DEFINING A CHAPTER ON .EPUB FORMAT. THE WORD "BLOCK"
# IS USED TO PRINT IT OUT TO THE TERMINAL, AND "CHAPTER" TO THE CODE
# WHICH IS LESS GENERIC FOR THE DEVELOPERS
import argparse
import asyncio
import csv
import fnmatch
import hashlib
import io
import json
import math
import os
import platform
import random
import shutil
import socket
import subprocess
import sys
import tempfile
import threading
import time
import traceback
import unicodedata
import urllib.request
import uuid
import zipfile
import ebooklib
import gradio as gr
import psutil
import pymupdf4llm
import regex as re
import requests
import stanza
import torch
import uvicorn
from soynlp.tokenizer import LTokenizer
from pythainlp.tokenize import word_tokenize
from sudachipy import dictionary, tokenizer
from PIL import Image
from tqdm import tqdm
from bs4 import BeautifulSoup, NavigableString, Tag
from collections import Counter
from collections.abc import Mapping
from collections.abc import MutableMapping
from datetime import datetime
from ebooklib import epub
from glob import glob
from iso639 import languages
from markdown import markdown
from multiprocessing import Pool, cpu_count
from multiprocessing import Manager, Event
from multiprocessing.managers import DictProxy, ListProxy
from num2words import num2words
from pathlib import Path
from pydub import AudioSegment
from queue import Queue, Empty
from types import MappingProxyType
from urllib.parse import urlparse
from starlette.requests import ClientDisconnect
from lib import *
from lib.classes.voice_extractor import VoiceExtractor
from lib.classes.tts_manager import TTSManager
#from lib.classes.redirect_console import RedirectConsole
#from lib.classes.argos_translator import ArgosTranslator
#import logging
#logging.basicConfig(
# format="%(asctime)s %(name)s %(levelname)s %(message)s",
# level=logging.DEBUG
#)
#logging.getLogger("gradio").setLevel(logging.DEBUG)
#logging.getLogger("httpx").setLevel(logging.DEBUG)
context = None
lock = threading.Lock()
is_gui_process = False
class DependencyError(Exception):
def __init__(self, message=None):
super().__init__(message)
print(message)
# Automatically handle the exception when it's raised
self.handle_exception()
def handle_exception(self):
# Print the full traceback of the exception
traceback.print_exc()
# Print the exception message
print(f'Caught DependencyError: {self}')
# Exit the script if it's not a web process
if not is_gui_process:
sys.exit(1)
def recursive_proxy(data, manager=None):
if manager is None:
manager = Manager()
if isinstance(data, dict):
proxy_dict = manager.dict()
for key, value in data.items():
proxy_dict[key] = recursive_proxy(value, manager)
return proxy_dict
elif isinstance(data, list):
proxy_list = manager.list()
for item in data:
proxy_list.append(recursive_proxy(item, manager))
return proxy_list
elif isinstance(data, (str, int, float, bool, type(None))):
return data
else:
error = f"Unsupported data type: {type(data)}"
print(error)
return
class SessionContext:
def __init__(self):
self.manager = Manager()
self.sessions = self.manager.dict() # Store all session-specific contexts
self.cancellation_events = {} # Store multiprocessing.Event for each session
def get_session(self, id):
if id not in self.sessions:
self.sessions[id] = recursive_proxy({
"script_mode": NATIVE,
"id": id,
"process_id": None,
"device": default_device,
"system": None,
"client": None,
"language": default_language_code,
"language_iso1": None,
"audiobook": None,
"audiobooks_dir": None,
"process_dir": None,
"ebook": None,
"ebook_list": None,
"ebook_mode": "single",
"chapters_dir": None,
"chapters_dir_sentences": None,
"epub_path": None,
"filename_noext": None,
"tts_engine": default_tts_engine,
"fine_tuned": default_fine_tuned,
"voice": None,
"voice_dir": None,
"custom_model": None,
"custom_model_dir": None,
"toc": None,
"chapters": None,
"cover": None,
"status": None,
"progress": 0,
"time": None,
"cancellation_requested": False,
"temperature": default_engine_settings[TTS_ENGINES['XTTSv2']]['temperature'],
"length_penalty": default_engine_settings[TTS_ENGINES['XTTSv2']]['length_penalty'],
"num_beams": default_engine_settings[TTS_ENGINES['XTTSv2']]['num_beams'],
"repetition_penalty": default_engine_settings[TTS_ENGINES['XTTSv2']]['repetition_penalty'],
"top_k": default_engine_settings[TTS_ENGINES['XTTSv2']]['top_k'],
"top_p": default_engine_settings[TTS_ENGINES['XTTSv2']]['top_k'],
"speed": default_engine_settings[TTS_ENGINES['XTTSv2']]['speed'],
"enable_text_splitting": default_engine_settings[TTS_ENGINES['XTTSv2']]['enable_text_splitting'],
"text_temp": default_engine_settings[TTS_ENGINES['BARK']]['text_temp'],
"waveform_temp": default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp'],
"event": None,
"final_name": None,
"output_format": default_output_format,
"metadata": {
"title": None,
"creator": None,
"contributor": None,
"language": None,
"identifier": None,
"publisher": None,
"date": None,
"description": None,
"subject": None,
"rights": None,
"format": None,
"type": None,
"coverage": None,
"relation": None,
"Source": None,
"Modified": None,
}
}, manager=self.manager)
return self.sessions[id]
def prepare_dirs(src, session):
try:
resume = False
os.makedirs(os.path.join(models_dir,'tts'), exist_ok=True)
os.makedirs(session['session_dir'], exist_ok=True)
os.makedirs(session['process_dir'], exist_ok=True)
os.makedirs(session['custom_model_dir'], exist_ok=True)
os.makedirs(session['voice_dir'], exist_ok=True)
os.makedirs(session['audiobooks_dir'], exist_ok=True)
session['ebook'] = os.path.join(session['process_dir'], os.path.basename(src))
if os.path.exists(session['ebook']):
if compare_files_by_hash(session['ebook'], src):
resume = True
if not resume:
shutil.rmtree(session['chapters_dir'], ignore_errors=True)
os.makedirs(session['chapters_dir'], exist_ok=True)
os.makedirs(session['chapters_dir_sentences'], exist_ok=True)
shutil.copy(src, session['ebook'])
return True
except Exception as e:
DependencyError(e)
return False
def check_programs(prog_name, command, options):
try:
subprocess.run(
[command, options],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
text=True,
encoding='utf-8'
)
return True, None
except FileNotFoundError:
e = f'''********** Error: {prog_name} is not installed! if your OS calibre package version
is not compatible you still can run ebook2audiobook.sh (linux/mac) or ebook2audiobook.cmd (windows) **********'''
DependencyError(e)
return False, None
except subprocess.CalledProcessError:
e = f'Error: There was an issue running {prog_name}.'
DependencyError(e)
return False, None
def analyze_uploaded_file(zip_path, required_files):
try:
if not os.path.exists(zip_path):
error = f"The file does not exist: {os.path.basename(zip_path)}"
print(error)
return False
files_in_zip = {}
empty_files = set()
with zipfile.ZipFile(zip_path, 'r') as zf:
for file_info in zf.infolist():
file_name = file_info.filename
if file_info.is_dir():
continue
base_name = os.path.basename(file_name)
files_in_zip[base_name.lower()] = file_info.file_size
if file_info.file_size == 0:
empty_files.add(base_name.lower())
required_files = [file.lower() for file in required_files]
missing_files = [f for f in required_files if f not in files_in_zip]
required_empty_files = [f for f in required_files if f in empty_files]
if missing_files:
print(f"Missing required files: {missing_files}")
if required_empty_files:
print(f"Required files with 0 KB: {required_empty_files}")
return not missing_files and not required_empty_files
except zipfile.BadZipFile:
error = "The file is not a valid ZIP archive."
raise ValueError(error)
except Exception as e:
error = f"An error occurred: {e}"
raise RuntimeError(error)
def extract_custom_model(file_src, session, required_files=None):
try:
model_path = None
if required_files is None:
required_files = models[session['tts_engine']][default_fine_tuned]['files']
model_name = re.sub('.zip', '', os.path.basename(file_src), flags=re.IGNORECASE)
model_name = get_sanitized(model_name)
with zipfile.ZipFile(file_src, 'r') as zip_ref:
files = zip_ref.namelist()
files_length = len(files)
tts_dir = session['tts_engine']
model_path = os.path.join(session['custom_model_dir'], tts_dir, model_name)
if os.path.exists(model_path):
print(f'{model_path} already exists, bypassing files extraction')
return model_path
os.makedirs(model_path, exist_ok=True)
required_files_lc = set(x.lower() for x in required_files)
with tqdm(total=files_length, unit='files') as t:
for f in files:
base_f = os.path.basename(f).lower()
if base_f in required_files_lc:
out_path = os.path.join(model_path, base_f)
with zip_ref.open(f) as src, open(out_path, 'wb') as dst:
shutil.copyfileobj(src, dst)
t.update(1)
if is_gui_process:
os.remove(file_src)
if model_path is not None:
msg = f'Extracted files to {model_path}'
print(msg)
return model_path
else:
error = f'An error occured when unzip {file_src}'
return None
except asyncio.exceptions.CancelledError as e:
DependencyError(e)
if is_gui_process:
os.remove(file_src)
return None
except Exception as e:
DependencyError(e)
if is_gui_process:
os.remove(file_src)
return None
def hash_proxy_dict(proxy_dict):
return hashlib.md5(str(proxy_dict).encode('utf-8')).hexdigest()
def calculate_hash(filepath, hash_algorithm='sha256'):
hash_func = hashlib.new(hash_algorithm)
with open(filepath, 'rb') as f:
while chunk := f.read(8192): # Read in chunks to handle large files
hash_func.update(chunk)
return hash_func.hexdigest()
def compare_files_by_hash(file1, file2, hash_algorithm='sha256'):
return calculate_hash(file1, hash_algorithm) == calculate_hash(file2, hash_algorithm)
def compare_dict_keys(d1, d2):
if not isinstance(d1, Mapping) or not isinstance(d2, Mapping):
return d1 == d2
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
missing_in_d2 = d1_keys - d2_keys
missing_in_d1 = d2_keys - d1_keys
if missing_in_d2 or missing_in_d1:
return {
"missing_in_d2": missing_in_d2,
"missing_in_d1": missing_in_d1,
}
for key in d1_keys.intersection(d2_keys):
nested_result = compare_keys(d1[key], d2[key])
if nested_result:
return {key: nested_result}
return None
def proxy2dict(proxy_obj):
def recursive_copy(source, visited):
# Handle circular references by tracking visited objects
if id(source) in visited:
return None # Stop processing circular references
visited.add(id(source)) # Mark as visited
if isinstance(source, dict):
result = {}
for key, value in source.items():
result[key] = recursive_copy(value, visited)
return result
elif isinstance(source, list):
return [recursive_copy(item, visited) for item in source]
elif isinstance(source, set):
return list(source)
elif isinstance(source, (int, float, str, bool, type(None))):
return source
elif isinstance(source, DictProxy):
# Explicitly handle DictProxy objects
return recursive_copy(dict(source), visited) # Convert DictProxy to dict
else:
return str(source) # Convert non-serializable types to strings
return recursive_copy(proxy_obj, set())
def convert2epub(session):
if session['cancellation_requested']:
print('Cancel requested')
return False
try:
title = False
author = False
util_app = shutil.which('ebook-convert')
if not util_app:
error = "The 'ebook-convert' utility is not installed or not found."
print(error)
return False
file_input = session['ebook']
if os.path.getsize(file_input) == 0:
error = f"Input file is empty: {file_input}"
print(error)
return False
file_ext = os.path.splitext(file_input)[1].lower()
if file_ext not in ebook_formats:
error = f'Unsupported file format: {file_ext}'
print(error)
return False
if file_ext == '.pdf':
import fitz
msg = 'File input is a PDF. flatten it in MarkDown...'
print(msg)
doc = fitz.open(session['ebook'])
pdf_metadata = doc.metadata
filename_no_ext = os.path.splitext(os.path.basename(session['ebook']))[0]
title = pdf_metadata.get('title') or filename_no_ext
author = pdf_metadata.get('author') or False
markdown_text = pymupdf4llm.to_markdown(session['ebook'])
# Remove single asterisks for italics (but not bold **)
markdown_text = re.sub(r'(? in the head of the first XHTML document
if all_docs:
html = all_docs[0].get_content().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.select_one("head > title")
if title_tag and title_tag.text.strip():
return title_tag.text.strip()
# 3. Try if no visible
img = soup.find("img", alt=True)
if img:
alt = img['alt'].strip()
if alt and "cover" not in alt.lower():
return alt
return None
def get_cover(epubBook, session):
try:
if session['cancellation_requested']:
msg = 'Cancel requested'
print(msg)
return False
cover_image = None
cover_path = os.path.join(session['process_dir'], session['filename_noext'] + '.jpg')
for item in epubBook.get_items_of_type(ebooklib.ITEM_COVER):
cover_image = item.get_content()
break
if not cover_image:
for item in epubBook.get_items_of_type(ebooklib.ITEM_IMAGE):
if 'cover' in item.file_name.lower() or 'cover' in item.get_id().lower():
cover_image = item.get_content()
break
if cover_image:
# Open the image from bytes
image = Image.open(io.BytesIO(cover_image))
# Convert to RGB if needed (JPEG doesn't support alpha)
if image.mode in ('RGBA', 'P'):
image = image.convert('RGB')
image.save(cover_path, format='JPEG')
return cover_path
return True
except Exception as e:
DependencyError(e)
return False
def get_chapters(epubBook, session):
try:
msg = r'''
*******************************************************************************
NOTE:
The warning "Character xx not found in the vocabulary."
MEANS THE MODEL CANNOT INTERPRET THE CHARACTER AND WILL MAYBE GENERATE
(AS WELL AS WRONG PUNCTUATION POSITION) AN HALLUCINATION TO IMPROVE THIS MODEL,
IT NEEDS TO ADD THIS CHARACTER INTO A NEW TRAINING MODEL.
YOU CAN IMPROVE IT OR ASK TO A TRAINING MODEL EXPERT.
*******************************************************************************
'''
print(msg)
if session['cancellation_requested']:
print('Cancel requested')
return False
# Step 1: Extract TOC (Table of Contents)
try:
toc = epubBook.toc # Extract TOC
toc_list = [
nt for item in toc if hasattr(item, 'title')
if (nt := normalize_text(
str(item.title),
session['language'],
session['language_iso1'],
session['tts_engine']
)) is not None
]
except Exception as toc_error:
error = f"Error extracting TOC: {toc_error}"
print(error)
# Get spine item IDs
spine_ids = [item[0] for item in epubBook.spine]
# Filter only spine documents (i.e., reading order)
all_docs = [
item for item in epubBook.get_items_of_type(ebooklib.ITEM_DOCUMENT)
if item.id in spine_ids
]
if not all_docs:
return [], []
title = get_ebook_title(epubBook, all_docs)
chapters = []
stanza_nlp = False
if session['language'] in year_to_decades_languages:
stanza.download(session['language_iso1'])
stanza_nlp = stanza.Pipeline(session['language_iso1'], processors='tokenize,ner')
is_num2words_compat = get_num2words_compat(session['language_iso1'])
msg = 'Analyzing maths and dates to convert in words...'
print(msg)
for doc in all_docs:
sentences_list = filter_chapter(doc, session['language'], session['language_iso1'], session['tts_engine'], stanza_nlp, is_num2words_compat)
if sentences_list is not None:
chapters.append(sentences_list)
return toc, chapters
except Exception as e:
error = f'Error extracting main content pages: {e}'
DependencyError(error)
return None, None
def filter_chapter(doc, lang, lang_iso1, tts_engine, stanza_nlp, is_num2words_compat):
def walk(node):
try:
for child in node.children:
if isinstance(child, NavigableString):
text = child.strip()
if text:
yield ("text", text)
elif isinstance(child, Tag):
name = child.name.lower()
if name in heading_tags:
title = child.get_text(strip=True)
if title:
yield ("heading", title)
elif name == "table":
yield ("table", child)
else:
return_data = False
if name in proc_tags:
for inner in walk(child):
return_data = True
yield inner
# Only add pause if something textual/structural came out
if return_data:
if name in break_tags:
yield ("break-request", None)
elif name in heading_tags or name in pause_tags:
yield ("pause-request", None)
else:
yield from walk(child)
except Exception as e:
error = f'filter_chapter() walk() error: {e}'
DependencyError(error)
return None
try:
heading_tags = [f'h{i}' for i in range(1, 5)]
break_tags = ['br', 'p']
pause_tags = ['div', 'span']
proc_tags = heading_tags + break_tags + pause_tags
raw_html = doc.get_body_content().decode("utf-8")
soup = BeautifulSoup(raw_html, 'html.parser')
body = soup.body
if not body or not body.get_text(strip=True):
return None
# Skip known non-chapter types
epub_type = body.get("epub:type", "").lower()
if not epub_type:
section_tag = soup.find("section")
if section_tag:
epub_type = section_tag.get("epub:type", "").lower()
excluded = {
"frontmatter", "backmatter", "toc", "titlepage", "colophon",
"acknowledgments", "dedication", "glossary", "index",
"appendix", "bibliography", "copyright-page", "landmark"
}
if any(part in epub_type for part in excluded):
return None
# remove scripts/styles
for tag in soup(["script", "style"]):
tag.decompose()
items = list(walk(body))
if not items:
return None
# Process items to insert breaks and pauses
processed = []
br_run = 0
for typ, payload in items:
if typ == 'break-request':
if processed and processed[-1][0] != "break": # avoid duplicates
processed.append(("break", TTS_SML['break']))
elif typ == 'pause-request':
if processed and processed[-1][0] != "pause": # avoid duplicates
processed.append(("pause", TTS_SML['pause']))
else:
processed.append((typ, payload))
items = processed
text_array = []
handled_tables = set()
for typ, payload in items:
if typ == "heading":
raw_text = replace_roman_numbers(payload, lang)
# Always add pause after heading (but avoid duplicate if previous already pause)
if text_array and text_array[-1] == TTS_SML['pause']:
text_array.append(raw_text.strip())
text_array.append(TTS_SML['pause'])
else:
text_array.append(f"{raw_text.strip()} {TTS_SML['pause']}")
elif typ == "break":
text_array.append(TTS_SML['break'])
elif typ == "pause":
text_array.append(TTS_SML['pause'])
elif typ == "table":
table = payload
if table in handled_tables:
continue
handled_tables.add(table)
rows = table.find_all("tr")
if not rows:
continue
headers = [c.get_text(strip=True) for c in rows[0].find_all(["td", "th"])]
for row in rows[1:]:
cells = [c.get_text(strip=True).replace('\xa0', ' ') for c in row.find_all("td")]
if not cells:
continue
if len(cells) == len(headers) and headers:
line = " — ".join(f"{h}: {c}" for h, c in zip(headers, cells))
else:
line = " — ".join(cells)
if line:
text_array.append(line.strip())
else: # "text"
text = payload.strip()
if text:
text_array.append(text)
text = "\n".join(text_array)
if not re.search(r"[^\W_]", text):
return None
if stanza_nlp:
# Check if numbers exists in the text
if bool(re.search(r'[-+]?\b\d+(\.\d+)?\b', text)):
# Check if there are positive integers so possible date to convert
if bool(re.search(r'\b\d+\b', text)):
date_spans = get_date_entities(text, stanza_nlp)
if date_spans:
result = []
last_pos = 0
for start, end, date_text in date_spans:
# Append text before this date
result.append(text[last_pos:start])
processed = re.sub(r"\b\d{4}\b", lambda m: year2words(m.group(), lang, lang_iso1, is_num2words_compat), date_text)
if not processed:
break
result.append(processed)
last_pos = end
# Append remaining text
result.append(text[last_pos:])
text = ''.join(result)
text = math2words(text, lang, lang_iso1, tts_engine, is_num2words_compat)
# build a translation table mapping each bad char to a space
specialchars_remove_table = str.maketrans({ch: ' ' for ch in specialchars_remove})
text = text.translate(specialchars_remove_table)
text = normalize_text(text, lang, lang_iso1, tts_engine)
# Ensure space before and after punctuation_list
pattern_space = re.escape(''.join(punctuation_list))
punctuation_pattern_space = r'\s*([{}])\s*'.format(pattern_space)
text = re.sub(punctuation_pattern_space, r' \1 ', text)
return get_sentences(text, lang, tts_engine)
except Exception as e:
error = f'filter_chapter() error: {e}'
DependencyError(error)
return None
def get_sentences(text, lang, tts_engine):
def regex_split_inclusive(text, pattern):
result = []
last_end = 0
for match in pattern.finditer(text):
result.append(text[last_end:match.end()].strip())
last_end = match.end()
if last_end < len(text):
tail = text[last_end:].strip()
if tail:
result.append(tail)
return result
def segment_ideogramms(text):
try:
if lang == 'zho':
import jieba
return list(jieba.cut(text))
elif lang == 'jpn':
sudachi = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C
return [m.surface() for m in sudachi.tokenize(text, mode)]
elif lang == 'kor':
ltokenizer = LTokenizer()
return ltokenizer.tokenize(text)
elif lang in ['tha', 'lao', 'mya', 'khm']:
return word_tokenize(text, engine='newmm')
else:
return [text]
except Exception as e:
DependencyError(e)
return [text]
def join_ideogramms(idg_list):
try:
buffer = ''
for token in idg_list:
# 1) On pause: flush & emit buffer, then the pause
if token.strip() in TTS_SML.values():
if buffer:
yield buffer
buffer = ''
yield token
continue
# 2) If adding this token would overflow, flush current buffer first
if buffer and len(buffer) + len(token) > max_chars:
yield buffer
buffer = ''
# 3) Append the token (word, punctuation, whatever)
buffer += token
# 4) Flush any trailing text
if buffer:
yield buffer
except Exception as e:
DependencyError(e)
yield buffer
try:
max_chars = language_mapping[lang]['max_chars']
min_tokens = 5
# tacotron2 apparently does not like double quotes
if tts_engine == TTS_ENGINES['TACOTRON2']:
text = text.replace('"', '')
# Step 1: Split first by ‡pause‡ and ‡break‡, keeping them as separate elements
sml_list = re.split(
rf'({re.escape(TTS_SML["pause"])}|{re.escape(TTS_SML["break"])})', text
)
sml_list = [
s if s in (TTS_SML['pause'], TTS_SML['break']) else s.strip()
for s in sml_list if s.strip() or s in (TTS_SML['pause'], TTS_SML['break'])
]
# Step 3: split with punctuation_split_hard_set
pattern_split = '|'.join(map(re.escape, punctuation_split_hard_set))
pattern = re.compile(rf"(.*?(?:{pattern_split}))(?=\s|$)", re.DOTALL)
hard_list = []
for s in sml_list:
if s == TTS_SML['pause']:
hard_list.append(s)
else:
parts = regex_split_inclusive(s, pattern)
if parts:
for text_part in parts:
text_part = text_part.strip()
if text_part:
hard_list.append(text_part)
else:
# no hard‑split punctuation found → keep whole sentence
hard_list.append(s)
# Step 4: check if some hard_list entries exceed max_chars, so split on soft punctuation
pattern_split = '|'.join(map(re.escape, punctuation_split_soft_set))
pattern = re.compile(rf"(.*?(?:{pattern_split}))(?=\s|$)", re.DOTALL)
soft_list = []
for s in hard_list:
if s == TTS_SML['pause']:
soft_list.append(s)
elif len(s) > max_chars:
parts = [p.strip() for p in regex_split_inclusive(s, pattern) if p.strip()]
if parts:
buffer = ''
for part in parts:
# always treat pauses as their own chunk
if part == TTS_SML['pause']:
if buffer:
soft_list.append(buffer)
buffer = ''
soft_list.append(part)
continue
if not buffer:
buffer = part
continue
if len(buffer.split()) < min_tokens:
buffer = buffer + ' ' + part
else:
# buffer is large enough: flush it, start new one
soft_list.append(buffer)
buffer = part
if buffer:
soft_list.append(buffer)
else:
soft_list.append(s)
else:
soft_list.append(s)
if lang in ['zho', 'jpn', 'kor', 'tha', 'lao', 'mya', 'khm']:
result = []
for s in soft_list:
if s == TTS_SML['pause']:
result.append(s)
else:
tokens = segment_ideogramms(s)
if isinstance(tokens, list):
result.extend([t for t in tokens if t.strip()])
else:
if tokens.strip():
result.append(tokens)
return list(join_ideogramms(result))
else:
# Step extra: split any remaining over‑length sentences on spaces
sentences = []
for s in soft_list:
if s == TTS_SML['pause'] or len(s) <= max_chars:
sentences.append(s)
else:
words = s.split(' ')
text_part = words[0]
for w in words[1:]:
if len(text_part) + 1 + len(w) <= max_chars:
text_part += ' ' + w
else:
sentences.append(text_part.strip())
text_part = w
if text_part:
cleaned = re.sub(r'[^\p{L}\p{N} ]+', '', s)
if not any(ch.isalnum() for ch in cleaned):
continue
sentences.append(text_part.strip())
return sentences
except Exception as e:
error = f'get_sentences() error: {e}'
print(error)
return None
def get_ram():
vm = psutil.virtual_memory()
return vm.total // (1024 ** 3)
def get_vram():
os_name = platform.system()
# NVIDIA (Cross-Platform: Windows, Linux, macOS)
try:
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0) # First GPU
info = nvmlDeviceGetMemoryInfo(handle)
vram = info.total
return int(vram // (1024 ** 3)) # Convert to GB
except ImportError:
pass
except Exception as e:
pass
# AMD (Windows)
if os_name == "Windows":
try:
cmd = 'wmic path Win32_VideoController get AdapterRAM'
output = subprocess.run(cmd, capture_output=True, text=True, shell=True)
lines = output.stdout.splitlines()
vram_values = [int(line.strip()) for line in lines if line.strip().isdigit()]
if vram_values:
return int(vram_values[0] // (1024 ** 3))
except Exception as e:
pass
# AMD (Linux)
if os_name == "Linux":
try:
cmd = "lspci -v | grep -i 'VGA' -A 12 | grep -i 'preallocated' | awk '{print $2}'"
output = subprocess.run(cmd, capture_output=True, text=True, shell=True)
if output.stdout.strip().isdigit():
return int(output.stdout.strip()) // 1024
except Exception as e:
pass
# Intel (Linux Only)
intel_vram_paths = [
"/sys/kernel/debug/dri/0/i915_vram_total", # Intel dedicated GPUs
"/sys/class/drm/card0/device/resource0" # Some integrated GPUs
]
for path in intel_vram_paths:
if os.path.exists(path):
try:
with open(path, "r") as f:
vram = int(f.read().strip()) // (1024 ** 3)
return vram
except Exception as e:
pass
# macOS (OpenGL Alternative)
if os_name == "Darwin":
try:
from OpenGL.GL import glGetIntegerv
from OpenGL.GLX import GLX_RENDERER_VIDEO_MEMORY_MB_MESA
vram = int(glGetIntegerv(GLX_RENDERER_VIDEO_MEMORY_MB_MESA) // 1024)
return vram
except ImportError:
pass
except Exception as e:
pass
msg = 'Could not detect GPU VRAM Capacity!'
return 0
def get_sanitized(str, replacement="_"):
str = str.replace('&', 'And')
forbidden_chars = r'[<>:"/\\|?*\x00-\x1F ()]'
sanitized = re.sub(r'\s+', replacement, str)
sanitized = re.sub(forbidden_chars, replacement, sanitized)
sanitized = sanitized.strip("_")
return sanitized
def get_date_entities(text, stanza_nlp):
try:
doc = stanza_nlp(text)
date_spans = []
for ent in doc.ents:
if ent.type == 'DATE':
date_spans.append((ent.start_char, ent.end_char, ent.text))
return date_spans
except Exception as e:
error = f'detect_date_entities() error: {e}'
print(error)
return False
def get_num2words_compat(lang_iso1):
try:
test = num2words(1, lang=lang_iso1.replace('zh', 'zh_CN'))
return True
except NotImplementedError:
return False
except Exception as e:
return False
def set_formatted_number(text: str, lang, lang_iso1: str, is_num2words_compat: bool, max_single_value: int = 999_999_999_999_999):
# match up to 12 digits, optional “,…” groups, optional decimal of up to 12 digits
number_re = re.compile(r'\b\d{1,12}(?:,\d{1,12})*(?:\.\d{1,12})?\b')
def clean_num(match):
tok = unicodedata.normalize('NFKC', match.group())
# pass through infinities/nans
if tok.lower() in ('inf', 'infinity', 'nan'):
return tok
# strip commas for numeric parsing
clean = tok.replace(',', '')
try:
num = float(clean) if '.' in clean else int(clean)
except (ValueError, OverflowError):
return tok
# skip out of range or non finite
if not math.isfinite(num) or abs(num) > max_single_value:
return tok
if is_num2words_compat:
new_lang_iso1 = lang_iso1.replace('zh', 'zh_CN')
return num2words(num, lang=new_lang_iso1)
else:
phoneme_map = language_math_phonemes.get(lang, language_math_phonemes.get(default_language_code, language_math_phonemes['eng']))
return ' '.join(phoneme_map.get(ch, ch) for ch in str(num))
return number_re.sub(clean_num, text)
def year2words(year_str, lang, lang_iso1, is_num2words_compat):
try:
year = int(year_str)
first_two = int(year_str[:2])
last_two = int(year_str[2:])
lang_iso1 = lang_iso1 if lang in language_math_phonemes.keys() else default_language_code
lang_iso1 = lang_iso1.replace('zh', 'zh_CN')
if not year_str.isdigit() or len(year_str) != 4 or last_two < 10:
if is_num2words_compat:
return num2words(year, lang=lang_iso1)
else:
return ' '.join(language_math_phonemes[lang].get(ch, ch) for ch in year_str)
if is_num2words_compat:
return f"{num2words(first_two, lang=lang_iso1)} {num2words(last_two, lang=lang_iso1)}"
else:
return ' '.join(language_math_phonemes[lang].get(ch, ch) for ch in first_two) + ' ' + ' '.join(language_math_phonemes[lang].get(ch, ch) for ch in last_two)
except Exception as e:
error = f'year2words() error: {e}'
print(error)
raise
return False
def clock2words(time, lang, is_num2words_compat):
parts = [int(p) for p in str(time).replace('.', ':').split(':')]
hour = parts[0]
minute = parts[1] if len(parts) > 1 else 0
second = parts[2] if len(parts) > 2 else None
lang = lang.lower()
lc = language_clock.get(lang)
if not lc:
parts = []
hour_word = num2words(hour, lang=lang)
minute_word = num2words(minute, lang=lang)
parts.append(hour_word)
if minute != 0:
parts.append(minute_word)
if second is not None and second > 0:
second_word = num2words(second, lang=lang)
parts.append(second_word)
return " ".join(parts)
h = hour
m = minute
next_hour = (h + 1) % 24
special_hours = lc.get("special_hours", {})
if m == 0 and (second is None or second == 0):
if h in special_hours:
phrase = special_hours[h]
else:
hour_word = num2words(h, lang=lang)
phrase = lc["oclock"].format(hour=hour_word)
elif m == 15:
hour_word = num2words(h, lang=lang)
phrase = lc["quarter_past"].format(hour=hour_word)
elif m == 30:
if lang == "deu":
next_hour_word = num2words(next_hour, lang=lang)
phrase = lc["half_past"].format(next_hour=next_hour_word)
else:
hour_word = num2words(h, lang=lang)
phrase = lc["half_past"].format(hour=hour_word)
elif m == 45:
next_hour_word = num2words(next_hour, lang=lang)
phrase = lc["quarter_to"].format(next_hour=next_hour_word)
elif m < 30:
hour_word = num2words(h, lang=lang)
if m == 0:
phrase = lc["oclock"].format(hour=hour_word)
else:
minute_word = num2words(m, lang=lang)
phrase = lc["past"].format(hour=hour_word, minute=minute_word)
else:
next_hour_word = num2words(next_hour, lang=lang)
minute_to_hour = 60 - m
minute_word = num2words(minute_to_hour, lang=lang)
phrase = lc["to"].format(next_hour=next_hour_word, minute=minute_word)
if second is not None and second > 0:
second_word = num2words(second, lang=lang)
second_phrase = lc["second"].format(second=second_word)
phrase = lc["full"].format(phrase=phrase, second_phrase=second_phrase)
return phrase
def math2words(text, lang, lang_iso1, tts_engine, is_num2words_compat):
def replace_ambiguous(match):
# handles "num SYMBOL num" and "SYMBOL num"
if match.group(2) and match.group(2) in ambiguous_replacements:
return f"{match.group(1)} {ambiguous_replacements[match.group(2)]} {match.group(3)}"
if match.group(3) and match.group(3) in ambiguous_replacements:
return f"{ambiguous_replacements[match.group(3)]} {match.group(4)}"
return match.group(0)
phonemes_list = language_math_phonemes.get(lang, language_math_phonemes[default_language_code])
# Regex to match all times (hh:mm, hh:mm:ss, h:m, h:m:s)
time_pattern = r'(\d{1,2})[:.](\d{1,2})(?:[:.](\d{1,2}))?'
text = re.sub(time_pattern, lambda m: clock2words(m.group(0), lang, is_num2words_compat), text)
text = re.sub(r'(\d)\)', r'\1 : ', text)
# Symbol phonemes
ambiguous_symbols = {"-", "/", "*", "x"}
replacements = {k: v for k, v in phonemes_list.items() if not k.isdigit() and k not in [',', '.']}
normal_replacements = {k: v for k, v in replacements.items() if k not in ambiguous_symbols}
ambiguous_replacements = {k: v for k, v in replacements.items() if k in ambiguous_symbols}
# Replace unambiguous symbols everywhere
if normal_replacements:
sym_pat = r'(' + '|'.join(map(re.escape, normal_replacements.keys())) + r')'
text = re.sub(sym_pat, lambda m: f" {normal_replacements[m.group(1)]} ", text)
# Replace ambiguous symbols only in valid equation contexts
if ambiguous_replacements:
ambiguous_pattern = (
r'(? str:
token = match.group(1)
for k, expansion in mapping.items():
if token.lower() == k.lower():
return expansion
return token # fallback
mapping = abbreviations_mapping[lang]
# Sort keys by descending length so longer ones match first
keys = sorted(mapping.keys(), key=len, reverse=True)
# Build a regex that only matches whole “words” (tokens) exactly
pattern = re.compile(
r'(? resume_sentence or (sentence_number == 0 and resume_sentence == 0):
if sentence_number <= resume_sentence and sentence_number > 0:
msg = f'**Recovering missing file sentence {sentence_number}'
print(msg)
success = tts_manager.convert_sentence2audio(sentence_number, sentence)
if success:
total_progress = (t.n + 1) / total_iterations
is_sentence = sentence.strip() not in TTS_SML.values()
if progress_bar is not None:
progress_bar(total_progress)
percentage = total_progress * 100
t.set_description(f'Converting {percentage:.2f}%')
msg = f"\nSentence: {sentence}" if is_sentence else f"SML: {sentence}"
print(msg)
else:
return False
if sentence.strip() not in TTS_SML.values():
sentence_number += 1
t.update(1) # advance for every iteration, including SML
end = sentence_number - 1 if sentence_number > 1 else sentence_number
msg = f"End of Block {chapter_num}"
print(msg)
if chapter_num in missing_chapters or sentence_number > resume_sentence:
if chapter_num <= resume_chapter:
msg = f'**Recovering missing file block {chapter_num}'
print(msg)
if combine_audio_sentences(chapter_audio_file, start, end, session):
msg = f'Combining block {chapter_num} to audio, sentence {start} to {end}'
print(msg)
else:
msg = 'combine_audio_sentences() failed!'
print(msg)
return False
return True
except Exception as e:
DependencyError(e)
return False
def assemble_chunks(txt_file, out_file):
try:
ffmpeg_cmd = [
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-y',
'-safe', '0', '-f', 'concat', '-i', txt_file,
'-c:a', default_audio_proc_format, '-map_metadata', '-1', '-threads', '1', out_file
]
process = subprocess.Popen(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8',
errors='ignore'
)
for line in process.stdout:
print(line, end='') # Print each line of stdout
process.wait()
if process.returncode == 0:
return True
else:
error = process.returncode
print(error, ffmpeg_cmd)
return False
except subprocess.CalledProcessError as e:
DependencyError(e)
return False
except Exception as e:
error = f"assemble_chanks() Error: Failed to process {txt_file} → {out_file}: {e}"
print(error)
return False
def combine_audio_sentences(chapter_audio_file, start, end, session):
try:
chapter_audio_file = os.path.join(session['chapters_dir'], chapter_audio_file)
chapters_dir_sentences = session['chapters_dir_sentences']
batch_size = 1024
sentence_files = [
f for f in os.listdir(chapters_dir_sentences)
if f.endswith(f'.{default_audio_proc_format}')
]
sentences_ordered = sorted(
sentence_files, key=lambda x: int(os.path.splitext(x)[0])
)
selected_files = [
os.path.join(chapters_dir_sentences, f)
for f in sentences_ordered
if start <= int(os.path.splitext(f)[0]) <= end
]
if not selected_files:
print('No audio files found in the specified range.')
return False
with tempfile.TemporaryDirectory() as tmpdir:
chunk_list = []
for i in range(0, len(selected_files), batch_size):
batch = selected_files[i:i + batch_size]
txt = os.path.join(tmpdir, f'chunk_{i:04d}.txt')
out = os.path.join(tmpdir, f'chunk_{i:04d}.{default_audio_proc_format}')
with open(txt, 'w') as f:
for file in batch:
f.write(f"file '{file.replace(os.sep, '/')}'\n")
chunk_list.append((txt, out))
try:
with Pool(cpu_count()) as pool:
results = pool.starmap(assemble_chunks, chunk_list)
except Exception as e:
error = f"combine_audio_sentences() multiprocessing error: {e}"
print(error)
return False
if not all(results):
error = "combine_audio_sentences() One or more chunks failed."
print(error)
return False
# Final merge
final_list = os.path.join(tmpdir, 'sentences_final.txt')
with open(final_list, 'w') as f:
for _, chunk_path in chunk_list:
f.write(f"file '{chunk_path.replace(os.sep, '/')}'\n")
if assemble_chunks(final_list, chapter_audio_file):
msg = f'********* Combined block audio file saved in {chapter_audio_file}'
print(msg)
return True
else:
error = "combine_audio_sentences() Final merge failed."
print(error)
return False
except Exception as e:
DependencyError(e)
return False
def combine_audio_chapters(session):
def get_audio_duration(filepath):
try:
ffprobe_cmd = [
shutil.which('ffprobe'),
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'json',
filepath
]
result = subprocess.run(ffprobe_cmd, capture_output=True, text=True)
try:
return float(json.loads(result.stdout)['format']['duration'])
except Exception:
return 0
except subprocess.CalledProcessError as e:
DependencyError(e)
return 0
except Exception as e:
error = f"get_audio_duration() Error: Failed to process {txt_file} → {out_file}: {e}"
print(error)
return 0
def generate_ffmpeg_metadata(part_chapters, session, output_metadata_path, default_audio_proc_format):
try:
out_fmt = session['output_format']
is_mp4_like = out_fmt in ['mp4', 'm4a', 'm4b', 'mov']
is_vorbis = out_fmt in ['ogg', 'webm']
is_mp3 = out_fmt == 'mp3'
def tag(key):
return key.upper() if is_vorbis else key
ffmpeg_metadata = ';FFMETADATA1\n'
if session['metadata'].get('title'):
ffmpeg_metadata += f"{tag('title')}={session['metadata']['title']}\n"
if session['metadata'].get('creator'):
ffmpeg_metadata += f"{tag('artist')}={session['metadata']['creator']}\n"
if session['metadata'].get('language'):
ffmpeg_metadata += f"{tag('language')}={session['metadata']['language']}\n"
if session['metadata'].get('description'):
ffmpeg_metadata += f"{tag('description')}={session['metadata']['description']}\n"
if session['metadata'].get('publisher') and (is_mp4_like or is_mp3):
ffmpeg_metadata += f"{tag('publisher')}={session['metadata']['publisher']}\n"
if session['metadata'].get('published'):
try:
if '.' in session['metadata']['published']:
year = datetime.strptime(session['metadata']['published'], '%Y-%m-%dT%H:%M:%S.%f%z').year
else:
year = datetime.strptime(session['metadata']['published'], '%Y-%m-%dT%H:%M:%S%z').year
except Exception:
year = datetime.now().year
else:
year = datetime.now().year
if is_vorbis:
ffmpeg_metadata += f"{tag('date')}={year}\n"
else:
ffmpeg_metadata += f"{tag('year')}={year}\n"
if session['metadata'].get('identifiers') and isinstance(session['metadata']['identifiers'], dict):
if is_mp3 or is_mp4_like:
isbn = session['metadata']['identifiers'].get('isbn')
if isbn:
ffmpeg_metadata += f"{tag('isbn')}={isbn}\n"
asin = session['metadata']['identifiers'].get('mobi-asin')
if asin:
ffmpeg_metadata += f"{tag('asin')}={asin}\n"
start_time = 0
for filename, chapter_title in part_chapters:
filepath = os.path.join(session['chapters_dir'], filename)
duration_ms = len(AudioSegment.from_file(filepath, format=default_audio_proc_format))
clean_title = re.sub(r'(^#)|[=\\]|(-$)', lambda m: '\\' + (m.group(1) or m.group(0)), chapter_title.replace(TTS_SML['pause'], ''))
ffmpeg_metadata += '[CHAPTER]\nTIMEBASE=1/1000\n'
ffmpeg_metadata += f'START={start_time}\nEND={start_time + duration_ms}\n'
ffmpeg_metadata += f"{tag('title')}={clean_title}\n"
start_time += duration_ms
with open(output_metadata_path, 'w', encoding='utf-8') as f:
f.write(ffmpeg_metadata)
return output_metadata_path
except Exception as e:
error = f"generate_ffmpeg_metadata() Error: Failed to process {txt_file} → {out_file}: {e}"
print(error)
return False
def export_audio(ffmpeg_combined_audio, ffmpeg_metadata_file, ffmpeg_final_file):
try:
if session['cancellation_requested']:
print('Cancel requested')
return False
cover_path = None
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', ffmpeg_combined_audio]
if session['output_format'] == 'wav':
ffmpeg_cmd += ['-map', '0:a', '-ar', '44100', '-sample_fmt', 's16']
elif session['output_format'] == 'aac':
ffmpeg_cmd += ['-c:a', 'aac', '-b:a', '192k', '-ar', '44100']
elif session['output_format'] == 'flac':
ffmpeg_cmd += ['-c:a', 'flac', '-compression_level', '5', '-ar', '44100', '-sample_fmt', 's16']
else:
ffmpeg_cmd += ['-f', 'ffmetadata', '-i', ffmpeg_metadata_file, '-map', '0:a']
if session['output_format'] in ['m4a', 'm4b', 'mp4', 'mov']:
ffmpeg_cmd += ['-c:a', 'aac', '-b:a', '192k', '-ar', '44100', '-movflags', '+faststart+use_metadata_tags']
elif session['output_format'] == 'mp3':
ffmpeg_cmd += ['-c:a', 'libmp3lame', '-b:a', '192k', '-ar', '44100']
elif session['output_format'] == 'webm':
ffmpeg_cmd += ['-c:a', 'libopus', '-b:a', '192k', '-ar', '48000']
elif session['output_format'] == 'ogg':
ffmpeg_cmd += ['-c:a', 'libopus', '-compression_level', '0', '-b:a', '192k', '-ar', '48000']
ffmpeg_cmd += ['-map_metadata', '1']
ffmpeg_cmd += ['-af', 'loudnorm=I=-16:LRA=11:TP=-1.5,afftdn=nf=-70', '-strict', 'experimental', '-threads', '1', '-y', ffmpeg_final_file]
process = subprocess.Popen(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8',
errors='ignore'
)
for line in process.stdout:
print(line, end='')
process.wait()
if process.returncode == 0:
if session['output_format'] in ['mp3', 'm4a', 'm4b', 'mp4']:
if session['cover'] is not None:
cover_path = session['cover']
msg = f'Adding cover {cover_path} into the final audiobook file...'
print(msg)
if session['output_format'] == 'mp3':
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, error
audio = MP3(ffmpeg_final_file, ID3=ID3)
try:
audio.add_tags()
except error:
pass
with open(cover_path, 'rb') as img:
audio.tags.add(
APIC(
encoding=3,
mime='image/jpeg',
type=3,
desc='Cover',
data=img.read()
)
)
elif session['output_format'] in ['mp4', 'm4a', 'm4b']:
from mutagen.mp4 import MP4, MP4Cover
audio = MP4(ffmpeg_final_file)
with open(cover_path, 'rb') as f:
cover_data = f.read()
audio["covr"] = [MP4Cover(cover_data, imageformat=MP4Cover.FORMAT_JPEG)]
if audio:
audio.save()
return True
else:
error = process.returncode
print(error, ffmpeg_cmd)
return False
except Exception as e:
DependencyError(e)
return False
try:
chapter_files = [f for f in os.listdir(session['chapters_dir']) if f.endswith(f'.{default_audio_proc_format}')]
chapter_files = sorted(chapter_files, key=lambda x: int(re.search(r'\d+', x).group()))
chapter_titles = [c[0] for c in session['chapters']]
if len(chapter_files) == 0:
print('No block files exists!')
return None
# Calculate total duration
durations = []
for file in chapter_files:
filepath = os.path.join(session['chapters_dir'], file)
durations.append(get_audio_duration(filepath))
total_duration = sum(durations)
max_part_duration = output_split_hours * 3600
needs_split = total_duration > (output_split_hours * 2) * 3600
# --- Split into parts by duration ---
part_files = []
part_chapter_indices = []
cur_part = []
cur_indices = []
cur_duration = 0
for idx, (file, dur) in enumerate(zip(chapter_files, durations)):
if cur_part and (cur_duration + dur > max_part_duration):
part_files.append(cur_part)
part_chapter_indices.append(cur_indices)
cur_part = []
cur_indices = []
cur_duration = 0
cur_part.append(file)
cur_indices.append(idx)
cur_duration += dur
if cur_part:
part_files.append(cur_part)
part_chapter_indices.append(cur_indices)
exported_files = []
for part_idx, (part_file_list, indices) in enumerate(zip(part_files, part_chapter_indices)):
with tempfile.TemporaryDirectory() as tmpdir:
# --- Batch merging logic ---
batch_size = 1024
chunk_list = []
for i in range(0, len(part_file_list), batch_size):
batch = part_file_list[i:i + batch_size]
txt = os.path.join(tmpdir, f'chunk_{i:04d}.txt')
out = os.path.join(tmpdir, f'chunk_{i:04d}.{default_audio_proc_format}')
with open(txt, 'w') as f:
for file in batch:
path = os.path.join(session['chapters_dir'], file).replace("\\", "/")
f.write(f"file '{path}'\n")
chunk_list.append((txt, out))
with Pool(cpu_count()) as pool:
results = pool.starmap(assemble_chunks, chunk_list)
if not all(results):
print(f"assemble_segments() One or more chunks failed for part {part_idx+1}.")
return None
# Final merge for this part
combined_chapters_file = os.path.join(
session['process_dir'],
f"{get_sanitized(session['metadata']['title'])}_part{part_idx+1}.{default_audio_proc_format}" if needs_split else f"{get_sanitized(session['metadata']['title'])}.{default_audio_proc_format}"
)
final_list = os.path.join(tmpdir, f'part_{part_idx+1:02d}_final.txt')
with open(final_list, 'w') as f:
for _, chunk_path in chunk_list:
f.write(f"file '{chunk_path.replace(os.sep, '/')}'\n")
if not assemble_chunks(final_list, combined_chapters_file):
print(f"assemble_segments() Final merge failed for part {part_idx+1}.")
return None
# --- Generate metadata for this part ---
metadata_file = os.path.join(session['process_dir'], f'metadata_part{part_idx+1}.txt')
part_chapters = [(chapter_files[i], chapter_titles[i]) for i in indices]
generate_ffmpeg_metadata(part_chapters, session, metadata_file, default_audio_proc_format)
# --- Export audio for this part ---
final_file = os.path.join(
session['audiobooks_dir'],
f"{session['final_name'].rsplit('.', 1)[0]}_part{part_idx+1}.{session['output_format']}" if needs_split else session['final_name']
)
if export_audio(combined_chapters_file, metadata_file, final_file):
exported_files.append(final_file)
return exported_files if exported_files else None
except Exception as e:
DependencyError(e)
return False
def replace_roman_numbers(text, lang):
def roman2int(s):
try:
roman = {
'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000,
'IV': 4, 'IX': 9, 'XL': 40, 'XC': 90, 'CD': 400, 'CM': 900
}
i = 0
num = 0
while i < len(s):
if i + 1 < len(s) and s[i:i+2] in roman:
num += roman[s[i:i+2]]
i += 2
else:
num += roman.get(s[i], 0)
i += 1
return num if num > 0 else s
except Exception:
return s
def replace_chapter_match(match):
chapter_word = match.group(1)
roman_numeral = match.group(2)
if not roman_numeral:
return match.group(0)
integer_value = roman2int(roman_numeral.upper())
if isinstance(integer_value, int):
return f'{chapter_word.capitalize()} {integer_value}; '
return match.group(0)
def replace_numeral_with_period(match):
roman_numeral = match.group(1)
integer_value = roman2int(roman_numeral.upper())
if isinstance(integer_value, int):
return f'{integer_value}. '
return match.group(0)
# Get language-specific chapter words
chapter_words = chapter_word_mapping.get(lang, [])
# Escape and join to form regex pattern
escaped_words = [re.escape(word) for word in chapter_words]
word_pattern = "|".join(escaped_words)
# Now build the full regex
roman_chapter_pattern = re.compile(
rf'\b({word_pattern})\s+'
r'(?=[IVXLCDM])'
r'((?:M{0,3})(?:CM|CD|D?C{0,3})?(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3}))\b',
re.IGNORECASE
)
# Roman numeral with trailing period
roman_numerals_with_period = re.compile(
r'^(?=[IVXLCDM])((?:M{0,3})(?:CM|CD|D?C{0,3})?(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3}))\.+',
re.IGNORECASE
)
text = roman_chapter_pattern.sub(replace_chapter_match, text)
text = roman_numerals_with_period.sub(replace_numeral_with_period, text)
return text
def delete_unused_tmp_dirs(web_dir, days, session):
dir_array = [
tmp_dir,
web_dir,
os.path.join(models_dir, '__sessions'),
os.path.join(voices_dir, '__sessions')
]
current_user_dirs = {
f"proc-{session['id']}",
f"web-{session['id']}",
f"voice-{session['id']}",
f"model-{session['id']}"
}
current_time = time.time()
threshold_time = current_time - (days * 24 * 60 * 60) # Convert days to seconds
for dir_path in dir_array:
if os.path.exists(dir_path) and os.path.isdir(dir_path):
for dir in os.listdir(dir_path):
if dir in current_user_dirs:
full_dir_path = os.path.join(dir_path, dir)
if os.path.isdir(full_dir_path):
try:
dir_mtime = os.path.getmtime(full_dir_path)
dir_ctime = os.path.getctime(full_dir_path)
if dir_mtime < threshold_time and dir_ctime < threshold_time:
shutil.rmtree(full_dir_path, ignore_errors=True)
print(f"Deleted expired session: {full_dir_path}")
except Exception as e:
print(f"Error deleting {full_dir_path}: {e}")
def compare_file_metadata(f1, f2):
if os.path.getsize(f1) != os.path.getsize(f2):
return False
if os.path.getmtime(f1) != os.path.getmtime(f2):
return False
return True
def get_compatible_tts_engines(language):
compatible_engines = [
tts for tts in models.keys()
if language in language_tts.get(tts, {})
]
return compatible_engines
def convert_ebook_batch(args, ctx):
global context
context = ctx
if isinstance(args['ebook_list'], list):
ebook_list = args['ebook_list'][:]
for file in ebook_list: # Use a shallow copy
if any(file.endswith(ext) for ext in ebook_formats):
args['ebook'] = file
print(f'Processing eBook file: {os.path.basename(file)}')
progress_status, passed = convert_ebook(args)
if passed is False:
print(f'Conversion failed: {progress_status}')
sys.exit(1)
args['ebook_list'].remove(file)
reset_ebook_session(args['session'])
return progress_status, passed
else:
print(f'the ebooks source is not a list!')
sys.exit(1)
def convert_ebook(args, ctx=None):
try:
global is_gui_process, context
error = None
id = None
info_session = None
if args['language'] is not None:
if not os.path.splitext(args['ebook'])[1]:
error = f"{args['ebook']} needs a format extension."
print(error)
return error, false
if not os.path.exists(args['ebook']):
error = 'File does not exist or Directory empty.'
print(error)
return error, false
try:
if len(args['language']) == 2:
lang_array = languages.get(part1=args['language'])
if lang_array:
args['language'] = lang_array.part3
args['language_iso1'] = lang_array.part1
elif len(args['language']) == 3:
lang_array = languages.get(part3=args['language'])
if lang_array:
args['language'] = lang_array.part3
args['language_iso1'] = lang_array.part1
else:
args['language_iso1'] = None
except Exception as e:
pass
if args['language'] not in language_mapping.keys():
error = 'The language you provided is not (yet) supported'
print(error)
return error, false
if ctx is not None:
context = ctx
is_gui_process = args['is_gui_process']
id = args['session'] if args['session'] is not None else str(uuid.uuid4())
session = context.get_session(id)
session['script_mode'] = args['script_mode'] if args['script_mode'] is not None else NATIVE
session['ebook'] = args['ebook']
session['ebook_list'] = args['ebook_list']
session['device'] = args['device']
session['language'] = args['language']
session['language_iso1'] = args['language_iso1']
session['tts_engine'] = args['tts_engine'] if args['tts_engine'] is not None else get_compatible_tts_engines(args['language'])[0]
session['custom_model'] = args['custom_model'] if not is_gui_process or args['custom_model'] is None else os.path.join(session['custom_model_dir'], args['custom_model'])
session['fine_tuned'] = args['fine_tuned']
session['output_format'] = args['output_format']
session['temperature'] = args['temperature']
session['length_penalty'] = args['length_penalty']
session['num_beams'] = args['num_beams']
session['repetition_penalty'] = args['repetition_penalty']
session['top_k'] = args['top_k']
session['top_p'] = args['top_p']
session['speed'] = args['speed']
session['enable_text_splitting'] = args['enable_text_splitting']
session['text_temp'] = args['text_temp']
session['waveform_temp'] = args['waveform_temp']
session['audiobooks_dir'] = args['audiobooks_dir']
session['voice'] = args['voice']
session['voice_model'] = args['voice_model']
info_session = f"\n*********** Session: {id} **************\nStore it in case of interruption, crash, reuse of custom model or custom voice,\nyou can resume the conversion with --session option"
if not is_gui_process:
session['voice_dir'] = os.path.join(voices_dir, '__sessions', f"voice-{session['id']}", session['language'])
os.makedirs(session['voice_dir'], exist_ok=True)
# As now uploaded voice files are in their respective language folder so check if no wav and bark folder are on the voice_dir root from previous versions
[shutil.move(src, os.path.join(session['voice_dir'], os.path.basename(src))) for src in glob(os.path.join(os.path.dirname(session['voice_dir']), '*.wav')) + ([os.path.join(os.path.dirname(session['voice_dir']), 'bark')] if os.path.isdir(os.path.join(os.path.dirname(session['voice_dir']), 'bark')) and not os.path.exists(os.path.join(session['voice_dir'], 'bark')) else [])]
session['custom_model_dir'] = os.path.join(models_dir, '__sessions',f"model-{session['id']}")
if session['custom_model'] is not None:
if not os.path.exists(session['custom_model_dir']):
os.makedirs(session['custom_model_dir'], exist_ok=True)
src_path = Path(session['custom_model'])
src_name = src_path.stem
if not os.path.exists(os.path.join(session['custom_model_dir'], src_name)):
required_files = models[session['tts_engine']]['internal']['files']
if analyze_uploaded_file(session['custom_model'], required_files):
model = extract_custom_model(session['custom_model'], session)
if model is not None:
session['custom_model'] = model
else:
error = f"{model} could not be extracted or mandatory files are missing"
else:
error = f'{os.path.basename(f)} is not a valid model or some required files are missing'
if session['voice'] is not None:
voice_name = get_sanitized(os.path.splitext(os.path.basename(session['voice']))[0])
final_voice_file = os.path.join(session['voice_dir'],f'{voice_name}_24000.wav')
if not os.path.exists(final_voice_file):
extractor = VoiceExtractor(session, session['voice'], voice_name)
status, msg = extractor.extract_voice()
if status:
session['voice'] = final_voice_file
else:
error = 'extractor.extract_voice()() failed! Check if you audio file is compatible.'
print(error)
if error is None:
if session['script_mode'] == NATIVE:
bool, e = check_programs('Calibre', 'ebook-convert', '--version')
if not bool:
error = f'check_programs() Calibre failed: {e}'
bool, e = check_programs('FFmpeg', 'ffmpeg', '-version')
if not bool:
error = f'check_programs() FFMPEG failed: {e}'
if error is None:
old_session_dir = os.path.join(tmp_dir, f"ebook-{session['id']}")
session['session_dir'] = os.path.join(tmp_dir, f"proc-{session['id']}")
if os.path.isdir(old_session_dir):
os.rename(old_session_dir, session['session_dir'])
session['process_dir'] = os.path.join(session['session_dir'], f"{hashlib.md5(session['ebook'].encode()).hexdigest()}")
session['chapters_dir'] = os.path.join(session['process_dir'], "chapters")
session['chapters_dir_sentences'] = os.path.join(session['chapters_dir'], 'sentences')
if prepare_dirs(args['ebook'], session):
session['filename_noext'] = os.path.splitext(os.path.basename(session['ebook']))[0]
msg = ''
msg_extra = ''
vram_avail = get_vram()
if vram_avail <= 4:
msg_extra += 'VRAM capacity could not be detected. -' if vram_avail == 0 else 'VRAM under 4GB - '
if session['tts_engine'] == TTS_ENGINES['BARK']:
os.environ['SUNO_USE_SMALL_MODELS'] = 'True'
msg_extra += f"Switching BARK to SMALL models - "
else:
if session['tts_engine'] == TTS_ENGINES['BARK']:
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
if session['device'] == 'cuda':
session['device'] = session['device'] if torch.cuda.is_available() else 'cpu'
if session['device'] == 'cpu':
msg += f"GPU not recognized by torch! Read {default_gpu_wiki} - Switching to CPU - "
elif session['device'] == 'mps':
session['device'] = session['device'] if torch.backends.mps.is_available() else 'cpu'
if session['device'] == 'cpu':
msg += f"MPS not recognized by torch! Read {default_gpu_wiki} - Switching to CPU - "
if session['device'] == 'cpu':
if session['tts_engine'] == TTS_ENGINES['BARK']:
os.environ['SUNO_OFFLOAD_CPU'] = 'True'
if default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'] == True:
try:
import deepspeed
except:
default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'] = False
msg_extra += 'deepseed not installed or package is broken. set to False - '
else:
msg_extra += 'deepspeed detected and ready!'
if msg == '':
msg = f"Using {session['device'].upper()} - "
msg += msg_extra
if is_gui_process:
show_alert({"type": "warning", "msg": msg})
print(msg)
session['epub_path'] = os.path.join(session['process_dir'], '__' + session['filename_noext'] + '.epub')
if convert2epub(session):
epubBook = epub.read_epub(session['epub_path'], {'ignore_ncx': True})
metadata = dict(session['metadata'])
for key, value in metadata.items():
data = epubBook.get_metadata('DC', key)
if data:
for value, attributes in data:
metadata[key] = value
metadata['language'] = session['language']
metadata['title'] = metadata['title'] if metadata['title'] else os.path.splitext(os.path.basename(session['ebook']))[0].replace('_',' ')
metadata['creator'] = False if not metadata['creator'] or metadata['creator'] == 'Unknown' else metadata['creator']
session['metadata'] = metadata
try:
if len(session['metadata']['language']) == 2:
lang_array = languages.get(part1=session['language'])
if lang_array:
session['metadata']['language'] = lang_array.part3
except Exception as e:
pass
if session['metadata']['language'] != session['language']:
error = f"WARNING!!! language selected {session['language']} differs from the EPUB file language {session['metadata']['language']}"
print(error)
session['cover'] = get_cover(epubBook, session)
if session['cover']:
session['toc'], session['chapters'] = get_chapters(epubBook, session)
session['final_name'] = get_sanitized(session['metadata']['title'] + '.' + session['output_format'])
if session['chapters'] is not None:
if convert_chapters2audio(session):
msg = 'Conversion successful. Combining sentences and chapters...'
show_alert({"type": "info", "msg": msg})
exported_files = combine_audio_chapters(session)
if len(exported_files) > 0:
chapters_dirs = [
dir_name for dir_name in os.listdir(session['process_dir'])
if fnmatch.fnmatch(dir_name, "chapters_*") and os.path.isdir(os.path.join(session['process_dir'], dir_name))
]
shutil.rmtree(os.path.join(session['voice_dir'], 'proc'), ignore_errors=True)
if is_gui_process:
if len(chapters_dirs) > 1:
if os.path.exists(session['chapters_dir']):
shutil.rmtree(session['chapters_dir'], ignore_errors=True)
if os.path.exists(session['epub_path']):
os.remove(session['epub_path'])
if os.path.exists(session['cover']):
os.remove(session['cover'])
else:
if os.path.exists(session['process_dir']):
shutil.rmtree(session['process_dir'], ignore_errors=True)
else:
if os.path.exists(session['voice_dir']):
if not any(os.scandir(session['voice_dir'])):
shutil.rmtree(session['voice_dir'], ignore_errors=True)
if os.path.exists(session['custom_model_dir']):
if not any(os.scandir(session['custom_model_dir'])):
shutil.rmtree(session['custom_model_dir'], ignore_errors=True)
if os.path.exists(session['session_dir']):
shutil.rmtree(session['session_dir'], ignore_errors=True)
progress_status = f'Audiobook(s) {", ".join(os.path.basename(f) for f in exported_files)} created!'
session['audiobook'] = exported_files[-1]
print(info_session)
return progress_status, True
else:
error = 'combine_audio_chapters() error: exported_files not created!'
else:
error = 'convert_chapters2audio() failed!'
else:
error = 'get_chapters() failed!'
else:
error = 'get_cover() failed!'
else:
error = 'convert2epub() failed!'
else:
error = f"Temporary directory {session['process_dir']} not removed due to failure."
else:
error = f"Language {args['language']} is not supported."
if session['cancellation_requested']:
error = 'Cancelled'
else:
if not is_gui_process and id is not None:
error += info_session
print(error)
return error, False
except Exception as e:
print(f'convert_ebook() Exception: {e}')
return e, False
def restore_session_from_data(data, session):
try:
for key, value in data.items():
if key in session: # Check if the key exists in session
if isinstance(value, dict) and isinstance(session[key], dict):
restore_session_from_data(value, session[key])
else:
session[key] = value
except Exception as e:
alert_exception(e)
def reset_ebook_session(id):
session = context.get_session(id)
data = {
"ebook": None,
"chapters_dir": None,
"chapters_dir_sentences": None,
"epub_path": None,
"filename_noext": None,
"chapters": None,
"cover": None,
"status": None,
"progress": 0,
"time": None,
"cancellation_requested": False,
"event": None,
"metadata": {
"title": None,
"creator": None,
"contributor": None,
"language": None,
"identifier": None,
"publisher": None,
"date": None,
"description": None,
"subject": None,
"rights": None,
"format": None,
"type": None,
"coverage": None,
"relation": None,
"Source": None,
"Modified": None
}
}
restore_session_from_data(data, session)
def get_all_ip_addresses():
ip_addresses = []
for interface, addresses in psutil.net_if_addrs().items():
for address in addresses:
if address.family == socket.AF_INET:
ip_addresses.append(address.address)
elif address.family == socket.AF_INET6:
ip_addresses.append(address.address)
return ip_addresses
def show_alert(state):
if isinstance(state, dict):
if state['type'] is not None:
if state['type'] == 'error':
gr.Error(state['msg'])
elif state['type'] == 'warning':
gr.Warning(state['msg'])
elif state['type'] == 'info':
gr.Info(state['msg'])
elif state['type'] == 'success':
gr.Success(state['msg'])
def web_interface(args, ctx):
global context
script_mode = args['script_mode']
is_gui_process = args['is_gui_process']
is_gui_shared = args['share']
glass_mask_msg = 'Initialization, please wait...'
ebook_src = None
language_options = [
(
f"{details['name']} - {details['native_name']}" if details['name'] != details['native_name'] else details['name'],
lang
)
for lang, details in language_mapping.items()
]
voice_options = []
tts_engine_options = []
custom_model_options = []
fine_tuned_options = []
audiobook_options = []
src_label_file = 'Select a File'
src_label_dir = 'Select a Directory'
visible_gr_tab_xtts_params = interface_component_options['gr_tab_xtts_params']
visible_gr_tab_bark_params = interface_component_options['gr_tab_bark_params']
visible_gr_group_custom_model = interface_component_options['gr_group_custom_model']
visible_gr_group_voice_file = interface_component_options['gr_group_voice_file']
# Buffer for real-time log streaming
log_buffer = Queue()
# Event to signal when the process should stop
thread = None
stop_event = threading.Event()
context = ctx
theme = gr.themes.Origin(
primary_hue='green',
secondary_hue='amber',
neutral_hue='gray',
radius_size='lg',
font_mono=['JetBrains Mono', 'monospace', 'Consolas', 'Menlo', 'Liberation Mono']
)
with gr.Blocks(theme=theme, delete_cache=(86400, 86400)) as app:
main_html = gr.HTML(
'''
'''
)
gr_logo_markdown = gr.Markdown(elem_id='gr_markdown_logo', value=f'''
')
with gr.Group(elem_id='gr_group_session'):
gr_session = gr.Textbox(label='Session', elem_id='gr_session', interactive=False)
gr_output_format_list = gr.Dropdown(label='Output format', elem_id='gr_output_format_list', choices=output_formats, type='value', value=default_output_format, interactive=True)
gr_tab_xtts_params = gr.TabItem('XTTSv2 Fine Tuned Parameters', elem_id='gr_tab_xtts_params', elem_classes='tab_item', visible=visible_gr_tab_xtts_params)
with gr_tab_xtts_params:
gr.Markdown(
elem_id='gr_markdown_tab_xtts_params',
value='''
### Customize XTTSv2 Parameters
Adjust the settings below to influence how the audio is generated. You can control the creativity, speed, repetition, and more.
'''
)
gr_xtts_temperature = gr.Slider(
label='Temperature',
minimum=0.1,
maximum=10.0,
step=0.1,
value=float(default_engine_settings[TTS_ENGINES['XTTSv2']]['temperature']),
elem_id='gr_xtts_temperature',
info='Higher values lead to more creative, unpredictable outputs. Lower values make it more monotone.'
)
gr_xtts_length_penalty = gr.Slider(
label='Length Penalty',
minimum=0.3,
maximum=5.0,
step=0.1,
value=float(default_engine_settings[TTS_ENGINES['XTTSv2']]['length_penalty']),
elem_id='gr_xtts_length_penalty',
info='Adjusts how much longer sequences are preferred. Higher values encourage the model to produce longer and more natural speech.',
visible=False
)
gr_xtts_num_beams = gr.Slider(
label='Number Beams',
minimum=1,
maximum=10,
step=1,
value=int(default_engine_settings[TTS_ENGINES['XTTSv2']]['num_beams']),
elem_id='gr_xtts_num_beams',
info='Controls how many alternative sequences the model explores. Higher values improve speech coherence and pronunciation but increase inference time.',
visible=False
)
gr_xtts_repetition_penalty = gr.Slider(
label='Repetition Penalty',
minimum=1.0,
maximum=10.0,
step=0.1,
value=float(default_engine_settings[TTS_ENGINES['XTTSv2']]['repetition_penalty']),
elem_id='gr_xtts_repetition_penalty',
info='Penalizes repeated phrases. Higher values reduce repetition.'
)
gr_xtts_top_k = gr.Slider(
label='Top-k Sampling',
minimum=10,
maximum=100,
step=1,
value=int(default_engine_settings[TTS_ENGINES['XTTSv2']]['top_k']),
elem_id='gr_xtts_top_k',
info='Lower values restrict outputs to more likely words and increase speed at which audio generates.'
)
gr_xtts_top_p = gr.Slider(
label='Top-p Sampling',
minimum=0.1,
maximum=1.0,
step=0.01,
value=float(default_engine_settings[TTS_ENGINES['XTTSv2']]['top_p']),
elem_id='gr_xtts_top_p',
info='Controls cumulative probability for word selection. Lower values make the output more predictable and increase speed at which audio generates.'
)
gr_xtts_speed = gr.Slider(
label='Speed',
minimum=0.5,
maximum=3.0,
step=0.1,
value=float(default_engine_settings[TTS_ENGINES['XTTSv2']]['speed']),
elem_id='gr_xtts_speed',
info='Adjusts how fast the narrator will speak.'
)
gr_xtts_enable_text_splitting = gr.Checkbox(
label='Enable Text Splitting',
value=default_engine_settings[TTS_ENGINES['XTTSv2']]['enable_text_splitting'],
elem_id='gr_xtts_enable_text_splitting',
info='Coqui-tts builtin text splitting. Can help against hallucinations bu can also be worse.',
visible=False
)
gr_tab_bark_params = gr.TabItem('BARK fine Tuned Parameters', elem_id='gr_tab_bark_params', elem_classes='tab_item', visible=visible_gr_tab_bark_params)
with gr_tab_bark_params:
gr.Markdown(
elem_id='gr_markdown_tab_bark_params',
value='''
### Customize BARK Parameters
Adjust the settings below to influence how the audio is generated, emotional and voice behavior random or more conservative
'''
)
gr_bark_text_temp = gr.Slider(
label='Text Temperature',
minimum=0.0,
maximum=1.0,
step=0.01,
value=float(default_engine_settings[TTS_ENGINES['BARK']]['text_temp']),
elem_id='gr_bark_text_temp',
info='Higher values lead to more creative, unpredictable outputs. Lower values make it more conservative.'
)
gr_bark_waveform_temp = gr.Slider(
label='Waveform Temperature',
minimum=0.0,
maximum=1.0,
step=0.01,
value=float(default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']),
elem_id='gr_bark_waveform_temp',
info='Higher values lead to more creative, unpredictable outputs. Lower values make it more conservative.'
)
gr_state = gr.State(value={"hash": None})
gr_state_alert = gr.State(value={"type": None,"msg": None})
gr_read_data = gr.JSON(visible=False)
gr_write_data = gr.JSON(visible=False)
gr_conversion_progress = gr.Textbox(elem_id='gr_conversion_progress', label='Progress')
gr_group_audiobook_list = gr.Group(elem_id='gr_group_audiobook_list', visible=False)
with gr_group_audiobook_list:
gr_audiobook_text = gr.Textbox(elem_id='gr_audiobook_text', label='Audiobook', interactive=False, visible=True)
gr_audiobook_player = gr.Audio(elem_id='gr_audiobook_player', label='',type='filepath', waveform_options=gr.WaveformOptions(show_recording_waveform=False), show_download_button=False, show_share_button=False, container=True, interactive=False, visible=True)
with gr.Row():
gr_audiobook_download_btn = gr.DownloadButton(elem_id='gr_audiobook_download_btn', label='↧', elem_classes=['small-btn'], variant='secondary', interactive=True, visible=True, scale=0, min_width=60)
gr_audiobook_list = gr.Dropdown(elem_id='gr_audiobook_list', label='', choices=audiobook_options, type='value', interactive=True, visible=True, scale=2)
gr_audiobook_del_btn = gr.Button(elem_id='gr_audiobook_del_btn', value='🗑', elem_classes=['small-btn'], variant='secondary', interactive=True, visible=True, scale=0, min_width=60)
gr_convert_btn = gr.Button(elem_id='gr_convert_btn', value='📚', elem_classes='icon-btn', variant='primary', interactive=False)
gr_modal = gr.HTML(visible=False)
gr_glass_mask = gr.HTML(f'