v25.12.32

This commit is contained in:
ROBERT MCDOWELL
2025-12-30 19:12:19 -08:00
committed by GitHub
15 changed files with 102 additions and 67 deletions

View File

@@ -3,8 +3,8 @@ FROM python:${PYTHON_VERSION}-slim-bookworm
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
ARG APP_VERSION=25.12.31
ARG DEVICE_TAG=cpu
ARG APP_VERSION=25.12.32
ARG DEVICE_TAG=cu128
ARG DOCKER_DEVICE_STR='{"name": "cu128", "os": "manylinux_2_28", "arch": "x86_64", "pyvenv": [3, 12], "tag": "cu128", "note": "default device"}'
ARG DOCKER_PROGRAMS_STR="curl ffmpeg nodejs npm espeak-ng sox tesseract-ocr"
ARG CALIBRE_INSTALLER_URL="https://download.calibre-ebook.com/linux-installer.sh"

View File

@@ -1 +1 @@
25.12.31
25.12.32

View File

@@ -5,7 +5,7 @@ services:
context: .
dockerfile: Dockerfile
args:
APP_VERSION: ${APP_VERSION:-25.25.25}
APP_VERSION: ${APP_VERSION:-25.12.32}
DEVICE_TAG: ${DEVICE_TAG:-cpu} # e.g. cu128, cu118, rocm, xpu, cpu
container_name: ebook2audiobook
working_dir: /app

View File

@@ -8,8 +8,7 @@ ebooklib
fastapi
hf_xet
beautifulsoup4
sudachipy
sudachidict-core
nagisa
pymupdf
pymupdf-layout
pytesseract

View File

@@ -428,7 +428,8 @@ class DeviceInstaller():
):
if os.path.exists(p):
with open(p, 'r', encoding='utf-8', errors='ignore') as f:
version = f.read()
v = f.read()
version = lib_version_parse(v)
break
elif os.name == 'nt':
for env in ('ROCM_PATH', 'HIP_PATH'):
@@ -447,10 +448,14 @@ class DeviceInstaller():
break
if version:
cmp = toolkit_version_compare(version, rocm_version_range)
min_version = rocm_version_range["min"]
max_version = rocm_version_range["max"]
min_version_str = ".".join(map(str, min_version)) if isinstance(min_version, (tuple, list)) else str(min_version)
max_version_str = ".".join(map(str, max_version)) if isinstance(max_version, (tuple, list)) else str(max_version)
if cmp == -1:
msg = f'ROCm {version} < min {rocm_version_range["min"]}. Please upgrade.'
msg = f'ROCm {version} < min {min_version_str}. Please upgrade.'
elif cmp == 1:
msg = f'ROCm {version} > max {rocm_version_range["max"]}. Falling back to CPU.'
msg = f'ROCm {version} > max {max_version_str}. Falling back to CPU.'
elif cmp == 0:
devices['ROCM']['found'] = True
parts = version.split(".")
@@ -531,10 +536,12 @@ class DeviceInstaller():
break
if version:
cmp = toolkit_version_compare(version, cuda_version_range)
min_ver = ".".join(str(part) for part in cuda_version_range["min"])
max_ver = ".".join(str(part) for part in cuda_version_range["max"])
if cmp == -1:
msg = f'CUDA {version} < min {cuda_version_range["min"]}. Please upgrade.'
msg = f'CUDA {version} < min {min_ver}. Please upgrade.'
elif cmp == 1:
msg = f'CUDA {version} > max {cuda_version_range["max"]}. Falling back to CPU.'
msg = f'CUDA {version} > max {max_ver}. Falling back to CPU.'
elif cmp == 0:
devices['CUDA']['found'] = True
parts = version.split(".")
@@ -580,7 +587,14 @@ class DeviceInstaller():
if version:
cmp = toolkit_version_compare(version, xpu_version_range)
if cmp == -1 or cmp == 1:
msg = f'XPU {version} out of supported range {xpu_version_range}. Falling back to CPU.'
range_display = (
f"{xpu_version_range.get('min')} to {xpu_version_range.get('max')}"
if isinstance(xpu_version_range, dict)
and 'min' in xpu_version_range
and 'max' in xpu_version_range
else str(xpu_version_range)
)
msg = f'XPU {version} out of supported range {range_display}. Falling back to CPU.'
elif cmp == 0:
devices['XPU']['found'] = True
name = 'xpu'

View File

@@ -122,6 +122,9 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
return False
if self.engine:
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -78,6 +78,9 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
if self.engine:
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -103,6 +103,9 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
if self.engine:
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -86,6 +86,9 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
if self.engine:
device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -85,6 +85,9 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
return False
if self.engine:
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -74,6 +74,9 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
return False
if self.engine:
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
s = sentence.strip()
if len(s) < 3 or not any(c.isalnum() for c in s):
return True
if sentence == TTS_SML['break']:
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds

View File

@@ -1039,10 +1039,15 @@ def get_sentences(text:str, id:str)->list|None:
jieba.dt.cache_file = os.path.join(models_dir, 'jieba.cache')
result.extend([t for t in jieba.cut(segment) if t.strip()])
elif lang == 'jpn':
"""
from sudachipy import dictionary, tokenizer
sudachi = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C
result.extend([m.surface() for m in sudachi.tokenize(segment, mode) if m.surface().strip()])
"""
import nagisa
tokens = nagisa.tagging(segment).words
result.extend(tokens)
elif lang == 'kor':
from soynlp.tokenizer import LTokenizer
ltokenizer = LTokenizer()
@@ -1679,55 +1684,56 @@ def convert_chapters2audio(id:str)->bool:
print(msg)
if session['is_gui_process']:
progress_bar = gr.Progress(track_tqdm=False)
ebook_name = Path(session['ebook']).name
with tqdm(total=total_iterations, desc='0.00%', bar_format='{desc}: {n_fmt}/{total_fmt} ', unit='step', initial=0) as t:
for x in range(0, total_chapters):
chapter_num = x + 1
chapter_audio_file = f'chapter_{chapter_num}.{default_audio_proc_format}'
sentences = session['chapters'][x]
sentences_count = sum(1 for row in sentences if row.strip() not in TTS_SML.values())
start = sentence_number
msg = f'Block {chapter_num} containing {sentences_count} sentences...'
print(msg)
for i, sentence in enumerate(sentences):
if session['cancellation_requested']:
msg = 'Cancel requested'
print(msg)
return False
if sentence_number in missing_sentences or sentence_number > resume_sentence or (sentence_number == 0 and resume_sentence == 0):
if sentence_number <= resume_sentence and sentence_number > 0:
msg = f'**Recovering missing file sentence {sentence_number}'
if session['ebook']:
ebook_name = Path(session['ebook']).name
with tqdm(total=total_iterations, desc='0.00%', bar_format='{desc}: {n_fmt}/{total_fmt} ', unit='step', initial=0) as t:
for x in range(0, total_chapters):
chapter_num = x + 1
chapter_audio_file = f'chapter_{chapter_num}.{default_audio_proc_format}'
sentences = session['chapters'][x]
sentences_count = sum(1 for row in sentences if row.strip() not in TTS_SML.values())
start = sentence_number
msg = f'Block {chapter_num} containing {sentences_count} sentences...'
print(msg)
for i, sentence in enumerate(sentences):
if session['cancellation_requested']:
msg = 'Cancel requested'
print(msg)
sentence = sentence.strip()
success = tts_manager.convert_sentence2audio(sentence_number, sentence) if sentence else True
if success:
total_progress = (t.n + 1) / total_iterations
if session['is_gui_process']:
progress_bar(progress=total_progress, desc=ebook_name)
is_sentence = sentence.strip() not in TTS_SML.values()
percentage = total_progress * 100
t.set_description(f"{percentage:.2f}%")
msg = f' : {sentence}' if is_sentence else f' : {sentence}'
return False
if sentence_number in missing_sentences or sentence_number > resume_sentence or (sentence_number == 0 and resume_sentence == 0):
if sentence_number <= resume_sentence and sentence_number > 0:
msg = f'**Recovering missing file sentence {sentence_number}'
print(msg)
sentence = sentence.strip()
success = tts_manager.convert_sentence2audio(sentence_number, sentence) if sentence else True
if success:
total_progress = (t.n + 1) / total_iterations
if session['is_gui_process']:
progress_bar(progress=total_progress, desc=ebook_name)
is_sentence = sentence.strip() not in TTS_SML.values()
percentage = total_progress * 100
t.set_description(f"{percentage:.2f}%")
msg = f' : {sentence}' if is_sentence else f' : {sentence}'
print(msg)
else:
return False
if sentence.strip() not in TTS_SML.values():
sentence_number += 1
t.update(1)
end = sentence_number - 1 if sentence_number > 1 else sentence_number
msg = f'End of Block {chapter_num}'
print(msg)
if chapter_num in missing_chapters or sentence_number > resume_sentence:
if chapter_num <= resume_chapter:
msg = f'**Recovering missing file block {chapter_num}'
print(msg)
if combine_audio_sentences(chapter_audio_file, int(start), int(end), id):
msg = f'Combining block {chapter_num} to audio, sentence {start} to {end}'
print(msg)
else:
msg = 'combine_audio_sentences() failed!'
print(msg)
return False
if sentence.strip() not in TTS_SML.values():
sentence_number += 1
t.update(1)
end = sentence_number - 1 if sentence_number > 1 else sentence_number
msg = f'End of Block {chapter_num}'
print(msg)
if chapter_num in missing_chapters or sentence_number > resume_sentence:
if chapter_num <= resume_chapter:
msg = f'**Recovering missing file block {chapter_num}'
print(msg)
if combine_audio_sentences(chapter_audio_file, int(start), int(end), id):
msg = f'Combining block {chapter_num} to audio, sentence {start} to {end}'
print(msg)
else:
msg = 'combine_audio_sentences() failed!'
print(msg)
return False
return True
except Exception as e:
DependencyError(e)
@@ -1941,11 +1947,10 @@ def combine_audio_chapters(id:str)->list[str]|None:
target_rate = '48000'
cmd += ['-c:a', 'libopus', '-compression_level', '0', '-b:a', '192k', '-ar', target_rate]
cmd += ['-map_metadata', '1']
if 'output_channel' in session:
if session['output_channel'] == 'mono':
cmd += ['-ac', '1']
elif session['output_channel'] == 'stereo':
cmd += ['-ac', '2']
if session['output_channel'] == 'stereo':
cmd += ['-ac', '2']
else:
cmd += ['-ac', '1']
if input_codec == target_codec and input_rate == target_rate:
cmd = [
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', ffmpeg_combined_audio,
@@ -2317,6 +2322,7 @@ def convert_ebook(args:dict)->tuple:
session['bark_waveform_temp'] = float(args['bark_waveform_temp'])
session['audiobooks_dir'] = str(args['audiobooks_dir']) if args['audiobooks_dir'] else None
session['output_format'] = str(args['output_format'])
session['output_channel'] = str(args['output_channel'])
session['output_split'] = bool(args['output_split'])
session['output_split_hours'] = args['output_split_hours']if args['output_split_hours'] is not None else default_output_split_hours
session['model_cache'] = f"{session['tts_engine']}-{session['fine_tuned']}"

View File

@@ -5,7 +5,7 @@ services:
context: .
dockerfile: Dockerfile
args:
APP_VERSION: ${APP_VERSION:-25.25.25}
APP_VERSION: ${APP_VERSION:-25.12.32}
DEVICE_TAG: ${DEVICE_TAG:-cpu} # e.g. cu124, cu128, rocm, xpu, cpu etc.
container_name: ebook2audiobook
working_dir: /app

View File

@@ -27,8 +27,7 @@ dependencies = [
"fastapi",
"hf_xet",
"beautifulsoup4",
"sudachipy",
"sudachidict-core",
"nagisa",
"pymupdf",
"pymupdf-layout",
"pytesseract",

View File

@@ -8,8 +8,7 @@ ebooklib
fastapi
hf_xet
beautifulsoup4
sudachipy
sudachidict-core
nagisa
pymupdf
pymupdf-layout
pytesseract