This commit is contained in:
unknown
2025-08-04 05:28:31 -07:00
parent 74820e9260
commit d285703fc8
4 changed files with 38 additions and 36 deletions

View File

@@ -7,8 +7,9 @@ from .models import (
from .conf import (
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
audiobooks_host_dir, debug_mode, default_audio_proc_format, default_device,
default_gpu_wiki, default_output_format, device_list, ebook_formats,
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
default_audio_proc_format, default_device, default_gpu_wiki,
default_output_format, device_list, ebook_formats,
ebooks_dir, interface_component_options, interface_concurrency_limit,
interface_host, interface_port, interface_shared_tmp_expire,
max_python_version, min_python_version, models_dir, os,
@@ -36,9 +37,9 @@ __all__ = [
# from conf
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
"audiobooks_host_dir", "debug_mode", "default_audio_proc_format",
"default_device", "default_gpu_wiki", "default_output_format",
"device_list", "ebook_formats", "ebooks_dir",
"audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
"default_audio_proc_format", "default_device", "default_gpu_wiki",
"default_output_format", "device_list", "ebook_formats", "ebooks_dir",
"interface_component_options", "interface_concurrency_limit",
"interface_host", "interface_port", "interface_shared_tmp_expire",
"max_python_version", "min_python_version", "models_dir", "os",

View File

@@ -306,7 +306,7 @@ class Coqui:
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
if normalize_audio(proc_voice_path, new_voice_path, 24000):
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
del audio_data, sourceTensor, audio_tensor
if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
del tts

View File

@@ -10,7 +10,7 @@ from io import BytesIO
from pydub import AudioSegment, silence
from pydub.silence import detect_silence
from lib.conf import voice_formats
from lib.conf import voice_formats, default_audio_proc_samplerate
from lib.models import TTS_ENGINES, models
from lib.classes.background_detector import BackgroundDetector
@@ -204,8 +204,9 @@ class VoiceExtractor:
raise ValueError(error)
def _normalize_audio(self):
try:
process_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
try:
rate = default_audio_proc_samplerate
process_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
filter_complex = (
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
@@ -226,34 +227,33 @@ class VoiceExtractor:
'-y', process_file
]
error = None
for rate in ['16000', '24000']:
ffmpeg_cmd[-3] = rate
output_file = re.sub(r'\.wav$', f'_{rate}.wav', process_file)
ffmpeg_cmd[-1] = output_file
try:
process = subprocess.Popen(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
universal_newlines=True,
encoding='utf-8'
)
for line in process.stdout:
print(line, end='') # Print each line of stdout
process.wait()
if process.returncode != 0:
error = f'_normalize_audio(): process.returncode: {process.returncode}'
break
elif not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
error = f'_normalize_audio() error: {output_file} was not created or is empty.'
break
else:
self.final_files.append(output_file)
except subprocess.CalledProcessError as e:
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
ffmpeg_cmd[-3] = rate
output_file = re.sub(r'_proc\.wav$', f'.wav', process_file)
ffmpeg_cmd[-1] = output_file
try:
process = subprocess.Popen(
ffmpeg_cmd,
env={},
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
universal_newlines=True,
encoding='utf-8'
)
for line in process.stdout:
print(line, end='') # Print each line of stdout
process.wait()
if process.returncode != 0:
error = f'_normalize_audio(): process.returncode: {process.returncode}'
break
elif not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
error = f'_normalize_audio() error: {output_file} was not created or is empty.'
break
else:
self.final_files.append(output_file)
except subprocess.CalledProcessError as e:
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
break
shutil.rmtree(self.demucs_dir, ignore_errors=True)
if os.path.exists(process_file):
os.remove(process_file)

View File

@@ -71,6 +71,7 @@ audiobooks_cli_dir = os.path.abspath(os.path.join('audiobooks','cli'))
ebook_formats = ['.epub', '.mobi', '.azw3', '.fb2', '.lrf', '.rb', '.snb', '.tcr', '.pdf', '.txt', '.rtf', '.doc', '.docx', '.html', '.odt', '.azw'] # Add or remove the format you accept as input
voice_formats = ['.mp4', '.m4b', '.m4a', '.mp3', '.wav', '.aac', '.flac', '.alac', '.ogg', '.aiff', '.aif', '.wma', '.dsd', '.opus', '.pcmu', '.pcma', '.gsm'] # Add or remove the format you accept as input
output_formats = ['aac', 'flac', 'mp3', 'm4b', 'm4a', 'mp4', 'mov', 'ogg', 'wav', 'webm']
default_audio_proc_samplerate = '24000'
default_audio_proc_format = 'flac' # or 'mp3', 'aac', 'm4a', 'm4b', 'amr', '3gp', 'alac'. 'wav' format is ok but limited to process files < 4GB
default_output_format = 'm4b'