...

2026-01-08 21:38:12 -05:00 · 2025-12-30 18:43:33 -08:00
parent 76cdbcb86a
commit 9586c3f24f
7 changed files with 24 additions and 10 deletions
--- a/lib/classes/tts_engines/bark.py
+++ b/lib/classes/tts_engines/bark.py
@@ -122,6 +122,9 @@ class Bark(TTSUtils, TTSRegistry, name='bark'):
                        return False
            if self.engine:
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/classes/tts_engines/fairseq.py
+++ b/lib/classes/tts_engines/fairseq.py
@@ -78,6 +78,9 @@ class Fairseq(TTSUtils, TTSRegistry, name='fairseq'):
            if self.engine:
                device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/classes/tts_engines/tacotron.py
+++ b/lib/classes/tts_engines/tacotron.py
@@ -103,6 +103,9 @@ class Tacotron2(TTSUtils, TTSRegistry, name='tacotron'):
            if self.engine:
                device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/classes/tts_engines/vits.py
+++ b/lib/classes/tts_engines/vits.py
@@ -86,6 +86,9 @@ class Vits(TTSUtils, TTSRegistry, name='vits'):
            if self.engine:
                device = devices['CUDA']['proc'] if self.session['device'] in ['cuda', 'jetson'] else self.session['device']
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/classes/tts_engines/xtts.py
+++ b/lib/classes/tts_engines/xtts.py
@@ -85,6 +85,9 @@ class XTTSv2(TTSUtils, TTSRegistry, name='xtts'):
                        return False
            if self.engine:
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/classes/tts_engines/yourtts.py
+++ b/lib/classes/tts_engines/yourtts.py
@@ -74,6 +74,9 @@ class YourTTS(TTSUtils, TTSRegistry, name='yourtts'):
                        return False
            if self.engine:
                final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_index}.{default_audio_proc_format}')
+                s = sentence.strip()
+                if len(s) < 3 or not any(c.isalnum() for c in s):
+                    return True
                if sentence == TTS_SML['break']:
                    silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
                    break_tensor = torch.zeros(1, int(self.params['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
--- a/lib/core.py
+++ b/lib/core.py
@@ -1047,11 +1047,7 @@ def get_sentences(text:str, id:str)->list|None:
                        """
                        import nagisa
                        tokens = nagisa.tagging(segment).words
-                        result.extend([
-                            f' {token}'
-                            for token in tokens
-                            if token.strip()
-                        ])
+                        result.extend(tokens)
                    elif lang == 'kor':
                        from soynlp.tokenizer import LTokenizer
                        ltokenizer = LTokenizer()
@@ -1951,11 +1947,10 @@ def combine_audio_chapters(id:str)->list[str]|None:
                    target_rate = '48000'
                    cmd += ['-c:a', 'libopus', '-compression_level', '0', '-b:a', '192k', '-ar', target_rate]
                cmd += ['-map_metadata', '1'] 
-            if 'output_channel' in session:
-                if session['output_channel'] == 'mono':
-                    cmd += ['-ac', '1']
-                elif session['output_channel'] == 'stereo':
-                    cmd += ['-ac', '2']
+            if session['output_channel'] == 'stereo':
+                cmd += ['-ac', '2']
+            else:
+                cmd += ['-ac', '1']
            if input_codec == target_codec and input_rate == target_rate:
                cmd = [
                    shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', ffmpeg_combined_audio,
@@ -2327,6 +2322,7 @@ def convert_ebook(args:dict)->tuple:
                session['bark_waveform_temp'] =  float(args['bark_waveform_temp'])
                session['audiobooks_dir'] = str(args['audiobooks_dir']) if args['audiobooks_dir'] else None
                session['output_format'] = str(args['output_format'])
+                session['output_channel'] = str(args['output_channel'])
                session['output_split'] = bool(args['output_split'])
                session['output_split_hours'] = args['output_split_hours']if args['output_split_hours'] is not None else default_output_split_hours
                session['model_cache'] = f"{session['tts_engine']}-{session['fine_tuned']}"