Filter out unusually short chapter blocks using statistical outlier detection.

> This commit adds statistical outlier filtering to the chapter splitting logic. After chapters are detected, any chapter block that is unusually short (based on a configurable standard deviation threshold) is automatically removed. This helps prevent processing of erroneous or junk blocks, resulting in cleaner audiobook output with more consistent chapter quality.

- The filter uses the mean and standard deviation of chapter lengths to determine outliers.
- The cutoff is adjustable with the `stddev_factor` parameter.
This commit is contained in:
Drew Thomasson
2025-07-25 11:21:58 -04:00
committed by GitHub
parent 733a0caced
commit d39bca35e9

View File

@@ -18,6 +18,7 @@ import platform
import random
import shutil
import socket
import statistics
import subprocess
import sys
import tempfile
@@ -1666,6 +1667,15 @@ def get_compatible_tts_engines(language):
if language in language_tts.get(tts, {})
]
return compatible_engines
# A higher stddev_factor removes fewer chapters (only extremely short ones), while a lower value removes more chapters (including mildly short ones).
def filter_short_chapter_outliers(chapter_blocks, stddev_factor=2.0):
lengths = [sum(len(s) for s in chapter) for chapter in chapter_blocks]
if len(lengths) < 2:
return chapter_blocks
mean = statistics.mean(lengths)
stdev = statistics.stdev(lengths)
cutoff = mean - stddev_factor * stdev
return [chapter for chapter, length in zip(chapter_blocks, lengths) if length >= cutoff]
def convert_ebook_batch(args, ctx):
global context
@@ -1862,6 +1872,8 @@ def convert_ebook(args, ctx=None):
session['cover'] = get_cover(epubBook, session)
if session['cover']:
session['toc'], session['chapters'] = get_chapters(epubBook, session)
# Filter out unusually short chapter blocks
session['chapters'] = filter_short_chapter_outliers(session['chapters'], stddev_factor=2.0)
session['final_name'] = get_sanitized(session['metadata']['title'] + '.' + session['output_format'])
if session['chapters'] is not None:
if convert_chapters2audio(session):