mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-07 21:14:06 -05:00
Filter out unusually short chapter blocks using statistical outlier detection.
> This commit adds statistical outlier filtering to the chapter splitting logic. After chapters are detected, any chapter block that is unusually short (based on a configurable standard deviation threshold) is automatically removed. This helps prevent processing of erroneous or junk blocks, resulting in cleaner audiobook output with more consistent chapter quality. - The filter uses the mean and standard deviation of chapter lengths to determine outliers. - The cutoff is adjustable with the `stddev_factor` parameter.
This commit is contained in:
@@ -18,6 +18,7 @@ import platform
|
||||
import random
|
||||
import shutil
|
||||
import socket
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -1666,6 +1667,15 @@ def get_compatible_tts_engines(language):
|
||||
if language in language_tts.get(tts, {})
|
||||
]
|
||||
return compatible_engines
|
||||
# A higher stddev_factor removes fewer chapters (only extremely short ones), while a lower value removes more chapters (including mildly short ones).
|
||||
def filter_short_chapter_outliers(chapter_blocks, stddev_factor=2.0):
|
||||
lengths = [sum(len(s) for s in chapter) for chapter in chapter_blocks]
|
||||
if len(lengths) < 2:
|
||||
return chapter_blocks
|
||||
mean = statistics.mean(lengths)
|
||||
stdev = statistics.stdev(lengths)
|
||||
cutoff = mean - stddev_factor * stdev
|
||||
return [chapter for chapter, length in zip(chapter_blocks, lengths) if length >= cutoff]
|
||||
|
||||
def convert_ebook_batch(args, ctx):
|
||||
global context
|
||||
@@ -1862,6 +1872,8 @@ def convert_ebook(args, ctx=None):
|
||||
session['cover'] = get_cover(epubBook, session)
|
||||
if session['cover']:
|
||||
session['toc'], session['chapters'] = get_chapters(epubBook, session)
|
||||
# Filter out unusually short chapter blocks
|
||||
session['chapters'] = filter_short_chapter_outliers(session['chapters'], stddev_factor=2.0)
|
||||
session['final_name'] = get_sanitized(session['metadata']['title'] + '.' + session['output_format'])
|
||||
if session['chapters'] is not None:
|
||||
if convert_chapters2audio(session):
|
||||
|
||||
Reference in New Issue
Block a user