mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-09 13:58:14 -05:00
...
This commit is contained in:
@@ -20,10 +20,10 @@ from .conf import (
|
||||
|
||||
from .lang import (
|
||||
abbreviations_mapping, chapter_word_mapping, default_language_code,
|
||||
emojis_array, install_info, language_mapping, language_math_phonemes,
|
||||
language_clock, language_tts, os, punctuation_list, punctuation_list_set,
|
||||
punctuation_split_hard, punctuation_split_hard_set, punctuation_split_soft,
|
||||
punctuation_split_soft_set, punctuation_switch,
|
||||
roman_numbers_tuples, emojis_list, install_info, language_mapping,
|
||||
language_math_phonemes, language_clock, language_tts, os, punctuation_list,
|
||||
punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
|
||||
punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
|
||||
specialchars_mapping, specialchars_remove, year_to_decades_languages
|
||||
)
|
||||
|
||||
@@ -49,9 +49,9 @@ __all__ = [
|
||||
|
||||
# from lang
|
||||
"abbreviations_mapping", "chapter_word_mapping", "default_language_code",
|
||||
"emojis_array", "install_info", "language_mapping",
|
||||
"language_math_phonemes", "language_clock", "language_tts", "os",
|
||||
"punctuation_list", "punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
|
||||
"roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
|
||||
"language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
|
||||
"punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
|
||||
"punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
|
||||
"specialchars_mapping", "specialchars_remove", "year_to_decades_languages"
|
||||
]
|
||||
|
||||
143
lib/functions.py
143
lib/functions.py
@@ -686,9 +686,9 @@ def filter_chapter(doc, lang, lang_iso1, tts_engine, stanza_nlp, is_num2words_co
|
||||
text = text.translate(specialchars_remove_table)
|
||||
text = normalize_text(text, lang, lang_iso1, tts_engine)
|
||||
# Ensure space before and after punctuation_list
|
||||
#pattern_space = re.escape(''.join(punctuation_list))
|
||||
#punctuation_pattern_space = r'(?<!\s)([{}])'.format(pattern_space)
|
||||
#text = re.sub(punctuation_pattern_space, r' \1', text)
|
||||
pattern_space = re.escape(''.join(punctuation_list))
|
||||
punctuation_pattern_space = r'(?<!\s)([{}])'.format(pattern_space)
|
||||
text = re.sub(punctuation_pattern_space, r' \1', text)
|
||||
if tts_engine in [TTS_ENGINES['YOURTTS']]:
|
||||
text = text.replace('—', ' - ')
|
||||
return get_sentences(text, lang, tts_engine)
|
||||
@@ -1113,123 +1113,40 @@ def math2words(text, lang, lang_iso1, tts_engine, is_num2words_compat):
|
||||
text = re.sub(ambiguous_pattern, replace_ambiguous, text)
|
||||
text = set_formatted_number(text, lang, lang_iso1, is_num2words_compat)
|
||||
return text
|
||||
|
||||
def roman2number(text, lang, re_non_ws, re_title_num, re_punct, re_insert):
|
||||
if re_non_ws.search(text):
|
||||
m = re_title_num.match(text)
|
||||
if m:
|
||||
num = m.group(1)
|
||||
if num.isdigit() or (set(num) <= set("IVXLCDM")):
|
||||
if not re_punct.match(text):
|
||||
text = re_insert.sub(r'\1 — ', text)
|
||||
# heck for a standalone Roman numeral + dot or dash
|
||||
stripped = text.strip()
|
||||
m = re.fullmatch(r'(?i)([IVXLCDM]+)([.-])', stripped)
|
||||
if m:
|
||||
roman, sep = m.group(1), m.group(2)
|
||||
# convert
|
||||
try:
|
||||
roman_map = {
|
||||
'I':1,'V':5,'X':10,'L':50,'C':100,
|
||||
'D':500,'M':1000,
|
||||
'IV':4,'IX':9,'XL':40,'XC':90,
|
||||
'CD':400,'CM':900
|
||||
}
|
||||
i = 0
|
||||
num = 0
|
||||
s = roman.upper()
|
||||
while i < len(s):
|
||||
if i+1 < len(s) and s[i:i+2] in roman_map:
|
||||
num += roman_map[s[i:i+2]]
|
||||
i += 2
|
||||
else:
|
||||
num += roman_map.get(s[i], 0)
|
||||
i += 1
|
||||
if num > 0:
|
||||
return f"{num}{sep}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Helper: convert a pure Roman string to int or return original
|
||||
def to_num(s):
|
||||
try:
|
||||
roman_map = {
|
||||
'I':1,'V':5,'X':10,'L':50,'C':100,
|
||||
'D':500,'M':1000,
|
||||
'IV':4,'IX':9,'XL':40,'XC':90,
|
||||
'CD':400,'CM':900
|
||||
}
|
||||
i = 0
|
||||
num = 0
|
||||
s_up = s.upper()
|
||||
while i < len(s_up):
|
||||
if i+1 < len(s_up) and s_up[i:i+2] in roman_map:
|
||||
num += roman_map[s_up[i:i+2]]
|
||||
i += 2
|
||||
else:
|
||||
num += roman_map.get(s_up[i], 0)
|
||||
i += 1
|
||||
return num if num > 0 else s
|
||||
except Exception:
|
||||
return s
|
||||
def roman2number(text):
|
||||
|
||||
# Chapter‐word + Roman
|
||||
def to_match(m):
|
||||
cw, rn = m.group(1), m.group(2)
|
||||
val = to_num(rn.upper())
|
||||
return f"{cw.capitalize()} {val}; " if isinstance(val, int) else m.group(0)
|
||||
def to_int(s):
|
||||
s = s.upper()
|
||||
i, result = 0, 0
|
||||
while i < len(s):
|
||||
for roman, value in roman_numbers_tuples:
|
||||
if s[i:i+len(roman)] == roman:
|
||||
result += value
|
||||
i += len(roman)
|
||||
break
|
||||
else:
|
||||
return s # Not valid roman
|
||||
return result
|
||||
|
||||
# Trailing‐period at start‐of‐line
|
||||
def clean_numbers(m):
|
||||
raw = m.group(0) # e.g. "IV..."
|
||||
core = re.sub(r'[^IVXLCDM]', '', raw.upper())
|
||||
sep = raw[len(core):] # the dots
|
||||
val = to_num(core)
|
||||
return f"{val}{sep}" if isinstance(val, int) else raw
|
||||
def repl_heading(m):
|
||||
val = to_int(m.group(1))
|
||||
return f"{val}{m.group(2)}{m.group(3)}" if isinstance(val, int) else m.group(0)
|
||||
text = re.sub(r'^([IVXLCDM]+)([.-])(\s+)', repl_heading, text)
|
||||
|
||||
# Bare Roman at start‐of‐line + "." or "-"
|
||||
def clean_start(m):
|
||||
raw = m.group(0) # e.g. "VI - "
|
||||
core = re.sub(r'[^IVXLCDM]', '', raw.upper())
|
||||
sep = raw[len(core):] # the ". " or " - "
|
||||
val = to_num(core)
|
||||
return f"{val}{sep}" if isinstance(val, int) else raw
|
||||
def repl_standalone(m):
|
||||
val = to_int(m.group(1))
|
||||
return f"{val}{m.group(2)}" if isinstance(val, int) else m.group(0)
|
||||
text = re.sub(r'^([IVXLCDM]+)([.-])$', repl_standalone, text)
|
||||
|
||||
# Convert ALL ALL-UPPERCASE Roman numerals as whole words anywhere in text
|
||||
def bare_roman(m):
|
||||
roman = m.group(0)
|
||||
if roman.isupper():
|
||||
val = to_num(roman)
|
||||
return str(val) if isinstance(val, int) else roman
|
||||
return roman
|
||||
def repl_word(m):
|
||||
val = to_int(m.group(0))
|
||||
return str(val) if isinstance(val, int) else m.group(0)
|
||||
text = re.sub(r'\b[IVXLCDM]{2,}\b', repl_word, text)
|
||||
|
||||
words = chapter_word_mapping.get(lang, [])
|
||||
wp = "|".join(re.escape(w) for w in words) or r'(?!x)x' # if empty, use impossible pattern
|
||||
p1 = re.compile(
|
||||
rf'\b({wp})\s+(?=[IVXLCDM])'
|
||||
r'((?:M{0,3})(?:CM|CD|D?C{0,3})'
|
||||
r'(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3}))\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
p2 = re.compile(
|
||||
r'(?m)^(?=[IVXLCDM])'
|
||||
r'(?:M{0,3})(?:CM|CD|D?C{0,3})'
|
||||
r'(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3})\.+',
|
||||
re.IGNORECASE
|
||||
)
|
||||
p3 = re.compile(
|
||||
r'(?m)^(?=[IVXLCDM])'
|
||||
r'(?:M{0,3})(?:CM|CD|D?C{0,3})'
|
||||
r'(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3})'
|
||||
r'(?P<sep>\.|\s*-\s*)',
|
||||
re.IGNORECASE
|
||||
)
|
||||
text = p1.sub(to_match, text)
|
||||
text = p2.sub(clean_numbers, text)
|
||||
text = p3.sub(clean_start, text)
|
||||
text = re.sub(r'\b[IVXLCDM]{2,}\b', bare_roman, text)
|
||||
return text
|
||||
|
||||
|
||||
def filter_sml(text):
|
||||
for key, value in TTS_SML.items():
|
||||
pattern = re.escape(key) if key == '###' else r'\[' + re.escape(key) + r'\]'
|
||||
@@ -1238,7 +1155,7 @@ def filter_sml(text):
|
||||
|
||||
def normalize_text(text, lang, lang_iso1, tts_engine):
|
||||
# Remove emojis
|
||||
emoji_pattern = re.compile(f"[{''.join(emojis_array)}]+", flags=re.UNICODE)
|
||||
emoji_pattern = re.compile(f"[{''.join(emojis_list)}]+", flags=re.UNICODE)
|
||||
emoji_pattern.sub('', text)
|
||||
if lang in abbreviations_mapping:
|
||||
def _replace_abbreviations(match: re.Match) -> str:
|
||||
|
||||
@@ -156,7 +156,13 @@ punctuation_split_soft = [
|
||||
]
|
||||
punctuation_split_soft_set = set(punctuation_split_soft)
|
||||
|
||||
emojis_array = [
|
||||
roman_numbers_tuples = [
|
||||
('M', 1000), ('CM', 900), ('D', 500), ('CD', 400),
|
||||
('C', 100), ('XC', 90), ('L', 50), ('XL', 40),
|
||||
('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1)
|
||||
]
|
||||
|
||||
emojis_list = [
|
||||
r"\U0001F600-\U0001F64F", # Emoticons
|
||||
r"\U0001F300-\U0001F5FF", # Symbols & pictographs
|
||||
r"\U0001F680-\U0001F6FF", # Transport & map symbols
|
||||
|
||||
Reference in New Issue
Block a user