mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-08 05:23:55 -05:00
...
This commit is contained in:
@@ -8,8 +8,7 @@ ebooklib
|
||||
fastapi
|
||||
hf_xet
|
||||
beautifulsoup4
|
||||
sudachipy
|
||||
sudachidict-core
|
||||
nagisa
|
||||
pymupdf
|
||||
pymupdf-layout
|
||||
pytesseract
|
||||
|
||||
@@ -1039,10 +1039,19 @@ def get_sentences(text:str, id:str)->list|None:
|
||||
jieba.dt.cache_file = os.path.join(models_dir, 'jieba.cache')
|
||||
result.extend([t for t in jieba.cut(segment) if t.strip()])
|
||||
elif lang == 'jpn':
|
||||
"""
|
||||
from sudachipy import dictionary, tokenizer
|
||||
sudachi = dictionary.Dictionary().create()
|
||||
mode = tokenizer.Tokenizer.SplitMode.C
|
||||
result.extend([m.surface() for m in sudachi.tokenize(segment, mode) if m.surface().strip()])
|
||||
"""
|
||||
import nagisa
|
||||
tokens = nagisa.tagging(segment).words
|
||||
result.extend([
|
||||
token
|
||||
for token in tokens
|
||||
if token.strip()
|
||||
])
|
||||
elif lang == 'kor':
|
||||
from soynlp.tokenizer import LTokenizer
|
||||
ltokenizer = LTokenizer()
|
||||
|
||||
@@ -27,8 +27,7 @@ dependencies = [
|
||||
"fastapi",
|
||||
"hf_xet",
|
||||
"beautifulsoup4",
|
||||
"sudachipy",
|
||||
"sudachidict-core",
|
||||
"nagisa",
|
||||
"pymupdf",
|
||||
"pymupdf-layout",
|
||||
"pytesseract",
|
||||
|
||||
@@ -8,8 +8,7 @@ ebooklib
|
||||
fastapi
|
||||
hf_xet
|
||||
beautifulsoup4
|
||||
sudachipy
|
||||
sudachidict-core
|
||||
nagisa
|
||||
pymupdf
|
||||
pymupdf-layout
|
||||
pytesseract
|
||||
|
||||
Reference in New Issue
Block a user