This commit is contained in:
unknown
2025-12-30 15:48:49 -08:00
parent 1f1b5308b5
commit c6453a227c
4 changed files with 12 additions and 6 deletions

View File

@@ -8,8 +8,7 @@ ebooklib
fastapi
hf_xet
beautifulsoup4
sudachipy
sudachidict-core
nagisa
pymupdf
pymupdf-layout
pytesseract

View File

@@ -1039,10 +1039,19 @@ def get_sentences(text:str, id:str)->list|None:
jieba.dt.cache_file = os.path.join(models_dir, 'jieba.cache')
result.extend([t for t in jieba.cut(segment) if t.strip()])
elif lang == 'jpn':
"""
from sudachipy import dictionary, tokenizer
sudachi = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C
result.extend([m.surface() for m in sudachi.tokenize(segment, mode) if m.surface().strip()])
"""
import nagisa
tokens = nagisa.tagging(segment).words
result.extend([
token
for token in tokens
if token.strip()
])
elif lang == 'kor':
from soynlp.tokenizer import LTokenizer
ltokenizer = LTokenizer()

View File

@@ -27,8 +27,7 @@ dependencies = [
"fastapi",
"hf_xet",
"beautifulsoup4",
"sudachipy",
"sudachidict-core",
"nagisa",
"pymupdf",
"pymupdf-layout",
"pytesseract",

View File

@@ -8,8 +8,7 @@ ebooklib
fastapi
hf_xet
beautifulsoup4
sudachipy
sudachidict-core
nagisa
pymupdf
pymupdf-layout
pytesseract