diff --git a/ebook2audiobook.egg-info/requires.txt b/ebook2audiobook.egg-info/requires.txt index 68e5da5f..a7678966 100644 --- a/ebook2audiobook.egg-info/requires.txt +++ b/ebook2audiobook.egg-info/requires.txt @@ -8,8 +8,7 @@ ebooklib fastapi hf_xet beautifulsoup4 -sudachipy -sudachidict-core +nagisa pymupdf pymupdf-layout pytesseract diff --git a/lib/core.py b/lib/core.py index 92b0152a..58c78c6b 100644 --- a/lib/core.py +++ b/lib/core.py @@ -1039,10 +1039,19 @@ def get_sentences(text:str, id:str)->list|None: jieba.dt.cache_file = os.path.join(models_dir, 'jieba.cache') result.extend([t for t in jieba.cut(segment) if t.strip()]) elif lang == 'jpn': + """ from sudachipy import dictionary, tokenizer sudachi = dictionary.Dictionary().create() mode = tokenizer.Tokenizer.SplitMode.C result.extend([m.surface() for m in sudachi.tokenize(segment, mode) if m.surface().strip()]) + """ + import nagisa + tokens = nagisa.tagging(segment).words + result.extend([ + token + for token in tokens + if token.strip() + ]) elif lang == 'kor': from soynlp.tokenizer import LTokenizer ltokenizer = LTokenizer() diff --git a/pyproject.toml b/pyproject.toml index eb8dc745..7fea2c75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,7 @@ dependencies = [ "fastapi", "hf_xet", "beautifulsoup4", - "sudachipy", - "sudachidict-core", + "nagisa", "pymupdf", "pymupdf-layout", "pytesseract", diff --git a/requirements.txt b/requirements.txt index 4dd22fe5..02a7e0cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,8 +8,7 @@ ebooklib fastapi hf_xet beautifulsoup4 -sudachipy -sudachidict-core +nagisa pymupdf pymupdf-layout pytesseract