This commit is contained in:
unknown
2025-11-07 14:31:22 -08:00
parent f2ea8f4bee
commit 1a50a125e5
4 changed files with 57 additions and 20 deletions

View File

@@ -17,7 +17,7 @@ set "PYTHONUTF8=1"
set "PYTHONIOENCODING=utf-8"
set "CURRENT_ENV="
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox"
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox tesseract"
set "TMP=%SCRIPT_DIR%\tmp"
set "TEMP=%SCRIPT_DIR%\tmp"

View File

@@ -8,7 +8,9 @@ fi
unset SWITCHED_TO_ZSH
ARCH=$(uname -m)
PYTHON_VERSION="3.12"
PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
MIN_PYTHON_VERSION="3.10"
MAX_PYTHON_VERSION="3.13"
export PYTHONUTF8="1"
export PYTHONIOENCODING="utf-8"
@@ -48,7 +50,7 @@ SCRIPT_MODE="$NATIVE"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
WGET=$(which wget 2>/dev/null)
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox")
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox" "tesseract")
PYTHON_ENV="python_env"
CURRENT_ENV=""
@@ -60,9 +62,6 @@ fi
if [[ "$OSTYPE" = "darwin"* ]]; then
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-$(uname -m).sh"
CONFIG_FILE="$HOME/.zshrc"
if [[ "$ARCH" == "x86_64" ]]; then
PYTHON_VERSION="3.11"
fi
elif [[ "$OSTYPE" = "linux"* ]]; then
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
CONFIG_FILE="$HOME/.bashrc"
@@ -123,14 +122,20 @@ else
local programs=("$@")
programs_missing=()
for program in "${programs[@]}"; do
bin="$program"
if [ "$program" = "nodejs" ]; then
bin="node"
elif [ "$program" = "rust" ]; then
if command -v apt-get &> /dev/null; then
fi
if [ "$program" = "rust" ]; then
if command -v apt-get &>/dev/null; then
program="rustc"
bin="rustc"
fi
else
bin="$program"
fi
if [ "$program" = "tesseract" ]; then
if command -v apt-get &>/dev/null || command -v zypper &>/dev/null || command -v apk &>/dev/null; then
program="tesseract-ocr"
fi
fi
if ! command -v "$bin" >/dev/null 2>&1; then
echo -e "\e[33m$program is not installed.\e[0m"
@@ -219,12 +224,7 @@ else
echo "$program installation failed."
fi
fi
elif [ "$program" = "rust" ]; then
if command -v apt-get &> /dev/null; then
app="rustc"
else
app="$program"
fi
elif [[ "$program" = "rust" ] || [ "$program" = "rustc" ]]; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
if command -v $app &>/dev/null; then
@@ -280,6 +280,15 @@ else
fi
fi
if [[ ! -d "$SCRIPT_DIR/$PYTHON_ENV" ]]; then
if [[ "$OSTYPE" = "darwin"* ] && [ "$ARCH" == "x86_64" ]]; then
PYTHON_VERSION="3.11"
else
if (( $(echo "$PYTHON_VERSION < 3.10" | bc -l) )); then
PYTHON_VERSION="$MIN_PYTHON_VERSION"
elif (( $(echo "$PYTHON_VERSION > 3.13" | bc -l) )); then
PYTHON_VERSION"$MAX_PYTHON_VERSION"
fi
fi
# Use this condition to chmod writable folders once
chmod -R 777 ./audiobooks ./tmp ./models
conda create --prefix "$SCRIPT_DIR/$PYTHON_ENV" python=$PYTHON_VERSION -y

View File

@@ -7,8 +7,10 @@
from __future__ import annotations
import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, platform, random, shutil, subprocess, sys, tempfile, threading, time, traceback, socket
import warnings, unicodedata, urllib.request, uuid, zipfile, ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, uvicorn, gc
import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, pytesseract
import platform, random, shutil, subprocess, sys, tempfile, threading, time, uvicorn
import traceback, socket, warnings, unicodedata, urllib.request, uuid, zipfile, fitz
import ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, gc
from soynlp.tokenizer import LTokenizer
from pythainlp.tokenize import word_tokenize
@@ -32,6 +34,7 @@ from multiprocessing.managers import DictProxy, ListProxy
from stanza.pipeline.core import Pipeline
from num2words import num2words
from pathlib import Path
from PIL import Image
from pydub import AudioSegment
from pydub.utils import mediainfo
from queue import Queue, Empty
@@ -394,7 +397,6 @@ def convert2epub(id:str)->bool:
print(error)
return False
if file_ext == '.pdf':
import fitz
msg = 'File input is a PDF. flatten it in MarkDown...'
print(msg)
doc = fitz.open(session['ebook'])
@@ -410,6 +412,31 @@ def convert2epub(id:str)->bool:
file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
with open(file_input, "w", encoding="utf-8") as html_file:
html_file.write(markdown_text)
msg = 'File input is a PDF. flatten it in MarkDown...'
print(msg)
doc = fitz.open(session['ebook'])
pdf_metadata = doc.metadata
filename_no_ext = os.path.splitext(os.path.basename(session['ebook']))[0]
title = pdf_metadata.get('title') or filename_no_ext
author = pdf_metadata.get('author') or False
markdown_pages = []
for i, page in enumerate(doc):
text = page.get_text("markdown").strip()
if not text:
pix = page.get_pixmap(dpi=300)
img = Image.open(io.BytesIO(pix.tobytes("png")))
text = pytesseract.image_to_string(img, lang="eng").strip()
text = text.replace("\n", " \n")
markdown_pages.append(f"## Page {i+1}\n{text}\n")
markdown_text = "\n".join(markdown_pages)
# Remove single asterisks for italics (but not bold **)
markdown_text = re.sub(r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', r'\1', markdown_text)
# Remove single underscores for italics (but not bold __)
markdown_text = re.sub(r'(?<!_)_(?!_)(.*?)_(?!_)', r'\1', markdown_text)
file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
with open(file_input, "w", encoding="utf-8") as html_file:
html_file.write(markdown_text)
msg = f"Running command: {util_app} {file_input} {session['epub_path']}"
print(msg)
cmd = [

View File

@@ -10,8 +10,9 @@ beautifulsoup4
fugashi
sudachipy
sudachidict_core
fitz
pytesseract
unidic
pymupdf4llm
hangul-romanize
indic-nlp-library
iso639-lang