...

2026-01-10 14:28:15 -05:00 · 2025-11-07 14:31:22 -08:00
parent f2ea8f4bee
commit 1a50a125e5
4 changed files with 57 additions and 20 deletions
--- a/ebook2audiobook.cmd
+++ b/ebook2audiobook.cmd
@@ -17,7 +17,7 @@ set "PYTHONUTF8=1"
 set "PYTHONIOENCODING=utf-8"
 set "CURRENT_ENV="

-set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox"
+set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox tesseract"

 set "TMP=%SCRIPT_DIR%\tmp"
 set "TEMP=%SCRIPT_DIR%\tmp"
--- a/ebook2audiobook.sh
+++ b/ebook2audiobook.sh
@@ -8,7 +8,9 @@ fi
 unset SWITCHED_TO_ZSH

 ARCH=$(uname -m)
-PYTHON_VERSION="3.12"
+PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+MIN_PYTHON_VERSION="3.10"
+MAX_PYTHON_VERSION="3.13"

 export PYTHONUTF8="1"
 export PYTHONIOENCODING="utf-8"
@@ -48,7 +50,7 @@ SCRIPT_MODE="$NATIVE"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

 WGET=$(which wget 2>/dev/null)
-REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox")
+REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox" "tesseract")
 PYTHON_ENV="python_env"
 CURRENT_ENV=""

@@ -60,9 +62,6 @@ fi
 if [[ "$OSTYPE" = "darwin"* ]]; then
 	CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-$(uname -m).sh"
 	CONFIG_FILE="$HOME/.zshrc"
-	if [[ "$ARCH" == "x86_64" ]]; then
-		PYTHON_VERSION="3.11"
-	fi
 elif [[ "$OSTYPE" = "linux"* ]]; then
 	CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 	CONFIG_FILE="$HOME/.bashrc"
@@ -123,14 +122,20 @@ else
 		local programs=("$@")
 		programs_missing=()
 		for program in "${programs[@]}"; do
+			bin="$program"
 			if [ "$program" = "nodejs" ]; then
 				bin="node"
-			elif [ "$program" = "rust" ]; then
-				if command -v apt-get &> /dev/null; then
+			fi
+			if [ "$program" = "rust" ]; then
+				if command -v apt-get &>/dev/null; then
+					program="rustc"
 					bin="rustc"
 				fi
-			else
-				bin="$program"
+			fi
+			if [ "$program" = "tesseract" ]; then
+				if command -v apt-get &>/dev/null || command -v zypper &>/dev/null || command -v apk &>/dev/null; then
+					program="tesseract-ocr"
+				fi
 			fi
 			if ! command -v "$bin" >/dev/null 2>&1; then
 				echo -e "\e[33m$program is not installed.\e[0m"
@@ -219,12 +224,7 @@ else
 						echo "$program installation failed."
 					fi
 				fi	
-			elif [ "$program" = "rust" ]; then
-				if command -v apt-get &> /dev/null; then
-					app="rustc"
-				else
-					app="$program"
-				fi
+			elif [[ "$program" = "rust" ] || [ "$program" = "rustc" ]]; then
 				curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 				source $HOME/.cargo/env
 				if command -v $app &>/dev/null; then
@@ -280,6 +280,15 @@ else
 			fi
 		fi
 		if [[ ! -d "$SCRIPT_DIR/$PYTHON_ENV" ]]; then
+			if [[ "$OSTYPE" = "darwin"* ] && [ "$ARCH" == "x86_64" ]]; then
+				PYTHON_VERSION="3.11"
+			else
+				if (( $(echo "$PYTHON_VERSION < 3.10" | bc -l) )); then
+					PYTHON_VERSION="$MIN_PYTHON_VERSION"
+				elif (( $(echo "$PYTHON_VERSION > 3.13" | bc -l) )); then
+					PYTHON_VERSION"$MAX_PYTHON_VERSION"
+				fi
+			fi
 			# Use this condition to chmod writable folders once
 			chmod -R 777 ./audiobooks ./tmp ./models
 			conda create --prefix "$SCRIPT_DIR/$PYTHON_ENV" python=$PYTHON_VERSION -y
--- a/lib/functions.py
+++ b/lib/functions.py
@@ -7,8 +7,10 @@

 from __future__ import annotations

-import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, platform, random, shutil, subprocess, sys, tempfile, threading, time, traceback, socket
-import warnings, unicodedata, urllib.request, uuid, zipfile, ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, uvicorn, gc
+import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, pytesseract
+import platform, random, shutil, subprocess, sys, tempfile, threading, time, uvicorn
+import traceback, socket, warnings, unicodedata, urllib.request, uuid, zipfile, fitz
+import ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, gc

 from soynlp.tokenizer import LTokenizer
 from pythainlp.tokenize import word_tokenize
@@ -32,6 +34,7 @@ from multiprocessing.managers import DictProxy, ListProxy
 from stanza.pipeline.core import Pipeline
 from num2words import num2words
 from pathlib import Path
+from PIL import Image
 from pydub import AudioSegment
 from pydub.utils import mediainfo
 from queue import Queue, Empty
@@ -394,7 +397,6 @@ def convert2epub(id:str)->bool:
            print(error)
            return False
        if file_ext == '.pdf':
-            import fitz
            msg = 'File input is a PDF. flatten it in MarkDown...'
            print(msg)
            doc = fitz.open(session['ebook'])
@@ -410,6 +412,31 @@ def convert2epub(id:str)->bool:
            file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
            with open(file_input, "w", encoding="utf-8") as html_file:
                html_file.write(markdown_text)
+                
+            msg = 'File input is a PDF. flatten it in MarkDown...'
+            print(msg)
+            doc = fitz.open(session['ebook'])
+            pdf_metadata = doc.metadata
+            filename_no_ext = os.path.splitext(os.path.basename(session['ebook']))[0]
+            title = pdf_metadata.get('title') or filename_no_ext
+            author = pdf_metadata.get('author') or False
+            markdown_pages = []
+            for i, page in enumerate(doc):
+                text = page.get_text("markdown").strip()
+                if not text:
+                    pix = page.get_pixmap(dpi=300)
+                    img = Image.open(io.BytesIO(pix.tobytes("png")))
+                    text = pytesseract.image_to_string(img, lang="eng").strip()
+                    text = text.replace("\n", "  \n")
+                markdown_pages.append(f"## Page {i+1}\n{text}\n")
+            markdown_text = "\n".join(markdown_pages)
+            # Remove single asterisks for italics (but not bold **)
+            markdown_text = re.sub(r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', r'\1', markdown_text)
+            # Remove single underscores for italics (but not bold __)
+            markdown_text = re.sub(r'(?<!_)_(?!_)(.*?)_(?!_)', r'\1', markdown_text)
+            file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
+            with open(file_input, "w", encoding="utf-8") as html_file:
+                html_file.write(markdown_text)
        msg = f"Running command: {util_app} {file_input} {session['epub_path']}"
        print(msg)
        cmd = [
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,8 +10,9 @@ beautifulsoup4
 fugashi
 sudachipy
 sudachidict_core
+fitz
+pytesseract
 unidic
-pymupdf4llm
 hangul-romanize
 indic-nlp-library
 iso639-lang