mirror of
https://github.com/DrewThomasson/ebook2audiobook.git
synced 2026-01-10 14:28:15 -05:00
...
This commit is contained in:
@@ -17,7 +17,7 @@ set "PYTHONUTF8=1"
|
||||
set "PYTHONIOENCODING=utf-8"
|
||||
set "CURRENT_ENV="
|
||||
|
||||
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox"
|
||||
set "PROGRAMS_LIST=calibre-normal ffmpeg nodejs espeak-ng sox tesseract"
|
||||
|
||||
set "TMP=%SCRIPT_DIR%\tmp"
|
||||
set "TEMP=%SCRIPT_DIR%\tmp"
|
||||
|
||||
@@ -8,7 +8,9 @@ fi
|
||||
unset SWITCHED_TO_ZSH
|
||||
|
||||
ARCH=$(uname -m)
|
||||
PYTHON_VERSION="3.12"
|
||||
PYTHON_VERSION=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
||||
MIN_PYTHON_VERSION="3.10"
|
||||
MAX_PYTHON_VERSION="3.13"
|
||||
|
||||
export PYTHONUTF8="1"
|
||||
export PYTHONIOENCODING="utf-8"
|
||||
@@ -48,7 +50,7 @@ SCRIPT_MODE="$NATIVE"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
WGET=$(which wget 2>/dev/null)
|
||||
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox")
|
||||
REQUIRED_PROGRAMS=("curl" "calibre" "ffmpeg" "nodejs" "espeak-ng" "rust" "sox" "tesseract")
|
||||
PYTHON_ENV="python_env"
|
||||
CURRENT_ENV=""
|
||||
|
||||
@@ -60,9 +62,6 @@ fi
|
||||
if [[ "$OSTYPE" = "darwin"* ]]; then
|
||||
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-$(uname -m).sh"
|
||||
CONFIG_FILE="$HOME/.zshrc"
|
||||
if [[ "$ARCH" == "x86_64" ]]; then
|
||||
PYTHON_VERSION="3.11"
|
||||
fi
|
||||
elif [[ "$OSTYPE" = "linux"* ]]; then
|
||||
CONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
|
||||
CONFIG_FILE="$HOME/.bashrc"
|
||||
@@ -123,14 +122,20 @@ else
|
||||
local programs=("$@")
|
||||
programs_missing=()
|
||||
for program in "${programs[@]}"; do
|
||||
bin="$program"
|
||||
if [ "$program" = "nodejs" ]; then
|
||||
bin="node"
|
||||
elif [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &> /dev/null; then
|
||||
fi
|
||||
if [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &>/dev/null; then
|
||||
program="rustc"
|
||||
bin="rustc"
|
||||
fi
|
||||
else
|
||||
bin="$program"
|
||||
fi
|
||||
if [ "$program" = "tesseract" ]; then
|
||||
if command -v apt-get &>/dev/null || command -v zypper &>/dev/null || command -v apk &>/dev/null; then
|
||||
program="tesseract-ocr"
|
||||
fi
|
||||
fi
|
||||
if ! command -v "$bin" >/dev/null 2>&1; then
|
||||
echo -e "\e[33m$program is not installed.\e[0m"
|
||||
@@ -219,12 +224,7 @@ else
|
||||
echo "$program installation failed."
|
||||
fi
|
||||
fi
|
||||
elif [ "$program" = "rust" ]; then
|
||||
if command -v apt-get &> /dev/null; then
|
||||
app="rustc"
|
||||
else
|
||||
app="$program"
|
||||
fi
|
||||
elif [[ "$program" = "rust" ] || [ "$program" = "rustc" ]]; then
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
source $HOME/.cargo/env
|
||||
if command -v $app &>/dev/null; then
|
||||
@@ -280,6 +280,15 @@ else
|
||||
fi
|
||||
fi
|
||||
if [[ ! -d "$SCRIPT_DIR/$PYTHON_ENV" ]]; then
|
||||
if [[ "$OSTYPE" = "darwin"* ] && [ "$ARCH" == "x86_64" ]]; then
|
||||
PYTHON_VERSION="3.11"
|
||||
else
|
||||
if (( $(echo "$PYTHON_VERSION < 3.10" | bc -l) )); then
|
||||
PYTHON_VERSION="$MIN_PYTHON_VERSION"
|
||||
elif (( $(echo "$PYTHON_VERSION > 3.13" | bc -l) )); then
|
||||
PYTHON_VERSION"$MAX_PYTHON_VERSION"
|
||||
fi
|
||||
fi
|
||||
# Use this condition to chmod writable folders once
|
||||
chmod -R 777 ./audiobooks ./tmp ./models
|
||||
conda create --prefix "$SCRIPT_DIR/$PYTHON_ENV" python=$PYTHON_VERSION -y
|
||||
|
||||
@@ -7,8 +7,10 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, platform, random, shutil, subprocess, sys, tempfile, threading, time, traceback, socket
|
||||
import warnings, unicodedata, urllib.request, uuid, zipfile, ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, uvicorn, gc
|
||||
import argparse, asyncio, csv, fnmatch, hashlib, io, json, math, os, pytesseract
|
||||
import platform, random, shutil, subprocess, sys, tempfile, threading, time, uvicorn
|
||||
import traceback, socket, warnings, unicodedata, urllib.request, uuid, zipfile, fitz
|
||||
import ebooklib, gradio as gr, psutil, pymupdf4llm, regex as re, requests, stanza, gc
|
||||
|
||||
from soynlp.tokenizer import LTokenizer
|
||||
from pythainlp.tokenize import word_tokenize
|
||||
@@ -32,6 +34,7 @@ from multiprocessing.managers import DictProxy, ListProxy
|
||||
from stanza.pipeline.core import Pipeline
|
||||
from num2words import num2words
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
from pydub import AudioSegment
|
||||
from pydub.utils import mediainfo
|
||||
from queue import Queue, Empty
|
||||
@@ -394,7 +397,6 @@ def convert2epub(id:str)->bool:
|
||||
print(error)
|
||||
return False
|
||||
if file_ext == '.pdf':
|
||||
import fitz
|
||||
msg = 'File input is a PDF. flatten it in MarkDown...'
|
||||
print(msg)
|
||||
doc = fitz.open(session['ebook'])
|
||||
@@ -410,6 +412,31 @@ def convert2epub(id:str)->bool:
|
||||
file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
|
||||
with open(file_input, "w", encoding="utf-8") as html_file:
|
||||
html_file.write(markdown_text)
|
||||
|
||||
msg = 'File input is a PDF. flatten it in MarkDown...'
|
||||
print(msg)
|
||||
doc = fitz.open(session['ebook'])
|
||||
pdf_metadata = doc.metadata
|
||||
filename_no_ext = os.path.splitext(os.path.basename(session['ebook']))[0]
|
||||
title = pdf_metadata.get('title') or filename_no_ext
|
||||
author = pdf_metadata.get('author') or False
|
||||
markdown_pages = []
|
||||
for i, page in enumerate(doc):
|
||||
text = page.get_text("markdown").strip()
|
||||
if not text:
|
||||
pix = page.get_pixmap(dpi=300)
|
||||
img = Image.open(io.BytesIO(pix.tobytes("png")))
|
||||
text = pytesseract.image_to_string(img, lang="eng").strip()
|
||||
text = text.replace("\n", " \n")
|
||||
markdown_pages.append(f"## Page {i+1}\n{text}\n")
|
||||
markdown_text = "\n".join(markdown_pages)
|
||||
# Remove single asterisks for italics (but not bold **)
|
||||
markdown_text = re.sub(r'(?<!\*)\*(?!\*)(.*?)\*(?!\*)', r'\1', markdown_text)
|
||||
# Remove single underscores for italics (but not bold __)
|
||||
markdown_text = re.sub(r'(?<!_)_(?!_)(.*?)_(?!_)', r'\1', markdown_text)
|
||||
file_input = os.path.join(session['process_dir'], f'{filename_no_ext}.md')
|
||||
with open(file_input, "w", encoding="utf-8") as html_file:
|
||||
html_file.write(markdown_text)
|
||||
msg = f"Running command: {util_app} {file_input} {session['epub_path']}"
|
||||
print(msg)
|
||||
cmd = [
|
||||
|
||||
@@ -10,8 +10,9 @@ beautifulsoup4
|
||||
fugashi
|
||||
sudachipy
|
||||
sudachidict_core
|
||||
fitz
|
||||
pytesseract
|
||||
unidic
|
||||
pymupdf4llm
|
||||
hangul-romanize
|
||||
indic-nlp-library
|
||||
iso639-lang
|
||||
|
||||
Reference in New Issue
Block a user