mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-09 15:07:55 -05:00
fix(build): consolidate pdf parsing dependencies, remove extraneous html deps (#1212)
* fix(build): consolidate pdf parsing dependencies, remove extraneous html deps * add types
This commit is contained in:
@@ -553,22 +553,11 @@ function handleGenericBuffer(
|
||||
*/
|
||||
async function parseBufferAsPdf(buffer: Buffer) {
|
||||
try {
|
||||
try {
|
||||
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
|
||||
const parser = new PdfParser()
|
||||
logger.info('Using main PDF parser for buffer')
|
||||
const { PdfParser } = await import('@/lib/file-parsers/pdf-parser')
|
||||
const parser = new PdfParser()
|
||||
logger.info('Using main PDF parser for buffer')
|
||||
|
||||
if (parser.parseBuffer) {
|
||||
return await parser.parseBuffer(buffer)
|
||||
}
|
||||
throw new Error('PDF parser does not support buffer parsing')
|
||||
} catch (error) {
|
||||
logger.warn('Main PDF parser failed, using raw parser for buffer:', error)
|
||||
const { RawPdfParser } = await import('@/lib/file-parsers/raw-pdf-parser')
|
||||
const rawParser = new RawPdfParser()
|
||||
|
||||
return await rawParser.parseBuffer(buffer)
|
||||
}
|
||||
return await parser.parseBuffer(buffer)
|
||||
} catch (error) {
|
||||
throw new Error(`PDF parsing failed: ${(error as Error).message}`)
|
||||
}
|
||||
|
||||
@@ -141,17 +141,6 @@ describe('File Parsers', () => {
|
||||
})),
|
||||
}))
|
||||
|
||||
vi.doMock('@/lib/file-parsers/raw-pdf-parser', () => ({
|
||||
RawPdfParser: vi.fn().mockImplementation(() => ({
|
||||
parseFile: vi.fn().mockResolvedValue({
|
||||
content: 'Raw parsed PDF content',
|
||||
metadata: {
|
||||
pageCount: 3,
|
||||
},
|
||||
}),
|
||||
})),
|
||||
}))
|
||||
|
||||
vi.doMock('@/lib/file-parsers/txt-parser', () => ({
|
||||
TxtParser: vi.fn().mockImplementation(() => ({
|
||||
parseFile: mockTxtParseFile,
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
import { existsSync } from 'fs'
|
||||
import { readFile } from 'fs/promises'
|
||||
import path from 'path'
|
||||
import { RawPdfParser } from '@/lib/file-parsers/raw-pdf-parser'
|
||||
import type { FileParseResult, FileParser, SupportedFileType } from '@/lib/file-parsers/types'
|
||||
import { createLogger } from '@/lib/logs/console/logger'
|
||||
|
||||
@@ -18,42 +16,12 @@ function getParserInstances(): Record<string, FileParser> {
|
||||
|
||||
try {
|
||||
try {
|
||||
logger.info('Attempting to load PDF parser...')
|
||||
try {
|
||||
const { PdfParser } = require('@/lib/file-parsers/pdf-parser')
|
||||
parserInstances.pdf = new PdfParser()
|
||||
logger.info('PDF parser loaded successfully')
|
||||
} catch (pdfLibError) {
|
||||
logger.error('Failed to load primary PDF parser:', pdfLibError)
|
||||
logger.info('Falling back to raw PDF parser')
|
||||
parserInstances.pdf = new RawPdfParser()
|
||||
logger.info('Raw PDF parser loaded successfully')
|
||||
}
|
||||
logger.info('Loading PDF parser...')
|
||||
const { PdfParser } = require('@/lib/file-parsers/pdf-parser')
|
||||
parserInstances.pdf = new PdfParser()
|
||||
logger.info('PDF parser loaded successfully')
|
||||
} catch (error) {
|
||||
logger.error('Failed to load any PDF parser:', error)
|
||||
parserInstances.pdf = {
|
||||
async parseFile(filePath: string): Promise<FileParseResult> {
|
||||
const buffer = await readFile(filePath)
|
||||
return {
|
||||
content: `PDF parsing is not available. File size: ${buffer.length} bytes`,
|
||||
metadata: {
|
||||
info: { Error: 'PDF parsing unavailable' },
|
||||
pageCount: 0,
|
||||
version: 'unknown',
|
||||
},
|
||||
}
|
||||
},
|
||||
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
|
||||
return {
|
||||
content: `PDF parsing is not available. File size: ${buffer.length} bytes`,
|
||||
metadata: {
|
||||
info: { Error: 'PDF parsing unavailable' },
|
||||
pageCount: 0,
|
||||
version: 'unknown',
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
logger.error('Failed to load PDF parser:', error)
|
||||
}
|
||||
|
||||
try {
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import { readFile } from 'fs/promises'
|
||||
import { PDFDocument } from 'pdf-lib'
|
||||
import pdfParse from 'pdf-parse'
|
||||
import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
|
||||
import { createLogger } from '@/lib/logs/console/logger'
|
||||
import { RawPdfParser } from './raw-pdf-parser'
|
||||
|
||||
const logger = createLogger('PdfParser')
|
||||
const rawPdfParser = new RawPdfParser()
|
||||
|
||||
export class PdfParser implements FileParser {
|
||||
async parseFile(filePath: string): Promise<FileParseResult> {
|
||||
@@ -31,68 +29,23 @@ export class PdfParser implements FileParser {
|
||||
try {
|
||||
logger.info('Starting to parse buffer, size:', dataBuffer.length)
|
||||
|
||||
try {
|
||||
logger.info('Attempting to parse with pdf-lib library...')
|
||||
const pdfData = await pdfParse(dataBuffer)
|
||||
|
||||
logger.info('Starting PDF parsing...')
|
||||
const pdfDoc = await PDFDocument.load(dataBuffer)
|
||||
const pages = pdfDoc.getPages()
|
||||
const pageCount = pages.length
|
||||
logger.info(
|
||||
'PDF parsed successfully, pages:',
|
||||
pdfData.numpages,
|
||||
'text length:',
|
||||
pdfData.text.length
|
||||
)
|
||||
|
||||
logger.info('PDF parsed successfully with pdf-lib, pages:', pageCount)
|
||||
|
||||
const metadata: Record<string, any> = {
|
||||
pageCount,
|
||||
}
|
||||
|
||||
try {
|
||||
const title = pdfDoc.getTitle()
|
||||
const author = pdfDoc.getAuthor()
|
||||
const subject = pdfDoc.getSubject()
|
||||
const creator = pdfDoc.getCreator()
|
||||
const producer = pdfDoc.getProducer()
|
||||
const creationDate = pdfDoc.getCreationDate()
|
||||
const modificationDate = pdfDoc.getModificationDate()
|
||||
|
||||
if (title) metadata.title = title
|
||||
if (author) metadata.author = author
|
||||
if (subject) metadata.subject = subject
|
||||
if (creator) metadata.creator = creator
|
||||
if (producer) metadata.producer = producer
|
||||
if (creationDate) metadata.creationDate = creationDate.toISOString()
|
||||
if (modificationDate) metadata.modificationDate = modificationDate.toISOString()
|
||||
} catch (metadataError) {
|
||||
logger.warn('Could not extract PDF metadata:', metadataError)
|
||||
}
|
||||
|
||||
logger.info(
|
||||
'pdf-lib loaded successfully, but text extraction requires fallback to raw parser'
|
||||
)
|
||||
const rawResult = await rawPdfParser.parseBuffer(dataBuffer)
|
||||
|
||||
return {
|
||||
content: rawResult.content,
|
||||
metadata: {
|
||||
...rawResult.metadata,
|
||||
...metadata,
|
||||
source: 'pdf-lib + raw-parser',
|
||||
},
|
||||
}
|
||||
} catch (pdfLibError: unknown) {
|
||||
logger.error('PDF-lib library failed:', pdfLibError)
|
||||
|
||||
logger.info('Falling back to raw PDF parser...')
|
||||
const rawResult = await rawPdfParser.parseBuffer(dataBuffer)
|
||||
|
||||
return {
|
||||
...rawResult,
|
||||
metadata: {
|
||||
...rawResult.metadata,
|
||||
fallback: true,
|
||||
source: 'raw-parser-only',
|
||||
error: (pdfLibError as Error).message || 'Unknown error',
|
||||
},
|
||||
}
|
||||
return {
|
||||
content: pdfData.text,
|
||||
metadata: {
|
||||
pageCount: pdfData.numpages,
|
||||
info: pdfData.info,
|
||||
version: pdfData.version,
|
||||
source: 'pdf-parse',
|
||||
},
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error parsing buffer:', error)
|
||||
|
||||
@@ -1,467 +0,0 @@
|
||||
import { readFile } from 'fs/promises'
|
||||
import { promisify } from 'util'
|
||||
import zlib from 'zlib'
|
||||
import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
|
||||
import { createLogger } from '@/lib/logs/console/logger'
|
||||
|
||||
const logger = createLogger('RawPdfParser')
|
||||
|
||||
const inflateAsync = promisify(zlib.inflate)
|
||||
const unzipAsync = promisify(zlib.unzip)
|
||||
|
||||
export class RawPdfParser implements FileParser {
|
||||
async parseFile(filePath: string): Promise<FileParseResult> {
|
||||
try {
|
||||
logger.info('Starting to parse file:', filePath)
|
||||
|
||||
if (!filePath) {
|
||||
throw new Error('No file path provided')
|
||||
}
|
||||
|
||||
logger.info('Reading file...')
|
||||
const dataBuffer = await readFile(filePath)
|
||||
logger.info('File read successfully, size:', dataBuffer.length)
|
||||
|
||||
return this.parseBuffer(dataBuffer)
|
||||
} catch (error) {
|
||||
logger.error('Error parsing PDF:', error)
|
||||
return {
|
||||
content: `Error parsing PDF: ${(error as Error).message}`,
|
||||
metadata: {
|
||||
error: (error as Error).message,
|
||||
pageCount: 0,
|
||||
version: 'unknown',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async parseBuffer(dataBuffer: Buffer): Promise<FileParseResult> {
|
||||
try {
|
||||
logger.info('Starting to parse buffer, size:', dataBuffer.length)
|
||||
|
||||
const rawContent = dataBuffer.toString('utf-8')
|
||||
|
||||
let version = 'Unknown'
|
||||
let pageCount = 0
|
||||
|
||||
const versionMatch = rawContent.match(/%PDF-(\d+\.\d+)/)
|
||||
if (versionMatch?.[1]) {
|
||||
version = versionMatch[1]
|
||||
}
|
||||
|
||||
const typePageMatches = rawContent.match(/\/Type\s*\/Page\b/gi)
|
||||
if (typePageMatches) {
|
||||
pageCount = typePageMatches.length
|
||||
logger.info('Found page count using /Type /Page:', pageCount)
|
||||
}
|
||||
|
||||
if (pageCount === 0) {
|
||||
const pageMatches = rawContent.match(/\/Page\s*\//gi)
|
||||
if (pageMatches) {
|
||||
pageCount = pageMatches.length
|
||||
logger.info('Found page count using /Page/ pattern:', pageCount)
|
||||
}
|
||||
}
|
||||
|
||||
if (pageCount === 0) {
|
||||
const pagesObjMatches = rawContent.match(/\/Pages\s+\d+\s+\d+\s+R/gi)
|
||||
if (pagesObjMatches && pagesObjMatches.length > 0) {
|
||||
const pagesObjRef = pagesObjMatches[0].match(/\/Pages\s+(\d+)\s+\d+\s+R/i)
|
||||
if (pagesObjRef?.[1]) {
|
||||
const objNum = pagesObjRef[1]
|
||||
const objRegex = new RegExp(`${objNum}\\s+0\\s+obj[\\s\\S]*?endobj`, 'i')
|
||||
const objMatch = rawContent.match(objRegex)
|
||||
if (objMatch) {
|
||||
const countMatch = objMatch[0].match(/\/Count\s+(\d+)/i)
|
||||
if (countMatch?.[1]) {
|
||||
pageCount = Number.parseInt(countMatch[1], 10)
|
||||
logger.info('Found page count using /Count in Pages object:', pageCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pageCount === 0) {
|
||||
const trailerMatches = rawContent.match(/trailer/gi)
|
||||
if (trailerMatches) {
|
||||
pageCount = Math.max(1, Math.ceil(trailerMatches.length / 2))
|
||||
logger.info('Estimated page count using trailer references:', pageCount)
|
||||
}
|
||||
}
|
||||
|
||||
if (pageCount === 0) {
|
||||
pageCount = 1
|
||||
logger.info('Defaulting to 1 page as no count was found')
|
||||
}
|
||||
|
||||
let extractedText = ''
|
||||
|
||||
const textMatches = rawContent.match(/BT[\s\S]*?ET/g)
|
||||
if (textMatches && textMatches.length > 0) {
|
||||
logger.info('Found', textMatches.length, 'text blocks')
|
||||
|
||||
extractedText = textMatches
|
||||
.map((textBlock) => {
|
||||
const textObjects = textBlock.match(/(\([^)]*\)|\[[^\]]*\])\s*(Tj|TJ)/g)
|
||||
if (textObjects && textObjects.length > 0) {
|
||||
return textObjects
|
||||
.map((obj) => {
|
||||
let text = ''
|
||||
if (obj.includes('Tj')) {
|
||||
const match = obj.match(/\(([^)]*)\)\s*Tj/)
|
||||
if (match?.[1]) {
|
||||
text = match[1]
|
||||
}
|
||||
} else if (obj.includes('TJ')) {
|
||||
const match = obj.match(/\[(.*)\]\s*TJ/)
|
||||
if (match?.[1]) {
|
||||
const parts = match[1].match(/\([^)]*\)/g)
|
||||
if (parts) {
|
||||
text = parts.map((p) => p.slice(1, -1)).join(' ')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return text
|
||||
.replace(/\\(\d{3})/g, (_, octal) =>
|
||||
String.fromCharCode(Number.parseInt(octal, 8))
|
||||
)
|
||||
.replace(/\\\\/g, '\\')
|
||||
.replace(/\\\(/g, '(')
|
||||
.replace(/\\\)/g, ')')
|
||||
})
|
||||
.join(' ')
|
||||
}
|
||||
return ''
|
||||
})
|
||||
.join('\n')
|
||||
.trim()
|
||||
}
|
||||
|
||||
let metadataText = ''
|
||||
const xmlMatch = rawContent.match(/<x:xmpmeta[\s\S]*?<\/x:xmpmeta>/)
|
||||
if (xmlMatch) {
|
||||
const xmlContent = xmlMatch[0]
|
||||
logger.info('Found XML metadata')
|
||||
|
||||
const titleMatch = xmlContent.match(/<dc:title>[\s\S]*?<rdf:li[^>]*>(.*?)<\/rdf:li>/i)
|
||||
if (titleMatch?.[1]) {
|
||||
const title = titleMatch[1].replace(/<[^>]+>/g, '').trim()
|
||||
metadataText += `Document Title: ${title}\n\n`
|
||||
}
|
||||
|
||||
const creatorMatch = xmlContent.match(/<dc:creator>[\s\S]*?<rdf:li[^>]*>(.*?)<\/rdf:li>/i)
|
||||
if (creatorMatch?.[1]) {
|
||||
const creator = creatorMatch[1].replace(/<[^>]+>/g, '').trim()
|
||||
metadataText += `Author: ${creator}\n`
|
||||
}
|
||||
|
||||
const dateMatch = xmlContent.match(/<xmp:CreateDate>(.*?)<\/xmp:CreateDate>/i)
|
||||
if (dateMatch?.[1]) {
|
||||
metadataText += `Created: ${dateMatch[1].trim()}\n`
|
||||
}
|
||||
|
||||
const producerMatch = xmlContent.match(/<pdf:Producer>(.*?)<\/pdf:Producer>/i)
|
||||
if (producerMatch?.[1]) {
|
||||
metadataText += `Producer: ${producerMatch[1].trim()}\n`
|
||||
}
|
||||
}
|
||||
|
||||
if (!extractedText || extractedText.length < 100 || extractedText.includes('/Type /Page')) {
|
||||
logger.info('Trying advanced text extraction from content streams')
|
||||
|
||||
const contentRefs = rawContent.match(/\/Contents\s+\[?\s*(\d+)\s+\d+\s+R\s*\]?/g)
|
||||
if (contentRefs && contentRefs.length > 0) {
|
||||
logger.info('Found', contentRefs.length, 'content stream references')
|
||||
|
||||
const objNumbers = contentRefs
|
||||
.map((ref) => {
|
||||
const match = ref.match(/\/Contents\s+\[?\s*(\d+)\s+\d+\s+R\s*\]?/)
|
||||
return match ? match[1] : null
|
||||
})
|
||||
.filter(Boolean)
|
||||
|
||||
logger.info('Content stream object numbers:', objNumbers)
|
||||
|
||||
if (objNumbers.length > 0) {
|
||||
let textFromStreams = ''
|
||||
|
||||
for (const objNum of objNumbers) {
|
||||
const objRegex = new RegExp(`${objNum}\\s+0\\s+obj[\\s\\S]*?endobj`, 'i')
|
||||
const objMatch = rawContent.match(objRegex)
|
||||
|
||||
if (objMatch) {
|
||||
const streamMatch = objMatch[0].match(/stream\r?\n([\s\S]*?)\r?\nendstream/)
|
||||
if (streamMatch?.[1]) {
|
||||
const streamContent = streamMatch[1]
|
||||
|
||||
const textFragments = streamContent.match(/\([^)]+\)\s*Tj|\[[^\]]*\]\s*TJ/g)
|
||||
if (textFragments && textFragments.length > 0) {
|
||||
const extractedFragments = textFragments
|
||||
.map((fragment) => {
|
||||
if (fragment.includes('Tj')) {
|
||||
return fragment
|
||||
.replace(/\(([^)]*)\)\s*Tj/, '$1')
|
||||
.replace(/\\(\d{3})/g, (_, octal) =>
|
||||
String.fromCharCode(Number.parseInt(octal, 8))
|
||||
)
|
||||
.replace(/\\\\/g, '\\')
|
||||
.replace(/\\\(/g, '(')
|
||||
.replace(/\\\)/g, ')')
|
||||
}
|
||||
if (fragment.includes('TJ')) {
|
||||
const parts = fragment.match(/\([^)]*\)/g)
|
||||
if (parts) {
|
||||
return parts
|
||||
.map((p) =>
|
||||
p
|
||||
.slice(1, -1)
|
||||
.replace(/\\(\d{3})/g, (_, octal) =>
|
||||
String.fromCharCode(Number.parseInt(octal, 8))
|
||||
)
|
||||
.replace(/\\\\/g, '\\')
|
||||
.replace(/\\\(/g, '(')
|
||||
.replace(/\\\)/g, ')')
|
||||
)
|
||||
.join(' ')
|
||||
}
|
||||
}
|
||||
return ''
|
||||
})
|
||||
.filter(Boolean)
|
||||
.join(' ')
|
||||
|
||||
if (extractedFragments.trim().length > 0) {
|
||||
textFromStreams += `${extractedFragments.trim()}\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (textFromStreams.trim().length > 0) {
|
||||
logger.info('Successfully extracted text from content streams')
|
||||
extractedText = textFromStreams.trim()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!extractedText || extractedText.length < 100) {
|
||||
logger.info('Trying to decompress PDF streams')
|
||||
|
||||
const compressedStreams = rawContent.match(
|
||||
/\/Filter\s*\/FlateDecode[\s\S]*?stream[\s\S]*?endstream/g
|
||||
)
|
||||
if (compressedStreams && compressedStreams.length > 0) {
|
||||
logger.info('Found', compressedStreams.length, 'compressed streams')
|
||||
|
||||
const decompressedContents = await Promise.all(
|
||||
compressedStreams.map(async (stream) => {
|
||||
try {
|
||||
const streamMatch = stream.match(/stream\r?\n([\s\S]*?)\r?\nendstream/)
|
||||
if (!streamMatch || !streamMatch[1]) return ''
|
||||
|
||||
const compressedData = Buffer.from(streamMatch[1], 'binary')
|
||||
|
||||
try {
|
||||
const decompressed = await inflateAsync(compressedData)
|
||||
const content = decompressed.toString('utf-8')
|
||||
|
||||
const readable = content.replace(/[^\x20-\x7E\r\n]/g, ' ').trim()
|
||||
if (
|
||||
readable.length > 50 &&
|
||||
readable.includes(' ') &&
|
||||
(readable.includes('.') || readable.includes(',')) &&
|
||||
!/[\x00-\x1F\x7F]/.test(readable)
|
||||
) {
|
||||
return readable
|
||||
}
|
||||
} catch (_inflateErr) {
|
||||
try {
|
||||
const decompressed = await unzipAsync(compressedData)
|
||||
const content = decompressed.toString('utf-8')
|
||||
|
||||
const readable = content.replace(/[^\x20-\x7E\r\n]/g, ' ').trim()
|
||||
if (
|
||||
readable.length > 50 &&
|
||||
readable.includes(' ') &&
|
||||
(readable.includes('.') || readable.includes(',')) &&
|
||||
!/[\x00-\x1F\x7F]/.test(readable)
|
||||
) {
|
||||
return readable
|
||||
}
|
||||
} catch (_unzipErr) {
|
||||
return ''
|
||||
}
|
||||
}
|
||||
} catch (_error) {
|
||||
return ''
|
||||
}
|
||||
|
||||
return ''
|
||||
})
|
||||
)
|
||||
|
||||
const decompressedText = decompressedContents
|
||||
.filter((text) => text && text.length > 0)
|
||||
.join('\n\n')
|
||||
|
||||
if (decompressedText && decompressedText.length > 0) {
|
||||
logger.info('Successfully decompressed text content, length:', decompressedText.length)
|
||||
extractedText = decompressedText
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!extractedText || extractedText.length < 50) {
|
||||
logger.info('Trying alternative text extraction method with streams')
|
||||
|
||||
const streamMatches = rawContent.match(/stream[\s\S]*?endstream/g)
|
||||
if (streamMatches && streamMatches.length > 0) {
|
||||
logger.info('Found', streamMatches.length, 'streams')
|
||||
|
||||
const textContent = streamMatches
|
||||
.map((stream) => {
|
||||
const content = stream.replace(/^stream\r?\n|\r?\nendstream$/g, '')
|
||||
|
||||
const readable = content.replace(/[^\x20-\x7E\r\n]/g, ' ').trim()
|
||||
|
||||
if (
|
||||
readable.length > 20 &&
|
||||
readable.includes(' ') &&
|
||||
(readable.includes('.') || readable.includes(',')) &&
|
||||
!/[\x00-\x1F\x7F]/.test(readable)
|
||||
) {
|
||||
return readable
|
||||
}
|
||||
return ''
|
||||
})
|
||||
.filter((text) => text.length > 0 && text.split(' ').length > 5)
|
||||
.join('\n\n')
|
||||
|
||||
if (textContent.length > 0) {
|
||||
extractedText = textContent
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!extractedText || extractedText.length < 50) {
|
||||
logger.info('Trying object streams for text')
|
||||
|
||||
const objMatches = rawContent.match(/\d+\s+\d+\s+obj[\s\S]*?endobj/g)
|
||||
if (objMatches && objMatches.length > 0) {
|
||||
logger.info('Found', objMatches.length, 'objects')
|
||||
|
||||
const textContent = objMatches
|
||||
.map((obj) => {
|
||||
const readable = obj.replace(/[^\x20-\x7E\r\n]/g, ' ').trim()
|
||||
|
||||
if (
|
||||
readable.length > 50 &&
|
||||
readable.includes(' ') &&
|
||||
!readable.includes('/Filter') &&
|
||||
readable.split(' ').length > 10 &&
|
||||
(readable.includes('.') || readable.includes(','))
|
||||
) {
|
||||
return readable
|
||||
}
|
||||
return ''
|
||||
})
|
||||
.filter((text) => text.length > 0)
|
||||
.join('\n\n')
|
||||
|
||||
if (textContent.length > 0) {
|
||||
extractedText += (extractedText ? '\n\n' : '') + textContent
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
extractedText &&
|
||||
(extractedText.includes('endobj') ||
|
||||
extractedText.includes('/Type /Page') ||
|
||||
extractedText.match(/\d+\s+\d+\s+obj/g)) &&
|
||||
metadataText
|
||||
) {
|
||||
logger.info(
|
||||
'Extracted content appears to be PDF structure information, using metadata instead'
|
||||
)
|
||||
extractedText = metadataText
|
||||
} else if (metadataText && !extractedText.includes('Document Title:')) {
|
||||
extractedText = metadataText + (extractedText ? `\n\n${extractedText}` : '')
|
||||
}
|
||||
|
||||
const validCharCount = (extractedText || '').replace(/[^\x20-\x7E\r\n]/g, '').length
|
||||
const totalCharCount = (extractedText || '').length
|
||||
const validRatio = validCharCount / (totalCharCount || 1)
|
||||
|
||||
const hasBinaryArtifacts =
|
||||
extractedText &&
|
||||
(extractedText.includes('\\u') ||
|
||||
extractedText.includes('\\x') ||
|
||||
extractedText.includes('\0') ||
|
||||
/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\xFF]{10,}/g.test(extractedText) ||
|
||||
validRatio < 0.7)
|
||||
|
||||
const looksLikeGibberish =
|
||||
extractedText &&
|
||||
(extractedText.replace(/[a-zA-Z0-9\s.,:'"()[\]{}]/g, '').length / extractedText.length >
|
||||
0.3 ||
|
||||
extractedText.split(' ').length < extractedText.length / 20)
|
||||
|
||||
if (!extractedText || extractedText.length < 50 || hasBinaryArtifacts || looksLikeGibberish) {
|
||||
logger.info('Could not extract meaningful text, providing fallback message')
|
||||
logger.info('Valid character ratio:', validRatio)
|
||||
logger.info('Has binary artifacts:', hasBinaryArtifacts)
|
||||
logger.info('Looks like gibberish:', looksLikeGibberish)
|
||||
|
||||
if (metadataText) {
|
||||
extractedText = `${metadataText}\n`
|
||||
} else {
|
||||
extractedText = ''
|
||||
}
|
||||
|
||||
extractedText += `This is a PDF document with ${pageCount} page(s) and version ${version}.\n\n`
|
||||
|
||||
const titleInStructure =
|
||||
rawContent.match(/title\s*:\s*([^\n]+)/i) ||
|
||||
rawContent.match(/Microsoft Word -\s*([^\n]+)/i)
|
||||
|
||||
if (titleInStructure?.[1] && !extractedText.includes('Document Title:')) {
|
||||
const title = titleInStructure[1].trim()
|
||||
extractedText = `Document Title: ${title}\n\n${extractedText}`
|
||||
}
|
||||
|
||||
extractedText += `The text content could not be properly extracted due to encoding or compression issues.\nFile size: ${dataBuffer.length} bytes.\n\nTo view this PDF properly, please download the file and open it with a PDF reader.`
|
||||
}
|
||||
|
||||
logger.info('PDF parsed with basic extraction, found text length:', extractedText.length)
|
||||
|
||||
return {
|
||||
content: extractedText,
|
||||
metadata: {
|
||||
pageCount,
|
||||
info: {
|
||||
RawExtraction: true,
|
||||
Version: version,
|
||||
Size: dataBuffer.length,
|
||||
},
|
||||
version,
|
||||
},
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error parsing buffer:', error)
|
||||
return {
|
||||
content: `Error parsing PDF buffer: ${(error as Error).message}`,
|
||||
metadata: {
|
||||
error: (error as Error).message,
|
||||
pageCount: 0,
|
||||
version: 'unknown',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -385,7 +385,6 @@ async function processOutlookEmails(
|
||||
if (type === 'text' || type === 'text/plain') {
|
||||
return content
|
||||
}
|
||||
// Default to converting HTML or unknown types
|
||||
return convertHtmlToPlainText(content)
|
||||
})(),
|
||||
bodyHtml: email.body?.content || '',
|
||||
|
||||
@@ -69,6 +69,7 @@
|
||||
"@react-email/components": "^0.0.34",
|
||||
"@sentry/nextjs": "^9.15.0",
|
||||
"@trigger.dev/sdk": "4.0.1",
|
||||
"@types/pdf-parse": "1.1.5",
|
||||
"@types/three": "0.177.0",
|
||||
"@vercel/og": "^0.6.5",
|
||||
"@vercel/speed-insights": "^1.2.0",
|
||||
@@ -84,6 +85,7 @@
|
||||
"dat.gui": "0.7.9",
|
||||
"date-fns": "4.1.0",
|
||||
"drizzle-orm": "^0.41.0",
|
||||
"entities": "6.0.1",
|
||||
"framer-motion": "^12.5.0",
|
||||
"fuse.js": "7.1.0",
|
||||
"geist": "1.4.2",
|
||||
@@ -105,7 +107,6 @@
|
||||
"officeparser": "^5.2.0",
|
||||
"openai": "^4.91.1",
|
||||
"papaparse": "5.5.3",
|
||||
"pdf-lib": "^1.17.1",
|
||||
"postgres": "^3.4.5",
|
||||
"prismjs": "^1.30.0",
|
||||
"react": "19.1.0",
|
||||
@@ -122,6 +123,7 @@
|
||||
"resend": "^4.1.2",
|
||||
"rtf-parser": "1.3.3",
|
||||
"rtf-stream-parser": "3.8.0",
|
||||
"sharp": "0.34.3",
|
||||
"socket.io": "^4.8.1",
|
||||
"stripe": "^17.7.0",
|
||||
"tailwind-merge": "^2.6.0",
|
||||
@@ -137,7 +139,7 @@
|
||||
"@testing-library/react": "^16.3.0",
|
||||
"@testing-library/user-event": "^14.6.1",
|
||||
"@trigger.dev/build": "4.0.1",
|
||||
"@types/html-to-text": "^9.0.4",
|
||||
"@types/html-to-text": "9.0.4",
|
||||
"@types/iconv-lite": "0.0.1",
|
||||
"@types/js-yaml": "4.0.9",
|
||||
"@types/jsdom": "21.1.7",
|
||||
|
||||
20
bun.lock
20
bun.lock
@@ -97,6 +97,7 @@
|
||||
"@react-email/components": "^0.0.34",
|
||||
"@sentry/nextjs": "^9.15.0",
|
||||
"@trigger.dev/sdk": "4.0.1",
|
||||
"@types/pdf-parse": "1.1.5",
|
||||
"@types/three": "0.177.0",
|
||||
"@vercel/og": "^0.6.5",
|
||||
"@vercel/speed-insights": "^1.2.0",
|
||||
@@ -112,6 +113,7 @@
|
||||
"dat.gui": "0.7.9",
|
||||
"date-fns": "4.1.0",
|
||||
"drizzle-orm": "^0.41.0",
|
||||
"entities": "6.0.1",
|
||||
"framer-motion": "^12.5.0",
|
||||
"fuse.js": "7.1.0",
|
||||
"geist": "1.4.2",
|
||||
@@ -133,7 +135,6 @@
|
||||
"officeparser": "^5.2.0",
|
||||
"openai": "^4.91.1",
|
||||
"papaparse": "5.5.3",
|
||||
"pdf-lib": "^1.17.1",
|
||||
"postgres": "^3.4.5",
|
||||
"prismjs": "^1.30.0",
|
||||
"react": "19.1.0",
|
||||
@@ -150,6 +151,7 @@
|
||||
"resend": "^4.1.2",
|
||||
"rtf-parser": "1.3.3",
|
||||
"rtf-stream-parser": "3.8.0",
|
||||
"sharp": "0.34.3",
|
||||
"socket.io": "^4.8.1",
|
||||
"stripe": "^17.7.0",
|
||||
"tailwind-merge": "^2.6.0",
|
||||
@@ -165,7 +167,7 @@
|
||||
"@testing-library/react": "^16.3.0",
|
||||
"@testing-library/user-event": "^14.6.1",
|
||||
"@trigger.dev/build": "4.0.1",
|
||||
"@types/html-to-text": "^9.0.4",
|
||||
"@types/html-to-text": "9.0.4",
|
||||
"@types/iconv-lite": "0.0.1",
|
||||
"@types/js-yaml": "4.0.9",
|
||||
"@types/jsdom": "21.1.7",
|
||||
@@ -830,10 +832,6 @@
|
||||
|
||||
"@orama/orama": ["@orama/orama@3.1.12", "", {}, "sha512-U7PY8FwXHuJ6bNBpbsqe0KLzb91IcJuORDggqHHkFy1waokY5SpWLN9tzB3AOW776awp6s1bjwts9I9Davy3lw=="],
|
||||
|
||||
"@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="],
|
||||
|
||||
"@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="],
|
||||
|
||||
"@peculiar/asn1-android": ["@peculiar/asn1-android@2.4.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.4.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-YFueREq97CLslZZBI8dKzis7jMfEHSLxM+nr0Zdx1POiXFLjqqwoY5s0F1UimdBiEw/iKlHey2m56MRDv7Jtyg=="],
|
||||
|
||||
"@peculiar/asn1-ecc": ["@peculiar/asn1-ecc@2.4.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.4.0", "@peculiar/asn1-x509": "^2.4.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-fJiYUBCJBDkjh347zZe5H81BdJ0+OGIg0X9z06v8xXUoql3MFeENUX0JsjCaVaU9A0L85PefLPGYkIoGpTnXLQ=="],
|
||||
@@ -1440,6 +1438,8 @@
|
||||
|
||||
"@types/papaparse": ["@types/papaparse@5.3.16", "", { "dependencies": { "@types/node": "*" } }, "sha512-T3VuKMC2H0lgsjI9buTB3uuKj3EMD2eap1MOuEQuBQ44EnDx/IkGhU6EwiTf9zG3za4SKlmwKAImdDKdNnCsXg=="],
|
||||
|
||||
"@types/pdf-parse": ["@types/pdf-parse@1.1.5", "", { "dependencies": { "@types/node": "*" } }, "sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA=="],
|
||||
|
||||
"@types/pg": ["@types/pg@8.6.1", "", { "dependencies": { "@types/node": "*", "pg-protocol": "*", "pg-types": "^2.2.0" } }, "sha512-1Kc4oAGzAl7uqUStZCDvaLFqZrW9qWSjXOmBfdgyBP5La7Us6Mg4GBvRlSoaZMhQF/zSj1C8CtKMBkoiT8eL8w=="],
|
||||
|
||||
"@types/pg-pool": ["@types/pg-pool@2.0.6", "", { "dependencies": { "@types/pg": "*" } }, "sha512-TaAUE5rq2VQYxab5Ts7WZhKNmuN78Q6PiFonTDdpbx8a1H0M1vhy3rhiMjl+e2iHmogyMw7jZF4FrE6eJUy5HQ=="],
|
||||
@@ -2644,8 +2644,6 @@
|
||||
|
||||
"pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="],
|
||||
|
||||
"pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="],
|
||||
|
||||
"pdfjs-dist": ["pdfjs-dist@5.4.54", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.74" } }, "sha512-TBAiTfQw89gU/Z4LW98Vahzd2/LoCFprVGvGbTgFt+QCB1F+woyOPmNNVgLa6djX9Z9GGTnj7qE1UzpOVJiINw=="],
|
||||
|
||||
"peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="],
|
||||
@@ -3680,6 +3678,8 @@
|
||||
|
||||
"@types/papaparse/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
|
||||
|
||||
"@types/pdf-parse/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
|
||||
|
||||
"@types/pg/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
|
||||
|
||||
"@types/tedious/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="],
|
||||
@@ -3816,8 +3816,6 @@
|
||||
|
||||
"parse-entities/@types/unist": ["@types/unist@2.0.11", "", {}, "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA=="],
|
||||
|
||||
"pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
|
||||
|
||||
"playwright/fsevents": ["fsevents@2.3.2", "", { "os": "darwin" }, "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA=="],
|
||||
|
||||
"postcss-nested/postcss-selector-parser": ["postcss-selector-parser@6.1.2", "", { "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" } }, "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg=="],
|
||||
@@ -4262,6 +4260,8 @@
|
||||
|
||||
"@types/papaparse/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
|
||||
|
||||
"@types/pdf-parse/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
|
||||
|
||||
"@types/pg/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
|
||||
|
||||
"@types/tedious/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="],
|
||||
|
||||
Reference in New Issue
Block a user