Add PDF to Markdown conversion functionality to the web svelte caht interface

This commit is contained in:
jmd1010
2025-02-24 17:24:02 -05:00
parent cbc82ec045
commit a0e1f7204d
5 changed files with 901 additions and 2 deletions

View File

@@ -0,0 +1,81 @@
import { createPipeline, transformers } from 'pdf-to-markdown-core/lib/src';
import { PARSE_SCHEMA } from 'pdf-to-markdown-core/lib/src/PdfParser';
import * as pdfjs from 'pdfjs-dist';
export class PdfConversionService {
constructor() {
if (typeof window !== 'undefined') {
console.log('PDF.js version:', pdfjs.version);
const workerUrl = new URL(
'pdfjs-dist/build/pdf.worker.min.js',
import.meta.url
);
console.log('Worker URL:', workerUrl.href);
pdfjs.GlobalWorkerOptions.workerSrc = workerUrl.href;
console.log('Worker configuration complete');
}
}
async convertToMarkdown(file: File): Promise<string> {
console.log('Starting PDF conversion:', {
fileName: file.name,
fileSize: file.size
});
const buffer = await file.arrayBuffer();
console.log('Buffer created:', buffer.byteLength);
const pipeline = createPipeline(pdfjs, {
transformConfig: {
transformers
}
});
console.log('Pipeline created');
const result = await pipeline.parse(
buffer,
(progress) => console.log('Processing:', {
stage: progress.stages,
details: progress.stageDetails,
progress: progress.stageProgress
})
);
console.log('Parse complete, validating result');
const transformed = result.transform();
console.log('Transform applied:', transformed);
const markdown = transformed.convert({
convert: (items) => {
console.log('PDF Structure:', {
itemCount: items.length,
firstItem: items[0],
schema: PARSE_SCHEMA // ['transform', 'width', 'height', 'str', 'fontName', 'dir']
});
const text = items
.map(item => item.value('str')) // Using 'str' instead of 'text' based on PARSE_SCHEMA
.filter(Boolean)
.join('\n');
console.log('Converted text:', {
length: text.length,
preview: text.substring(0, 100)
});
return text;
}
});
return markdown;
}
}