mirror of
https://github.com/danielmiessler/Fabric.git
synced 2026-01-09 14:28:01 -05:00
Add PDF to Markdown conversion functionality to the web svelte caht interface
This commit is contained in:
81
web/src/lib/services/PdfConversionService.ts
Normal file
81
web/src/lib/services/PdfConversionService.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
import { createPipeline, transformers } from 'pdf-to-markdown-core/lib/src';
|
||||
import { PARSE_SCHEMA } from 'pdf-to-markdown-core/lib/src/PdfParser';
|
||||
import * as pdfjs from 'pdfjs-dist';
|
||||
|
||||
export class PdfConversionService {
|
||||
constructor() {
|
||||
if (typeof window !== 'undefined') {
|
||||
console.log('PDF.js version:', pdfjs.version);
|
||||
const workerUrl = new URL(
|
||||
'pdfjs-dist/build/pdf.worker.min.js',
|
||||
import.meta.url
|
||||
);
|
||||
console.log('Worker URL:', workerUrl.href);
|
||||
pdfjs.GlobalWorkerOptions.workerSrc = workerUrl.href;
|
||||
console.log('Worker configuration complete');
|
||||
}
|
||||
}
|
||||
|
||||
async convertToMarkdown(file: File): Promise<string> {
|
||||
console.log('Starting PDF conversion:', {
|
||||
fileName: file.name,
|
||||
fileSize: file.size
|
||||
});
|
||||
|
||||
const buffer = await file.arrayBuffer();
|
||||
console.log('Buffer created:', buffer.byteLength);
|
||||
|
||||
const pipeline = createPipeline(pdfjs, {
|
||||
transformConfig: {
|
||||
transformers
|
||||
}
|
||||
});
|
||||
console.log('Pipeline created');
|
||||
|
||||
const result = await pipeline.parse(
|
||||
buffer,
|
||||
(progress) => console.log('Processing:', {
|
||||
stage: progress.stages,
|
||||
details: progress.stageDetails,
|
||||
progress: progress.stageProgress
|
||||
})
|
||||
);
|
||||
console.log('Parse complete, validating result');
|
||||
|
||||
const transformed = result.transform();
|
||||
console.log('Transform applied:', transformed);
|
||||
|
||||
const markdown = transformed.convert({
|
||||
convert: (items) => {
|
||||
console.log('PDF Structure:', {
|
||||
itemCount: items.length,
|
||||
firstItem: items[0],
|
||||
schema: PARSE_SCHEMA // ['transform', 'width', 'height', 'str', 'fontName', 'dir']
|
||||
});
|
||||
|
||||
const text = items
|
||||
.map(item => item.value('str')) // Using 'str' instead of 'text' based on PARSE_SCHEMA
|
||||
.filter(Boolean)
|
||||
.join('\n');
|
||||
|
||||
console.log('Converted text:', {
|
||||
length: text.length,
|
||||
preview: text.substring(0, 100)
|
||||
});
|
||||
|
||||
return text;
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
return markdown;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user