From d0767507b22b32cb459aac953d55ea56a1c2c8ca Mon Sep 17 00:00:00 2001 From: Waleed Date: Fri, 14 Nov 2025 22:39:28 -0800 Subject: [PATCH] fix(pdfs): use unpdf instead of pdf-parse (#2004) --- apps/sim/lib/file-parsers/pdf-parser.ts | 26 ++++++++++--------------- apps/sim/next.config.ts | 2 +- apps/sim/package.json | 7 +++---- apps/sim/trigger.config.ts | 3 +-- bun.lock | 15 +++----------- 5 files changed, 18 insertions(+), 35 deletions(-) diff --git a/apps/sim/lib/file-parsers/pdf-parser.ts b/apps/sim/lib/file-parsers/pdf-parser.ts index 473d034be..4e6b998e1 100644 --- a/apps/sim/lib/file-parsers/pdf-parser.ts +++ b/apps/sim/lib/file-parsers/pdf-parser.ts @@ -28,29 +28,23 @@ export class PdfParser implements FileParser { try { logger.info('Starting to parse buffer, size:', dataBuffer.length) - const { PDFParse } = await import('pdf-parse') + const { extractText, getDocumentProxy } = await import('unpdf') - const parser = new PDFParse({ data: dataBuffer }) - const textResult = await parser.getText() - const infoResult = await parser.getInfo() - await parser.destroy() + const uint8Array = new Uint8Array(dataBuffer) - logger.info( - 'PDF parsed successfully, pages:', - textResult.total, - 'text length:', - textResult.text.length - ) + const pdf = await getDocumentProxy(uint8Array) - const cleanContent = textResult.text.replace(/\u0000/g, '') + const { totalPages, text } = await extractText(pdf, { mergePages: true }) + + logger.info('PDF parsed successfully, pages:', totalPages, 'text length:', text.length) + + const cleanContent = text.replace(/\u0000/g, '') return { content: cleanContent, metadata: { - pageCount: textResult.total, - info: infoResult.info, - version: infoResult.metadata?.get('pdf:PDFVersion'), - source: 'pdf-parse', + pageCount: totalPages, + source: 'unpdf', }, } } catch (error) { diff --git a/apps/sim/next.config.ts b/apps/sim/next.config.ts index 7bad6b97e..10b80bc4e 100644 --- a/apps/sim/next.config.ts +++ b/apps/sim/next.config.ts @@ -75,7 +75,7 @@ const nextConfig: NextConfig = { turbopack: { resolveExtensions: ['.tsx', '.ts', '.jsx', '.js', '.mjs', '.json'], }, - serverExternalPackages: ['pdf-parse'], + serverExternalPackages: ['unpdf'], experimental: { optimizeCss: true, turbopackSourceMaps: false, diff --git a/apps/sim/package.json b/apps/sim/package.json index 5ca214320..73cf01c72 100644 --- a/apps/sim/package.json +++ b/apps/sim/package.json @@ -61,7 +61,6 @@ "@radix-ui/react-tooltip": "1.2.8", "@react-email/components": "^0.0.34", "@trigger.dev/sdk": "4.0.4", - "@types/pdf-parse": "1.1.5", "@types/three": "0.177.0", "better-auth": "1.3.12", "browser-image-compression": "^2.0.2", @@ -76,6 +75,7 @@ "entities": "6.0.1", "framer-motion": "^12.5.0", "fuse.js": "7.1.0", + "gray-matter": "^4.0.3", "groq-sdk": "^0.15.0", "html-to-text": "^9.0.5", "input-otp": "^1.4.2", @@ -96,8 +96,6 @@ "officeparser": "^5.2.0", "openai": "^4.91.1", "papaparse": "5.5.3", - "pdf-parse": "2.4.5", - "gray-matter": "^4.0.3", "posthog-js": "1.268.9", "posthog-node": "5.9.2", "prismjs": "^1.30.0", @@ -109,9 +107,9 @@ "react-markdown": "^10.1.0", "react-simple-code-editor": "^0.14.1", "reactflow": "^11.11.4", - "remark-gfm": "4.0.1", "rehype-autolink-headings": "^7.1.0", "rehype-slug": "^6.0.0", + "remark-gfm": "4.0.1", "resend": "^4.1.2", "sharp": "0.34.3", "socket.io": "^4.8.1", @@ -119,6 +117,7 @@ "tailwind-merge": "^2.6.0", "tailwindcss-animate": "^1.0.7", "three": "0.177.0", + "unpdf": "1.4.0", "uuid": "^11.1.0", "xlsx": "0.18.5", "zod": "^3.24.2" diff --git a/apps/sim/trigger.config.ts b/apps/sim/trigger.config.ts index 88e6b2b98..4d82d04e2 100644 --- a/apps/sim/trigger.config.ts +++ b/apps/sim/trigger.config.ts @@ -16,9 +16,8 @@ export default defineConfig({ dirs: ['./background'], build: { extensions: [ - // pdf-parse has native bindings, keep as external package additionalPackages({ - packages: ['pdf-parse'], + packages: ['unpdf'], }), ], }, diff --git a/bun.lock b/bun.lock index 4ed9df46e..c265bd733 100644 --- a/bun.lock +++ b/bun.lock @@ -99,7 +99,6 @@ "@radix-ui/react-tooltip": "1.2.8", "@react-email/components": "^0.0.34", "@trigger.dev/sdk": "4.0.4", - "@types/pdf-parse": "1.1.5", "@types/three": "0.177.0", "better-auth": "1.3.12", "browser-image-compression": "^2.0.2", @@ -135,7 +134,6 @@ "officeparser": "^5.2.0", "openai": "^4.91.1", "papaparse": "5.5.3", - "pdf-parse": "2.4.5", "posthog-js": "1.268.9", "posthog-node": "5.9.2", "prismjs": "^1.30.0", @@ -157,6 +155,7 @@ "tailwind-merge": "^2.6.0", "tailwindcss-animate": "^1.0.7", "three": "0.177.0", + "unpdf": "1.4.0", "uuid": "^11.1.0", "xlsx": "0.18.5", "zod": "^3.24.2", @@ -1358,8 +1357,6 @@ "@types/papaparse": ["@types/papaparse@5.3.16", "", { "dependencies": { "@types/node": "*" } }, "sha512-T3VuKMC2H0lgsjI9buTB3uuKj3EMD2eap1MOuEQuBQ44EnDx/IkGhU6EwiTf9zG3za4SKlmwKAImdDKdNnCsXg=="], - "@types/pdf-parse": ["@types/pdf-parse@1.1.5", "", { "dependencies": { "@types/node": "*" } }, "sha512-kBfrSXsloMnUJOKi25s3+hRmkycHfLK6A09eRGqF/N8BkQoPUmaCr+q8Cli5FnfohEz/rsv82zAiPz/LXtOGhA=="], - "@types/prismjs": ["@types/prismjs@1.26.5", "", {}, "sha512-AUZTa7hQ2KY5L7AmtSiqxlhWxb4ina0yd8hNbl4TWuqnv/pFP0nDMb3YrfSBf4hJVGLh2YEIBfKaBW/9UEl6IQ=="], "@types/react": ["@types/react@19.1.15", "", { "dependencies": { "csstype": "^3.0.2" } }, "sha512-+kLxJpaJzXybyDyFXYADyP1cznTO8HSuBpenGlnKOAkH4hyNINiywvXS/tGJhsrGGP/gM185RA3xpjY0Yg4erA=="], @@ -2558,8 +2555,6 @@ "pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="], - "pdf-parse": ["pdf-parse@2.4.5", "", { "dependencies": { "@napi-rs/canvas": "0.1.80", "pdfjs-dist": "5.4.296" }, "bin": { "pdf-parse": "bin/cli.mjs" } }, "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg=="], - "pdfjs-dist": ["pdfjs-dist@5.4.149", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.77" } }, "sha512-Xe8/1FMJEQPUVSti25AlDpwpUm2QAVmNOpFP0SIahaPIOKBKICaefbzogLdwey3XGGoaP4Lb9wqiw2e9Jqp0LA=="], "peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="], @@ -3074,6 +3069,8 @@ "universal-user-agent": ["universal-user-agent@7.0.3", "", {}, "sha512-TmnEAEAsBJVZM/AADELsK76llnwcf9vMKuPz8JflO1frO8Lchitr0fNaN9d+Ap0BjKtqWqd/J17qeDnXh8CL2A=="], + "unpdf": ["unpdf@1.4.0", "", { "peerDependencies": { "@napi-rs/canvas": "^0.1.69" }, "optionalPeers": ["@napi-rs/canvas"] }, "sha512-TahIk0xdH/4jh/MxfclzU79g40OyxtP00VnEUZdEkJoYtXAHWLiir6t3FC6z3vDqQTzc2ZHcla6uEiVTNjejuA=="], + "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="], "update-browserslist-db": ["update-browserslist-db@1.1.3", "", { "dependencies": { "escalade": "^3.2.0", "picocolors": "^1.1.1" }, "peerDependencies": { "browserslist": ">= 4.21.0" }, "bin": { "update-browserslist-db": "cli.js" } }, "sha512-UxhIZQ+QInVdunkDAaiazvvT/+fXL5Osr0JZlJulepYu6Jd7qJtDZjlur0emRlT71EN3ScPoE7gvsuIKKNavKw=="], @@ -3414,8 +3411,6 @@ "@types/papaparse/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="], - "@types/pdf-parse/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="], - "@types/through/@types/node": ["@types/node@24.2.1", "", { "dependencies": { "undici-types": "~7.10.0" } }, "sha512-DRh5K+ka5eJic8CjH7td8QpYEV6Zo10gfRkjHCO3weqZHWDtAaSTFtl4+VMqOJ4N5jcuhZ9/l+yy8rVgw7BQeQ=="], "@typespec/ts-http-runtime/https-proxy-agent": ["https-proxy-agent@7.0.6", "", { "dependencies": { "agent-base": "^7.1.2", "debug": "4" } }, "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw=="], @@ -3568,8 +3563,6 @@ "parse-entities/@types/unist": ["@types/unist@2.0.11", "", {}, "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA=="], - "pdf-parse/pdfjs-dist": ["pdfjs-dist@5.4.296", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.80" } }, "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q=="], - "playwright/fsevents": ["fsevents@2.3.2", "", { "os": "darwin" }, "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA=="], "postcss-nested/postcss-selector-parser": ["postcss-selector-parser@6.1.2", "", { "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" } }, "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg=="], @@ -3758,8 +3751,6 @@ "@types/papaparse/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="], - "@types/pdf-parse/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="], - "@types/through/@types/node/undici-types": ["undici-types@7.10.0", "", {}, "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag=="], "@typespec/ts-http-runtime/https-proxy-agent/agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="],