improvement(kb): optimize processes, add more robust fallbacks for large file ops (#2684)

* improvement(kb): optimize processes, add more robust fallbacks for large file ops * stronger typing * comments cleanup * ack PR comments * upgraded turborepo * ack more PR comments * fix failing test * moved doc update inside tx for embeddings chunks upload * ack more PR comments
2026-01-08 22:48:14 -05:00 · 2026-01-05 20:26:16 -08:00
parent d25084e05d
commit 75aca00b6e
22 changed files with 592 additions and 261 deletions
--- a/apps/sim/app/api/knowledge/utils.test.ts
+++ b/apps/sim/app/api/knowledge/utils.test.ts
@@ -136,16 +136,29 @@ vi.mock('@sim/db', () => {
          },
        }),
      }),
-      transaction: vi.fn(async (fn: any) => {
-        await fn({
-          insert: (table: any) => ({
+      delete: () => ({
+        where: () => Promise.resolve(),
+      }),
+      insert: () => ({
        values: (records: any) => {
          dbOps.order.push('insert')
          dbOps.insertRecords.push(records)
          return Promise.resolve()
        },
      }),
-          update: (table: any) => ({
+      transaction: vi.fn(async (fn: any) => {
+        await fn({
+          delete: () => ({
+            where: () => Promise.resolve(),
+          }),
+          insert: () => ({
+            values: (records: any) => {
+              dbOps.order.push('insert')
+              dbOps.insertRecords.push(records)
+              return Promise.resolve()
+            },
+          }),
+          update: () => ({
            set: (payload: any) => ({
              where: () => {
                dbOps.updatePayloads.push(payload)
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx
@@ -453,6 +453,8 @@ export function KnowledgeBase({
    error: knowledgeBaseError,
    refresh: refreshKnowledgeBase,
  } = useKnowledgeBase(id)
+  const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false)
+
  const {
    documents,
    pagination,
@@ -468,6 +470,7 @@ export function KnowledgeBase({
    offset: (currentPage - 1) * DOCUMENTS_PER_PAGE,
    sortBy,
    sortOrder,
+    refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false,
  })

  const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id)
@@ -534,25 +537,15 @@ export function KnowledgeBase({
  )

  useEffect(() => {
-    const hasProcessingDocuments = documents.some(
+    const processing = documents.some(
      (doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing'
    )
+    setHasProcessingDocuments(processing)

-    if (!hasProcessingDocuments) return
-
-    const refreshInterval = setInterval(async () => {
-      try {
-        if (!isDeleting) {
-          await checkForDeadProcesses()
-          await refreshDocuments()
+    if (processing) {
+      checkForDeadProcesses()
    }
-      } catch (error) {
-        logger.error('Error refreshing documents:', error)
-      }
-    }, 3000)
-
-    return () => clearInterval(refreshInterval)
-  }, [documents, refreshDocuments, isDeleting])
+  }, [documents])

  /**
   * Checks for documents with stale processing states and marks them as failed
@@ -672,25 +665,6 @@ export function KnowledgeBase({

      await refreshDocuments()

-      let refreshAttempts = 0
-      const maxRefreshAttempts = 3
-      const refreshInterval = setInterval(async () => {
-        try {
-          refreshAttempts++
-          await refreshDocuments()
-          if (refreshAttempts >= maxRefreshAttempts) {
-            clearInterval(refreshInterval)
-          }
-        } catch (error) {
-          logger.error('Error refreshing documents after retry:', error)
-          clearInterval(refreshInterval)
-        }
-      }, 1000)
-
-      setTimeout(() => {
-        clearInterval(refreshInterval)
-      }, 4000)
-
      logger.info(`Document retry initiated successfully for: ${docId}`)
    } catch (err) {
      logger.error('Error retrying document:', err)
--- a/apps/sim/background/knowledge-processing.ts
+++ b/apps/sim/background/knowledge-processing.ts
@@ -27,6 +27,7 @@ export type DocumentProcessingPayload = {
 export const processDocument = task({
  id: 'knowledge-process-document',
  maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
+  machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing
  retry: {
    maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
    factor: env.KB_CONFIG_RETRY_FACTOR || 2,
--- a/apps/sim/hooks/queries/knowledge.ts
+++ b/apps/sim/hooks/queries/knowledge.ts
@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(
  params: KnowledgeDocumentsParams,
  options?: {
    enabled?: boolean
+    refetchInterval?: number | false
  }
 ) {
  const paramsKey = serializeDocumentParams(params)
@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(
    enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),
    staleTime: 60 * 1000,
    placeholderData: keepPreviousData,
+    refetchInterval: options?.refetchInterval ?? false,
  })
 }

--- a/apps/sim/hooks/use-knowledge.ts
+++ b/apps/sim/hooks/use-knowledge.ts
@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(
    sortBy?: string
    sortOrder?: string
    enabled?: boolean
+    refetchInterval?: number | false
  }
 ) {
  const queryClient = useQueryClient()
@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(
    },
    {
      enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),
+      refetchInterval: options?.refetchInterval,
    }
  )

--- a/apps/sim/lib/chunkers/docs-chunker.ts
+++ b/apps/sim/lib/chunkers/docs-chunker.ts
@@ -16,7 +16,7 @@ interface HeaderInfo {
 interface Frontmatter {
  title?: string
  description?: string
-  [key: string]: any
+  [key: string]: unknown
 }

 const logger = createLogger('DocsChunker')
--- a/apps/sim/lib/chunkers/json-yaml-chunker.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators'

 const logger = createLogger('JsonYamlChunker')

+type JsonPrimitive = string | number | boolean | null
+type JsonValue = JsonPrimitive | JsonObject | JsonArray
+type JsonObject = { [key: string]: JsonValue }
+type JsonArray = JsonValue[]
+
 function getTokenCount(text: string): number {
  try {
    return getAccurateTokenCount(text, 'text-embedding-3-small')
@@ -59,11 +64,11 @@ export class JsonYamlChunker {
   */
  async chunk(content: string): Promise<Chunk[]> {
    try {
-      let data: any
+      let data: JsonValue
      try {
-        data = JSON.parse(content)
+        data = JSON.parse(content) as JsonValue
      } catch {
-        data = yaml.load(content)
+        data = yaml.load(content) as JsonValue
      }
      const chunks = this.chunkStructuredData(data)

@@ -86,7 +91,7 @@ export class JsonYamlChunker {
  /**
   * Chunk structured data based on its structure
   */
-  private chunkStructuredData(data: any, path: string[] = []): Chunk[] {
+  private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
    const chunks: Chunk[] = []

    if (Array.isArray(data)) {
@@ -94,7 +99,7 @@ export class JsonYamlChunker {
    }

    if (typeof data === 'object' && data !== null) {
-      return this.chunkObject(data, path)
+      return this.chunkObject(data as JsonObject, path)
    }

    const content = JSON.stringify(data, null, 2)
@@ -118,9 +123,9 @@ export class JsonYamlChunker {
  /**
   * Chunk an array intelligently
   */
-  private chunkArray(arr: any[], path: string[]): Chunk[] {
+  private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
    const chunks: Chunk[] = []
-    let currentBatch: any[] = []
+    let currentBatch: JsonValue[] = []
    let currentTokens = 0

    const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
@@ -194,7 +199,7 @@ export class JsonYamlChunker {
  /**
   * Chunk an object intelligently
   */
-  private chunkObject(obj: Record<string, any>, path: string[]): Chunk[] {
+  private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
    const chunks: Chunk[] = []
    const entries = Object.entries(obj)

@@ -213,7 +218,7 @@ export class JsonYamlChunker {
      return chunks
    }

-    let currentObj: Record<string, any> = {}
+    let currentObj: JsonObject = {}
    let currentTokens = 0
    let currentKeys: string[] = []

--- a/apps/sim/lib/chunkers/text-chunker.ts
+++ b/apps/sim/lib/chunkers/text-chunker.ts
@@ -110,10 +110,12 @@ export class TextChunker {
          chunks.push(currentChunk.trim())
        }

-        // Start new chunk with current part
        // If part itself is too large, split it further
        if (this.estimateTokens(part) > this.chunkSize) {
-          chunks.push(...(await this.splitRecursively(part, separatorIndex + 1)))
+          const subChunks = await this.splitRecursively(part, separatorIndex + 1)
+          for (const subChunk of subChunks) {
+            chunks.push(subChunk)
+          }
          currentChunk = ''
        } else {
          currentChunk = part
--- a/apps/sim/lib/core/config/env.ts
+++ b/apps/sim/lib/core/config/env.ts
@@ -178,6 +178,7 @@ export const env = createEnv({
    KB_CONFIG_BATCH_SIZE:                  z.number().optional().default(2000),    // Chunks to process per embedding batch
    KB_CONFIG_DELAY_BETWEEN_BATCHES:       z.number().optional().default(0),       // Delay between batches in ms (0 for max speed)
    KB_CONFIG_DELAY_BETWEEN_DOCUMENTS:     z.number().optional().default(50),      // Delay between documents in ms
+    KB_CONFIG_CHUNK_CONCURRENCY:           z.number().optional().default(10),      // Concurrent PDF chunk OCR processing

    // Real-time Communication
    SOCKET_SERVER_URL:                     z.string().url().optional(),            // WebSocket server URL for real-time features
--- a/apps/sim/lib/file-parsers/doc-parser.ts
+++ b/apps/sim/lib/file-parsers/doc-parser.ts
@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
        throw new Error(`File not found: ${filePath}`)
      }

-      logger.info(`Parsing DOC file: ${filePath}`)
-
      const buffer = await readFile(filePath)
      return this.parseBuffer(buffer)
    } catch (error) {
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {

  async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
    try {
-      logger.info('Parsing DOC buffer, size:', buffer.length)
-
      if (!buffer || buffer.length === 0) {
        throw new Error('Empty buffer provided')
      }

-      let parseOfficeAsync
      try {
        const officeParser = await import('officeparser')
-        parseOfficeAsync = officeParser.parseOfficeAsync
-      } catch (importError) {
-        logger.warn('officeparser not available, using fallback extraction')
-        return this.fallbackExtraction(buffer)
-      }
-
-      try {
-        const result = await parseOfficeAsync(buffer)
-
-        if (!result) {
-          throw new Error('officeparser returned no result')
-        }
+        const result = await officeParser.parseOfficeAsync(buffer)

+        if (result) {
          const resultString = typeof result === 'string' ? result : String(result)
-
          const content = sanitizeTextForUTF8(resultString.trim())

-        logger.info('DOC parsing completed successfully with officeparser')
-
+          if (content.length > 0) {
            return {
-          content: content,
+              content,
              metadata: {
                characterCount: content.length,
                extractionMethod: 'officeparser',
              },
            }
-      } catch (extractError) {
-        logger.warn('officeparser failed, using fallback:', extractError)
-        return this.fallbackExtraction(buffer)
          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed, trying mammoth:', officeError)
+      }
+
+      try {
+        const mammoth = await import('mammoth')
+        const result = await mammoth.extractRawText({ buffer })
+
+        if (result.value && result.value.trim().length > 0) {
+          const content = sanitizeTextForUTF8(result.value.trim())
+          return {
+            content,
+            metadata: {
+              characterCount: content.length,
+              extractionMethod: 'mammoth',
+              messages: result.messages,
+            },
+          }
+        }
+      } catch (mammothError) {
+        logger.warn('mammoth failed:', mammothError)
+      }
+
+      return this.fallbackExtraction(buffer)
    } catch (error) {
-      logger.error('DOC buffer parsing error:', error)
+      logger.error('DOC parsing error:', error)
      throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`)
    }
  }

  private fallbackExtraction(buffer: Buffer): FileParseResult {
-    logger.info('Using fallback text extraction for DOC file')
+    const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
+
+    if (!isBinaryDoc) {
+      const textContent = buffer.toString('utf8').trim()
+
+      if (textContent.length > 0) {
+        const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
+        const isProbablyText = printableChars / textContent.length > 0.9
+
+        if (isProbablyText) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOC format, extracted as plain text',
+            },
+          }
+        }
+      }
+    }

    const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))

--- a/apps/sim/lib/file-parsers/docx-parser.ts
+++ b/apps/sim/lib/file-parsers/docx-parser.ts
@@ -2,13 +2,18 @@ import { readFile } from 'fs/promises'
 import { createLogger } from '@sim/logger'
 import mammoth from 'mammoth'
 import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
+import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'

 const logger = createLogger('DocxParser')

-// Define interface for mammoth result
+interface MammothMessage {
+  type: 'warning' | 'error'
+  message: string
+}
+
 interface MammothResult {
  value: string
-  messages: any[]
+  messages: MammothMessage[]
 }

 export class DocxParser implements FileParser {
@@ -19,7 +24,6 @@ export class DocxParser implements FileParser {
      }

      const buffer = await readFile(filePath)
-
      return this.parseBuffer(buffer)
    } catch (error) {
      logger.error('DOCX file error:', error)
@@ -29,26 +33,74 @@ export class DocxParser implements FileParser {

  async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
    try {
-      logger.info('Parsing buffer, size:', buffer.length)
+      if (!buffer || buffer.length === 0) {
+        throw new Error('Empty buffer provided')
+      }

+      try {
        const result = await mammoth.extractRawText({ buffer })

+        if (result.value && result.value.trim().length > 0) {
          let htmlResult: MammothResult = { value: '', messages: [] }
          try {
            htmlResult = await mammoth.convertToHtml({ buffer })
-      } catch (htmlError) {
-        logger.warn('HTML conversion warning:', htmlError)
+          } catch {
+            // HTML conversion is optional
          }

          return {
-        content: result.value,
+            content: sanitizeTextForUTF8(result.value),
            metadata: {
+              extractionMethod: 'mammoth',
              messages: [...result.messages, ...htmlResult.messages],
              html: htmlResult.value,
            },
          }
+        }
+      } catch (mammothError) {
+        logger.warn('mammoth failed, trying officeparser:', mammothError)
+      }
+
+      try {
+        const officeParser = await import('officeparser')
+        const result = await officeParser.parseOfficeAsync(buffer)
+
+        if (result) {
+          const resultString = typeof result === 'string' ? result : String(result)
+          const content = sanitizeTextForUTF8(resultString.trim())
+
+          if (content.length > 0) {
+            return {
+              content,
+              metadata: {
+                extractionMethod: 'officeparser',
+                characterCount: content.length,
+              },
+            }
+          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed:', officeError)
+      }
+
+      const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b
+      if (!isZipFile) {
+        const textContent = buffer.toString('utf8').trim()
+        if (textContent.length > 0) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOCX format, extracted as plain text',
+            },
+          }
+        }
+      }
+
+      throw new Error('Failed to extract text from DOCX file')
    } catch (error) {
-      logger.error('DOCX buffer parsing error:', error)
+      logger.error('DOCX parsing error:', error)
      throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`)
    }
  }
--- a/apps/sim/lib/file-parsers/types.ts
+++ b/apps/sim/lib/file-parsers/types.ts
@@ -1,6 +1,22 @@
+export interface FileParseMetadata {
+  characterCount?: number
+  pageCount?: number
+  extractionMethod?: string
+  warning?: string
+  messages?: unknown[]
+  html?: string
+  type?: string
+  headers?: string[]
+  totalRows?: number
+  rowCount?: number
+  sheetNames?: string[]
+  source?: string
+  [key: string]: unknown
+}
+
 export interface FileParseResult {
  content: string
-  metadata?: Record<string, any>
+  metadata?: FileParseMetadata
 }

 export interface FileParser {
--- a/apps/sim/lib/knowledge/documents/document-processor.ts
+++ b/apps/sim/lib/knowledge/documents/document-processor.ts
@@ -1,8 +1,10 @@
 import { createLogger } from '@sim/logger'
+import { PDFDocument } from 'pdf-lib'
 import { getBYOKKey } from '@/lib/api-key/byok'
 import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
 import { env } from '@/lib/core/config/env'
 import { parseBuffer, parseFile } from '@/lib/file-parsers'
+import type { FileParseMetadata } from '@/lib/file-parsers/types'
 import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
 import { StorageService } from '@/lib/uploads'
 import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
@@ -15,6 +17,8 @@ const TIMEOUTS = {
  MISTRAL_OCR_API: 120000,
 } as const

+const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
+
 type OCRResult = {
  success: boolean
  error?: string
@@ -36,6 +40,61 @@ type OCRRequestBody = {
  include_image_base64: boolean
 }

+const MISTRAL_MAX_PAGES = 1000
+
+/**
+ * Get page count from a PDF buffer using unpdf
+ */
+async function getPdfPageCount(buffer: Buffer): Promise<number> {
+  try {
+    const { getDocumentProxy } = await import('unpdf')
+    const uint8Array = new Uint8Array(buffer)
+    const pdf = await getDocumentProxy(uint8Array)
+    return pdf.numPages
+  } catch (error) {
+    logger.warn('Failed to get PDF page count:', error)
+    return 0
+  }
+}
+
+/**
+ * Split a PDF buffer into multiple smaller PDFs
+ * Returns an array of PDF buffers, each with at most maxPages pages
+ */
+async function splitPdfIntoChunks(
+  pdfBuffer: Buffer,
+  maxPages: number
+): Promise<{ buffer: Buffer; startPage: number; endPage: number }[]> {
+  const sourcePdf = await PDFDocument.load(pdfBuffer)
+  const totalPages = sourcePdf.getPageCount()
+
+  if (totalPages <= maxPages) {
+    return [{ buffer: pdfBuffer, startPage: 0, endPage: totalPages - 1 }]
+  }
+
+  const chunks: { buffer: Buffer; startPage: number; endPage: number }[] = []
+
+  for (let startPage = 0; startPage < totalPages; startPage += maxPages) {
+    const endPage = Math.min(startPage + maxPages - 1, totalPages - 1)
+    const pageCount = endPage - startPage + 1
+
+    const newPdf = await PDFDocument.create()
+    const pageIndices = Array.from({ length: pageCount }, (_, i) => startPage + i)
+    const copiedPages = await newPdf.copyPages(sourcePdf, pageIndices)
+
+    copiedPages.forEach((page) => newPdf.addPage(page))
+
+    const pdfBytes = await newPdf.save()
+    chunks.push({
+      buffer: Buffer.from(pdfBytes),
+      startPage,
+      endPage,
+    })
+  }
+
+  return chunks
+}
+
 type AzureOCRResponse = {
  pages?: OCRPage[]
  [key: string]: unknown
@@ -81,7 +140,7 @@ export async function processDocument(
    const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined

    let chunks: Chunk[]
-    const metadata = 'metadata' in parseResult ? parseResult.metadata : {}
+    const metadata: FileParseMetadata = parseResult.metadata ?? {}

    const isJsonYaml =
      metadata.type === 'json' ||
@@ -97,10 +156,11 @@ export async function processDocument(
      })
    } else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
      logger.info('Using structured data chunker for spreadsheet/CSV content')
+      const rowCount = metadata.totalRows ?? metadata.rowCount
      chunks = await StructuredDataChunker.chunkStructuredData(content, {
        chunkSize,
        headers: metadata.headers,
-        totalRows: metadata.totalRows || metadata.rowCount,
+        totalRows: typeof rowCount === 'number' ? rowCount : undefined,
        sheetName: metadata.sheetNames?.[0],
      })
    } else {
@@ -153,7 +213,7 @@ async function parseDocument(
  content: string
  processingMethod: 'file-parser' | 'mistral-ocr'
  cloudUrl?: string
-  metadata?: any
+  metadata?: FileParseMetadata
 }> {
  const isPDF = mimeType === 'application/pdf'
  const hasAzureMistralOCR =
@@ -165,7 +225,7 @@ async function parseDocument(
  if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
    if (hasAzureMistralOCR) {
      logger.info(`Using Azure Mistral OCR: ${filename}`)
-      return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId)
+      return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
    }

    if (hasMistralOCR) {
@@ -188,13 +248,32 @@ async function handleFileForOCR(
  const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')

  if (isExternalHttps) {
-    return { httpsUrl: fileUrl }
+    if (mimeType === 'application/pdf') {
+      logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
+      try {
+        const buffer = await downloadFileWithTimeout(fileUrl)
+        logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
+        return { httpsUrl: fileUrl, buffer }
+      } catch (error) {
+        logger.warn(
+          `handleFileForOCR: Failed to download external PDF for page count check, proceeding without batching`,
+          {
+            error: error instanceof Error ? error.message : String(error),
+          }
+        )
+        return { httpsUrl: fileUrl, buffer: undefined }
+      }
+    }
+    logger.info(`handleFileForOCR: Using external URL directly`)
+    return { httpsUrl: fileUrl, buffer: undefined }
  }

  logger.info(`Uploading "${filename}" to cloud storage for OCR`)

  const buffer = await downloadFileWithTimeout(fileUrl)

+  logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)
+
  try {
    const metadata: Record<string, string> = {
      originalName: filename,
@@ -224,8 +303,7 @@ async function handleFileForOCR(
      900 // 15 minutes
    )

-    logger.info(`Successfully uploaded for OCR: ${cloudResult.key}`)
-    return { httpsUrl, cloudUrl: httpsUrl }
+    return { httpsUrl, cloudUrl: httpsUrl, buffer }
  } catch (uploadError) {
    const message = uploadError instanceof Error ? uploadError.message : 'Unknown error'
    throw new Error(`Cloud upload failed: ${message}. Cloud upload is required for OCR.`)
@@ -321,13 +399,7 @@ async function makeOCRRequest(
  }
 }

-async function parseWithAzureMistralOCR(
-  fileUrl: string,
-  filename: string,
-  mimeType: string,
-  userId?: string,
-  workspaceId?: string | null
-) {
+async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
  validateOCRConfig(
    env.OCR_AZURE_API_KEY,
    env.OCR_AZURE_ENDPOINT,
@@ -336,6 +408,19 @@ async function parseWithAzureMistralOCR(
  )

  const fileBuffer = await downloadFileForBase64(fileUrl)
+
+  if (mimeType === 'application/pdf') {
+    const pageCount = await getPdfPageCount(fileBuffer)
+    if (pageCount > MISTRAL_MAX_PAGES) {
+      logger.info(
+        `PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
+          `Falling back to file parser.`
+      )
+      return parseWithFileParser(fileUrl, filename, mimeType)
+    }
+    logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
+  }
+
  const base64Data = fileBuffer.toString('base64')
  const dataUri = `data:${mimeType};base64,${base64Data}`

@@ -374,17 +459,7 @@ async function parseWithAzureMistralOCR(
      message: error instanceof Error ? error.message : String(error),
    })

-    const fallbackMistralKey = await getMistralApiKey(workspaceId)
-    if (fallbackMistralKey) {
-      return parseWithMistralOCR(
-        fileUrl,
-        filename,
-        mimeType,
-        userId,
-        workspaceId,
-        fallbackMistralKey
-      )
-    }
+    logger.info(`Falling back to file parser: ${filename}`)
    return parseWithFileParser(fileUrl, filename, mimeType)
  }
 }
@@ -406,17 +481,54 @@ async function parseWithMistralOCR(
    throw new Error('Mistral parser tool not configured')
  }

-  const { httpsUrl, cloudUrl } = await handleFileForOCR(
+  const { httpsUrl, cloudUrl, buffer } = await handleFileForOCR(
    fileUrl,
    filename,
    mimeType,
    userId,
    workspaceId
  )
+
+  logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`)
+
+  let pageCount = 0
+  if (mimeType === 'application/pdf' && buffer) {
+    pageCount = await getPdfPageCount(buffer)
+    logger.info(`PDF page count for ${filename}: ${pageCount}`)
+  }
+
+  const needsBatching = pageCount > MISTRAL_MAX_PAGES
+
+  if (needsBatching && buffer) {
+    logger.info(
+      `PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.`
+    )
+    return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl)
+  }
+
  const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }

  try {
-    const response = await retryWithExponentialBackoff(
+    const response = await executeMistralOCRRequest(params, userId)
+    const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
+    const content = processOCRContent(result, filename)
+
+    return { content, processingMethod: 'mistral-ocr' as const, cloudUrl }
+  } catch (error) {
+    logger.error(`Mistral OCR failed for ${filename}:`, {
+      message: error instanceof Error ? error.message : String(error),
+    })
+
+    logger.info(`Falling back to file parser: ${filename}`)
+    return parseWithFileParser(fileUrl, filename, mimeType)
+  }
+}
+
+async function executeMistralOCRRequest(
+  params: { filePath: string; apiKey: string; resultType: 'text' },
+  userId?: string
+): Promise<Response> {
+  return retryWithExponentialBackoff(
    async () => {
      let url =
        typeof mistralParserTool.request!.url === 'function'
@@ -449,25 +561,167 @@ async function parseWithMistralOCR(
    },
    { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
  )
+}

-    const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
-    const content = processOCRContent(result, filename)
+/**
+ * Process a single PDF chunk: upload to S3, OCR, cleanup
+ */
+async function processChunk(
+  chunk: { buffer: Buffer; startPage: number; endPage: number },
+  chunkIndex: number,
+  totalChunks: number,
+  filename: string,
+  apiKey: string,
+  userId?: string
+): Promise<{ index: number; content: string | null }> {
+  const chunkPageCount = chunk.endPage - chunk.startPage + 1

-    return { content, processingMethod: 'mistral-ocr' as const, cloudUrl }
-  } catch (error) {
-    logger.error(`Mistral OCR failed for ${filename}:`, {
-      message: error instanceof Error ? error.message : String(error),
+  logger.info(
+    `Processing chunk ${chunkIndex + 1}/${totalChunks} (pages ${chunk.startPage + 1}-${chunk.endPage + 1}, ${chunkPageCount} pages)`
+  )
+
+  let uploadedKey: string | null = null
+
+  try {
+    // Upload the chunk to S3
+    const timestamp = Date.now()
+    const uniqueId = Math.random().toString(36).substring(2, 9)
+    const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
+    const chunkKey = `kb/${timestamp}-${uniqueId}-chunk${chunkIndex + 1}-${safeFileName}`
+
+    const metadata: Record<string, string> = {
+      originalName: `${filename}_chunk${chunkIndex + 1}`,
+      uploadedAt: new Date().toISOString(),
+      purpose: 'knowledge-base',
+      ...(userId && { userId }),
+    }
+
+    const uploadResult = await StorageService.uploadFile({
+      file: chunk.buffer,
+      fileName: `${filename}_chunk${chunkIndex + 1}`,
+      contentType: 'application/pdf',
+      context: 'knowledge-base',
+      customKey: chunkKey,
+      metadata,
    })

-    logger.info(`Falling back to file parser: ${filename}`)
-    return parseWithFileParser(fileUrl, filename, mimeType)
+    uploadedKey = uploadResult.key
+
+    const chunkUrl = await StorageService.generatePresignedDownloadUrl(
+      uploadResult.key,
+      'knowledge-base',
+      900 // 15 minutes
+    )
+
+    logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
+
+    // Process the chunk with Mistral OCR
+    const params = {
+      filePath: chunkUrl,
+      apiKey,
+      resultType: 'text' as const,
+    }
+
+    const response = await executeMistralOCRRequest(params, userId)
+    const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
+
+    if (result.success && result.output?.content) {
+      logger.info(`Chunk ${chunkIndex + 1}/${totalChunks} completed successfully`)
+      return { index: chunkIndex, content: result.output.content }
+    }
+    logger.warn(`Chunk ${chunkIndex + 1}/${totalChunks} returned no content`)
+    return { index: chunkIndex, content: null }
+  } catch (error) {
+    logger.error(`Chunk ${chunkIndex + 1}/${totalChunks} failed:`, {
+      message: error instanceof Error ? error.message : String(error),
+    })
+    return { index: chunkIndex, content: null }
+  } finally {
+    // Clean up the chunk file from S3 after processing
+    if (uploadedKey) {
+      try {
+        await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
+        logger.info(`Cleaned up chunk ${chunkIndex + 1} from S3`)
+      } catch (deleteError) {
+        logger.warn(`Failed to clean up chunk ${chunkIndex + 1} from S3:`, {
+          message: deleteError instanceof Error ? deleteError.message : String(deleteError),
+        })
+      }
+    }
+  }
+}
+
+async function processMistralOCRInBatches(
+  filename: string,
+  apiKey: string,
+  pdfBuffer: Buffer,
+  userId?: string,
+  cloudUrl?: string
+): Promise<{
+  content: string
+  processingMethod: 'mistral-ocr'
+  cloudUrl?: string
+}> {
+  const totalPages = await getPdfPageCount(pdfBuffer)
+  logger.info(
+    `Splitting ${filename} (${totalPages} pages) into chunks of ${MISTRAL_MAX_PAGES} pages`
+  )
+
+  const pdfChunks = await splitPdfIntoChunks(pdfBuffer, MISTRAL_MAX_PAGES)
+  logger.info(
+    `Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
+  )
+
+  // Process chunks concurrently with limited concurrency
+  const results: { index: number; content: string | null }[] = []
+
+  for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
+    const batch = pdfChunks.slice(i, i + MAX_CONCURRENT_CHUNKS)
+    const batchPromises = batch.map((chunk, batchIndex) =>
+      processChunk(chunk, i + batchIndex, pdfChunks.length, filename, apiKey, userId)
+    )
+
+    const batchResults = await Promise.all(batchPromises)
+    for (const result of batchResults) {
+      results.push(result)
+    }
+
+    logger.info(
+      `Completed batch ${Math.floor(i / MAX_CONCURRENT_CHUNKS) + 1}/${Math.ceil(pdfChunks.length / MAX_CONCURRENT_CHUNKS)}`
+    )
+  }
+
+  // Sort by index to maintain page order and filter out nulls
+  const sortedResults = results
+    .sort((a, b) => a.index - b.index)
+    .filter((r) => r.content !== null)
+    .map((r) => r.content as string)
+
+  if (sortedResults.length === 0) {
+    // Don't fall back to file parser for large PDFs - it produces poor results
+    // Better to fail clearly than return low-quality extraction
+    throw new Error(
+      `OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
+        `Large PDFs require OCR - file parser fallback would produce poor results.`
+    )
+  }
+
+  const combinedContent = sortedResults.join('\n\n')
+  logger.info(
+    `Successfully processed ${sortedResults.length}/${pdfChunks.length} chunks for ${filename}`
+  )
+
+  return {
+    content: combinedContent,
+    processingMethod: 'mistral-ocr',
+    cloudUrl,
  }
 }

 async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
  try {
    let content: string
-    let metadata: any = {}
+    let metadata: FileParseMetadata = {}

    if (fileUrl.startsWith('data:')) {
      content = await parseDataURI(fileUrl, filename, mimeType)
@@ -513,7 +767,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
 async function parseHttpFile(
  fileUrl: string,
  filename: string
-): Promise<{ content: string; metadata?: any }> {
+): Promise<{ content: string; metadata?: FileParseMetadata }> {
  const buffer = await downloadFileWithTimeout(fileUrl)

  const extension = filename.split('.').pop()?.toLowerCase()
--- a/apps/sim/lib/knowledge/documents/service.ts
+++ b/apps/sim/lib/knowledge/documents/service.ts
@@ -212,7 +212,6 @@ export async function processDocumentTags(
    return result
  }

-  // Fetch existing tag definitions
  const existingDefinitions = await db
    .select()
    .from(knowledgeBaseTagDefinitions)
@@ -220,18 +219,15 @@ export async function processDocumentTags(

  const existingByName = new Map(existingDefinitions.map((def) => [def.displayName, def]))

-  // First pass: collect all validation errors
  const undefinedTags: string[] = []
  const typeErrors: string[] = []

  for (const tag of tagData) {
-    // Skip if no tag name
    if (!tag.tagName?.trim()) continue

    const tagName = tag.tagName.trim()
    const fieldType = tag.fieldType || 'text'

-    // For boolean, check if value is defined; for others, check if value is non-empty
    const hasValue =
      fieldType === 'boolean'
        ? tag.value !== undefined && tag.value !== null && tag.value !== ''
@@ -239,14 +235,12 @@ export async function processDocumentTags(

    if (!hasValue) continue

-    // Check if tag exists
    const existingDef = existingByName.get(tagName)
    if (!existingDef) {
      undefinedTags.push(tagName)
      continue
    }

-    // Validate value type using shared validation
    const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
    const actualFieldType = existingDef.fieldType || fieldType
    const validationError = validateTagValue(tagName, String(rawValue), actualFieldType)
@@ -255,7 +249,6 @@ export async function processDocumentTags(
    }
  }

-  // Throw combined error if there are any validation issues
  if (undefinedTags.length > 0 || typeErrors.length > 0) {
    const errorParts: string[] = []

@@ -270,7 +263,6 @@ export async function processDocumentTags(
    throw new Error(errorParts.join('\n'))
  }

-  // Second pass: process valid tags
  for (const tag of tagData) {
    if (!tag.tagName?.trim()) continue

@@ -285,14 +277,13 @@ export async function processDocumentTags(
    if (!hasValue) continue

    const existingDef = existingByName.get(tagName)
-    if (!existingDef) continue // Already validated above
+    if (!existingDef) continue

    const targetSlot = existingDef.tagSlot
    const actualFieldType = existingDef.fieldType || fieldType
    const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
    const stringValue = String(rawValue).trim()

-    // Assign value to the slot with proper type conversion (values already validated)
    if (actualFieldType === 'boolean') {
      setTagValue(result, targetSlot, parseBooleanValue(stringValue) ?? false)
    } else if (actualFieldType === 'number') {
@@ -440,7 +431,6 @@ export async function processDocumentAsync(

    logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)

-    // Use KB's chunkingConfig as fallback if processingOptions not provided
    const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }

    await withTimeout(
@@ -469,7 +459,6 @@ export async function processDocumentAsync(
          `[${documentId}] Document parsed successfully, generating embeddings for ${processed.chunks.length} chunks`
        )

-        // Generate embeddings in batches for large documents
        const chunkTexts = processed.chunks.map((chunk) => chunk.text)
        const embeddings: number[][] = []

@@ -485,7 +474,9 @@ export async function processDocumentAsync(

            logger.info(`[${documentId}] Processing embedding batch ${batchNum}/${totalBatches}`)
            const batchEmbeddings = await generateEmbeddings(batch, undefined, kb[0].workspaceId)
-            embeddings.push(...batchEmbeddings)
+            for (const emb of batchEmbeddings) {
+              embeddings.push(emb)
+            }
          }
        }

@@ -562,23 +553,18 @@ export async function processDocumentAsync(
        }))

        await db.transaction(async (tx) => {
-          // Insert embeddings in batches for large documents
          if (embeddingRecords.length > 0) {
-            const batchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
-            const totalBatches = Math.ceil(embeddingRecords.length / batchSize)
+            await tx.delete(embedding).where(eq(embedding.documentId, documentId))

-            logger.info(
-              `[${documentId}] Inserting ${embeddingRecords.length} embeddings in ${totalBatches} batches`
-            )
-
-            for (let i = 0; i < embeddingRecords.length; i += batchSize) {
-              const batch = embeddingRecords.slice(i, i + batchSize)
-              const batchNum = Math.floor(i / batchSize) + 1
+            const insertBatchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
+            const batches: (typeof embeddingRecords)[] = []
+            for (let i = 0; i < embeddingRecords.length; i += insertBatchSize) {
+              batches.push(embeddingRecords.slice(i, i + insertBatchSize))
+            }

+            logger.info(`[${documentId}] Inserting ${embeddingRecords.length} embeddings`)
+            for (const batch of batches) {
              await tx.insert(embedding).values(batch)
-              logger.info(
-                `[${documentId}] Inserted batch ${batchNum}/${totalBatches} (${batch.length} records)`
-              )
            }
          }

@@ -689,11 +675,9 @@ export async function createDocumentRecords(
  requestId: string,
  userId?: string
 ): Promise<DocumentData[]> {
-  // Check storage limits before creating documents
  if (userId) {
    const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)

-    // Get knowledge base owner
    const kb = await db
      .select({ userId: knowledgeBase.userId })
      .from(knowledgeBase)
@@ -713,7 +697,7 @@ export async function createDocumentRecords(
    for (const docData of documents) {
      const documentId = randomUUID()

-      let processedTags: Record<string, any> = {}
+      let processedTags: Partial<ProcessedDocumentTags> = {}

      if (docData.documentTagsData) {
        try {
@@ -722,7 +706,6 @@ export async function createDocumentRecords(
            processedTags = await processDocumentTags(knowledgeBaseId, tagData, requestId)
          }
        } catch (error) {
-          // Re-throw validation errors, only catch JSON parse errors
          if (error instanceof SyntaxError) {
            logger.warn(`[${requestId}] Failed to parse documentTagsData for bulk document:`, error)
          } else {
@@ -791,7 +774,6 @@ export async function createDocumentRecords(
      if (userId) {
        const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)

-        // Get knowledge base owner
        const kb = await db
          .select({ userId: knowledgeBase.userId })
          .from(knowledgeBase)
@@ -1079,7 +1061,7 @@ export async function createSingleDocument(
  const now = new Date()

  // Process structured tag data if provided
-  let processedTags: Record<string, any> = {
+  let processedTags: ProcessedDocumentTags = {
    // Text tags (7 slots)
    tag1: documentData.tag1 ?? null,
    tag2: documentData.tag2 ?? null,
@@ -1555,23 +1537,30 @@ export async function updateDocument(
    return value || null
  }

+  // Type-safe access to tag slots in updateData
+  type UpdateDataWithTags = typeof updateData & Record<TagSlot, string | undefined>
+  const typedUpdateData = updateData as UpdateDataWithTags
+
  ALL_TAG_SLOTS.forEach((slot: TagSlot) => {
-    const updateValue = (updateData as any)[slot]
+    const updateValue = typedUpdateData[slot]
    if (updateValue !== undefined) {
-      ;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue)
+      ;(dbUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[slot] =
+        convertTagValue(slot, updateValue)
    }
  })

  await db.transaction(async (tx) => {
    await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId))

-    const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined)
+    const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined)

    if (hasTagUpdates) {
-      const embeddingUpdateData: Record<string, any> = {}
+      const embeddingUpdateData: Partial<ProcessedDocumentTags> = {}
      ALL_TAG_SLOTS.forEach((field) => {
-        if ((updateData as any)[field] !== undefined) {
-          embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field])
+        if (typedUpdateData[field] !== undefined) {
+          ;(embeddingUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[
+            field
+          ] = convertTagValue(field, typedUpdateData[field])
        }
      })

--- a/apps/sim/lib/knowledge/documents/utils.ts
+++ b/apps/sim/lib/knowledge/documents/utils.ts
@@ -14,7 +14,7 @@ export interface RetryOptions {
  initialDelayMs?: number
  maxDelayMs?: number
  backoffMultiplier?: number
-  retryCondition?: (error: RetryableError) => boolean
+  retryCondition?: (error: unknown) => boolean
 }

 export interface RetryResult<T> {
@@ -30,11 +30,18 @@ function hasStatus(
  return typeof error === 'object' && error !== null && 'status' in error
 }

+function isRetryableErrorType(error: unknown): error is RetryableError {
+  if (!error) return false
+  if (error instanceof Error) return true
+  if (typeof error === 'object' && ('status' in error || 'message' in error)) return true
+  return false
+}
+
 /**
 * Default retry condition for rate limiting errors
 */
-export function isRetryableError(error: RetryableError): boolean {
-  if (!error) return false
+export function isRetryableError(error: unknown): boolean {
+  if (!isRetryableErrorType(error)) return false

  // Check for rate limiting status codes
  if (
@@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean {
  }

  // Check for rate limiting in error messages
-  const errorMessage = error.message || error.toString()
+  const errorMessage = error instanceof Error ? error.message : String(error)
  const rateLimitKeywords = [
    'rate limit',
    'rate_limit',
--- a/apps/sim/lib/knowledge/embeddings.ts
+++ b/apps/sim/lib/knowledge/embeddings.ts
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
 import { getBYOKKey } from '@/lib/api-key/byok'
 import { env } from '@/lib/core/config/env'
 import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
-import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization'
+import { batchByTokenLimit } from '@/lib/tokenization'

 const logger = createLogger('EmbeddingUtils')

@@ -26,6 +26,20 @@ interface EmbeddingConfig {
  modelName: string
 }

+interface EmbeddingResponseItem {
+  embedding: number[]
+  index: number
+}
+
+interface EmbeddingAPIResponse {
+  data: EmbeddingResponseItem[]
+  model: string
+  usage: {
+    prompt_tokens: number
+    total_tokens: number
+  }
+}
+
 async function getEmbeddingConfig(
  embeddingModel = 'text-embedding-3-small',
  workspaceId?: string | null
@@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
        )
      }

-      const data = await response.json()
-      return data.data.map((item: any) => item.embedding)
+      const data: EmbeddingAPIResponse = await response.json()
+      return data.data.map((item) => item.embedding)
    },
    {
      maxRetries: 3,
      initialDelayMs: 1000,
      maxDelayMs: 10000,
-      retryCondition: (error: any) => {
+      retryCondition: (error: unknown) => {
        if (error instanceof EmbeddingAPIError) {
          return error.status === 429 || error.status >= 500
        }
@@ -153,44 +167,27 @@ export async function generateEmbeddings(
 ): Promise<number[][]> {
  const config = await getEmbeddingConfig(embeddingModel, workspaceId)

-  logger.info(
-    `Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)`
-  )
-
  const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel)

-  logger.info(
-    `Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch, ${MAX_CONCURRENT_BATCHES} concurrent)`
-  )
-
  const batchResults = await processWithConcurrency(
    batches,
    MAX_CONCURRENT_BATCHES,
    async (batch, i) => {
-      const batchTokenCount = getTotalTokenCount(batch, embeddingModel)
-
-      logger.info(
-        `Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens`
-      )
-
      try {
-        const batchEmbeddings = await callEmbeddingAPI(batch, config)
-
-        logger.info(
-          `Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}`
-        )
-
-        return batchEmbeddings
+        return await callEmbeddingAPI(batch, config)
      } catch (error) {
-        logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error)
+        logger.error(`Failed to generate embeddings for batch ${i + 1}/${batches.length}:`, error)
        throw error
      }
    }
  )

-  const allEmbeddings = batchResults.flat()
-
-  logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`)
+  const allEmbeddings: number[][] = []
+  for (const batch of batchResults) {
+    for (const emb of batch) {
+      allEmbeddings.push(emb)
+    }
+  }

  return allEmbeddings
 }
--- a/apps/sim/lib/tokenization/estimators.ts
+++ b/apps/sim/lib/tokenization/estimators.ts
@@ -127,24 +127,6 @@ export function truncateToTokenLimit(
  }
 }

-/**
- * Get token count for multiple texts (for batching decisions)
- * Returns array of token counts in same order as input
- */
-export function getTokenCountsForBatch(
-  texts: string[],
-  modelName = 'text-embedding-3-small'
-): number[] {
-  return texts.map((text) => getAccurateTokenCount(text, modelName))
-}
-
-/**
- * Calculate total tokens across multiple texts
- */
-export function getTotalTokenCount(texts: string[], modelName = 'text-embedding-3-small'): number {
-  return texts.reduce((total, text) => total + getAccurateTokenCount(text, modelName), 0)
-}
-
 /**
 * Batch texts by token count to stay within API limits
 * Returns array of batches where each batch's total tokens <= maxTokensPerBatch
--- a/apps/sim/lib/tokenization/index.ts
+++ b/apps/sim/lib/tokenization/index.ts
@@ -12,8 +12,6 @@ export {
  estimateOutputTokens,
  estimateTokenCount,
  getAccurateTokenCount,
-  getTokenCountsForBatch,
-  getTotalTokenCount,
  truncateToTokenLimit,
 } from '@/lib/tokenization/estimators'
 export { processStreamingBlockLog, processStreamingBlockLogs } from '@/lib/tokenization/streaming'
--- a/apps/sim/package.json
+++ b/apps/sim/package.json
@@ -127,6 +127,7 @@
    "onedollarstats": "0.0.10",
    "openai": "^4.91.1",
    "papaparse": "5.5.3",
+    "pdf-lib": "1.17.1",
    "postgres": "^3.4.5",
    "posthog-js": "1.268.9",
    "posthog-node": "5.9.2",
--- a/apps/sim/trigger.config.ts
+++ b/apps/sim/trigger.config.ts
@@ -17,7 +17,7 @@ export default defineConfig({
  build: {
    extensions: [
      additionalPackages({
-        packages: ['unpdf'],
+        packages: ['unpdf', 'pdf-lib'],
      }),
    ],
  },
--- a/bun.lock
+++ b/bun.lock
@@ -1,5 +1,6 @@
 {
  "lockfileVersion": 1,
+  "configVersion": 0,
  "workspaces": {
    "": {
      "name": "simstudio",
@@ -11,7 +12,7 @@
        "drizzle-kit": "^0.31.4",
        "husky": "9.1.7",
        "lint-staged": "16.0.0",
-        "turbo": "2.7.2",
+        "turbo": "2.7.3",
      },
    },
    "apps/docs": {
@@ -156,6 +157,7 @@
        "onedollarstats": "0.0.10",
        "openai": "^4.91.1",
        "papaparse": "5.5.3",
+        "pdf-lib": "1.17.1",
        "postgres": "^3.4.5",
        "posthog-js": "1.268.9",
        "posthog-node": "5.9.2",
@@ -912,6 +914,10 @@

    "@orama/orama": ["@orama/orama@3.1.18", "", {}, "sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA=="],

+    "@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="],
+
+    "@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="],
+
    "@peculiar/asn1-android": ["@peculiar/asn1-android@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-cBRCKtYPF7vJGN76/yG8VbxRcHLPF3HnkoHhKOZeHpoVtbMYfY9ROKtH3DtYUY9m8uI1Mh47PRhHf2hSK3xcSQ=="],

    "@peculiar/asn1-cms": ["@peculiar/asn1-cms@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "@peculiar/asn1-x509": "^2.6.0", "@peculiar/asn1-x509-attr": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA=="],
@@ -2864,6 +2870,8 @@

    "pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="],

+    "pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="],
+
    "pdfjs-dist": ["pdfjs-dist@5.4.449", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.81" } }, "sha512-CegnUaT0QwAyQMS+7o2POr4wWUNNe8VaKKlcuoRHeYo98cVnqPpwOXNSx6Trl6szH02JrRcsPgletV6GmF3LtQ=="],

    "peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="],
@@ -3362,19 +3370,19 @@

    "tunnel-agent": ["tunnel-agent@0.6.0", "", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w=="],

-    "turbo": ["turbo@2.7.2", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.2", "turbo-darwin-arm64": "2.7.2", "turbo-linux-64": "2.7.2", "turbo-linux-arm64": "2.7.2", "turbo-windows-64": "2.7.2", "turbo-windows-arm64": "2.7.2" }, "bin": { "turbo": "bin/turbo" } }, "sha512-5JIA5aYBAJSAhrhbyag1ZuMSgUZnHtI+Sq3H8D3an4fL8PeF+L1yYvbEJg47akP1PFfATMf5ehkqFnxfkmuwZQ=="],
+    "turbo": ["turbo@2.7.3", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.3", "turbo-darwin-arm64": "2.7.3", "turbo-linux-64": "2.7.3", "turbo-linux-arm64": "2.7.3", "turbo-windows-64": "2.7.3", "turbo-windows-arm64": "2.7.3" }, "bin": { "turbo": "bin/turbo" } }, "sha512-+HjKlP4OfYk+qzvWNETA3cUO5UuK6b5MSc2UJOKyvBceKucQoQGb2g7HlC2H1GHdkfKrk4YF1VPvROkhVZDDLQ=="],

-    "turbo-darwin-64": ["turbo-darwin-64@2.7.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-dxY3X6ezcT5vm3coK6VGixbrhplbQMwgNsCsvZamS/+/6JiebqW9DKt4NwpgYXhDY2HdH00I7FWs3wkVuan4rA=="],
+    "turbo-darwin-64": ["turbo-darwin-64@2.7.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-aZHhvRiRHXbJw1EcEAq4aws1hsVVUZ9DPuSFaq9VVFAKCup7niIEwc22glxb7240yYEr1vLafdQ2U294Vcwz+w=="],

-    "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1bXmuwPLqNFt3mzrtYcVx1sdJ8UYb124Bf48nIgcpMCGZy3kDhgxNv1503kmuK/37OGOZbsWSQFU4I08feIuSg=="],
+    "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-CkVrHSq+Bnhl9sX2LQgqQYVfLTWC2gvI74C4758OmU0djfrssDKU9d4YQF0AYXXhIIRZipSXfxClQziIMD+EAg=="],

-    "turbo-linux-64": ["turbo-linux-64@2.7.2", "", { "os": "linux", "cpu": "x64" }, "sha512-kP+TiiMaiPugbRlv57VGLfcjFNsFbo8H64wMBCPV2270Or2TpDCBULMzZrvEsvWFjT3pBFvToYbdp8/Kw0jAQg=="],
+    "turbo-linux-64": ["turbo-linux-64@2.7.3", "", { "os": "linux", "cpu": "x64" }, "sha512-GqDsCNnzzr89kMaLGpRALyigUklzgxIrSy2pHZVXyifgczvYPnLglex78Aj3T2gu+T3trPPH2iJ+pWucVOCC2Q=="],

-    "turbo-linux-arm64": ["turbo-linux-arm64@2.7.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-VDJwQ0+8zjAfbyY6boNaWfP6RIez4ypKHxwkuB6SrWbOSk+vxTyW5/hEjytTwK8w/TsbKVcMDyvpora8tEsRFw=="],
+    "turbo-linux-arm64": ["turbo-linux-arm64@2.7.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-NdCDTfIcIo3dWjsiaAHlxu5gW61Ed/8maah1IAF/9E3EtX0aAHNiBMbuYLZaR4vRJ7BeVkYB6xKWRtdFLZ0y3g=="],

-    "turbo-windows-64": ["turbo-windows-64@2.7.2", "", { "os": "win32", "cpu": "x64" }, "sha512-rPjqQXVnI6A6oxgzNEE8DNb6Vdj2Wwyhfv3oDc+YM3U9P7CAcBIlKv/868mKl4vsBtz4ouWpTQNXG8vljgJO+w=="],
+    "turbo-windows-64": ["turbo-windows-64@2.7.3", "", { "os": "win32", "cpu": "x64" }, "sha512-7bVvO987daXGSJVYBoG8R4Q+csT1pKIgLJYZevXRQ0Hqw0Vv4mKme/TOjYXs9Qb1xMKh51Tb3bXKDbd8/4G08g=="],

-    "turbo-windows-arm64": ["turbo-windows-arm64@2.7.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-tcnHvBhO515OheIFWdxA+qUvZzNqqcHbLVFc1+n+TJ1rrp8prYicQtbtmsiKgMvr/54jb9jOabU62URAobnB7g=="],
+    "turbo-windows-arm64": ["turbo-windows-arm64@2.7.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-nTodweTbPmkvwMu/a55XvjMsPtuyUSC+sV7f/SR57K36rB2I0YG21qNETN+00LOTUW9B3omd8XkiXJkt4kx/cw=="],

    "tweetnacl": ["tweetnacl@0.14.5", "", {}, "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA=="],

@@ -4046,6 +4054,8 @@

    "path-scurry/lru-cache": ["lru-cache@11.2.4", "", {}, "sha512-B5Y16Jr9LB9dHVkh6ZevG+vAbOsNOYCX+sXvFWFu7B3Iz5mijW3zdbMyhsh8ANd2mSWBYdJgnqi+mL7/LrOPYg=="],

+    "pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
+
    "pino/thread-stream": ["thread-stream@3.1.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-OqyPZ9u96VohAyMfJykzmivOrY2wfMSf3C5TtFJVgN+Hm6aj+voFhlK+kZEIv2FBh1X6Xp3DlnCOfEQ3B2J86A=="],

    "pino-pretty/pino-abstract-transport": ["pino-abstract-transport@3.0.0", "", { "dependencies": { "split2": "^4.0.0" } }, "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg=="],
--- a/package.json
+++ b/package.json
@@ -41,7 +41,7 @@
    "drizzle-kit": "^0.31.4",
    "husky": "9.1.7",
    "lint-staged": "16.0.0",
-    "turbo": "2.7.2"
+    "turbo": "2.7.3"
  },
  "lint-staged": {
    "*.{js,jsx,ts,tsx,json,css,scss}": [