diff --git a/apps/sim/app/api/knowledge/utils.test.ts b/apps/sim/app/api/knowledge/utils.test.ts
index eb4b92eb3..9ae1372c0 100644
--- a/apps/sim/app/api/knowledge/utils.test.ts
+++ b/apps/sim/app/api/knowledge/utils.test.ts
@@ -136,16 +136,29 @@ vi.mock('@sim/db', () => {
           },
         }),
       }),
+      delete: () => ({
+        where: () => Promise.resolve(),
+      }),
+      insert: () => ({
+        values: (records: any) => {
+          dbOps.order.push('insert')
+          dbOps.insertRecords.push(records)
+          return Promise.resolve()
+        },
+      }),
       transaction: vi.fn(async (fn: any) => {
         await fn({
-          insert: (table: any) => ({
+          delete: () => ({
+            where: () => Promise.resolve(),
+          }),
+          insert: () => ({
             values: (records: any) => {
               dbOps.order.push('insert')
               dbOps.insertRecords.push(records)
               return Promise.resolve()
             },
           }),
-          update: (table: any) => ({
+          update: () => ({
             set: (payload: any) => ({
               where: () => {
                 dbOps.updatePayloads.push(payload)
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx
index 21c0f4d09..b56fc4ec4 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx
@@ -453,6 +453,8 @@ export function KnowledgeBase({
     error: knowledgeBaseError,
     refresh: refreshKnowledgeBase,
   } = useKnowledgeBase(id)
+  const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false)
+
   const {
     documents,
     pagination,
@@ -468,6 +470,7 @@ export function KnowledgeBase({
     offset: (currentPage - 1) * DOCUMENTS_PER_PAGE,
     sortBy,
     sortOrder,
+    refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false,
   })
 
   const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id)
@@ -534,25 +537,15 @@ export function KnowledgeBase({
   )
 
   useEffect(() => {
-    const hasProcessingDocuments = documents.some(
+    const processing = documents.some(
       (doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing'
     )
+    setHasProcessingDocuments(processing)
 
-    if (!hasProcessingDocuments) return
-
-    const refreshInterval = setInterval(async () => {
-      try {
-        if (!isDeleting) {
-          await checkForDeadProcesses()
-          await refreshDocuments()
-        }
-      } catch (error) {
-        logger.error('Error refreshing documents:', error)
-      }
-    }, 3000)
-
-    return () => clearInterval(refreshInterval)
-  }, [documents, refreshDocuments, isDeleting])
+    if (processing) {
+      checkForDeadProcesses()
+    }
+  }, [documents])
 
   /**
    * Checks for documents with stale processing states and marks them as failed
@@ -672,25 +665,6 @@ export function KnowledgeBase({
 
       await refreshDocuments()
 
-      let refreshAttempts = 0
-      const maxRefreshAttempts = 3
-      const refreshInterval = setInterval(async () => {
-        try {
-          refreshAttempts++
-          await refreshDocuments()
-          if (refreshAttempts >= maxRefreshAttempts) {
-            clearInterval(refreshInterval)
-          }
-        } catch (error) {
-          logger.error('Error refreshing documents after retry:', error)
-          clearInterval(refreshInterval)
-        }
-      }, 1000)
-
-      setTimeout(() => {
-        clearInterval(refreshInterval)
-      }, 4000)
-
       logger.info(`Document retry initiated successfully for: ${docId}`)
     } catch (err) {
       logger.error('Error retrying document:', err)
diff --git a/apps/sim/background/knowledge-processing.ts b/apps/sim/background/knowledge-processing.ts
index 920c129af..8f7d75c42 100644
--- a/apps/sim/background/knowledge-processing.ts
+++ b/apps/sim/background/knowledge-processing.ts
@@ -27,6 +27,7 @@ export type DocumentProcessingPayload = {
 export const processDocument = task({
   id: 'knowledge-process-document',
   maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
+  machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing
   retry: {
     maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
     factor: env.KB_CONFIG_RETRY_FACTOR || 2,
diff --git a/apps/sim/hooks/queries/knowledge.ts b/apps/sim/hooks/queries/knowledge.ts
index 977d0e8c9..304c3d216 100644
--- a/apps/sim/hooks/queries/knowledge.ts
+++ b/apps/sim/hooks/queries/knowledge.ts
@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(
   params: KnowledgeDocumentsParams,
   options?: {
     enabled?: boolean
+    refetchInterval?: number | false
   }
 ) {
   const paramsKey = serializeDocumentParams(params)
@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(
     enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),
     staleTime: 60 * 1000,
     placeholderData: keepPreviousData,
+    refetchInterval: options?.refetchInterval ?? false,
   })
 }
 
diff --git a/apps/sim/hooks/use-knowledge.ts b/apps/sim/hooks/use-knowledge.ts
index 14b84393e..be70cf9f7 100644
--- a/apps/sim/hooks/use-knowledge.ts
+++ b/apps/sim/hooks/use-knowledge.ts
@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(
     sortBy?: string
     sortOrder?: string
     enabled?: boolean
+    refetchInterval?: number | false
   }
 ) {
   const queryClient = useQueryClient()
@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(
     },
     {
       enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),
+      refetchInterval: options?.refetchInterval,
     }
   )
 
diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts
index cb64e3867..a75c94e23 100644
--- a/apps/sim/lib/chunkers/docs-chunker.ts
+++ b/apps/sim/lib/chunkers/docs-chunker.ts
@@ -16,7 +16,7 @@ interface HeaderInfo {
 interface Frontmatter {
   title?: string
   description?: string
-  [key: string]: any
+  [key: string]: unknown
 }
 
 const logger = createLogger('DocsChunker')
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts
index 5d81b8262..458f8d3e8 100644
--- a/apps/sim/lib/chunkers/json-yaml-chunker.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators'
 
 const logger = createLogger('JsonYamlChunker')
 
+type JsonPrimitive = string | number | boolean | null
+type JsonValue = JsonPrimitive | JsonObject | JsonArray
+type JsonObject = { [key: string]: JsonValue }
+type JsonArray = JsonValue[]
+
 function getTokenCount(text: string): number {
   try {
     return getAccurateTokenCount(text, 'text-embedding-3-small')
@@ -59,11 +64,11 @@ export class JsonYamlChunker {
    */
   async chunk(content: string): Promise<Chunk[]> {
     try {
-      let data: any
+      let data: JsonValue
       try {
-        data = JSON.parse(content)
+        data = JSON.parse(content) as JsonValue
       } catch {
-        data = yaml.load(content)
+        data = yaml.load(content) as JsonValue
       }
       const chunks = this.chunkStructuredData(data)
 
@@ -86,7 +91,7 @@ export class JsonYamlChunker {
   /**
    * Chunk structured data based on its structure
    */
-  private chunkStructuredData(data: any, path: string[] = []): Chunk[] {
+  private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
     const chunks: Chunk[] = []
 
     if (Array.isArray(data)) {
@@ -94,7 +99,7 @@ export class JsonYamlChunker {
     }
 
     if (typeof data === 'object' && data !== null) {
-      return this.chunkObject(data, path)
+      return this.chunkObject(data as JsonObject, path)
     }
 
     const content = JSON.stringify(data, null, 2)
@@ -118,9 +123,9 @@ export class JsonYamlChunker {
   /**
    * Chunk an array intelligently
    */
-  private chunkArray(arr: any[], path: string[]): Chunk[] {
+  private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
     const chunks: Chunk[] = []
-    let currentBatch: any[] = []
+    let currentBatch: JsonValue[] = []
     let currentTokens = 0
 
     const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
@@ -194,7 +199,7 @@ export class JsonYamlChunker {
   /**
    * Chunk an object intelligently
    */
-  private chunkObject(obj: Record<string, any>, path: string[]): Chunk[] {
+  private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
     const chunks: Chunk[] = []
     const entries = Object.entries(obj)
 
@@ -213,7 +218,7 @@ export class JsonYamlChunker {
       return chunks
     }
 
-    let currentObj: Record<string, any> = {}
+    let currentObj: JsonObject = {}
     let currentTokens = 0
     let currentKeys: string[] = []
 
diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts
index c28f86197..7dbbde0cf 100644
--- a/apps/sim/lib/chunkers/text-chunker.ts
+++ b/apps/sim/lib/chunkers/text-chunker.ts
@@ -110,10 +110,12 @@ export class TextChunker {
           chunks.push(currentChunk.trim())
         }
 
-        // Start new chunk with current part
         // If part itself is too large, split it further
         if (this.estimateTokens(part) > this.chunkSize) {
-          chunks.push(...(await this.splitRecursively(part, separatorIndex + 1)))
+          const subChunks = await this.splitRecursively(part, separatorIndex + 1)
+          for (const subChunk of subChunks) {
+            chunks.push(subChunk)
+          }
           currentChunk = ''
         } else {
           currentChunk = part
diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts
index 5cca8759b..50ea62d6d 100644
--- a/apps/sim/lib/core/config/env.ts
+++ b/apps/sim/lib/core/config/env.ts
@@ -178,6 +178,7 @@ export const env = createEnv({
     KB_CONFIG_BATCH_SIZE:                  z.number().optional().default(2000),    // Chunks to process per embedding batch
     KB_CONFIG_DELAY_BETWEEN_BATCHES:       z.number().optional().default(0),       // Delay between batches in ms (0 for max speed)
     KB_CONFIG_DELAY_BETWEEN_DOCUMENTS:     z.number().optional().default(50),      // Delay between documents in ms
+    KB_CONFIG_CHUNK_CONCURRENCY:           z.number().optional().default(10),      // Concurrent PDF chunk OCR processing
 
     // Real-time Communication
     SOCKET_SERVER_URL:                     z.string().url().optional(),            // WebSocket server URL for real-time features
diff --git a/apps/sim/lib/file-parsers/doc-parser.ts b/apps/sim/lib/file-parsers/doc-parser.ts
index a0e0c1bc3..0d7379721 100644
--- a/apps/sim/lib/file-parsers/doc-parser.ts
+++ b/apps/sim/lib/file-parsers/doc-parser.ts
@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
         throw new Error(`File not found: ${filePath}`)
       }
 
-      logger.info(`Parsing DOC file: ${filePath}`)
-
       const buffer = await readFile(filePath)
       return this.parseBuffer(buffer)
     } catch (error) {
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {
 
   async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
     try {
-      logger.info('Parsing DOC buffer, size:', buffer.length)
-
       if (!buffer || buffer.length === 0) {
         throw new Error('Empty buffer provided')
       }
 
-      let parseOfficeAsync
       try {
         const officeParser = await import('officeparser')
-        parseOfficeAsync = officeParser.parseOfficeAsync
-      } catch (importError) {
-        logger.warn('officeparser not available, using fallback extraction')
-        return this.fallbackExtraction(buffer)
+        const result = await officeParser.parseOfficeAsync(buffer)
+
+        if (result) {
+          const resultString = typeof result === 'string' ? result : String(result)
+          const content = sanitizeTextForUTF8(resultString.trim())
+
+          if (content.length > 0) {
+            return {
+              content,
+              metadata: {
+                characterCount: content.length,
+                extractionMethod: 'officeparser',
+              },
+            }
+          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed, trying mammoth:', officeError)
       }
 
       try {
-        const result = await parseOfficeAsync(buffer)
+        const mammoth = await import('mammoth')
+        const result = await mammoth.extractRawText({ buffer })
 
-        if (!result) {
-          throw new Error('officeparser returned no result')
+        if (result.value && result.value.trim().length > 0) {
+          const content = sanitizeTextForUTF8(result.value.trim())
+          return {
+            content,
+            metadata: {
+              characterCount: content.length,
+              extractionMethod: 'mammoth',
+              messages: result.messages,
+            },
+          }
         }
-
-        const resultString = typeof result === 'string' ? result : String(result)
-
-        const content = sanitizeTextForUTF8(resultString.trim())
-
-        logger.info('DOC parsing completed successfully with officeparser')
-
-        return {
-          content: content,
-          metadata: {
-            characterCount: content.length,
-            extractionMethod: 'officeparser',
-          },
-        }
-      } catch (extractError) {
-        logger.warn('officeparser failed, using fallback:', extractError)
-        return this.fallbackExtraction(buffer)
+      } catch (mammothError) {
+        logger.warn('mammoth failed:', mammothError)
       }
+
+      return this.fallbackExtraction(buffer)
     } catch (error) {
-      logger.error('DOC buffer parsing error:', error)
+      logger.error('DOC parsing error:', error)
       throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`)
     }
   }
 
   private fallbackExtraction(buffer: Buffer): FileParseResult {
-    logger.info('Using fallback text extraction for DOC file')
+    const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
+
+    if (!isBinaryDoc) {
+      const textContent = buffer.toString('utf8').trim()
+
+      if (textContent.length > 0) {
+        const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
+        const isProbablyText = printableChars / textContent.length > 0.9
+
+        if (isProbablyText) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOC format, extracted as plain text',
+            },
+          }
+        }
+      }
+    }
 
     const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))
 
diff --git a/apps/sim/lib/file-parsers/docx-parser.ts b/apps/sim/lib/file-parsers/docx-parser.ts
index 5663a50b6..ab49f86af 100644
--- a/apps/sim/lib/file-parsers/docx-parser.ts
+++ b/apps/sim/lib/file-parsers/docx-parser.ts
@@ -2,13 +2,18 @@ import { readFile } from 'fs/promises'
 import { createLogger } from '@sim/logger'
 import mammoth from 'mammoth'
 import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
+import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'
 
 const logger = createLogger('DocxParser')
 
-// Define interface for mammoth result
+interface MammothMessage {
+  type: 'warning' | 'error'
+  message: string
+}
+
 interface MammothResult {
   value: string
-  messages: any[]
+  messages: MammothMessage[]
 }
 
 export class DocxParser implements FileParser {
@@ -19,7 +24,6 @@ export class DocxParser implements FileParser {
       }
 
       const buffer = await readFile(filePath)
-
       return this.parseBuffer(buffer)
     } catch (error) {
       logger.error('DOCX file error:', error)
@@ -29,26 +33,74 @@ export class DocxParser implements FileParser {
 
   async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
     try {
-      logger.info('Parsing buffer, size:', buffer.length)
+      if (!buffer || buffer.length === 0) {
+        throw new Error('Empty buffer provided')
+      }
 
-      const result = await mammoth.extractRawText({ buffer })
-
-      let htmlResult: MammothResult = { value: '', messages: [] }
       try {
-        htmlResult = await mammoth.convertToHtml({ buffer })
-      } catch (htmlError) {
-        logger.warn('HTML conversion warning:', htmlError)
+        const result = await mammoth.extractRawText({ buffer })
+
+        if (result.value && result.value.trim().length > 0) {
+          let htmlResult: MammothResult = { value: '', messages: [] }
+          try {
+            htmlResult = await mammoth.convertToHtml({ buffer })
+          } catch {
+            // HTML conversion is optional
+          }
+
+          return {
+            content: sanitizeTextForUTF8(result.value),
+            metadata: {
+              extractionMethod: 'mammoth',
+              messages: [...result.messages, ...htmlResult.messages],
+              html: htmlResult.value,
+            },
+          }
+        }
+      } catch (mammothError) {
+        logger.warn('mammoth failed, trying officeparser:', mammothError)
       }
 
-      return {
-        content: result.value,
-        metadata: {
-          messages: [...result.messages, ...htmlResult.messages],
-          html: htmlResult.value,
-        },
+      try {
+        const officeParser = await import('officeparser')
+        const result = await officeParser.parseOfficeAsync(buffer)
+
+        if (result) {
+          const resultString = typeof result === 'string' ? result : String(result)
+          const content = sanitizeTextForUTF8(resultString.trim())
+
+          if (content.length > 0) {
+            return {
+              content,
+              metadata: {
+                extractionMethod: 'officeparser',
+                characterCount: content.length,
+              },
+            }
+          }
+        }
+      } catch (officeError) {
+        logger.warn('officeparser failed:', officeError)
       }
+
+      const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b
+      if (!isZipFile) {
+        const textContent = buffer.toString('utf8').trim()
+        if (textContent.length > 0) {
+          return {
+            content: sanitizeTextForUTF8(textContent),
+            metadata: {
+              extractionMethod: 'plaintext-fallback',
+              characterCount: textContent.length,
+              warning: 'File is not a valid DOCX format, extracted as plain text',
+            },
+          }
+        }
+      }
+
+      throw new Error('Failed to extract text from DOCX file')
     } catch (error) {
-      logger.error('DOCX buffer parsing error:', error)
+      logger.error('DOCX parsing error:', error)
       throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`)
     }
   }
diff --git a/apps/sim/lib/file-parsers/types.ts b/apps/sim/lib/file-parsers/types.ts
index 4d02cc547..90baa0543 100644
--- a/apps/sim/lib/file-parsers/types.ts
+++ b/apps/sim/lib/file-parsers/types.ts
@@ -1,6 +1,22 @@
+export interface FileParseMetadata {
+  characterCount?: number
+  pageCount?: number
+  extractionMethod?: string
+  warning?: string
+  messages?: unknown[]
+  html?: string
+  type?: string
+  headers?: string[]
+  totalRows?: number
+  rowCount?: number
+  sheetNames?: string[]
+  source?: string
+  [key: string]: unknown
+}
+
 export interface FileParseResult {
   content: string
-  metadata?: Record<string, any>
+  metadata?: FileParseMetadata
 }
 
 export interface FileParser {
diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts
index e10935cbd..632e91fa8 100644
--- a/apps/sim/lib/knowledge/documents/document-processor.ts
+++ b/apps/sim/lib/knowledge/documents/document-processor.ts
@@ -1,8 +1,10 @@
 import { createLogger } from '@sim/logger'
+import { PDFDocument } from 'pdf-lib'
 import { getBYOKKey } from '@/lib/api-key/byok'
 import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
 import { env } from '@/lib/core/config/env'
 import { parseBuffer, parseFile } from '@/lib/file-parsers'
+import type { FileParseMetadata } from '@/lib/file-parsers/types'
 import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
 import { StorageService } from '@/lib/uploads'
 import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
@@ -15,6 +17,8 @@ const TIMEOUTS = {
   MISTRAL_OCR_API: 120000,
 } as const
 
+const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
+
 type OCRResult = {
   success: boolean
   error?: string
@@ -36,6 +40,61 @@ type OCRRequestBody = {
   include_image_base64: boolean
 }
 
+const MISTRAL_MAX_PAGES = 1000
+
+/**
+ * Get page count from a PDF buffer using unpdf
+ */
+async function getPdfPageCount(buffer: Buffer): Promise<number> {
+  try {
+    const { getDocumentProxy } = await import('unpdf')
+    const uint8Array = new Uint8Array(buffer)
+    const pdf = await getDocumentProxy(uint8Array)
+    return pdf.numPages
+  } catch (error) {
+    logger.warn('Failed to get PDF page count:', error)
+    return 0
+  }
+}
+
+/**
+ * Split a PDF buffer into multiple smaller PDFs
+ * Returns an array of PDF buffers, each with at most maxPages pages
+ */
+async function splitPdfIntoChunks(
+  pdfBuffer: Buffer,
+  maxPages: number
+): Promise<{ buffer: Buffer; startPage: number; endPage: number }[]> {
+  const sourcePdf = await PDFDocument.load(pdfBuffer)
+  const totalPages = sourcePdf.getPageCount()
+
+  if (totalPages <= maxPages) {
+    return [{ buffer: pdfBuffer, startPage: 0, endPage: totalPages - 1 }]
+  }
+
+  const chunks: { buffer: Buffer; startPage: number; endPage: number }[] = []
+
+  for (let startPage = 0; startPage < totalPages; startPage += maxPages) {
+    const endPage = Math.min(startPage + maxPages - 1, totalPages - 1)
+    const pageCount = endPage - startPage + 1
+
+    const newPdf = await PDFDocument.create()
+    const pageIndices = Array.from({ length: pageCount }, (_, i) => startPage + i)
+    const copiedPages = await newPdf.copyPages(sourcePdf, pageIndices)
+
+    copiedPages.forEach((page) => newPdf.addPage(page))
+
+    const pdfBytes = await newPdf.save()
+    chunks.push({
+      buffer: Buffer.from(pdfBytes),
+      startPage,
+      endPage,
+    })
+  }
+
+  return chunks
+}
+
 type AzureOCRResponse = {
   pages?: OCRPage[]
   [key: string]: unknown
@@ -81,7 +140,7 @@ export async function processDocument(
     const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined
 
     let chunks: Chunk[]
-    const metadata = 'metadata' in parseResult ? parseResult.metadata : {}
+    const metadata: FileParseMetadata = parseResult.metadata ?? {}
 
     const isJsonYaml =
       metadata.type === 'json' ||
@@ -97,10 +156,11 @@ export async function processDocument(
       })
     } else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
       logger.info('Using structured data chunker for spreadsheet/CSV content')
+      const rowCount = metadata.totalRows ?? metadata.rowCount
       chunks = await StructuredDataChunker.chunkStructuredData(content, {
         chunkSize,
         headers: metadata.headers,
-        totalRows: metadata.totalRows || metadata.rowCount,
+        totalRows: typeof rowCount === 'number' ? rowCount : undefined,
         sheetName: metadata.sheetNames?.[0],
       })
     } else {
@@ -153,7 +213,7 @@ async function parseDocument(
   content: string
   processingMethod: 'file-parser' | 'mistral-ocr'
   cloudUrl?: string
-  metadata?: any
+  metadata?: FileParseMetadata
 }> {
   const isPDF = mimeType === 'application/pdf'
   const hasAzureMistralOCR =
@@ -165,7 +225,7 @@ async function parseDocument(
   if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
     if (hasAzureMistralOCR) {
       logger.info(`Using Azure Mistral OCR: ${filename}`)
-      return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId)
+      return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
     }
 
     if (hasMistralOCR) {
@@ -188,13 +248,32 @@ async function handleFileForOCR(
   const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
 
   if (isExternalHttps) {
-    return { httpsUrl: fileUrl }
+    if (mimeType === 'application/pdf') {
+      logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
+      try {
+        const buffer = await downloadFileWithTimeout(fileUrl)
+        logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
+        return { httpsUrl: fileUrl, buffer }
+      } catch (error) {
+        logger.warn(
+          `handleFileForOCR: Failed to download external PDF for page count check, proceeding without batching`,
+          {
+            error: error instanceof Error ? error.message : String(error),
+          }
+        )
+        return { httpsUrl: fileUrl, buffer: undefined }
+      }
+    }
+    logger.info(`handleFileForOCR: Using external URL directly`)
+    return { httpsUrl: fileUrl, buffer: undefined }
   }
 
   logger.info(`Uploading "${filename}" to cloud storage for OCR`)
 
   const buffer = await downloadFileWithTimeout(fileUrl)
 
+  logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)
+
   try {
     const metadata: Record<string, string> = {
       originalName: filename,
@@ -224,8 +303,7 @@ async function handleFileForOCR(
       900 // 15 minutes
     )
 
-    logger.info(`Successfully uploaded for OCR: ${cloudResult.key}`)
-    return { httpsUrl, cloudUrl: httpsUrl }
+    return { httpsUrl, cloudUrl: httpsUrl, buffer }
   } catch (uploadError) {
     const message = uploadError instanceof Error ? uploadError.message : 'Unknown error'
     throw new Error(`Cloud upload failed: ${message}. Cloud upload is required for OCR.`)
@@ -321,13 +399,7 @@ async function makeOCRRequest(
   }
 }
 
-async function parseWithAzureMistralOCR(
-  fileUrl: string,
-  filename: string,
-  mimeType: string,
-  userId?: string,
-  workspaceId?: string | null
-) {
+async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
   validateOCRConfig(
     env.OCR_AZURE_API_KEY,
     env.OCR_AZURE_ENDPOINT,
@@ -336,6 +408,19 @@ async function parseWithAzureMistralOCR(
   )
 
   const fileBuffer = await downloadFileForBase64(fileUrl)
+
+  if (mimeType === 'application/pdf') {
+    const pageCount = await getPdfPageCount(fileBuffer)
+    if (pageCount > MISTRAL_MAX_PAGES) {
+      logger.info(
+        `PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
+          `Falling back to file parser.`
+      )
+      return parseWithFileParser(fileUrl, filename, mimeType)
+    }
+    logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
+  }
+
   const base64Data = fileBuffer.toString('base64')
   const dataUri = `data:${mimeType};base64,${base64Data}`
 
@@ -374,17 +459,7 @@ async function parseWithAzureMistralOCR(
       message: error instanceof Error ? error.message : String(error),
     })
 
-    const fallbackMistralKey = await getMistralApiKey(workspaceId)
-    if (fallbackMistralKey) {
-      return parseWithMistralOCR(
-        fileUrl,
-        filename,
-        mimeType,
-        userId,
-        workspaceId,
-        fallbackMistralKey
-      )
-    }
+    logger.info(`Falling back to file parser: ${filename}`)
     return parseWithFileParser(fileUrl, filename, mimeType)
   }
 }
@@ -406,50 +481,35 @@ async function parseWithMistralOCR(
     throw new Error('Mistral parser tool not configured')
   }
 
-  const { httpsUrl, cloudUrl } = await handleFileForOCR(
+  const { httpsUrl, cloudUrl, buffer } = await handleFileForOCR(
     fileUrl,
     filename,
     mimeType,
     userId,
     workspaceId
   )
+
+  logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`)
+
+  let pageCount = 0
+  if (mimeType === 'application/pdf' && buffer) {
+    pageCount = await getPdfPageCount(buffer)
+    logger.info(`PDF page count for ${filename}: ${pageCount}`)
+  }
+
+  const needsBatching = pageCount > MISTRAL_MAX_PAGES
+
+  if (needsBatching && buffer) {
+    logger.info(
+      `PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.`
+    )
+    return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl)
+  }
+
   const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }
 
   try {
-    const response = await retryWithExponentialBackoff(
-      async () => {
-        let url =
-          typeof mistralParserTool.request!.url === 'function'
-            ? mistralParserTool.request!.url(params)
-            : mistralParserTool.request!.url
-
-        const isInternalRoute = url.startsWith('/')
-
-        if (isInternalRoute) {
-          const { getBaseUrl } = await import('@/lib/core/utils/urls')
-          url = `${getBaseUrl()}${url}`
-        }
-
-        let headers =
-          typeof mistralParserTool.request!.headers === 'function'
-            ? mistralParserTool.request!.headers(params)
-            : mistralParserTool.request!.headers
-
-        if (isInternalRoute) {
-          const { generateInternalToken } = await import('@/lib/auth/internal')
-          const internalToken = await generateInternalToken(userId)
-          headers = {
-            ...headers,
-            Authorization: `Bearer ${internalToken}`,
-          }
-        }
-
-        const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody
-        return makeOCRRequest(url, headers as Record<string, string>, requestBody)
-      },
-      { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
-    )
-
+    const response = await executeMistralOCRRequest(params, userId)
     const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
     const content = processOCRContent(result, filename)
 
@@ -464,10 +524,204 @@ async function parseWithMistralOCR(
   }
 }
 
+async function executeMistralOCRRequest(
+  params: { filePath: string; apiKey: string; resultType: 'text' },
+  userId?: string
+): Promise<Response> {
+  return retryWithExponentialBackoff(
+    async () => {
+      let url =
+        typeof mistralParserTool.request!.url === 'function'
+          ? mistralParserTool.request!.url(params)
+          : mistralParserTool.request!.url
+
+      const isInternalRoute = url.startsWith('/')
+
+      if (isInternalRoute) {
+        const { getBaseUrl } = await import('@/lib/core/utils/urls')
+        url = `${getBaseUrl()}${url}`
+      }
+
+      let headers =
+        typeof mistralParserTool.request!.headers === 'function'
+          ? mistralParserTool.request!.headers(params)
+          : mistralParserTool.request!.headers
+
+      if (isInternalRoute) {
+        const { generateInternalToken } = await import('@/lib/auth/internal')
+        const internalToken = await generateInternalToken(userId)
+        headers = {
+          ...headers,
+          Authorization: `Bearer ${internalToken}`,
+        }
+      }
+
+      const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody
+      return makeOCRRequest(url, headers as Record<string, string>, requestBody)
+    },
+    { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
+  )
+}
+
+/**
+ * Process a single PDF chunk: upload to S3, OCR, cleanup
+ */
+async function processChunk(
+  chunk: { buffer: Buffer; startPage: number; endPage: number },
+  chunkIndex: number,
+  totalChunks: number,
+  filename: string,
+  apiKey: string,
+  userId?: string
+): Promise<{ index: number; content: string | null }> {
+  const chunkPageCount = chunk.endPage - chunk.startPage + 1
+
+  logger.info(
+    `Processing chunk ${chunkIndex + 1}/${totalChunks} (pages ${chunk.startPage + 1}-${chunk.endPage + 1}, ${chunkPageCount} pages)`
+  )
+
+  let uploadedKey: string | null = null
+
+  try {
+    // Upload the chunk to S3
+    const timestamp = Date.now()
+    const uniqueId = Math.random().toString(36).substring(2, 9)
+    const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
+    const chunkKey = `kb/${timestamp}-${uniqueId}-chunk${chunkIndex + 1}-${safeFileName}`
+
+    const metadata: Record<string, string> = {
+      originalName: `${filename}_chunk${chunkIndex + 1}`,
+      uploadedAt: new Date().toISOString(),
+      purpose: 'knowledge-base',
+      ...(userId && { userId }),
+    }
+
+    const uploadResult = await StorageService.uploadFile({
+      file: chunk.buffer,
+      fileName: `${filename}_chunk${chunkIndex + 1}`,
+      contentType: 'application/pdf',
+      context: 'knowledge-base',
+      customKey: chunkKey,
+      metadata,
+    })
+
+    uploadedKey = uploadResult.key
+
+    const chunkUrl = await StorageService.generatePresignedDownloadUrl(
+      uploadResult.key,
+      'knowledge-base',
+      900 // 15 minutes
+    )
+
+    logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
+
+    // Process the chunk with Mistral OCR
+    const params = {
+      filePath: chunkUrl,
+      apiKey,
+      resultType: 'text' as const,
+    }
+
+    const response = await executeMistralOCRRequest(params, userId)
+    const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
+
+    if (result.success && result.output?.content) {
+      logger.info(`Chunk ${chunkIndex + 1}/${totalChunks} completed successfully`)
+      return { index: chunkIndex, content: result.output.content }
+    }
+    logger.warn(`Chunk ${chunkIndex + 1}/${totalChunks} returned no content`)
+    return { index: chunkIndex, content: null }
+  } catch (error) {
+    logger.error(`Chunk ${chunkIndex + 1}/${totalChunks} failed:`, {
+      message: error instanceof Error ? error.message : String(error),
+    })
+    return { index: chunkIndex, content: null }
+  } finally {
+    // Clean up the chunk file from S3 after processing
+    if (uploadedKey) {
+      try {
+        await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
+        logger.info(`Cleaned up chunk ${chunkIndex + 1} from S3`)
+      } catch (deleteError) {
+        logger.warn(`Failed to clean up chunk ${chunkIndex + 1} from S3:`, {
+          message: deleteError instanceof Error ? deleteError.message : String(deleteError),
+        })
+      }
+    }
+  }
+}
+
+async function processMistralOCRInBatches(
+  filename: string,
+  apiKey: string,
+  pdfBuffer: Buffer,
+  userId?: string,
+  cloudUrl?: string
+): Promise<{
+  content: string
+  processingMethod: 'mistral-ocr'
+  cloudUrl?: string
+}> {
+  const totalPages = await getPdfPageCount(pdfBuffer)
+  logger.info(
+    `Splitting ${filename} (${totalPages} pages) into chunks of ${MISTRAL_MAX_PAGES} pages`
+  )
+
+  const pdfChunks = await splitPdfIntoChunks(pdfBuffer, MISTRAL_MAX_PAGES)
+  logger.info(
+    `Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
+  )
+
+  // Process chunks concurrently with limited concurrency
+  const results: { index: number; content: string | null }[] = []
+
+  for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
+    const batch = pdfChunks.slice(i, i + MAX_CONCURRENT_CHUNKS)
+    const batchPromises = batch.map((chunk, batchIndex) =>
+      processChunk(chunk, i + batchIndex, pdfChunks.length, filename, apiKey, userId)
+    )
+
+    const batchResults = await Promise.all(batchPromises)
+    for (const result of batchResults) {
+      results.push(result)
+    }
+
+    logger.info(
+      `Completed batch ${Math.floor(i / MAX_CONCURRENT_CHUNKS) + 1}/${Math.ceil(pdfChunks.length / MAX_CONCURRENT_CHUNKS)}`
+    )
+  }
+
+  // Sort by index to maintain page order and filter out nulls
+  const sortedResults = results
+    .sort((a, b) => a.index - b.index)
+    .filter((r) => r.content !== null)
+    .map((r) => r.content as string)
+
+  if (sortedResults.length === 0) {
+    // Don't fall back to file parser for large PDFs - it produces poor results
+    // Better to fail clearly than return low-quality extraction
+    throw new Error(
+      `OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
+        `Large PDFs require OCR - file parser fallback would produce poor results.`
+    )
+  }
+
+  const combinedContent = sortedResults.join('\n\n')
+  logger.info(
+    `Successfully processed ${sortedResults.length}/${pdfChunks.length} chunks for ${filename}`
+  )
+
+  return {
+    content: combinedContent,
+    processingMethod: 'mistral-ocr',
+    cloudUrl,
+  }
+}
+
 async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
   try {
     let content: string
-    let metadata: any = {}
+    let metadata: FileParseMetadata = {}
 
     if (fileUrl.startsWith('data:')) {
       content = await parseDataURI(fileUrl, filename, mimeType)
@@ -513,7 +767,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
 async function parseHttpFile(
   fileUrl: string,
   filename: string
-): Promise<{ content: string; metadata?: any }> {
+): Promise<{ content: string; metadata?: FileParseMetadata }> {
   const buffer = await downloadFileWithTimeout(fileUrl)
 
   const extension = filename.split('.').pop()?.toLowerCase()
diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts
index 313ea8d39..d61cbdfd7 100644
--- a/apps/sim/lib/knowledge/documents/service.ts
+++ b/apps/sim/lib/knowledge/documents/service.ts
@@ -212,7 +212,6 @@ export async function processDocumentTags(
     return result
   }
 
-  // Fetch existing tag definitions
   const existingDefinitions = await db
     .select()
     .from(knowledgeBaseTagDefinitions)
@@ -220,18 +219,15 @@ export async function processDocumentTags(
 
   const existingByName = new Map(existingDefinitions.map((def) => [def.displayName, def]))
 
-  // First pass: collect all validation errors
   const undefinedTags: string[] = []
   const typeErrors: string[] = []
 
   for (const tag of tagData) {
-    // Skip if no tag name
     if (!tag.tagName?.trim()) continue
 
     const tagName = tag.tagName.trim()
     const fieldType = tag.fieldType || 'text'
 
-    // For boolean, check if value is defined; for others, check if value is non-empty
     const hasValue =
       fieldType === 'boolean'
         ? tag.value !== undefined && tag.value !== null && tag.value !== ''
@@ -239,14 +235,12 @@ export async function processDocumentTags(
 
     if (!hasValue) continue
 
-    // Check if tag exists
     const existingDef = existingByName.get(tagName)
     if (!existingDef) {
       undefinedTags.push(tagName)
       continue
     }
 
-    // Validate value type using shared validation
     const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
     const actualFieldType = existingDef.fieldType || fieldType
     const validationError = validateTagValue(tagName, String(rawValue), actualFieldType)
@@ -255,7 +249,6 @@ export async function processDocumentTags(
     }
   }
 
-  // Throw combined error if there are any validation issues
   if (undefinedTags.length > 0 || typeErrors.length > 0) {
     const errorParts: string[] = []
 
@@ -270,7 +263,6 @@ export async function processDocumentTags(
     throw new Error(errorParts.join('\n'))
   }
 
-  // Second pass: process valid tags
   for (const tag of tagData) {
     if (!tag.tagName?.trim()) continue
 
@@ -285,14 +277,13 @@ export async function processDocumentTags(
     if (!hasValue) continue
 
     const existingDef = existingByName.get(tagName)
-    if (!existingDef) continue // Already validated above
+    if (!existingDef) continue
 
     const targetSlot = existingDef.tagSlot
     const actualFieldType = existingDef.fieldType || fieldType
     const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
     const stringValue = String(rawValue).trim()
 
-    // Assign value to the slot with proper type conversion (values already validated)
     if (actualFieldType === 'boolean') {
       setTagValue(result, targetSlot, parseBooleanValue(stringValue) ?? false)
     } else if (actualFieldType === 'number') {
@@ -440,7 +431,6 @@ export async function processDocumentAsync(
 
     logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)
 
-    // Use KB's chunkingConfig as fallback if processingOptions not provided
     const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
 
     await withTimeout(
@@ -469,7 +459,6 @@ export async function processDocumentAsync(
           `[${documentId}] Document parsed successfully, generating embeddings for ${processed.chunks.length} chunks`
         )
 
-        // Generate embeddings in batches for large documents
         const chunkTexts = processed.chunks.map((chunk) => chunk.text)
         const embeddings: number[][] = []
 
@@ -485,7 +474,9 @@ export async function processDocumentAsync(
 
             logger.info(`[${documentId}] Processing embedding batch ${batchNum}/${totalBatches}`)
             const batchEmbeddings = await generateEmbeddings(batch, undefined, kb[0].workspaceId)
-            embeddings.push(...batchEmbeddings)
+            for (const emb of batchEmbeddings) {
+              embeddings.push(emb)
+            }
           }
         }
 
@@ -562,23 +553,18 @@ export async function processDocumentAsync(
         }))
 
         await db.transaction(async (tx) => {
-          // Insert embeddings in batches for large documents
           if (embeddingRecords.length > 0) {
-            const batchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
-            const totalBatches = Math.ceil(embeddingRecords.length / batchSize)
+            await tx.delete(embedding).where(eq(embedding.documentId, documentId))
 
-            logger.info(
-              `[${documentId}] Inserting ${embeddingRecords.length} embeddings in ${totalBatches} batches`
-            )
-
-            for (let i = 0; i < embeddingRecords.length; i += batchSize) {
-              const batch = embeddingRecords.slice(i, i + batchSize)
-              const batchNum = Math.floor(i / batchSize) + 1
+            const insertBatchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
+            const batches: (typeof embeddingRecords)[] = []
+            for (let i = 0; i < embeddingRecords.length; i += insertBatchSize) {
+              batches.push(embeddingRecords.slice(i, i + insertBatchSize))
+            }
 
+            logger.info(`[${documentId}] Inserting ${embeddingRecords.length} embeddings`)
+            for (const batch of batches) {
               await tx.insert(embedding).values(batch)
-              logger.info(
-                `[${documentId}] Inserted batch ${batchNum}/${totalBatches} (${batch.length} records)`
-              )
             }
           }
 
@@ -689,11 +675,9 @@ export async function createDocumentRecords(
   requestId: string,
   userId?: string
 ): Promise<DocumentData[]> {
-  // Check storage limits before creating documents
   if (userId) {
     const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
 
-    // Get knowledge base owner
     const kb = await db
       .select({ userId: knowledgeBase.userId })
       .from(knowledgeBase)
@@ -713,7 +697,7 @@ export async function createDocumentRecords(
     for (const docData of documents) {
       const documentId = randomUUID()
 
-      let processedTags: Record<string, any> = {}
+      let processedTags: Partial<ProcessedDocumentTags> = {}
 
       if (docData.documentTagsData) {
         try {
@@ -722,7 +706,6 @@ export async function createDocumentRecords(
             processedTags = await processDocumentTags(knowledgeBaseId, tagData, requestId)
           }
         } catch (error) {
-          // Re-throw validation errors, only catch JSON parse errors
           if (error instanceof SyntaxError) {
             logger.warn(`[${requestId}] Failed to parse documentTagsData for bulk document:`, error)
           } else {
@@ -791,7 +774,6 @@ export async function createDocumentRecords(
       if (userId) {
         const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
 
-        // Get knowledge base owner
         const kb = await db
           .select({ userId: knowledgeBase.userId })
           .from(knowledgeBase)
@@ -1079,7 +1061,7 @@ export async function createSingleDocument(
   const now = new Date()
 
   // Process structured tag data if provided
-  let processedTags: Record<string, any> = {
+  let processedTags: ProcessedDocumentTags = {
     // Text tags (7 slots)
     tag1: documentData.tag1 ?? null,
     tag2: documentData.tag2 ?? null,
@@ -1555,23 +1537,30 @@ export async function updateDocument(
     return value || null
   }
 
+  // Type-safe access to tag slots in updateData
+  type UpdateDataWithTags = typeof updateData & Record<TagSlot, string | undefined>
+  const typedUpdateData = updateData as UpdateDataWithTags
+
   ALL_TAG_SLOTS.forEach((slot: TagSlot) => {
-    const updateValue = (updateData as any)[slot]
+    const updateValue = typedUpdateData[slot]
     if (updateValue !== undefined) {
-      ;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue)
+      ;(dbUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[slot] =
+        convertTagValue(slot, updateValue)
     }
   })
 
   await db.transaction(async (tx) => {
     await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId))
 
-    const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined)
+    const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined)
 
     if (hasTagUpdates) {
-      const embeddingUpdateData: Record<string, any> = {}
+      const embeddingUpdateData: Partial<ProcessedDocumentTags> = {}
       ALL_TAG_SLOTS.forEach((field) => {
-        if ((updateData as any)[field] !== undefined) {
-          embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field])
+        if (typedUpdateData[field] !== undefined) {
+          ;(embeddingUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[
+            field
+          ] = convertTagValue(field, typedUpdateData[field])
         }
       })
 
diff --git a/apps/sim/lib/knowledge/documents/utils.ts b/apps/sim/lib/knowledge/documents/utils.ts
index a872c1ede..7aae3187a 100644
--- a/apps/sim/lib/knowledge/documents/utils.ts
+++ b/apps/sim/lib/knowledge/documents/utils.ts
@@ -14,7 +14,7 @@ export interface RetryOptions {
   initialDelayMs?: number
   maxDelayMs?: number
   backoffMultiplier?: number
-  retryCondition?: (error: RetryableError) => boolean
+  retryCondition?: (error: unknown) => boolean
 }
 
 export interface RetryResult<T> {
@@ -30,11 +30,18 @@ function hasStatus(
   return typeof error === 'object' && error !== null && 'status' in error
 }
 
+function isRetryableErrorType(error: unknown): error is RetryableError {
+  if (!error) return false
+  if (error instanceof Error) return true
+  if (typeof error === 'object' && ('status' in error || 'message' in error)) return true
+  return false
+}
+
 /**
  * Default retry condition for rate limiting errors
  */
-export function isRetryableError(error: RetryableError): boolean {
-  if (!error) return false
+export function isRetryableError(error: unknown): boolean {
+  if (!isRetryableErrorType(error)) return false
 
   // Check for rate limiting status codes
   if (
@@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean {
   }
 
   // Check for rate limiting in error messages
-  const errorMessage = error.message || error.toString()
+  const errorMessage = error instanceof Error ? error.message : String(error)
   const rateLimitKeywords = [
     'rate limit',
     'rate_limit',
diff --git a/apps/sim/lib/knowledge/embeddings.ts b/apps/sim/lib/knowledge/embeddings.ts
index 7a736688b..2b57b34c5 100644
--- a/apps/sim/lib/knowledge/embeddings.ts
+++ b/apps/sim/lib/knowledge/embeddings.ts
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
 import { getBYOKKey } from '@/lib/api-key/byok'
 import { env } from '@/lib/core/config/env'
 import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
-import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization'
+import { batchByTokenLimit } from '@/lib/tokenization'
 
 const logger = createLogger('EmbeddingUtils')
 
@@ -26,6 +26,20 @@ interface EmbeddingConfig {
   modelName: string
 }
 
+interface EmbeddingResponseItem {
+  embedding: number[]
+  index: number
+}
+
+interface EmbeddingAPIResponse {
+  data: EmbeddingResponseItem[]
+  model: string
+  usage: {
+    prompt_tokens: number
+    total_tokens: number
+  }
+}
+
 async function getEmbeddingConfig(
   embeddingModel = 'text-embedding-3-small',
   workspaceId?: string | null
@@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
         )
       }
 
-      const data = await response.json()
-      return data.data.map((item: any) => item.embedding)
+      const data: EmbeddingAPIResponse = await response.json()
+      return data.data.map((item) => item.embedding)
     },
     {
       maxRetries: 3,
       initialDelayMs: 1000,
       maxDelayMs: 10000,
-      retryCondition: (error: any) => {
+      retryCondition: (error: unknown) => {
         if (error instanceof EmbeddingAPIError) {
           return error.status === 429 || error.status >= 500
         }
@@ -153,44 +167,27 @@ export async function generateEmbeddings(
 ): Promise<number[][]> {
   const config = await getEmbeddingConfig(embeddingModel, workspaceId)
 
-  logger.info(
-    `Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)`
-  )
-
   const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel)
 
-  logger.info(
-    `Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch, ${MAX_CONCURRENT_BATCHES} concurrent)`
-  )
-
   const batchResults = await processWithConcurrency(
     batches,
     MAX_CONCURRENT_BATCHES,
     async (batch, i) => {
-      const batchTokenCount = getTotalTokenCount(batch, embeddingModel)
-
-      logger.info(
-        `Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens`
-      )
-
       try {
-        const batchEmbeddings = await callEmbeddingAPI(batch, config)
-
-        logger.info(
-          `Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}`
-        )
-
-        return batchEmbeddings
+        return await callEmbeddingAPI(batch, config)
       } catch (error) {
-        logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error)
+        logger.error(`Failed to generate embeddings for batch ${i + 1}/${batches.length}:`, error)
         throw error
       }
     }
   )
 
-  const allEmbeddings = batchResults.flat()
-
-  logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`)
+  const allEmbeddings: number[][] = []
+  for (const batch of batchResults) {
+    for (const emb of batch) {
+      allEmbeddings.push(emb)
+    }
+  }
 
   return allEmbeddings
 }
diff --git a/apps/sim/lib/tokenization/estimators.ts b/apps/sim/lib/tokenization/estimators.ts
index c0d96fb0f..53ce71965 100644
--- a/apps/sim/lib/tokenization/estimators.ts
+++ b/apps/sim/lib/tokenization/estimators.ts
@@ -127,24 +127,6 @@ export function truncateToTokenLimit(
   }
 }
 
-/**
- * Get token count for multiple texts (for batching decisions)
- * Returns array of token counts in same order as input
- */
-export function getTokenCountsForBatch(
-  texts: string[],
-  modelName = 'text-embedding-3-small'
-): number[] {
-  return texts.map((text) => getAccurateTokenCount(text, modelName))
-}
-
-/**
- * Calculate total tokens across multiple texts
- */
-export function getTotalTokenCount(texts: string[], modelName = 'text-embedding-3-small'): number {
-  return texts.reduce((total, text) => total + getAccurateTokenCount(text, modelName), 0)
-}
-
 /**
  * Batch texts by token count to stay within API limits
  * Returns array of batches where each batch's total tokens <= maxTokensPerBatch
diff --git a/apps/sim/lib/tokenization/index.ts b/apps/sim/lib/tokenization/index.ts
index 4a82a60e0..b6c559f2b 100644
--- a/apps/sim/lib/tokenization/index.ts
+++ b/apps/sim/lib/tokenization/index.ts
@@ -12,8 +12,6 @@ export {
   estimateOutputTokens,
   estimateTokenCount,
   getAccurateTokenCount,
-  getTokenCountsForBatch,
-  getTotalTokenCount,
   truncateToTokenLimit,
 } from '@/lib/tokenization/estimators'
 export { processStreamingBlockLog, processStreamingBlockLogs } from '@/lib/tokenization/streaming'
diff --git a/apps/sim/package.json b/apps/sim/package.json
index f968fe0f0..c6daca4e9 100644
--- a/apps/sim/package.json
+++ b/apps/sim/package.json
@@ -127,6 +127,7 @@
     "onedollarstats": "0.0.10",
     "openai": "^4.91.1",
     "papaparse": "5.5.3",
+    "pdf-lib": "1.17.1",
     "postgres": "^3.4.5",
     "posthog-js": "1.268.9",
     "posthog-node": "5.9.2",
diff --git a/apps/sim/trigger.config.ts b/apps/sim/trigger.config.ts
index 543d112f0..f6915e95f 100644
--- a/apps/sim/trigger.config.ts
+++ b/apps/sim/trigger.config.ts
@@ -17,7 +17,7 @@ export default defineConfig({
   build: {
     extensions: [
       additionalPackages({
-        packages: ['unpdf'],
+        packages: ['unpdf', 'pdf-lib'],
       }),
     ],
   },
diff --git a/bun.lock b/bun.lock
index d7add8660..d62ed31d0 100644
--- a/bun.lock
+++ b/bun.lock
@@ -1,5 +1,6 @@
 {
   "lockfileVersion": 1,
+  "configVersion": 0,
   "workspaces": {
     "": {
       "name": "simstudio",
@@ -11,7 +12,7 @@
         "drizzle-kit": "^0.31.4",
         "husky": "9.1.7",
         "lint-staged": "16.0.0",
-        "turbo": "2.7.2",
+        "turbo": "2.7.3",
       },
     },
     "apps/docs": {
@@ -156,6 +157,7 @@
         "onedollarstats": "0.0.10",
         "openai": "^4.91.1",
         "papaparse": "5.5.3",
+        "pdf-lib": "1.17.1",
         "postgres": "^3.4.5",
         "posthog-js": "1.268.9",
         "posthog-node": "5.9.2",
@@ -912,6 +914,10 @@
 
     "@orama/orama": ["@orama/orama@3.1.18", "", {}, "sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA=="],
 
+    "@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="],
+
+    "@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="],
+
     "@peculiar/asn1-android": ["@peculiar/asn1-android@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-cBRCKtYPF7vJGN76/yG8VbxRcHLPF3HnkoHhKOZeHpoVtbMYfY9ROKtH3DtYUY9m8uI1Mh47PRhHf2hSK3xcSQ=="],
 
     "@peculiar/asn1-cms": ["@peculiar/asn1-cms@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "@peculiar/asn1-x509": "^2.6.0", "@peculiar/asn1-x509-attr": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA=="],
@@ -2864,6 +2870,8 @@
 
     "pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="],
 
+    "pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="],
+
     "pdfjs-dist": ["pdfjs-dist@5.4.449", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.81" } }, "sha512-CegnUaT0QwAyQMS+7o2POr4wWUNNe8VaKKlcuoRHeYo98cVnqPpwOXNSx6Trl6szH02JrRcsPgletV6GmF3LtQ=="],
 
     "peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="],
@@ -3362,19 +3370,19 @@
 
     "tunnel-agent": ["tunnel-agent@0.6.0", "", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w=="],
 
-    "turbo": ["turbo@2.7.2", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.2", "turbo-darwin-arm64": "2.7.2", "turbo-linux-64": "2.7.2", "turbo-linux-arm64": "2.7.2", "turbo-windows-64": "2.7.2", "turbo-windows-arm64": "2.7.2" }, "bin": { "turbo": "bin/turbo" } }, "sha512-5JIA5aYBAJSAhrhbyag1ZuMSgUZnHtI+Sq3H8D3an4fL8PeF+L1yYvbEJg47akP1PFfATMf5ehkqFnxfkmuwZQ=="],
+    "turbo": ["turbo@2.7.3", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.3", "turbo-darwin-arm64": "2.7.3", "turbo-linux-64": "2.7.3", "turbo-linux-arm64": "2.7.3", "turbo-windows-64": "2.7.3", "turbo-windows-arm64": "2.7.3" }, "bin": { "turbo": "bin/turbo" } }, "sha512-+HjKlP4OfYk+qzvWNETA3cUO5UuK6b5MSc2UJOKyvBceKucQoQGb2g7HlC2H1GHdkfKrk4YF1VPvROkhVZDDLQ=="],
 
-    "turbo-darwin-64": ["turbo-darwin-64@2.7.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-dxY3X6ezcT5vm3coK6VGixbrhplbQMwgNsCsvZamS/+/6JiebqW9DKt4NwpgYXhDY2HdH00I7FWs3wkVuan4rA=="],
+    "turbo-darwin-64": ["turbo-darwin-64@2.7.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-aZHhvRiRHXbJw1EcEAq4aws1hsVVUZ9DPuSFaq9VVFAKCup7niIEwc22glxb7240yYEr1vLafdQ2U294Vcwz+w=="],
 
-    "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1bXmuwPLqNFt3mzrtYcVx1sdJ8UYb124Bf48nIgcpMCGZy3kDhgxNv1503kmuK/37OGOZbsWSQFU4I08feIuSg=="],
+    "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-CkVrHSq+Bnhl9sX2LQgqQYVfLTWC2gvI74C4758OmU0djfrssDKU9d4YQF0AYXXhIIRZipSXfxClQziIMD+EAg=="],
 
-    "turbo-linux-64": ["turbo-linux-64@2.7.2", "", { "os": "linux", "cpu": "x64" }, "sha512-kP+TiiMaiPugbRlv57VGLfcjFNsFbo8H64wMBCPV2270Or2TpDCBULMzZrvEsvWFjT3pBFvToYbdp8/Kw0jAQg=="],
+    "turbo-linux-64": ["turbo-linux-64@2.7.3", "", { "os": "linux", "cpu": "x64" }, "sha512-GqDsCNnzzr89kMaLGpRALyigUklzgxIrSy2pHZVXyifgczvYPnLglex78Aj3T2gu+T3trPPH2iJ+pWucVOCC2Q=="],
 
-    "turbo-linux-arm64": ["turbo-linux-arm64@2.7.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-VDJwQ0+8zjAfbyY6boNaWfP6RIez4ypKHxwkuB6SrWbOSk+vxTyW5/hEjytTwK8w/TsbKVcMDyvpora8tEsRFw=="],
+    "turbo-linux-arm64": ["turbo-linux-arm64@2.7.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-NdCDTfIcIo3dWjsiaAHlxu5gW61Ed/8maah1IAF/9E3EtX0aAHNiBMbuYLZaR4vRJ7BeVkYB6xKWRtdFLZ0y3g=="],
 
-    "turbo-windows-64": ["turbo-windows-64@2.7.2", "", { "os": "win32", "cpu": "x64" }, "sha512-rPjqQXVnI6A6oxgzNEE8DNb6Vdj2Wwyhfv3oDc+YM3U9P7CAcBIlKv/868mKl4vsBtz4ouWpTQNXG8vljgJO+w=="],
+    "turbo-windows-64": ["turbo-windows-64@2.7.3", "", { "os": "win32", "cpu": "x64" }, "sha512-7bVvO987daXGSJVYBoG8R4Q+csT1pKIgLJYZevXRQ0Hqw0Vv4mKme/TOjYXs9Qb1xMKh51Tb3bXKDbd8/4G08g=="],
 
-    "turbo-windows-arm64": ["turbo-windows-arm64@2.7.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-tcnHvBhO515OheIFWdxA+qUvZzNqqcHbLVFc1+n+TJ1rrp8prYicQtbtmsiKgMvr/54jb9jOabU62URAobnB7g=="],
+    "turbo-windows-arm64": ["turbo-windows-arm64@2.7.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-nTodweTbPmkvwMu/a55XvjMsPtuyUSC+sV7f/SR57K36rB2I0YG21qNETN+00LOTUW9B3omd8XkiXJkt4kx/cw=="],
 
     "tweetnacl": ["tweetnacl@0.14.5", "", {}, "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA=="],
 
@@ -4046,6 +4054,8 @@
 
     "path-scurry/lru-cache": ["lru-cache@11.2.4", "", {}, "sha512-B5Y16Jr9LB9dHVkh6ZevG+vAbOsNOYCX+sXvFWFu7B3Iz5mijW3zdbMyhsh8ANd2mSWBYdJgnqi+mL7/LrOPYg=="],
 
+    "pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
+
     "pino/thread-stream": ["thread-stream@3.1.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-OqyPZ9u96VohAyMfJykzmivOrY2wfMSf3C5TtFJVgN+Hm6aj+voFhlK+kZEIv2FBh1X6Xp3DlnCOfEQ3B2J86A=="],
 
     "pino-pretty/pino-abstract-transport": ["pino-abstract-transport@3.0.0", "", { "dependencies": { "split2": "^4.0.0" } }, "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg=="],
diff --git a/package.json b/package.json
index 1b9ddf95f..4fa9176e0 100644
--- a/package.json
+++ b/package.json
@@ -41,7 +41,7 @@
     "drizzle-kit": "^0.31.4",
     "husky": "9.1.7",
     "lint-staged": "16.0.0",
-    "turbo": "2.7.2"
+    "turbo": "2.7.3"
   },
   "lint-staged": {
     "*.{js,jsx,ts,tsx,json,css,scss}": [