diff --git a/apps/sim/app/api/knowledge/utils.test.ts b/apps/sim/app/api/knowledge/utils.test.ts index eb4b92eb3..9ae1372c0 100644 --- a/apps/sim/app/api/knowledge/utils.test.ts +++ b/apps/sim/app/api/knowledge/utils.test.ts @@ -136,16 +136,29 @@ vi.mock('@sim/db', () => { }, }), }), + delete: () => ({ + where: () => Promise.resolve(), + }), + insert: () => ({ + values: (records: any) => { + dbOps.order.push('insert') + dbOps.insertRecords.push(records) + return Promise.resolve() + }, + }), transaction: vi.fn(async (fn: any) => { await fn({ - insert: (table: any) => ({ + delete: () => ({ + where: () => Promise.resolve(), + }), + insert: () => ({ values: (records: any) => { dbOps.order.push('insert') dbOps.insertRecords.push(records) return Promise.resolve() }, }), - update: (table: any) => ({ + update: () => ({ set: (payload: any) => ({ where: () => { dbOps.updatePayloads.push(payload) diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx index 21c0f4d09..b56fc4ec4 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/base.tsx @@ -453,6 +453,8 @@ export function KnowledgeBase({ error: knowledgeBaseError, refresh: refreshKnowledgeBase, } = useKnowledgeBase(id) + const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false) + const { documents, pagination, @@ -468,6 +470,7 @@ export function KnowledgeBase({ offset: (currentPage - 1) * DOCUMENTS_PER_PAGE, sortBy, sortOrder, + refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false, }) const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id) @@ -534,25 +537,15 @@ export function KnowledgeBase({ ) useEffect(() => { - const hasProcessingDocuments = documents.some( + const processing = documents.some( (doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing' ) + setHasProcessingDocuments(processing) - if (!hasProcessingDocuments) return - - const refreshInterval = setInterval(async () => { - try { - if (!isDeleting) { - await checkForDeadProcesses() - await refreshDocuments() - } - } catch (error) { - logger.error('Error refreshing documents:', error) - } - }, 3000) - - return () => clearInterval(refreshInterval) - }, [documents, refreshDocuments, isDeleting]) + if (processing) { + checkForDeadProcesses() + } + }, [documents]) /** * Checks for documents with stale processing states and marks them as failed @@ -672,25 +665,6 @@ export function KnowledgeBase({ await refreshDocuments() - let refreshAttempts = 0 - const maxRefreshAttempts = 3 - const refreshInterval = setInterval(async () => { - try { - refreshAttempts++ - await refreshDocuments() - if (refreshAttempts >= maxRefreshAttempts) { - clearInterval(refreshInterval) - } - } catch (error) { - logger.error('Error refreshing documents after retry:', error) - clearInterval(refreshInterval) - } - }, 1000) - - setTimeout(() => { - clearInterval(refreshInterval) - }, 4000) - logger.info(`Document retry initiated successfully for: ${docId}`) } catch (err) { logger.error('Error retrying document:', err) diff --git a/apps/sim/background/knowledge-processing.ts b/apps/sim/background/knowledge-processing.ts index 920c129af..8f7d75c42 100644 --- a/apps/sim/background/knowledge-processing.ts +++ b/apps/sim/background/knowledge-processing.ts @@ -27,6 +27,7 @@ export type DocumentProcessingPayload = { export const processDocument = task({ id: 'knowledge-process-document', maxDuration: env.KB_CONFIG_MAX_DURATION || 600, + machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing retry: { maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3, factor: env.KB_CONFIG_RETRY_FACTOR || 2, diff --git a/apps/sim/hooks/queries/knowledge.ts b/apps/sim/hooks/queries/knowledge.ts index 977d0e8c9..304c3d216 100644 --- a/apps/sim/hooks/queries/knowledge.ts +++ b/apps/sim/hooks/queries/knowledge.ts @@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery( params: KnowledgeDocumentsParams, options?: { enabled?: boolean + refetchInterval?: number | false } ) { const paramsKey = serializeDocumentParams(params) @@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery( enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId), staleTime: 60 * 1000, placeholderData: keepPreviousData, + refetchInterval: options?.refetchInterval ?? false, }) } diff --git a/apps/sim/hooks/use-knowledge.ts b/apps/sim/hooks/use-knowledge.ts index 14b84393e..be70cf9f7 100644 --- a/apps/sim/hooks/use-knowledge.ts +++ b/apps/sim/hooks/use-knowledge.ts @@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments( sortBy?: string sortOrder?: string enabled?: boolean + refetchInterval?: number | false } ) { const queryClient = useQueryClient() @@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments( }, { enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId), + refetchInterval: options?.refetchInterval, } ) diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts index cb64e3867..a75c94e23 100644 --- a/apps/sim/lib/chunkers/docs-chunker.ts +++ b/apps/sim/lib/chunkers/docs-chunker.ts @@ -16,7 +16,7 @@ interface HeaderInfo { interface Frontmatter { title?: string description?: string - [key: string]: any + [key: string]: unknown } const logger = createLogger('DocsChunker') diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts index 5d81b8262..458f8d3e8 100644 --- a/apps/sim/lib/chunkers/json-yaml-chunker.ts +++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts @@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators' const logger = createLogger('JsonYamlChunker') +type JsonPrimitive = string | number | boolean | null +type JsonValue = JsonPrimitive | JsonObject | JsonArray +type JsonObject = { [key: string]: JsonValue } +type JsonArray = JsonValue[] + function getTokenCount(text: string): number { try { return getAccurateTokenCount(text, 'text-embedding-3-small') @@ -59,11 +64,11 @@ export class JsonYamlChunker { */ async chunk(content: string): Promise { try { - let data: any + let data: JsonValue try { - data = JSON.parse(content) + data = JSON.parse(content) as JsonValue } catch { - data = yaml.load(content) + data = yaml.load(content) as JsonValue } const chunks = this.chunkStructuredData(data) @@ -86,7 +91,7 @@ export class JsonYamlChunker { /** * Chunk structured data based on its structure */ - private chunkStructuredData(data: any, path: string[] = []): Chunk[] { + private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] { const chunks: Chunk[] = [] if (Array.isArray(data)) { @@ -94,7 +99,7 @@ export class JsonYamlChunker { } if (typeof data === 'object' && data !== null) { - return this.chunkObject(data, path) + return this.chunkObject(data as JsonObject, path) } const content = JSON.stringify(data, null, 2) @@ -118,9 +123,9 @@ export class JsonYamlChunker { /** * Chunk an array intelligently */ - private chunkArray(arr: any[], path: string[]): Chunk[] { + private chunkArray(arr: JsonArray, path: string[]): Chunk[] { const chunks: Chunk[] = [] - let currentBatch: any[] = [] + let currentBatch: JsonValue[] = [] let currentTokens = 0 const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : '' @@ -194,7 +199,7 @@ export class JsonYamlChunker { /** * Chunk an object intelligently */ - private chunkObject(obj: Record, path: string[]): Chunk[] { + private chunkObject(obj: JsonObject, path: string[]): Chunk[] { const chunks: Chunk[] = [] const entries = Object.entries(obj) @@ -213,7 +218,7 @@ export class JsonYamlChunker { return chunks } - let currentObj: Record = {} + let currentObj: JsonObject = {} let currentTokens = 0 let currentKeys: string[] = [] diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts index c28f86197..7dbbde0cf 100644 --- a/apps/sim/lib/chunkers/text-chunker.ts +++ b/apps/sim/lib/chunkers/text-chunker.ts @@ -110,10 +110,12 @@ export class TextChunker { chunks.push(currentChunk.trim()) } - // Start new chunk with current part // If part itself is too large, split it further if (this.estimateTokens(part) > this.chunkSize) { - chunks.push(...(await this.splitRecursively(part, separatorIndex + 1))) + const subChunks = await this.splitRecursively(part, separatorIndex + 1) + for (const subChunk of subChunks) { + chunks.push(subChunk) + } currentChunk = '' } else { currentChunk = part diff --git a/apps/sim/lib/core/config/env.ts b/apps/sim/lib/core/config/env.ts index 5cca8759b..50ea62d6d 100644 --- a/apps/sim/lib/core/config/env.ts +++ b/apps/sim/lib/core/config/env.ts @@ -178,6 +178,7 @@ export const env = createEnv({ KB_CONFIG_BATCH_SIZE: z.number().optional().default(2000), // Chunks to process per embedding batch KB_CONFIG_DELAY_BETWEEN_BATCHES: z.number().optional().default(0), // Delay between batches in ms (0 for max speed) KB_CONFIG_DELAY_BETWEEN_DOCUMENTS: z.number().optional().default(50), // Delay between documents in ms + KB_CONFIG_CHUNK_CONCURRENCY: z.number().optional().default(10), // Concurrent PDF chunk OCR processing // Real-time Communication SOCKET_SERVER_URL: z.string().url().optional(), // WebSocket server URL for real-time features diff --git a/apps/sim/lib/file-parsers/doc-parser.ts b/apps/sim/lib/file-parsers/doc-parser.ts index a0e0c1bc3..0d7379721 100644 --- a/apps/sim/lib/file-parsers/doc-parser.ts +++ b/apps/sim/lib/file-parsers/doc-parser.ts @@ -17,8 +17,6 @@ export class DocParser implements FileParser { throw new Error(`File not found: ${filePath}`) } - logger.info(`Parsing DOC file: ${filePath}`) - const buffer = await readFile(filePath) return this.parseBuffer(buffer) } catch (error) { @@ -29,53 +27,80 @@ export class DocParser implements FileParser { async parseBuffer(buffer: Buffer): Promise { try { - logger.info('Parsing DOC buffer, size:', buffer.length) - if (!buffer || buffer.length === 0) { throw new Error('Empty buffer provided') } - let parseOfficeAsync try { const officeParser = await import('officeparser') - parseOfficeAsync = officeParser.parseOfficeAsync - } catch (importError) { - logger.warn('officeparser not available, using fallback extraction') - return this.fallbackExtraction(buffer) + const result = await officeParser.parseOfficeAsync(buffer) + + if (result) { + const resultString = typeof result === 'string' ? result : String(result) + const content = sanitizeTextForUTF8(resultString.trim()) + + if (content.length > 0) { + return { + content, + metadata: { + characterCount: content.length, + extractionMethod: 'officeparser', + }, + } + } + } + } catch (officeError) { + logger.warn('officeparser failed, trying mammoth:', officeError) } try { - const result = await parseOfficeAsync(buffer) + const mammoth = await import('mammoth') + const result = await mammoth.extractRawText({ buffer }) - if (!result) { - throw new Error('officeparser returned no result') + if (result.value && result.value.trim().length > 0) { + const content = sanitizeTextForUTF8(result.value.trim()) + return { + content, + metadata: { + characterCount: content.length, + extractionMethod: 'mammoth', + messages: result.messages, + }, + } } - - const resultString = typeof result === 'string' ? result : String(result) - - const content = sanitizeTextForUTF8(resultString.trim()) - - logger.info('DOC parsing completed successfully with officeparser') - - return { - content: content, - metadata: { - characterCount: content.length, - extractionMethod: 'officeparser', - }, - } - } catch (extractError) { - logger.warn('officeparser failed, using fallback:', extractError) - return this.fallbackExtraction(buffer) + } catch (mammothError) { + logger.warn('mammoth failed:', mammothError) } + + return this.fallbackExtraction(buffer) } catch (error) { - logger.error('DOC buffer parsing error:', error) + logger.error('DOC parsing error:', error) throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`) } } private fallbackExtraction(buffer: Buffer): FileParseResult { - logger.info('Using fallback text extraction for DOC file') + const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf + + if (!isBinaryDoc) { + const textContent = buffer.toString('utf8').trim() + + if (textContent.length > 0) { + const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0 + const isProbablyText = printableChars / textContent.length > 0.9 + + if (isProbablyText) { + return { + content: sanitizeTextForUTF8(textContent), + metadata: { + extractionMethod: 'plaintext-fallback', + characterCount: textContent.length, + warning: 'File is not a valid DOC format, extracted as plain text', + }, + } + } + } + } const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000)) diff --git a/apps/sim/lib/file-parsers/docx-parser.ts b/apps/sim/lib/file-parsers/docx-parser.ts index 5663a50b6..ab49f86af 100644 --- a/apps/sim/lib/file-parsers/docx-parser.ts +++ b/apps/sim/lib/file-parsers/docx-parser.ts @@ -2,13 +2,18 @@ import { readFile } from 'fs/promises' import { createLogger } from '@sim/logger' import mammoth from 'mammoth' import type { FileParseResult, FileParser } from '@/lib/file-parsers/types' +import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils' const logger = createLogger('DocxParser') -// Define interface for mammoth result +interface MammothMessage { + type: 'warning' | 'error' + message: string +} + interface MammothResult { value: string - messages: any[] + messages: MammothMessage[] } export class DocxParser implements FileParser { @@ -19,7 +24,6 @@ export class DocxParser implements FileParser { } const buffer = await readFile(filePath) - return this.parseBuffer(buffer) } catch (error) { logger.error('DOCX file error:', error) @@ -29,26 +33,74 @@ export class DocxParser implements FileParser { async parseBuffer(buffer: Buffer): Promise { try { - logger.info('Parsing buffer, size:', buffer.length) + if (!buffer || buffer.length === 0) { + throw new Error('Empty buffer provided') + } - const result = await mammoth.extractRawText({ buffer }) - - let htmlResult: MammothResult = { value: '', messages: [] } try { - htmlResult = await mammoth.convertToHtml({ buffer }) - } catch (htmlError) { - logger.warn('HTML conversion warning:', htmlError) + const result = await mammoth.extractRawText({ buffer }) + + if (result.value && result.value.trim().length > 0) { + let htmlResult: MammothResult = { value: '', messages: [] } + try { + htmlResult = await mammoth.convertToHtml({ buffer }) + } catch { + // HTML conversion is optional + } + + return { + content: sanitizeTextForUTF8(result.value), + metadata: { + extractionMethod: 'mammoth', + messages: [...result.messages, ...htmlResult.messages], + html: htmlResult.value, + }, + } + } + } catch (mammothError) { + logger.warn('mammoth failed, trying officeparser:', mammothError) } - return { - content: result.value, - metadata: { - messages: [...result.messages, ...htmlResult.messages], - html: htmlResult.value, - }, + try { + const officeParser = await import('officeparser') + const result = await officeParser.parseOfficeAsync(buffer) + + if (result) { + const resultString = typeof result === 'string' ? result : String(result) + const content = sanitizeTextForUTF8(resultString.trim()) + + if (content.length > 0) { + return { + content, + metadata: { + extractionMethod: 'officeparser', + characterCount: content.length, + }, + } + } + } + } catch (officeError) { + logger.warn('officeparser failed:', officeError) } + + const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b + if (!isZipFile) { + const textContent = buffer.toString('utf8').trim() + if (textContent.length > 0) { + return { + content: sanitizeTextForUTF8(textContent), + metadata: { + extractionMethod: 'plaintext-fallback', + characterCount: textContent.length, + warning: 'File is not a valid DOCX format, extracted as plain text', + }, + } + } + } + + throw new Error('Failed to extract text from DOCX file') } catch (error) { - logger.error('DOCX buffer parsing error:', error) + logger.error('DOCX parsing error:', error) throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`) } } diff --git a/apps/sim/lib/file-parsers/types.ts b/apps/sim/lib/file-parsers/types.ts index 4d02cc547..90baa0543 100644 --- a/apps/sim/lib/file-parsers/types.ts +++ b/apps/sim/lib/file-parsers/types.ts @@ -1,6 +1,22 @@ +export interface FileParseMetadata { + characterCount?: number + pageCount?: number + extractionMethod?: string + warning?: string + messages?: unknown[] + html?: string + type?: string + headers?: string[] + totalRows?: number + rowCount?: number + sheetNames?: string[] + source?: string + [key: string]: unknown +} + export interface FileParseResult { content: string - metadata?: Record + metadata?: FileParseMetadata } export interface FileParser { diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index e10935cbd..632e91fa8 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -1,8 +1,10 @@ import { createLogger } from '@sim/logger' +import { PDFDocument } from 'pdf-lib' import { getBYOKKey } from '@/lib/api-key/byok' import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers' import { env } from '@/lib/core/config/env' import { parseBuffer, parseFile } from '@/lib/file-parsers' +import type { FileParseMetadata } from '@/lib/file-parsers/types' import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils' import { StorageService } from '@/lib/uploads' import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server' @@ -15,6 +17,8 @@ const TIMEOUTS = { MISTRAL_OCR_API: 120000, } as const +const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY + type OCRResult = { success: boolean error?: string @@ -36,6 +40,61 @@ type OCRRequestBody = { include_image_base64: boolean } +const MISTRAL_MAX_PAGES = 1000 + +/** + * Get page count from a PDF buffer using unpdf + */ +async function getPdfPageCount(buffer: Buffer): Promise { + try { + const { getDocumentProxy } = await import('unpdf') + const uint8Array = new Uint8Array(buffer) + const pdf = await getDocumentProxy(uint8Array) + return pdf.numPages + } catch (error) { + logger.warn('Failed to get PDF page count:', error) + return 0 + } +} + +/** + * Split a PDF buffer into multiple smaller PDFs + * Returns an array of PDF buffers, each with at most maxPages pages + */ +async function splitPdfIntoChunks( + pdfBuffer: Buffer, + maxPages: number +): Promise<{ buffer: Buffer; startPage: number; endPage: number }[]> { + const sourcePdf = await PDFDocument.load(pdfBuffer) + const totalPages = sourcePdf.getPageCount() + + if (totalPages <= maxPages) { + return [{ buffer: pdfBuffer, startPage: 0, endPage: totalPages - 1 }] + } + + const chunks: { buffer: Buffer; startPage: number; endPage: number }[] = [] + + for (let startPage = 0; startPage < totalPages; startPage += maxPages) { + const endPage = Math.min(startPage + maxPages - 1, totalPages - 1) + const pageCount = endPage - startPage + 1 + + const newPdf = await PDFDocument.create() + const pageIndices = Array.from({ length: pageCount }, (_, i) => startPage + i) + const copiedPages = await newPdf.copyPages(sourcePdf, pageIndices) + + copiedPages.forEach((page) => newPdf.addPage(page)) + + const pdfBytes = await newPdf.save() + chunks.push({ + buffer: Buffer.from(pdfBytes), + startPage, + endPage, + }) + } + + return chunks +} + type AzureOCRResponse = { pages?: OCRPage[] [key: string]: unknown @@ -81,7 +140,7 @@ export async function processDocument( const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined let chunks: Chunk[] - const metadata = 'metadata' in parseResult ? parseResult.metadata : {} + const metadata: FileParseMetadata = parseResult.metadata ?? {} const isJsonYaml = metadata.type === 'json' || @@ -97,10 +156,11 @@ export async function processDocument( }) } else if (StructuredDataChunker.isStructuredData(content, mimeType)) { logger.info('Using structured data chunker for spreadsheet/CSV content') + const rowCount = metadata.totalRows ?? metadata.rowCount chunks = await StructuredDataChunker.chunkStructuredData(content, { chunkSize, headers: metadata.headers, - totalRows: metadata.totalRows || metadata.rowCount, + totalRows: typeof rowCount === 'number' ? rowCount : undefined, sheetName: metadata.sheetNames?.[0], }) } else { @@ -153,7 +213,7 @@ async function parseDocument( content: string processingMethod: 'file-parser' | 'mistral-ocr' cloudUrl?: string - metadata?: any + metadata?: FileParseMetadata }> { const isPDF = mimeType === 'application/pdf' const hasAzureMistralOCR = @@ -165,7 +225,7 @@ async function parseDocument( if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) { if (hasAzureMistralOCR) { logger.info(`Using Azure Mistral OCR: ${filename}`) - return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId) + return parseWithAzureMistralOCR(fileUrl, filename, mimeType) } if (hasMistralOCR) { @@ -188,13 +248,32 @@ async function handleFileForOCR( const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/') if (isExternalHttps) { - return { httpsUrl: fileUrl } + if (mimeType === 'application/pdf') { + logger.info(`handleFileForOCR: Downloading external PDF to check page count`) + try { + const buffer = await downloadFileWithTimeout(fileUrl) + logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`) + return { httpsUrl: fileUrl, buffer } + } catch (error) { + logger.warn( + `handleFileForOCR: Failed to download external PDF for page count check, proceeding without batching`, + { + error: error instanceof Error ? error.message : String(error), + } + ) + return { httpsUrl: fileUrl, buffer: undefined } + } + } + logger.info(`handleFileForOCR: Using external URL directly`) + return { httpsUrl: fileUrl, buffer: undefined } } logger.info(`Uploading "${filename}" to cloud storage for OCR`) const buffer = await downloadFileWithTimeout(fileUrl) + logger.info(`Downloaded ${filename}: ${buffer.length} bytes`) + try { const metadata: Record = { originalName: filename, @@ -224,8 +303,7 @@ async function handleFileForOCR( 900 // 15 minutes ) - logger.info(`Successfully uploaded for OCR: ${cloudResult.key}`) - return { httpsUrl, cloudUrl: httpsUrl } + return { httpsUrl, cloudUrl: httpsUrl, buffer } } catch (uploadError) { const message = uploadError instanceof Error ? uploadError.message : 'Unknown error' throw new Error(`Cloud upload failed: ${message}. Cloud upload is required for OCR.`) @@ -321,13 +399,7 @@ async function makeOCRRequest( } } -async function parseWithAzureMistralOCR( - fileUrl: string, - filename: string, - mimeType: string, - userId?: string, - workspaceId?: string | null -) { +async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) { validateOCRConfig( env.OCR_AZURE_API_KEY, env.OCR_AZURE_ENDPOINT, @@ -336,6 +408,19 @@ async function parseWithAzureMistralOCR( ) const fileBuffer = await downloadFileForBase64(fileUrl) + + if (mimeType === 'application/pdf') { + const pageCount = await getPdfPageCount(fileBuffer) + if (pageCount > MISTRAL_MAX_PAGES) { + logger.info( + `PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` + + `Falling back to file parser.` + ) + return parseWithFileParser(fileUrl, filename, mimeType) + } + logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`) + } + const base64Data = fileBuffer.toString('base64') const dataUri = `data:${mimeType};base64,${base64Data}` @@ -374,17 +459,7 @@ async function parseWithAzureMistralOCR( message: error instanceof Error ? error.message : String(error), }) - const fallbackMistralKey = await getMistralApiKey(workspaceId) - if (fallbackMistralKey) { - return parseWithMistralOCR( - fileUrl, - filename, - mimeType, - userId, - workspaceId, - fallbackMistralKey - ) - } + logger.info(`Falling back to file parser: ${filename}`) return parseWithFileParser(fileUrl, filename, mimeType) } } @@ -406,50 +481,35 @@ async function parseWithMistralOCR( throw new Error('Mistral parser tool not configured') } - const { httpsUrl, cloudUrl } = await handleFileForOCR( + const { httpsUrl, cloudUrl, buffer } = await handleFileForOCR( fileUrl, filename, mimeType, userId, workspaceId ) + + logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`) + + let pageCount = 0 + if (mimeType === 'application/pdf' && buffer) { + pageCount = await getPdfPageCount(buffer) + logger.info(`PDF page count for ${filename}: ${pageCount}`) + } + + const needsBatching = pageCount > MISTRAL_MAX_PAGES + + if (needsBatching && buffer) { + logger.info( + `PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.` + ) + return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl) + } + const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const } try { - const response = await retryWithExponentialBackoff( - async () => { - let url = - typeof mistralParserTool.request!.url === 'function' - ? mistralParserTool.request!.url(params) - : mistralParserTool.request!.url - - const isInternalRoute = url.startsWith('/') - - if (isInternalRoute) { - const { getBaseUrl } = await import('@/lib/core/utils/urls') - url = `${getBaseUrl()}${url}` - } - - let headers = - typeof mistralParserTool.request!.headers === 'function' - ? mistralParserTool.request!.headers(params) - : mistralParserTool.request!.headers - - if (isInternalRoute) { - const { generateInternalToken } = await import('@/lib/auth/internal') - const internalToken = await generateInternalToken(userId) - headers = { - ...headers, - Authorization: `Bearer ${internalToken}`, - } - } - - const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody - return makeOCRRequest(url, headers as Record, requestBody) - }, - { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 } - ) - + const response = await executeMistralOCRRequest(params, userId) const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult const content = processOCRContent(result, filename) @@ -464,10 +524,204 @@ async function parseWithMistralOCR( } } +async function executeMistralOCRRequest( + params: { filePath: string; apiKey: string; resultType: 'text' }, + userId?: string +): Promise { + return retryWithExponentialBackoff( + async () => { + let url = + typeof mistralParserTool.request!.url === 'function' + ? mistralParserTool.request!.url(params) + : mistralParserTool.request!.url + + const isInternalRoute = url.startsWith('/') + + if (isInternalRoute) { + const { getBaseUrl } = await import('@/lib/core/utils/urls') + url = `${getBaseUrl()}${url}` + } + + let headers = + typeof mistralParserTool.request!.headers === 'function' + ? mistralParserTool.request!.headers(params) + : mistralParserTool.request!.headers + + if (isInternalRoute) { + const { generateInternalToken } = await import('@/lib/auth/internal') + const internalToken = await generateInternalToken(userId) + headers = { + ...headers, + Authorization: `Bearer ${internalToken}`, + } + } + + const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody + return makeOCRRequest(url, headers as Record, requestBody) + }, + { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 } + ) +} + +/** + * Process a single PDF chunk: upload to S3, OCR, cleanup + */ +async function processChunk( + chunk: { buffer: Buffer; startPage: number; endPage: number }, + chunkIndex: number, + totalChunks: number, + filename: string, + apiKey: string, + userId?: string +): Promise<{ index: number; content: string | null }> { + const chunkPageCount = chunk.endPage - chunk.startPage + 1 + + logger.info( + `Processing chunk ${chunkIndex + 1}/${totalChunks} (pages ${chunk.startPage + 1}-${chunk.endPage + 1}, ${chunkPageCount} pages)` + ) + + let uploadedKey: string | null = null + + try { + // Upload the chunk to S3 + const timestamp = Date.now() + const uniqueId = Math.random().toString(36).substring(2, 9) + const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_') + const chunkKey = `kb/${timestamp}-${uniqueId}-chunk${chunkIndex + 1}-${safeFileName}` + + const metadata: Record = { + originalName: `${filename}_chunk${chunkIndex + 1}`, + uploadedAt: new Date().toISOString(), + purpose: 'knowledge-base', + ...(userId && { userId }), + } + + const uploadResult = await StorageService.uploadFile({ + file: chunk.buffer, + fileName: `${filename}_chunk${chunkIndex + 1}`, + contentType: 'application/pdf', + context: 'knowledge-base', + customKey: chunkKey, + metadata, + }) + + uploadedKey = uploadResult.key + + const chunkUrl = await StorageService.generatePresignedDownloadUrl( + uploadResult.key, + 'knowledge-base', + 900 // 15 minutes + ) + + logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`) + + // Process the chunk with Mistral OCR + const params = { + filePath: chunkUrl, + apiKey, + resultType: 'text' as const, + } + + const response = await executeMistralOCRRequest(params, userId) + const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult + + if (result.success && result.output?.content) { + logger.info(`Chunk ${chunkIndex + 1}/${totalChunks} completed successfully`) + return { index: chunkIndex, content: result.output.content } + } + logger.warn(`Chunk ${chunkIndex + 1}/${totalChunks} returned no content`) + return { index: chunkIndex, content: null } + } catch (error) { + logger.error(`Chunk ${chunkIndex + 1}/${totalChunks} failed:`, { + message: error instanceof Error ? error.message : String(error), + }) + return { index: chunkIndex, content: null } + } finally { + // Clean up the chunk file from S3 after processing + if (uploadedKey) { + try { + await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' }) + logger.info(`Cleaned up chunk ${chunkIndex + 1} from S3`) + } catch (deleteError) { + logger.warn(`Failed to clean up chunk ${chunkIndex + 1} from S3:`, { + message: deleteError instanceof Error ? deleteError.message : String(deleteError), + }) + } + } + } +} + +async function processMistralOCRInBatches( + filename: string, + apiKey: string, + pdfBuffer: Buffer, + userId?: string, + cloudUrl?: string +): Promise<{ + content: string + processingMethod: 'mistral-ocr' + cloudUrl?: string +}> { + const totalPages = await getPdfPageCount(pdfBuffer) + logger.info( + `Splitting ${filename} (${totalPages} pages) into chunks of ${MISTRAL_MAX_PAGES} pages` + ) + + const pdfChunks = await splitPdfIntoChunks(pdfBuffer, MISTRAL_MAX_PAGES) + logger.info( + `Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}` + ) + + // Process chunks concurrently with limited concurrency + const results: { index: number; content: string | null }[] = [] + + for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) { + const batch = pdfChunks.slice(i, i + MAX_CONCURRENT_CHUNKS) + const batchPromises = batch.map((chunk, batchIndex) => + processChunk(chunk, i + batchIndex, pdfChunks.length, filename, apiKey, userId) + ) + + const batchResults = await Promise.all(batchPromises) + for (const result of batchResults) { + results.push(result) + } + + logger.info( + `Completed batch ${Math.floor(i / MAX_CONCURRENT_CHUNKS) + 1}/${Math.ceil(pdfChunks.length / MAX_CONCURRENT_CHUNKS)}` + ) + } + + // Sort by index to maintain page order and filter out nulls + const sortedResults = results + .sort((a, b) => a.index - b.index) + .filter((r) => r.content !== null) + .map((r) => r.content as string) + + if (sortedResults.length === 0) { + // Don't fall back to file parser for large PDFs - it produces poor results + // Better to fail clearly than return low-quality extraction + throw new Error( + `OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` + + `Large PDFs require OCR - file parser fallback would produce poor results.` + ) + } + + const combinedContent = sortedResults.join('\n\n') + logger.info( + `Successfully processed ${sortedResults.length}/${pdfChunks.length} chunks for ${filename}` + ) + + return { + content: combinedContent, + processingMethod: 'mistral-ocr', + cloudUrl, + } +} + async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) { try { let content: string - let metadata: any = {} + let metadata: FileParseMetadata = {} if (fileUrl.startsWith('data:')) { content = await parseDataURI(fileUrl, filename, mimeType) @@ -513,7 +767,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string) async function parseHttpFile( fileUrl: string, filename: string -): Promise<{ content: string; metadata?: any }> { +): Promise<{ content: string; metadata?: FileParseMetadata }> { const buffer = await downloadFileWithTimeout(fileUrl) const extension = filename.split('.').pop()?.toLowerCase() diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts index 313ea8d39..d61cbdfd7 100644 --- a/apps/sim/lib/knowledge/documents/service.ts +++ b/apps/sim/lib/knowledge/documents/service.ts @@ -212,7 +212,6 @@ export async function processDocumentTags( return result } - // Fetch existing tag definitions const existingDefinitions = await db .select() .from(knowledgeBaseTagDefinitions) @@ -220,18 +219,15 @@ export async function processDocumentTags( const existingByName = new Map(existingDefinitions.map((def) => [def.displayName, def])) - // First pass: collect all validation errors const undefinedTags: string[] = [] const typeErrors: string[] = [] for (const tag of tagData) { - // Skip if no tag name if (!tag.tagName?.trim()) continue const tagName = tag.tagName.trim() const fieldType = tag.fieldType || 'text' - // For boolean, check if value is defined; for others, check if value is non-empty const hasValue = fieldType === 'boolean' ? tag.value !== undefined && tag.value !== null && tag.value !== '' @@ -239,14 +235,12 @@ export async function processDocumentTags( if (!hasValue) continue - // Check if tag exists const existingDef = existingByName.get(tagName) if (!existingDef) { undefinedTags.push(tagName) continue } - // Validate value type using shared validation const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value const actualFieldType = existingDef.fieldType || fieldType const validationError = validateTagValue(tagName, String(rawValue), actualFieldType) @@ -255,7 +249,6 @@ export async function processDocumentTags( } } - // Throw combined error if there are any validation issues if (undefinedTags.length > 0 || typeErrors.length > 0) { const errorParts: string[] = [] @@ -270,7 +263,6 @@ export async function processDocumentTags( throw new Error(errorParts.join('\n')) } - // Second pass: process valid tags for (const tag of tagData) { if (!tag.tagName?.trim()) continue @@ -285,14 +277,13 @@ export async function processDocumentTags( if (!hasValue) continue const existingDef = existingByName.get(tagName) - if (!existingDef) continue // Already validated above + if (!existingDef) continue const targetSlot = existingDef.tagSlot const actualFieldType = existingDef.fieldType || fieldType const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value const stringValue = String(rawValue).trim() - // Assign value to the slot with proper type conversion (values already validated) if (actualFieldType === 'boolean') { setTagValue(result, targetSlot, parseBooleanValue(stringValue) ?? false) } else if (actualFieldType === 'number') { @@ -440,7 +431,6 @@ export async function processDocumentAsync( logger.info(`[${documentId}] Status updated to 'processing', starting document processor`) - // Use KB's chunkingConfig as fallback if processingOptions not provided const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number } await withTimeout( @@ -469,7 +459,6 @@ export async function processDocumentAsync( `[${documentId}] Document parsed successfully, generating embeddings for ${processed.chunks.length} chunks` ) - // Generate embeddings in batches for large documents const chunkTexts = processed.chunks.map((chunk) => chunk.text) const embeddings: number[][] = [] @@ -485,7 +474,9 @@ export async function processDocumentAsync( logger.info(`[${documentId}] Processing embedding batch ${batchNum}/${totalBatches}`) const batchEmbeddings = await generateEmbeddings(batch, undefined, kb[0].workspaceId) - embeddings.push(...batchEmbeddings) + for (const emb of batchEmbeddings) { + embeddings.push(emb) + } } } @@ -562,23 +553,18 @@ export async function processDocumentAsync( })) await db.transaction(async (tx) => { - // Insert embeddings in batches for large documents if (embeddingRecords.length > 0) { - const batchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH - const totalBatches = Math.ceil(embeddingRecords.length / batchSize) + await tx.delete(embedding).where(eq(embedding.documentId, documentId)) - logger.info( - `[${documentId}] Inserting ${embeddingRecords.length} embeddings in ${totalBatches} batches` - ) - - for (let i = 0; i < embeddingRecords.length; i += batchSize) { - const batch = embeddingRecords.slice(i, i + batchSize) - const batchNum = Math.floor(i / batchSize) + 1 + const insertBatchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH + const batches: (typeof embeddingRecords)[] = [] + for (let i = 0; i < embeddingRecords.length; i += insertBatchSize) { + batches.push(embeddingRecords.slice(i, i + insertBatchSize)) + } + logger.info(`[${documentId}] Inserting ${embeddingRecords.length} embeddings`) + for (const batch of batches) { await tx.insert(embedding).values(batch) - logger.info( - `[${documentId}] Inserted batch ${batchNum}/${totalBatches} (${batch.length} records)` - ) } } @@ -689,11 +675,9 @@ export async function createDocumentRecords( requestId: string, userId?: string ): Promise { - // Check storage limits before creating documents if (userId) { const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0) - // Get knowledge base owner const kb = await db .select({ userId: knowledgeBase.userId }) .from(knowledgeBase) @@ -713,7 +697,7 @@ export async function createDocumentRecords( for (const docData of documents) { const documentId = randomUUID() - let processedTags: Record = {} + let processedTags: Partial = {} if (docData.documentTagsData) { try { @@ -722,7 +706,6 @@ export async function createDocumentRecords( processedTags = await processDocumentTags(knowledgeBaseId, tagData, requestId) } } catch (error) { - // Re-throw validation errors, only catch JSON parse errors if (error instanceof SyntaxError) { logger.warn(`[${requestId}] Failed to parse documentTagsData for bulk document:`, error) } else { @@ -791,7 +774,6 @@ export async function createDocumentRecords( if (userId) { const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0) - // Get knowledge base owner const kb = await db .select({ userId: knowledgeBase.userId }) .from(knowledgeBase) @@ -1079,7 +1061,7 @@ export async function createSingleDocument( const now = new Date() // Process structured tag data if provided - let processedTags: Record = { + let processedTags: ProcessedDocumentTags = { // Text tags (7 slots) tag1: documentData.tag1 ?? null, tag2: documentData.tag2 ?? null, @@ -1555,23 +1537,30 @@ export async function updateDocument( return value || null } + // Type-safe access to tag slots in updateData + type UpdateDataWithTags = typeof updateData & Record + const typedUpdateData = updateData as UpdateDataWithTags + ALL_TAG_SLOTS.forEach((slot: TagSlot) => { - const updateValue = (updateData as any)[slot] + const updateValue = typedUpdateData[slot] if (updateValue !== undefined) { - ;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue) + ;(dbUpdateData as Record)[slot] = + convertTagValue(slot, updateValue) } }) await db.transaction(async (tx) => { await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId)) - const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined) + const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined) if (hasTagUpdates) { - const embeddingUpdateData: Record = {} + const embeddingUpdateData: Partial = {} ALL_TAG_SLOTS.forEach((field) => { - if ((updateData as any)[field] !== undefined) { - embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field]) + if (typedUpdateData[field] !== undefined) { + ;(embeddingUpdateData as Record)[ + field + ] = convertTagValue(field, typedUpdateData[field]) } }) diff --git a/apps/sim/lib/knowledge/documents/utils.ts b/apps/sim/lib/knowledge/documents/utils.ts index a872c1ede..7aae3187a 100644 --- a/apps/sim/lib/knowledge/documents/utils.ts +++ b/apps/sim/lib/knowledge/documents/utils.ts @@ -14,7 +14,7 @@ export interface RetryOptions { initialDelayMs?: number maxDelayMs?: number backoffMultiplier?: number - retryCondition?: (error: RetryableError) => boolean + retryCondition?: (error: unknown) => boolean } export interface RetryResult { @@ -30,11 +30,18 @@ function hasStatus( return typeof error === 'object' && error !== null && 'status' in error } +function isRetryableErrorType(error: unknown): error is RetryableError { + if (!error) return false + if (error instanceof Error) return true + if (typeof error === 'object' && ('status' in error || 'message' in error)) return true + return false +} + /** * Default retry condition for rate limiting errors */ -export function isRetryableError(error: RetryableError): boolean { - if (!error) return false +export function isRetryableError(error: unknown): boolean { + if (!isRetryableErrorType(error)) return false // Check for rate limiting status codes if ( @@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean { } // Check for rate limiting in error messages - const errorMessage = error.message || error.toString() + const errorMessage = error instanceof Error ? error.message : String(error) const rateLimitKeywords = [ 'rate limit', 'rate_limit', diff --git a/apps/sim/lib/knowledge/embeddings.ts b/apps/sim/lib/knowledge/embeddings.ts index 7a736688b..2b57b34c5 100644 --- a/apps/sim/lib/knowledge/embeddings.ts +++ b/apps/sim/lib/knowledge/embeddings.ts @@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger' import { getBYOKKey } from '@/lib/api-key/byok' import { env } from '@/lib/core/config/env' import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils' -import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization' +import { batchByTokenLimit } from '@/lib/tokenization' const logger = createLogger('EmbeddingUtils') @@ -26,6 +26,20 @@ interface EmbeddingConfig { modelName: string } +interface EmbeddingResponseItem { + embedding: number[] + index: number +} + +interface EmbeddingAPIResponse { + data: EmbeddingResponseItem[] + model: string + usage: { + prompt_tokens: number + total_tokens: number + } +} + async function getEmbeddingConfig( embeddingModel = 'text-embedding-3-small', workspaceId?: string | null @@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom ) } - const data = await response.json() - return data.data.map((item: any) => item.embedding) + const data: EmbeddingAPIResponse = await response.json() + return data.data.map((item) => item.embedding) }, { maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000, - retryCondition: (error: any) => { + retryCondition: (error: unknown) => { if (error instanceof EmbeddingAPIError) { return error.status === 429 || error.status >= 500 } @@ -153,44 +167,27 @@ export async function generateEmbeddings( ): Promise { const config = await getEmbeddingConfig(embeddingModel, workspaceId) - logger.info( - `Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)` - ) - const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel) - logger.info( - `Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch, ${MAX_CONCURRENT_BATCHES} concurrent)` - ) - const batchResults = await processWithConcurrency( batches, MAX_CONCURRENT_BATCHES, async (batch, i) => { - const batchTokenCount = getTotalTokenCount(batch, embeddingModel) - - logger.info( - `Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens` - ) - try { - const batchEmbeddings = await callEmbeddingAPI(batch, config) - - logger.info( - `Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}` - ) - - return batchEmbeddings + return await callEmbeddingAPI(batch, config) } catch (error) { - logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error) + logger.error(`Failed to generate embeddings for batch ${i + 1}/${batches.length}:`, error) throw error } } ) - const allEmbeddings = batchResults.flat() - - logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`) + const allEmbeddings: number[][] = [] + for (const batch of batchResults) { + for (const emb of batch) { + allEmbeddings.push(emb) + } + } return allEmbeddings } diff --git a/apps/sim/lib/tokenization/estimators.ts b/apps/sim/lib/tokenization/estimators.ts index c0d96fb0f..53ce71965 100644 --- a/apps/sim/lib/tokenization/estimators.ts +++ b/apps/sim/lib/tokenization/estimators.ts @@ -127,24 +127,6 @@ export function truncateToTokenLimit( } } -/** - * Get token count for multiple texts (for batching decisions) - * Returns array of token counts in same order as input - */ -export function getTokenCountsForBatch( - texts: string[], - modelName = 'text-embedding-3-small' -): number[] { - return texts.map((text) => getAccurateTokenCount(text, modelName)) -} - -/** - * Calculate total tokens across multiple texts - */ -export function getTotalTokenCount(texts: string[], modelName = 'text-embedding-3-small'): number { - return texts.reduce((total, text) => total + getAccurateTokenCount(text, modelName), 0) -} - /** * Batch texts by token count to stay within API limits * Returns array of batches where each batch's total tokens <= maxTokensPerBatch diff --git a/apps/sim/lib/tokenization/index.ts b/apps/sim/lib/tokenization/index.ts index 4a82a60e0..b6c559f2b 100644 --- a/apps/sim/lib/tokenization/index.ts +++ b/apps/sim/lib/tokenization/index.ts @@ -12,8 +12,6 @@ export { estimateOutputTokens, estimateTokenCount, getAccurateTokenCount, - getTokenCountsForBatch, - getTotalTokenCount, truncateToTokenLimit, } from '@/lib/tokenization/estimators' export { processStreamingBlockLog, processStreamingBlockLogs } from '@/lib/tokenization/streaming' diff --git a/apps/sim/package.json b/apps/sim/package.json index f968fe0f0..c6daca4e9 100644 --- a/apps/sim/package.json +++ b/apps/sim/package.json @@ -127,6 +127,7 @@ "onedollarstats": "0.0.10", "openai": "^4.91.1", "papaparse": "5.5.3", + "pdf-lib": "1.17.1", "postgres": "^3.4.5", "posthog-js": "1.268.9", "posthog-node": "5.9.2", diff --git a/apps/sim/trigger.config.ts b/apps/sim/trigger.config.ts index 543d112f0..f6915e95f 100644 --- a/apps/sim/trigger.config.ts +++ b/apps/sim/trigger.config.ts @@ -17,7 +17,7 @@ export default defineConfig({ build: { extensions: [ additionalPackages({ - packages: ['unpdf'], + packages: ['unpdf', 'pdf-lib'], }), ], }, diff --git a/bun.lock b/bun.lock index d7add8660..d62ed31d0 100644 --- a/bun.lock +++ b/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "simstudio", @@ -11,7 +12,7 @@ "drizzle-kit": "^0.31.4", "husky": "9.1.7", "lint-staged": "16.0.0", - "turbo": "2.7.2", + "turbo": "2.7.3", }, }, "apps/docs": { @@ -156,6 +157,7 @@ "onedollarstats": "0.0.10", "openai": "^4.91.1", "papaparse": "5.5.3", + "pdf-lib": "1.17.1", "postgres": "^3.4.5", "posthog-js": "1.268.9", "posthog-node": "5.9.2", @@ -912,6 +914,10 @@ "@orama/orama": ["@orama/orama@3.1.18", "", {}, "sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA=="], + "@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="], + + "@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="], + "@peculiar/asn1-android": ["@peculiar/asn1-android@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-cBRCKtYPF7vJGN76/yG8VbxRcHLPF3HnkoHhKOZeHpoVtbMYfY9ROKtH3DtYUY9m8uI1Mh47PRhHf2hSK3xcSQ=="], "@peculiar/asn1-cms": ["@peculiar/asn1-cms@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "@peculiar/asn1-x509": "^2.6.0", "@peculiar/asn1-x509-attr": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA=="], @@ -2864,6 +2870,8 @@ "pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="], + "pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="], + "pdfjs-dist": ["pdfjs-dist@5.4.449", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.81" } }, "sha512-CegnUaT0QwAyQMS+7o2POr4wWUNNe8VaKKlcuoRHeYo98cVnqPpwOXNSx6Trl6szH02JrRcsPgletV6GmF3LtQ=="], "peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="], @@ -3362,19 +3370,19 @@ "tunnel-agent": ["tunnel-agent@0.6.0", "", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w=="], - "turbo": ["turbo@2.7.2", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.2", "turbo-darwin-arm64": "2.7.2", "turbo-linux-64": "2.7.2", "turbo-linux-arm64": "2.7.2", "turbo-windows-64": "2.7.2", "turbo-windows-arm64": "2.7.2" }, "bin": { "turbo": "bin/turbo" } }, "sha512-5JIA5aYBAJSAhrhbyag1ZuMSgUZnHtI+Sq3H8D3an4fL8PeF+L1yYvbEJg47akP1PFfATMf5ehkqFnxfkmuwZQ=="], + "turbo": ["turbo@2.7.3", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.3", "turbo-darwin-arm64": "2.7.3", "turbo-linux-64": "2.7.3", "turbo-linux-arm64": "2.7.3", "turbo-windows-64": "2.7.3", "turbo-windows-arm64": "2.7.3" }, "bin": { "turbo": "bin/turbo" } }, "sha512-+HjKlP4OfYk+qzvWNETA3cUO5UuK6b5MSc2UJOKyvBceKucQoQGb2g7HlC2H1GHdkfKrk4YF1VPvROkhVZDDLQ=="], - "turbo-darwin-64": ["turbo-darwin-64@2.7.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-dxY3X6ezcT5vm3coK6VGixbrhplbQMwgNsCsvZamS/+/6JiebqW9DKt4NwpgYXhDY2HdH00I7FWs3wkVuan4rA=="], + "turbo-darwin-64": ["turbo-darwin-64@2.7.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-aZHhvRiRHXbJw1EcEAq4aws1hsVVUZ9DPuSFaq9VVFAKCup7niIEwc22glxb7240yYEr1vLafdQ2U294Vcwz+w=="], - "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1bXmuwPLqNFt3mzrtYcVx1sdJ8UYb124Bf48nIgcpMCGZy3kDhgxNv1503kmuK/37OGOZbsWSQFU4I08feIuSg=="], + "turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-CkVrHSq+Bnhl9sX2LQgqQYVfLTWC2gvI74C4758OmU0djfrssDKU9d4YQF0AYXXhIIRZipSXfxClQziIMD+EAg=="], - "turbo-linux-64": ["turbo-linux-64@2.7.2", "", { "os": "linux", "cpu": "x64" }, "sha512-kP+TiiMaiPugbRlv57VGLfcjFNsFbo8H64wMBCPV2270Or2TpDCBULMzZrvEsvWFjT3pBFvToYbdp8/Kw0jAQg=="], + "turbo-linux-64": ["turbo-linux-64@2.7.3", "", { "os": "linux", "cpu": "x64" }, "sha512-GqDsCNnzzr89kMaLGpRALyigUklzgxIrSy2pHZVXyifgczvYPnLglex78Aj3T2gu+T3trPPH2iJ+pWucVOCC2Q=="], - "turbo-linux-arm64": ["turbo-linux-arm64@2.7.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-VDJwQ0+8zjAfbyY6boNaWfP6RIez4ypKHxwkuB6SrWbOSk+vxTyW5/hEjytTwK8w/TsbKVcMDyvpora8tEsRFw=="], + "turbo-linux-arm64": ["turbo-linux-arm64@2.7.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-NdCDTfIcIo3dWjsiaAHlxu5gW61Ed/8maah1IAF/9E3EtX0aAHNiBMbuYLZaR4vRJ7BeVkYB6xKWRtdFLZ0y3g=="], - "turbo-windows-64": ["turbo-windows-64@2.7.2", "", { "os": "win32", "cpu": "x64" }, "sha512-rPjqQXVnI6A6oxgzNEE8DNb6Vdj2Wwyhfv3oDc+YM3U9P7CAcBIlKv/868mKl4vsBtz4ouWpTQNXG8vljgJO+w=="], + "turbo-windows-64": ["turbo-windows-64@2.7.3", "", { "os": "win32", "cpu": "x64" }, "sha512-7bVvO987daXGSJVYBoG8R4Q+csT1pKIgLJYZevXRQ0Hqw0Vv4mKme/TOjYXs9Qb1xMKh51Tb3bXKDbd8/4G08g=="], - "turbo-windows-arm64": ["turbo-windows-arm64@2.7.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-tcnHvBhO515OheIFWdxA+qUvZzNqqcHbLVFc1+n+TJ1rrp8prYicQtbtmsiKgMvr/54jb9jOabU62URAobnB7g=="], + "turbo-windows-arm64": ["turbo-windows-arm64@2.7.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-nTodweTbPmkvwMu/a55XvjMsPtuyUSC+sV7f/SR57K36rB2I0YG21qNETN+00LOTUW9B3omd8XkiXJkt4kx/cw=="], "tweetnacl": ["tweetnacl@0.14.5", "", {}, "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA=="], @@ -4046,6 +4054,8 @@ "path-scurry/lru-cache": ["lru-cache@11.2.4", "", {}, "sha512-B5Y16Jr9LB9dHVkh6ZevG+vAbOsNOYCX+sXvFWFu7B3Iz5mijW3zdbMyhsh8ANd2mSWBYdJgnqi+mL7/LrOPYg=="], + "pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="], + "pino/thread-stream": ["thread-stream@3.1.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-OqyPZ9u96VohAyMfJykzmivOrY2wfMSf3C5TtFJVgN+Hm6aj+voFhlK+kZEIv2FBh1X6Xp3DlnCOfEQ3B2J86A=="], "pino-pretty/pino-abstract-transport": ["pino-abstract-transport@3.0.0", "", { "dependencies": { "split2": "^4.0.0" } }, "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg=="], diff --git a/package.json b/package.json index 1b9ddf95f..4fa9176e0 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "drizzle-kit": "^0.31.4", "husky": "9.1.7", "lint-staged": "16.0.0", - "turbo": "2.7.2" + "turbo": "2.7.3" }, "lint-staged": { "*.{js,jsx,ts,tsx,json,css,scss}": [