improvement(kb): optimize processes, add more robust fallbacks for large file ops (#2684)

* improvement(kb): optimize processes, add more robust fallbacks for large file ops

* stronger typing

* comments cleanup

* ack PR comments

* upgraded turborepo

* ack more PR comments

* fix failing test

* moved doc update inside tx for embeddings chunks upload

* ack more PR comments
This commit is contained in:
Waleed
2026-01-05 20:26:16 -08:00
committed by GitHub
parent d25084e05d
commit 75aca00b6e
22 changed files with 592 additions and 261 deletions

View File

@@ -136,16 +136,29 @@ vi.mock('@sim/db', () => {
},
}),
}),
transaction: vi.fn(async (fn: any) => {
await fn({
insert: (table: any) => ({
delete: () => ({
where: () => Promise.resolve(),
}),
insert: () => ({
values: (records: any) => {
dbOps.order.push('insert')
dbOps.insertRecords.push(records)
return Promise.resolve()
},
}),
update: (table: any) => ({
transaction: vi.fn(async (fn: any) => {
await fn({
delete: () => ({
where: () => Promise.resolve(),
}),
insert: () => ({
values: (records: any) => {
dbOps.order.push('insert')
dbOps.insertRecords.push(records)
return Promise.resolve()
},
}),
update: () => ({
set: (payload: any) => ({
where: () => {
dbOps.updatePayloads.push(payload)

View File

@@ -453,6 +453,8 @@ export function KnowledgeBase({
error: knowledgeBaseError,
refresh: refreshKnowledgeBase,
} = useKnowledgeBase(id)
const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false)
const {
documents,
pagination,
@@ -468,6 +470,7 @@ export function KnowledgeBase({
offset: (currentPage - 1) * DOCUMENTS_PER_PAGE,
sortBy,
sortOrder,
refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false,
})
const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id)
@@ -534,25 +537,15 @@ export function KnowledgeBase({
)
useEffect(() => {
const hasProcessingDocuments = documents.some(
const processing = documents.some(
(doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing'
)
setHasProcessingDocuments(processing)
if (!hasProcessingDocuments) return
const refreshInterval = setInterval(async () => {
try {
if (!isDeleting) {
await checkForDeadProcesses()
await refreshDocuments()
if (processing) {
checkForDeadProcesses()
}
} catch (error) {
logger.error('Error refreshing documents:', error)
}
}, 3000)
return () => clearInterval(refreshInterval)
}, [documents, refreshDocuments, isDeleting])
}, [documents])
/**
* Checks for documents with stale processing states and marks them as failed
@@ -672,25 +665,6 @@ export function KnowledgeBase({
await refreshDocuments()
let refreshAttempts = 0
const maxRefreshAttempts = 3
const refreshInterval = setInterval(async () => {
try {
refreshAttempts++
await refreshDocuments()
if (refreshAttempts >= maxRefreshAttempts) {
clearInterval(refreshInterval)
}
} catch (error) {
logger.error('Error refreshing documents after retry:', error)
clearInterval(refreshInterval)
}
}, 1000)
setTimeout(() => {
clearInterval(refreshInterval)
}, 4000)
logger.info(`Document retry initiated successfully for: ${docId}`)
} catch (err) {
logger.error('Error retrying document:', err)

View File

@@ -27,6 +27,7 @@ export type DocumentProcessingPayload = {
export const processDocument = task({
id: 'knowledge-process-document',
maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing
retry: {
maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
factor: env.KB_CONFIG_RETRY_FACTOR || 2,

View File

@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(
params: KnowledgeDocumentsParams,
options?: {
enabled?: boolean
refetchInterval?: number | false
}
) {
const paramsKey = serializeDocumentParams(params)
@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(
enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),
staleTime: 60 * 1000,
placeholderData: keepPreviousData,
refetchInterval: options?.refetchInterval ?? false,
})
}

View File

@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(
sortBy?: string
sortOrder?: string
enabled?: boolean
refetchInterval?: number | false
}
) {
const queryClient = useQueryClient()
@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(
},
{
enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),
refetchInterval: options?.refetchInterval,
}
)

View File

@@ -16,7 +16,7 @@ interface HeaderInfo {
interface Frontmatter {
title?: string
description?: string
[key: string]: any
[key: string]: unknown
}
const logger = createLogger('DocsChunker')

View File

@@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators'
const logger = createLogger('JsonYamlChunker')
type JsonPrimitive = string | number | boolean | null
type JsonValue = JsonPrimitive | JsonObject | JsonArray
type JsonObject = { [key: string]: JsonValue }
type JsonArray = JsonValue[]
function getTokenCount(text: string): number {
try {
return getAccurateTokenCount(text, 'text-embedding-3-small')
@@ -59,11 +64,11 @@ export class JsonYamlChunker {
*/
async chunk(content: string): Promise<Chunk[]> {
try {
let data: any
let data: JsonValue
try {
data = JSON.parse(content)
data = JSON.parse(content) as JsonValue
} catch {
data = yaml.load(content)
data = yaml.load(content) as JsonValue
}
const chunks = this.chunkStructuredData(data)
@@ -86,7 +91,7 @@ export class JsonYamlChunker {
/**
* Chunk structured data based on its structure
*/
private chunkStructuredData(data: any, path: string[] = []): Chunk[] {
private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
const chunks: Chunk[] = []
if (Array.isArray(data)) {
@@ -94,7 +99,7 @@ export class JsonYamlChunker {
}
if (typeof data === 'object' && data !== null) {
return this.chunkObject(data, path)
return this.chunkObject(data as JsonObject, path)
}
const content = JSON.stringify(data, null, 2)
@@ -118,9 +123,9 @@ export class JsonYamlChunker {
/**
* Chunk an array intelligently
*/
private chunkArray(arr: any[], path: string[]): Chunk[] {
private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
const chunks: Chunk[] = []
let currentBatch: any[] = []
let currentBatch: JsonValue[] = []
let currentTokens = 0
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
@@ -194,7 +199,7 @@ export class JsonYamlChunker {
/**
* Chunk an object intelligently
*/
private chunkObject(obj: Record<string, any>, path: string[]): Chunk[] {
private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
const chunks: Chunk[] = []
const entries = Object.entries(obj)
@@ -213,7 +218,7 @@ export class JsonYamlChunker {
return chunks
}
let currentObj: Record<string, any> = {}
let currentObj: JsonObject = {}
let currentTokens = 0
let currentKeys: string[] = []

View File

@@ -110,10 +110,12 @@ export class TextChunker {
chunks.push(currentChunk.trim())
}
// Start new chunk with current part
// If part itself is too large, split it further
if (this.estimateTokens(part) > this.chunkSize) {
chunks.push(...(await this.splitRecursively(part, separatorIndex + 1)))
const subChunks = await this.splitRecursively(part, separatorIndex + 1)
for (const subChunk of subChunks) {
chunks.push(subChunk)
}
currentChunk = ''
} else {
currentChunk = part

View File

@@ -178,6 +178,7 @@ export const env = createEnv({
KB_CONFIG_BATCH_SIZE: z.number().optional().default(2000), // Chunks to process per embedding batch
KB_CONFIG_DELAY_BETWEEN_BATCHES: z.number().optional().default(0), // Delay between batches in ms (0 for max speed)
KB_CONFIG_DELAY_BETWEEN_DOCUMENTS: z.number().optional().default(50), // Delay between documents in ms
KB_CONFIG_CHUNK_CONCURRENCY: z.number().optional().default(10), // Concurrent PDF chunk OCR processing
// Real-time Communication
SOCKET_SERVER_URL: z.string().url().optional(), // WebSocket server URL for real-time features

View File

@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
throw new Error(`File not found: ${filePath}`)
}
logger.info(`Parsing DOC file: ${filePath}`)
const buffer = await readFile(filePath)
return this.parseBuffer(buffer)
} catch (error) {
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
try {
logger.info('Parsing DOC buffer, size:', buffer.length)
if (!buffer || buffer.length === 0) {
throw new Error('Empty buffer provided')
}
let parseOfficeAsync
try {
const officeParser = await import('officeparser')
parseOfficeAsync = officeParser.parseOfficeAsync
} catch (importError) {
logger.warn('officeparser not available, using fallback extraction')
return this.fallbackExtraction(buffer)
}
try {
const result = await parseOfficeAsync(buffer)
if (!result) {
throw new Error('officeparser returned no result')
}
const result = await officeParser.parseOfficeAsync(buffer)
if (result) {
const resultString = typeof result === 'string' ? result : String(result)
const content = sanitizeTextForUTF8(resultString.trim())
logger.info('DOC parsing completed successfully with officeparser')
if (content.length > 0) {
return {
content: content,
content,
metadata: {
characterCount: content.length,
extractionMethod: 'officeparser',
},
}
} catch (extractError) {
logger.warn('officeparser failed, using fallback:', extractError)
return this.fallbackExtraction(buffer)
}
}
} catch (officeError) {
logger.warn('officeparser failed, trying mammoth:', officeError)
}
try {
const mammoth = await import('mammoth')
const result = await mammoth.extractRawText({ buffer })
if (result.value && result.value.trim().length > 0) {
const content = sanitizeTextForUTF8(result.value.trim())
return {
content,
metadata: {
characterCount: content.length,
extractionMethod: 'mammoth',
messages: result.messages,
},
}
}
} catch (mammothError) {
logger.warn('mammoth failed:', mammothError)
}
return this.fallbackExtraction(buffer)
} catch (error) {
logger.error('DOC buffer parsing error:', error)
logger.error('DOC parsing error:', error)
throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`)
}
}
private fallbackExtraction(buffer: Buffer): FileParseResult {
logger.info('Using fallback text extraction for DOC file')
const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
if (!isBinaryDoc) {
const textContent = buffer.toString('utf8').trim()
if (textContent.length > 0) {
const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
const isProbablyText = printableChars / textContent.length > 0.9
if (isProbablyText) {
return {
content: sanitizeTextForUTF8(textContent),
metadata: {
extractionMethod: 'plaintext-fallback',
characterCount: textContent.length,
warning: 'File is not a valid DOC format, extracted as plain text',
},
}
}
}
}
const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))

View File

@@ -2,13 +2,18 @@ import { readFile } from 'fs/promises'
import { createLogger } from '@sim/logger'
import mammoth from 'mammoth'
import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'
const logger = createLogger('DocxParser')
// Define interface for mammoth result
interface MammothMessage {
type: 'warning' | 'error'
message: string
}
interface MammothResult {
value: string
messages: any[]
messages: MammothMessage[]
}
export class DocxParser implements FileParser {
@@ -19,7 +24,6 @@ export class DocxParser implements FileParser {
}
const buffer = await readFile(filePath)
return this.parseBuffer(buffer)
} catch (error) {
logger.error('DOCX file error:', error)
@@ -29,26 +33,74 @@ export class DocxParser implements FileParser {
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
try {
logger.info('Parsing buffer, size:', buffer.length)
if (!buffer || buffer.length === 0) {
throw new Error('Empty buffer provided')
}
try {
const result = await mammoth.extractRawText({ buffer })
if (result.value && result.value.trim().length > 0) {
let htmlResult: MammothResult = { value: '', messages: [] }
try {
htmlResult = await mammoth.convertToHtml({ buffer })
} catch (htmlError) {
logger.warn('HTML conversion warning:', htmlError)
} catch {
// HTML conversion is optional
}
return {
content: result.value,
content: sanitizeTextForUTF8(result.value),
metadata: {
extractionMethod: 'mammoth',
messages: [...result.messages, ...htmlResult.messages],
html: htmlResult.value,
},
}
}
} catch (mammothError) {
logger.warn('mammoth failed, trying officeparser:', mammothError)
}
try {
const officeParser = await import('officeparser')
const result = await officeParser.parseOfficeAsync(buffer)
if (result) {
const resultString = typeof result === 'string' ? result : String(result)
const content = sanitizeTextForUTF8(resultString.trim())
if (content.length > 0) {
return {
content,
metadata: {
extractionMethod: 'officeparser',
characterCount: content.length,
},
}
}
}
} catch (officeError) {
logger.warn('officeparser failed:', officeError)
}
const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b
if (!isZipFile) {
const textContent = buffer.toString('utf8').trim()
if (textContent.length > 0) {
return {
content: sanitizeTextForUTF8(textContent),
metadata: {
extractionMethod: 'plaintext-fallback',
characterCount: textContent.length,
warning: 'File is not a valid DOCX format, extracted as plain text',
},
}
}
}
throw new Error('Failed to extract text from DOCX file')
} catch (error) {
logger.error('DOCX buffer parsing error:', error)
logger.error('DOCX parsing error:', error)
throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`)
}
}

View File

@@ -1,6 +1,22 @@
export interface FileParseMetadata {
characterCount?: number
pageCount?: number
extractionMethod?: string
warning?: string
messages?: unknown[]
html?: string
type?: string
headers?: string[]
totalRows?: number
rowCount?: number
sheetNames?: string[]
source?: string
[key: string]: unknown
}
export interface FileParseResult {
content: string
metadata?: Record<string, any>
metadata?: FileParseMetadata
}
export interface FileParser {

View File

@@ -1,8 +1,10 @@
import { createLogger } from '@sim/logger'
import { PDFDocument } from 'pdf-lib'
import { getBYOKKey } from '@/lib/api-key/byok'
import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
import { env } from '@/lib/core/config/env'
import { parseBuffer, parseFile } from '@/lib/file-parsers'
import type { FileParseMetadata } from '@/lib/file-parsers/types'
import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
import { StorageService } from '@/lib/uploads'
import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
@@ -15,6 +17,8 @@ const TIMEOUTS = {
MISTRAL_OCR_API: 120000,
} as const
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
type OCRResult = {
success: boolean
error?: string
@@ -36,6 +40,61 @@ type OCRRequestBody = {
include_image_base64: boolean
}
const MISTRAL_MAX_PAGES = 1000
/**
* Get page count from a PDF buffer using unpdf
*/
async function getPdfPageCount(buffer: Buffer): Promise<number> {
try {
const { getDocumentProxy } = await import('unpdf')
const uint8Array = new Uint8Array(buffer)
const pdf = await getDocumentProxy(uint8Array)
return pdf.numPages
} catch (error) {
logger.warn('Failed to get PDF page count:', error)
return 0
}
}
/**
* Split a PDF buffer into multiple smaller PDFs
* Returns an array of PDF buffers, each with at most maxPages pages
*/
async function splitPdfIntoChunks(
pdfBuffer: Buffer,
maxPages: number
): Promise<{ buffer: Buffer; startPage: number; endPage: number }[]> {
const sourcePdf = await PDFDocument.load(pdfBuffer)
const totalPages = sourcePdf.getPageCount()
if (totalPages <= maxPages) {
return [{ buffer: pdfBuffer, startPage: 0, endPage: totalPages - 1 }]
}
const chunks: { buffer: Buffer; startPage: number; endPage: number }[] = []
for (let startPage = 0; startPage < totalPages; startPage += maxPages) {
const endPage = Math.min(startPage + maxPages - 1, totalPages - 1)
const pageCount = endPage - startPage + 1
const newPdf = await PDFDocument.create()
const pageIndices = Array.from({ length: pageCount }, (_, i) => startPage + i)
const copiedPages = await newPdf.copyPages(sourcePdf, pageIndices)
copiedPages.forEach((page) => newPdf.addPage(page))
const pdfBytes = await newPdf.save()
chunks.push({
buffer: Buffer.from(pdfBytes),
startPage,
endPage,
})
}
return chunks
}
type AzureOCRResponse = {
pages?: OCRPage[]
[key: string]: unknown
@@ -81,7 +140,7 @@ export async function processDocument(
const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined
let chunks: Chunk[]
const metadata = 'metadata' in parseResult ? parseResult.metadata : {}
const metadata: FileParseMetadata = parseResult.metadata ?? {}
const isJsonYaml =
metadata.type === 'json' ||
@@ -97,10 +156,11 @@ export async function processDocument(
})
} else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
logger.info('Using structured data chunker for spreadsheet/CSV content')
const rowCount = metadata.totalRows ?? metadata.rowCount
chunks = await StructuredDataChunker.chunkStructuredData(content, {
chunkSize,
headers: metadata.headers,
totalRows: metadata.totalRows || metadata.rowCount,
totalRows: typeof rowCount === 'number' ? rowCount : undefined,
sheetName: metadata.sheetNames?.[0],
})
} else {
@@ -153,7 +213,7 @@ async function parseDocument(
content: string
processingMethod: 'file-parser' | 'mistral-ocr'
cloudUrl?: string
metadata?: any
metadata?: FileParseMetadata
}> {
const isPDF = mimeType === 'application/pdf'
const hasAzureMistralOCR =
@@ -165,7 +225,7 @@ async function parseDocument(
if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
if (hasAzureMistralOCR) {
logger.info(`Using Azure Mistral OCR: ${filename}`)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId)
return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
}
if (hasMistralOCR) {
@@ -188,13 +248,32 @@ async function handleFileForOCR(
const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
if (isExternalHttps) {
return { httpsUrl: fileUrl }
if (mimeType === 'application/pdf') {
logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
try {
const buffer = await downloadFileWithTimeout(fileUrl)
logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
return { httpsUrl: fileUrl, buffer }
} catch (error) {
logger.warn(
`handleFileForOCR: Failed to download external PDF for page count check, proceeding without batching`,
{
error: error instanceof Error ? error.message : String(error),
}
)
return { httpsUrl: fileUrl, buffer: undefined }
}
}
logger.info(`handleFileForOCR: Using external URL directly`)
return { httpsUrl: fileUrl, buffer: undefined }
}
logger.info(`Uploading "${filename}" to cloud storage for OCR`)
const buffer = await downloadFileWithTimeout(fileUrl)
logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)
try {
const metadata: Record<string, string> = {
originalName: filename,
@@ -224,8 +303,7 @@ async function handleFileForOCR(
900 // 15 minutes
)
logger.info(`Successfully uploaded for OCR: ${cloudResult.key}`)
return { httpsUrl, cloudUrl: httpsUrl }
return { httpsUrl, cloudUrl: httpsUrl, buffer }
} catch (uploadError) {
const message = uploadError instanceof Error ? uploadError.message : 'Unknown error'
throw new Error(`Cloud upload failed: ${message}. Cloud upload is required for OCR.`)
@@ -321,13 +399,7 @@ async function makeOCRRequest(
}
}
async function parseWithAzureMistralOCR(
fileUrl: string,
filename: string,
mimeType: string,
userId?: string,
workspaceId?: string | null
) {
async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
validateOCRConfig(
env.OCR_AZURE_API_KEY,
env.OCR_AZURE_ENDPOINT,
@@ -336,6 +408,19 @@ async function parseWithAzureMistralOCR(
)
const fileBuffer = await downloadFileForBase64(fileUrl)
if (mimeType === 'application/pdf') {
const pageCount = await getPdfPageCount(fileBuffer)
if (pageCount > MISTRAL_MAX_PAGES) {
logger.info(
`PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
`Falling back to file parser.`
)
return parseWithFileParser(fileUrl, filename, mimeType)
}
logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
}
const base64Data = fileBuffer.toString('base64')
const dataUri = `data:${mimeType};base64,${base64Data}`
@@ -374,17 +459,7 @@ async function parseWithAzureMistralOCR(
message: error instanceof Error ? error.message : String(error),
})
const fallbackMistralKey = await getMistralApiKey(workspaceId)
if (fallbackMistralKey) {
return parseWithMistralOCR(
fileUrl,
filename,
mimeType,
userId,
workspaceId,
fallbackMistralKey
)
}
logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
}
}
@@ -406,17 +481,54 @@ async function parseWithMistralOCR(
throw new Error('Mistral parser tool not configured')
}
const { httpsUrl, cloudUrl } = await handleFileForOCR(
const { httpsUrl, cloudUrl, buffer } = await handleFileForOCR(
fileUrl,
filename,
mimeType,
userId,
workspaceId
)
logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`)
let pageCount = 0
if (mimeType === 'application/pdf' && buffer) {
pageCount = await getPdfPageCount(buffer)
logger.info(`PDF page count for ${filename}: ${pageCount}`)
}
const needsBatching = pageCount > MISTRAL_MAX_PAGES
if (needsBatching && buffer) {
logger.info(
`PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.`
)
return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl)
}
const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }
try {
const response = await retryWithExponentialBackoff(
const response = await executeMistralOCRRequest(params, userId)
const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
const content = processOCRContent(result, filename)
return { content, processingMethod: 'mistral-ocr' as const, cloudUrl }
} catch (error) {
logger.error(`Mistral OCR failed for ${filename}:`, {
message: error instanceof Error ? error.message : String(error),
})
logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
}
}
async function executeMistralOCRRequest(
params: { filePath: string; apiKey: string; resultType: 'text' },
userId?: string
): Promise<Response> {
return retryWithExponentialBackoff(
async () => {
let url =
typeof mistralParserTool.request!.url === 'function'
@@ -449,25 +561,167 @@ async function parseWithMistralOCR(
},
{ maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
)
}
const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
const content = processOCRContent(result, filename)
/**
* Process a single PDF chunk: upload to S3, OCR, cleanup
*/
async function processChunk(
chunk: { buffer: Buffer; startPage: number; endPage: number },
chunkIndex: number,
totalChunks: number,
filename: string,
apiKey: string,
userId?: string
): Promise<{ index: number; content: string | null }> {
const chunkPageCount = chunk.endPage - chunk.startPage + 1
return { content, processingMethod: 'mistral-ocr' as const, cloudUrl }
} catch (error) {
logger.error(`Mistral OCR failed for ${filename}:`, {
message: error instanceof Error ? error.message : String(error),
logger.info(
`Processing chunk ${chunkIndex + 1}/${totalChunks} (pages ${chunk.startPage + 1}-${chunk.endPage + 1}, ${chunkPageCount} pages)`
)
let uploadedKey: string | null = null
try {
// Upload the chunk to S3
const timestamp = Date.now()
const uniqueId = Math.random().toString(36).substring(2, 9)
const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
const chunkKey = `kb/${timestamp}-${uniqueId}-chunk${chunkIndex + 1}-${safeFileName}`
const metadata: Record<string, string> = {
originalName: `${filename}_chunk${chunkIndex + 1}`,
uploadedAt: new Date().toISOString(),
purpose: 'knowledge-base',
...(userId && { userId }),
}
const uploadResult = await StorageService.uploadFile({
file: chunk.buffer,
fileName: `${filename}_chunk${chunkIndex + 1}`,
contentType: 'application/pdf',
context: 'knowledge-base',
customKey: chunkKey,
metadata,
})
logger.info(`Falling back to file parser: ${filename}`)
return parseWithFileParser(fileUrl, filename, mimeType)
uploadedKey = uploadResult.key
const chunkUrl = await StorageService.generatePresignedDownloadUrl(
uploadResult.key,
'knowledge-base',
900 // 15 minutes
)
logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
// Process the chunk with Mistral OCR
const params = {
filePath: chunkUrl,
apiKey,
resultType: 'text' as const,
}
const response = await executeMistralOCRRequest(params, userId)
const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
if (result.success && result.output?.content) {
logger.info(`Chunk ${chunkIndex + 1}/${totalChunks} completed successfully`)
return { index: chunkIndex, content: result.output.content }
}
logger.warn(`Chunk ${chunkIndex + 1}/${totalChunks} returned no content`)
return { index: chunkIndex, content: null }
} catch (error) {
logger.error(`Chunk ${chunkIndex + 1}/${totalChunks} failed:`, {
message: error instanceof Error ? error.message : String(error),
})
return { index: chunkIndex, content: null }
} finally {
// Clean up the chunk file from S3 after processing
if (uploadedKey) {
try {
await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
logger.info(`Cleaned up chunk ${chunkIndex + 1} from S3`)
} catch (deleteError) {
logger.warn(`Failed to clean up chunk ${chunkIndex + 1} from S3:`, {
message: deleteError instanceof Error ? deleteError.message : String(deleteError),
})
}
}
}
}
async function processMistralOCRInBatches(
filename: string,
apiKey: string,
pdfBuffer: Buffer,
userId?: string,
cloudUrl?: string
): Promise<{
content: string
processingMethod: 'mistral-ocr'
cloudUrl?: string
}> {
const totalPages = await getPdfPageCount(pdfBuffer)
logger.info(
`Splitting ${filename} (${totalPages} pages) into chunks of ${MISTRAL_MAX_PAGES} pages`
)
const pdfChunks = await splitPdfIntoChunks(pdfBuffer, MISTRAL_MAX_PAGES)
logger.info(
`Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
)
// Process chunks concurrently with limited concurrency
const results: { index: number; content: string | null }[] = []
for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
const batch = pdfChunks.slice(i, i + MAX_CONCURRENT_CHUNKS)
const batchPromises = batch.map((chunk, batchIndex) =>
processChunk(chunk, i + batchIndex, pdfChunks.length, filename, apiKey, userId)
)
const batchResults = await Promise.all(batchPromises)
for (const result of batchResults) {
results.push(result)
}
logger.info(
`Completed batch ${Math.floor(i / MAX_CONCURRENT_CHUNKS) + 1}/${Math.ceil(pdfChunks.length / MAX_CONCURRENT_CHUNKS)}`
)
}
// Sort by index to maintain page order and filter out nulls
const sortedResults = results
.sort((a, b) => a.index - b.index)
.filter((r) => r.content !== null)
.map((r) => r.content as string)
if (sortedResults.length === 0) {
// Don't fall back to file parser for large PDFs - it produces poor results
// Better to fail clearly than return low-quality extraction
throw new Error(
`OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
`Large PDFs require OCR - file parser fallback would produce poor results.`
)
}
const combinedContent = sortedResults.join('\n\n')
logger.info(
`Successfully processed ${sortedResults.length}/${pdfChunks.length} chunks for ${filename}`
)
return {
content: combinedContent,
processingMethod: 'mistral-ocr',
cloudUrl,
}
}
async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
try {
let content: string
let metadata: any = {}
let metadata: FileParseMetadata = {}
if (fileUrl.startsWith('data:')) {
content = await parseDataURI(fileUrl, filename, mimeType)
@@ -513,7 +767,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
async function parseHttpFile(
fileUrl: string,
filename: string
): Promise<{ content: string; metadata?: any }> {
): Promise<{ content: string; metadata?: FileParseMetadata }> {
const buffer = await downloadFileWithTimeout(fileUrl)
const extension = filename.split('.').pop()?.toLowerCase()

View File

@@ -212,7 +212,6 @@ export async function processDocumentTags(
return result
}
// Fetch existing tag definitions
const existingDefinitions = await db
.select()
.from(knowledgeBaseTagDefinitions)
@@ -220,18 +219,15 @@ export async function processDocumentTags(
const existingByName = new Map(existingDefinitions.map((def) => [def.displayName, def]))
// First pass: collect all validation errors
const undefinedTags: string[] = []
const typeErrors: string[] = []
for (const tag of tagData) {
// Skip if no tag name
if (!tag.tagName?.trim()) continue
const tagName = tag.tagName.trim()
const fieldType = tag.fieldType || 'text'
// For boolean, check if value is defined; for others, check if value is non-empty
const hasValue =
fieldType === 'boolean'
? tag.value !== undefined && tag.value !== null && tag.value !== ''
@@ -239,14 +235,12 @@ export async function processDocumentTags(
if (!hasValue) continue
// Check if tag exists
const existingDef = existingByName.get(tagName)
if (!existingDef) {
undefinedTags.push(tagName)
continue
}
// Validate value type using shared validation
const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
const actualFieldType = existingDef.fieldType || fieldType
const validationError = validateTagValue(tagName, String(rawValue), actualFieldType)
@@ -255,7 +249,6 @@ export async function processDocumentTags(
}
}
// Throw combined error if there are any validation issues
if (undefinedTags.length > 0 || typeErrors.length > 0) {
const errorParts: string[] = []
@@ -270,7 +263,6 @@ export async function processDocumentTags(
throw new Error(errorParts.join('\n'))
}
// Second pass: process valid tags
for (const tag of tagData) {
if (!tag.tagName?.trim()) continue
@@ -285,14 +277,13 @@ export async function processDocumentTags(
if (!hasValue) continue
const existingDef = existingByName.get(tagName)
if (!existingDef) continue // Already validated above
if (!existingDef) continue
const targetSlot = existingDef.tagSlot
const actualFieldType = existingDef.fieldType || fieldType
const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
const stringValue = String(rawValue).trim()
// Assign value to the slot with proper type conversion (values already validated)
if (actualFieldType === 'boolean') {
setTagValue(result, targetSlot, parseBooleanValue(stringValue) ?? false)
} else if (actualFieldType === 'number') {
@@ -440,7 +431,6 @@ export async function processDocumentAsync(
logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)
// Use KB's chunkingConfig as fallback if processingOptions not provided
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
await withTimeout(
@@ -469,7 +459,6 @@ export async function processDocumentAsync(
`[${documentId}] Document parsed successfully, generating embeddings for ${processed.chunks.length} chunks`
)
// Generate embeddings in batches for large documents
const chunkTexts = processed.chunks.map((chunk) => chunk.text)
const embeddings: number[][] = []
@@ -485,7 +474,9 @@ export async function processDocumentAsync(
logger.info(`[${documentId}] Processing embedding batch ${batchNum}/${totalBatches}`)
const batchEmbeddings = await generateEmbeddings(batch, undefined, kb[0].workspaceId)
embeddings.push(...batchEmbeddings)
for (const emb of batchEmbeddings) {
embeddings.push(emb)
}
}
}
@@ -562,23 +553,18 @@ export async function processDocumentAsync(
}))
await db.transaction(async (tx) => {
// Insert embeddings in batches for large documents
if (embeddingRecords.length > 0) {
const batchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
const totalBatches = Math.ceil(embeddingRecords.length / batchSize)
await tx.delete(embedding).where(eq(embedding.documentId, documentId))
logger.info(
`[${documentId}] Inserting ${embeddingRecords.length} embeddings in ${totalBatches} batches`
)
for (let i = 0; i < embeddingRecords.length; i += batchSize) {
const batch = embeddingRecords.slice(i, i + batchSize)
const batchNum = Math.floor(i / batchSize) + 1
const insertBatchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
const batches: (typeof embeddingRecords)[] = []
for (let i = 0; i < embeddingRecords.length; i += insertBatchSize) {
batches.push(embeddingRecords.slice(i, i + insertBatchSize))
}
logger.info(`[${documentId}] Inserting ${embeddingRecords.length} embeddings`)
for (const batch of batches) {
await tx.insert(embedding).values(batch)
logger.info(
`[${documentId}] Inserted batch ${batchNum}/${totalBatches} (${batch.length} records)`
)
}
}
@@ -689,11 +675,9 @@ export async function createDocumentRecords(
requestId: string,
userId?: string
): Promise<DocumentData[]> {
// Check storage limits before creating documents
if (userId) {
const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
// Get knowledge base owner
const kb = await db
.select({ userId: knowledgeBase.userId })
.from(knowledgeBase)
@@ -713,7 +697,7 @@ export async function createDocumentRecords(
for (const docData of documents) {
const documentId = randomUUID()
let processedTags: Record<string, any> = {}
let processedTags: Partial<ProcessedDocumentTags> = {}
if (docData.documentTagsData) {
try {
@@ -722,7 +706,6 @@ export async function createDocumentRecords(
processedTags = await processDocumentTags(knowledgeBaseId, tagData, requestId)
}
} catch (error) {
// Re-throw validation errors, only catch JSON parse errors
if (error instanceof SyntaxError) {
logger.warn(`[${requestId}] Failed to parse documentTagsData for bulk document:`, error)
} else {
@@ -791,7 +774,6 @@ export async function createDocumentRecords(
if (userId) {
const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
// Get knowledge base owner
const kb = await db
.select({ userId: knowledgeBase.userId })
.from(knowledgeBase)
@@ -1079,7 +1061,7 @@ export async function createSingleDocument(
const now = new Date()
// Process structured tag data if provided
let processedTags: Record<string, any> = {
let processedTags: ProcessedDocumentTags = {
// Text tags (7 slots)
tag1: documentData.tag1 ?? null,
tag2: documentData.tag2 ?? null,
@@ -1555,23 +1537,30 @@ export async function updateDocument(
return value || null
}
// Type-safe access to tag slots in updateData
type UpdateDataWithTags = typeof updateData & Record<TagSlot, string | undefined>
const typedUpdateData = updateData as UpdateDataWithTags
ALL_TAG_SLOTS.forEach((slot: TagSlot) => {
const updateValue = (updateData as any)[slot]
const updateValue = typedUpdateData[slot]
if (updateValue !== undefined) {
;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue)
;(dbUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[slot] =
convertTagValue(slot, updateValue)
}
})
await db.transaction(async (tx) => {
await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId))
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined)
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined)
if (hasTagUpdates) {
const embeddingUpdateData: Record<string, any> = {}
const embeddingUpdateData: Partial<ProcessedDocumentTags> = {}
ALL_TAG_SLOTS.forEach((field) => {
if ((updateData as any)[field] !== undefined) {
embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field])
if (typedUpdateData[field] !== undefined) {
;(embeddingUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[
field
] = convertTagValue(field, typedUpdateData[field])
}
})

View File

@@ -14,7 +14,7 @@ export interface RetryOptions {
initialDelayMs?: number
maxDelayMs?: number
backoffMultiplier?: number
retryCondition?: (error: RetryableError) => boolean
retryCondition?: (error: unknown) => boolean
}
export interface RetryResult<T> {
@@ -30,11 +30,18 @@ function hasStatus(
return typeof error === 'object' && error !== null && 'status' in error
}
function isRetryableErrorType(error: unknown): error is RetryableError {
if (!error) return false
if (error instanceof Error) return true
if (typeof error === 'object' && ('status' in error || 'message' in error)) return true
return false
}
/**
* Default retry condition for rate limiting errors
*/
export function isRetryableError(error: RetryableError): boolean {
if (!error) return false
export function isRetryableError(error: unknown): boolean {
if (!isRetryableErrorType(error)) return false
// Check for rate limiting status codes
if (
@@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean {
}
// Check for rate limiting in error messages
const errorMessage = error.message || error.toString()
const errorMessage = error instanceof Error ? error.message : String(error)
const rateLimitKeywords = [
'rate limit',
'rate_limit',

View File

@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
import { getBYOKKey } from '@/lib/api-key/byok'
import { env } from '@/lib/core/config/env'
import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization'
import { batchByTokenLimit } from '@/lib/tokenization'
const logger = createLogger('EmbeddingUtils')
@@ -26,6 +26,20 @@ interface EmbeddingConfig {
modelName: string
}
interface EmbeddingResponseItem {
embedding: number[]
index: number
}
interface EmbeddingAPIResponse {
data: EmbeddingResponseItem[]
model: string
usage: {
prompt_tokens: number
total_tokens: number
}
}
async function getEmbeddingConfig(
embeddingModel = 'text-embedding-3-small',
workspaceId?: string | null
@@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
)
}
const data = await response.json()
return data.data.map((item: any) => item.embedding)
const data: EmbeddingAPIResponse = await response.json()
return data.data.map((item) => item.embedding)
},
{
maxRetries: 3,
initialDelayMs: 1000,
maxDelayMs: 10000,
retryCondition: (error: any) => {
retryCondition: (error: unknown) => {
if (error instanceof EmbeddingAPIError) {
return error.status === 429 || error.status >= 500
}
@@ -153,44 +167,27 @@ export async function generateEmbeddings(
): Promise<number[][]> {
const config = await getEmbeddingConfig(embeddingModel, workspaceId)
logger.info(
`Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)`
)
const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel)
logger.info(
`Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch, ${MAX_CONCURRENT_BATCHES} concurrent)`
)
const batchResults = await processWithConcurrency(
batches,
MAX_CONCURRENT_BATCHES,
async (batch, i) => {
const batchTokenCount = getTotalTokenCount(batch, embeddingModel)
logger.info(
`Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens`
)
try {
const batchEmbeddings = await callEmbeddingAPI(batch, config)
logger.info(
`Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}`
)
return batchEmbeddings
return await callEmbeddingAPI(batch, config)
} catch (error) {
logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error)
logger.error(`Failed to generate embeddings for batch ${i + 1}/${batches.length}:`, error)
throw error
}
}
)
const allEmbeddings = batchResults.flat()
logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`)
const allEmbeddings: number[][] = []
for (const batch of batchResults) {
for (const emb of batch) {
allEmbeddings.push(emb)
}
}
return allEmbeddings
}

View File

@@ -127,24 +127,6 @@ export function truncateToTokenLimit(
}
}
/**
* Get token count for multiple texts (for batching decisions)
* Returns array of token counts in same order as input
*/
export function getTokenCountsForBatch(
texts: string[],
modelName = 'text-embedding-3-small'
): number[] {
return texts.map((text) => getAccurateTokenCount(text, modelName))
}
/**
* Calculate total tokens across multiple texts
*/
export function getTotalTokenCount(texts: string[], modelName = 'text-embedding-3-small'): number {
return texts.reduce((total, text) => total + getAccurateTokenCount(text, modelName), 0)
}
/**
* Batch texts by token count to stay within API limits
* Returns array of batches where each batch's total tokens <= maxTokensPerBatch

View File

@@ -12,8 +12,6 @@ export {
estimateOutputTokens,
estimateTokenCount,
getAccurateTokenCount,
getTokenCountsForBatch,
getTotalTokenCount,
truncateToTokenLimit,
} from '@/lib/tokenization/estimators'
export { processStreamingBlockLog, processStreamingBlockLogs } from '@/lib/tokenization/streaming'

View File

@@ -127,6 +127,7 @@
"onedollarstats": "0.0.10",
"openai": "^4.91.1",
"papaparse": "5.5.3",
"pdf-lib": "1.17.1",
"postgres": "^3.4.5",
"posthog-js": "1.268.9",
"posthog-node": "5.9.2",

View File

@@ -17,7 +17,7 @@ export default defineConfig({
build: {
extensions: [
additionalPackages({
packages: ['unpdf'],
packages: ['unpdf', 'pdf-lib'],
}),
],
},

View File

@@ -1,5 +1,6 @@
{
"lockfileVersion": 1,
"configVersion": 0,
"workspaces": {
"": {
"name": "simstudio",
@@ -11,7 +12,7 @@
"drizzle-kit": "^0.31.4",
"husky": "9.1.7",
"lint-staged": "16.0.0",
"turbo": "2.7.2",
"turbo": "2.7.3",
},
},
"apps/docs": {
@@ -156,6 +157,7 @@
"onedollarstats": "0.0.10",
"openai": "^4.91.1",
"papaparse": "5.5.3",
"pdf-lib": "1.17.1",
"postgres": "^3.4.5",
"posthog-js": "1.268.9",
"posthog-node": "5.9.2",
@@ -912,6 +914,10 @@
"@orama/orama": ["@orama/orama@3.1.18", "", {}, "sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA=="],
"@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="],
"@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="],
"@peculiar/asn1-android": ["@peculiar/asn1-android@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-cBRCKtYPF7vJGN76/yG8VbxRcHLPF3HnkoHhKOZeHpoVtbMYfY9ROKtH3DtYUY9m8uI1Mh47PRhHf2hSK3xcSQ=="],
"@peculiar/asn1-cms": ["@peculiar/asn1-cms@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "@peculiar/asn1-x509": "^2.6.0", "@peculiar/asn1-x509-attr": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA=="],
@@ -2864,6 +2870,8 @@
"pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="],
"pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="],
"pdfjs-dist": ["pdfjs-dist@5.4.449", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.81" } }, "sha512-CegnUaT0QwAyQMS+7o2POr4wWUNNe8VaKKlcuoRHeYo98cVnqPpwOXNSx6Trl6szH02JrRcsPgletV6GmF3LtQ=="],
"peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="],
@@ -3362,19 +3370,19 @@
"tunnel-agent": ["tunnel-agent@0.6.0", "", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w=="],
"turbo": ["turbo@2.7.2", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.2", "turbo-darwin-arm64": "2.7.2", "turbo-linux-64": "2.7.2", "turbo-linux-arm64": "2.7.2", "turbo-windows-64": "2.7.2", "turbo-windows-arm64": "2.7.2" }, "bin": { "turbo": "bin/turbo" } }, "sha512-5JIA5aYBAJSAhrhbyag1ZuMSgUZnHtI+Sq3H8D3an4fL8PeF+L1yYvbEJg47akP1PFfATMf5ehkqFnxfkmuwZQ=="],
"turbo": ["turbo@2.7.3", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.3", "turbo-darwin-arm64": "2.7.3", "turbo-linux-64": "2.7.3", "turbo-linux-arm64": "2.7.3", "turbo-windows-64": "2.7.3", "turbo-windows-arm64": "2.7.3" }, "bin": { "turbo": "bin/turbo" } }, "sha512-+HjKlP4OfYk+qzvWNETA3cUO5UuK6b5MSc2UJOKyvBceKucQoQGb2g7HlC2H1GHdkfKrk4YF1VPvROkhVZDDLQ=="],
"turbo-darwin-64": ["turbo-darwin-64@2.7.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-dxY3X6ezcT5vm3coK6VGixbrhplbQMwgNsCsvZamS/+/6JiebqW9DKt4NwpgYXhDY2HdH00I7FWs3wkVuan4rA=="],
"turbo-darwin-64": ["turbo-darwin-64@2.7.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-aZHhvRiRHXbJw1EcEAq4aws1hsVVUZ9DPuSFaq9VVFAKCup7niIEwc22glxb7240yYEr1vLafdQ2U294Vcwz+w=="],
"turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1bXmuwPLqNFt3mzrtYcVx1sdJ8UYb124Bf48nIgcpMCGZy3kDhgxNv1503kmuK/37OGOZbsWSQFU4I08feIuSg=="],
"turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-CkVrHSq+Bnhl9sX2LQgqQYVfLTWC2gvI74C4758OmU0djfrssDKU9d4YQF0AYXXhIIRZipSXfxClQziIMD+EAg=="],
"turbo-linux-64": ["turbo-linux-64@2.7.2", "", { "os": "linux", "cpu": "x64" }, "sha512-kP+TiiMaiPugbRlv57VGLfcjFNsFbo8H64wMBCPV2270Or2TpDCBULMzZrvEsvWFjT3pBFvToYbdp8/Kw0jAQg=="],
"turbo-linux-64": ["turbo-linux-64@2.7.3", "", { "os": "linux", "cpu": "x64" }, "sha512-GqDsCNnzzr89kMaLGpRALyigUklzgxIrSy2pHZVXyifgczvYPnLglex78Aj3T2gu+T3trPPH2iJ+pWucVOCC2Q=="],
"turbo-linux-arm64": ["turbo-linux-arm64@2.7.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-VDJwQ0+8zjAfbyY6boNaWfP6RIez4ypKHxwkuB6SrWbOSk+vxTyW5/hEjytTwK8w/TsbKVcMDyvpora8tEsRFw=="],
"turbo-linux-arm64": ["turbo-linux-arm64@2.7.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-NdCDTfIcIo3dWjsiaAHlxu5gW61Ed/8maah1IAF/9E3EtX0aAHNiBMbuYLZaR4vRJ7BeVkYB6xKWRtdFLZ0y3g=="],
"turbo-windows-64": ["turbo-windows-64@2.7.2", "", { "os": "win32", "cpu": "x64" }, "sha512-rPjqQXVnI6A6oxgzNEE8DNb6Vdj2Wwyhfv3oDc+YM3U9P7CAcBIlKv/868mKl4vsBtz4ouWpTQNXG8vljgJO+w=="],
"turbo-windows-64": ["turbo-windows-64@2.7.3", "", { "os": "win32", "cpu": "x64" }, "sha512-7bVvO987daXGSJVYBoG8R4Q+csT1pKIgLJYZevXRQ0Hqw0Vv4mKme/TOjYXs9Qb1xMKh51Tb3bXKDbd8/4G08g=="],
"turbo-windows-arm64": ["turbo-windows-arm64@2.7.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-tcnHvBhO515OheIFWdxA+qUvZzNqqcHbLVFc1+n+TJ1rrp8prYicQtbtmsiKgMvr/54jb9jOabU62URAobnB7g=="],
"turbo-windows-arm64": ["turbo-windows-arm64@2.7.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-nTodweTbPmkvwMu/a55XvjMsPtuyUSC+sV7f/SR57K36rB2I0YG21qNETN+00LOTUW9B3omd8XkiXJkt4kx/cw=="],
"tweetnacl": ["tweetnacl@0.14.5", "", {}, "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA=="],
@@ -4046,6 +4054,8 @@
"path-scurry/lru-cache": ["lru-cache@11.2.4", "", {}, "sha512-B5Y16Jr9LB9dHVkh6ZevG+vAbOsNOYCX+sXvFWFu7B3Iz5mijW3zdbMyhsh8ANd2mSWBYdJgnqi+mL7/LrOPYg=="],
"pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
"pino/thread-stream": ["thread-stream@3.1.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-OqyPZ9u96VohAyMfJykzmivOrY2wfMSf3C5TtFJVgN+Hm6aj+voFhlK+kZEIv2FBh1X6Xp3DlnCOfEQ3B2J86A=="],
"pino-pretty/pino-abstract-transport": ["pino-abstract-transport@3.0.0", "", { "dependencies": { "split2": "^4.0.0" } }, "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg=="],

View File

@@ -41,7 +41,7 @@
"drizzle-kit": "^0.31.4",
"husky": "9.1.7",
"lint-staged": "16.0.0",
"turbo": "2.7.2"
"turbo": "2.7.3"
},
"lint-staged": {
"*.{js,jsx,ts,tsx,json,css,scss}": [