mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-08 14:43:54 -05:00
improvement(kb): optimize processes, add more robust fallbacks for large file ops (#2684)
* improvement(kb): optimize processes, add more robust fallbacks for large file ops * stronger typing * comments cleanup * ack PR comments * upgraded turborepo * ack more PR comments * fix failing test * moved doc update inside tx for embeddings chunks upload * ack more PR comments
This commit is contained in:
@@ -136,16 +136,29 @@ vi.mock('@sim/db', () => {
|
||||
},
|
||||
}),
|
||||
}),
|
||||
delete: () => ({
|
||||
where: () => Promise.resolve(),
|
||||
}),
|
||||
insert: () => ({
|
||||
values: (records: any) => {
|
||||
dbOps.order.push('insert')
|
||||
dbOps.insertRecords.push(records)
|
||||
return Promise.resolve()
|
||||
},
|
||||
}),
|
||||
transaction: vi.fn(async (fn: any) => {
|
||||
await fn({
|
||||
insert: (table: any) => ({
|
||||
delete: () => ({
|
||||
where: () => Promise.resolve(),
|
||||
}),
|
||||
insert: () => ({
|
||||
values: (records: any) => {
|
||||
dbOps.order.push('insert')
|
||||
dbOps.insertRecords.push(records)
|
||||
return Promise.resolve()
|
||||
},
|
||||
}),
|
||||
update: (table: any) => ({
|
||||
update: () => ({
|
||||
set: (payload: any) => ({
|
||||
where: () => {
|
||||
dbOps.updatePayloads.push(payload)
|
||||
|
||||
@@ -453,6 +453,8 @@ export function KnowledgeBase({
|
||||
error: knowledgeBaseError,
|
||||
refresh: refreshKnowledgeBase,
|
||||
} = useKnowledgeBase(id)
|
||||
const [hasProcessingDocuments, setHasProcessingDocuments] = useState(false)
|
||||
|
||||
const {
|
||||
documents,
|
||||
pagination,
|
||||
@@ -468,6 +470,7 @@ export function KnowledgeBase({
|
||||
offset: (currentPage - 1) * DOCUMENTS_PER_PAGE,
|
||||
sortBy,
|
||||
sortOrder,
|
||||
refetchInterval: hasProcessingDocuments && !isDeleting ? 3000 : false,
|
||||
})
|
||||
|
||||
const { tagDefinitions } = useKnowledgeBaseTagDefinitions(id)
|
||||
@@ -534,25 +537,15 @@ export function KnowledgeBase({
|
||||
)
|
||||
|
||||
useEffect(() => {
|
||||
const hasProcessingDocuments = documents.some(
|
||||
const processing = documents.some(
|
||||
(doc) => doc.processingStatus === 'pending' || doc.processingStatus === 'processing'
|
||||
)
|
||||
setHasProcessingDocuments(processing)
|
||||
|
||||
if (!hasProcessingDocuments) return
|
||||
|
||||
const refreshInterval = setInterval(async () => {
|
||||
try {
|
||||
if (!isDeleting) {
|
||||
await checkForDeadProcesses()
|
||||
await refreshDocuments()
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error refreshing documents:', error)
|
||||
}
|
||||
}, 3000)
|
||||
|
||||
return () => clearInterval(refreshInterval)
|
||||
}, [documents, refreshDocuments, isDeleting])
|
||||
if (processing) {
|
||||
checkForDeadProcesses()
|
||||
}
|
||||
}, [documents])
|
||||
|
||||
/**
|
||||
* Checks for documents with stale processing states and marks them as failed
|
||||
@@ -672,25 +665,6 @@ export function KnowledgeBase({
|
||||
|
||||
await refreshDocuments()
|
||||
|
||||
let refreshAttempts = 0
|
||||
const maxRefreshAttempts = 3
|
||||
const refreshInterval = setInterval(async () => {
|
||||
try {
|
||||
refreshAttempts++
|
||||
await refreshDocuments()
|
||||
if (refreshAttempts >= maxRefreshAttempts) {
|
||||
clearInterval(refreshInterval)
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error refreshing documents after retry:', error)
|
||||
clearInterval(refreshInterval)
|
||||
}
|
||||
}, 1000)
|
||||
|
||||
setTimeout(() => {
|
||||
clearInterval(refreshInterval)
|
||||
}, 4000)
|
||||
|
||||
logger.info(`Document retry initiated successfully for: ${docId}`)
|
||||
} catch (err) {
|
||||
logger.error('Error retrying document:', err)
|
||||
|
||||
@@ -27,6 +27,7 @@ export type DocumentProcessingPayload = {
|
||||
export const processDocument = task({
|
||||
id: 'knowledge-process-document',
|
||||
maxDuration: env.KB_CONFIG_MAX_DURATION || 600,
|
||||
machine: 'large-1x', // 2 vCPU, 2GB RAM - needed for large PDF processing
|
||||
retry: {
|
||||
maxAttempts: env.KB_CONFIG_MAX_ATTEMPTS || 3,
|
||||
factor: env.KB_CONFIG_RETRY_FACTOR || 2,
|
||||
|
||||
@@ -228,6 +228,7 @@ export function useKnowledgeDocumentsQuery(
|
||||
params: KnowledgeDocumentsParams,
|
||||
options?: {
|
||||
enabled?: boolean
|
||||
refetchInterval?: number | false
|
||||
}
|
||||
) {
|
||||
const paramsKey = serializeDocumentParams(params)
|
||||
@@ -237,6 +238,7 @@ export function useKnowledgeDocumentsQuery(
|
||||
enabled: (options?.enabled ?? true) && Boolean(params.knowledgeBaseId),
|
||||
staleTime: 60 * 1000,
|
||||
placeholderData: keepPreviousData,
|
||||
refetchInterval: options?.refetchInterval ?? false,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ export function useKnowledgeBaseDocuments(
|
||||
sortBy?: string
|
||||
sortOrder?: string
|
||||
enabled?: boolean
|
||||
refetchInterval?: number | false
|
||||
}
|
||||
) {
|
||||
const queryClient = useQueryClient()
|
||||
@@ -92,6 +93,7 @@ export function useKnowledgeBaseDocuments(
|
||||
},
|
||||
{
|
||||
enabled: (options?.enabled ?? true) && Boolean(knowledgeBaseId),
|
||||
refetchInterval: options?.refetchInterval,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ interface HeaderInfo {
|
||||
interface Frontmatter {
|
||||
title?: string
|
||||
description?: string
|
||||
[key: string]: any
|
||||
[key: string]: unknown
|
||||
}
|
||||
|
||||
const logger = createLogger('DocsChunker')
|
||||
|
||||
@@ -6,6 +6,11 @@ import { estimateTokenCount } from '@/lib/tokenization/estimators'
|
||||
|
||||
const logger = createLogger('JsonYamlChunker')
|
||||
|
||||
type JsonPrimitive = string | number | boolean | null
|
||||
type JsonValue = JsonPrimitive | JsonObject | JsonArray
|
||||
type JsonObject = { [key: string]: JsonValue }
|
||||
type JsonArray = JsonValue[]
|
||||
|
||||
function getTokenCount(text: string): number {
|
||||
try {
|
||||
return getAccurateTokenCount(text, 'text-embedding-3-small')
|
||||
@@ -59,11 +64,11 @@ export class JsonYamlChunker {
|
||||
*/
|
||||
async chunk(content: string): Promise<Chunk[]> {
|
||||
try {
|
||||
let data: any
|
||||
let data: JsonValue
|
||||
try {
|
||||
data = JSON.parse(content)
|
||||
data = JSON.parse(content) as JsonValue
|
||||
} catch {
|
||||
data = yaml.load(content)
|
||||
data = yaml.load(content) as JsonValue
|
||||
}
|
||||
const chunks = this.chunkStructuredData(data)
|
||||
|
||||
@@ -86,7 +91,7 @@ export class JsonYamlChunker {
|
||||
/**
|
||||
* Chunk structured data based on its structure
|
||||
*/
|
||||
private chunkStructuredData(data: any, path: string[] = []): Chunk[] {
|
||||
private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
|
||||
const chunks: Chunk[] = []
|
||||
|
||||
if (Array.isArray(data)) {
|
||||
@@ -94,7 +99,7 @@ export class JsonYamlChunker {
|
||||
}
|
||||
|
||||
if (typeof data === 'object' && data !== null) {
|
||||
return this.chunkObject(data, path)
|
||||
return this.chunkObject(data as JsonObject, path)
|
||||
}
|
||||
|
||||
const content = JSON.stringify(data, null, 2)
|
||||
@@ -118,9 +123,9 @@ export class JsonYamlChunker {
|
||||
/**
|
||||
* Chunk an array intelligently
|
||||
*/
|
||||
private chunkArray(arr: any[], path: string[]): Chunk[] {
|
||||
private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
|
||||
const chunks: Chunk[] = []
|
||||
let currentBatch: any[] = []
|
||||
let currentBatch: JsonValue[] = []
|
||||
let currentTokens = 0
|
||||
|
||||
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
|
||||
@@ -194,7 +199,7 @@ export class JsonYamlChunker {
|
||||
/**
|
||||
* Chunk an object intelligently
|
||||
*/
|
||||
private chunkObject(obj: Record<string, any>, path: string[]): Chunk[] {
|
||||
private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
|
||||
const chunks: Chunk[] = []
|
||||
const entries = Object.entries(obj)
|
||||
|
||||
@@ -213,7 +218,7 @@ export class JsonYamlChunker {
|
||||
return chunks
|
||||
}
|
||||
|
||||
let currentObj: Record<string, any> = {}
|
||||
let currentObj: JsonObject = {}
|
||||
let currentTokens = 0
|
||||
let currentKeys: string[] = []
|
||||
|
||||
|
||||
@@ -110,10 +110,12 @@ export class TextChunker {
|
||||
chunks.push(currentChunk.trim())
|
||||
}
|
||||
|
||||
// Start new chunk with current part
|
||||
// If part itself is too large, split it further
|
||||
if (this.estimateTokens(part) > this.chunkSize) {
|
||||
chunks.push(...(await this.splitRecursively(part, separatorIndex + 1)))
|
||||
const subChunks = await this.splitRecursively(part, separatorIndex + 1)
|
||||
for (const subChunk of subChunks) {
|
||||
chunks.push(subChunk)
|
||||
}
|
||||
currentChunk = ''
|
||||
} else {
|
||||
currentChunk = part
|
||||
|
||||
@@ -178,6 +178,7 @@ export const env = createEnv({
|
||||
KB_CONFIG_BATCH_SIZE: z.number().optional().default(2000), // Chunks to process per embedding batch
|
||||
KB_CONFIG_DELAY_BETWEEN_BATCHES: z.number().optional().default(0), // Delay between batches in ms (0 for max speed)
|
||||
KB_CONFIG_DELAY_BETWEEN_DOCUMENTS: z.number().optional().default(50), // Delay between documents in ms
|
||||
KB_CONFIG_CHUNK_CONCURRENCY: z.number().optional().default(10), // Concurrent PDF chunk OCR processing
|
||||
|
||||
// Real-time Communication
|
||||
SOCKET_SERVER_URL: z.string().url().optional(), // WebSocket server URL for real-time features
|
||||
|
||||
@@ -17,8 +17,6 @@ export class DocParser implements FileParser {
|
||||
throw new Error(`File not found: ${filePath}`)
|
||||
}
|
||||
|
||||
logger.info(`Parsing DOC file: ${filePath}`)
|
||||
|
||||
const buffer = await readFile(filePath)
|
||||
return this.parseBuffer(buffer)
|
||||
} catch (error) {
|
||||
@@ -29,53 +27,80 @@ export class DocParser implements FileParser {
|
||||
|
||||
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
|
||||
try {
|
||||
logger.info('Parsing DOC buffer, size:', buffer.length)
|
||||
|
||||
if (!buffer || buffer.length === 0) {
|
||||
throw new Error('Empty buffer provided')
|
||||
}
|
||||
|
||||
let parseOfficeAsync
|
||||
try {
|
||||
const officeParser = await import('officeparser')
|
||||
parseOfficeAsync = officeParser.parseOfficeAsync
|
||||
} catch (importError) {
|
||||
logger.warn('officeparser not available, using fallback extraction')
|
||||
return this.fallbackExtraction(buffer)
|
||||
const result = await officeParser.parseOfficeAsync(buffer)
|
||||
|
||||
if (result) {
|
||||
const resultString = typeof result === 'string' ? result : String(result)
|
||||
const content = sanitizeTextForUTF8(resultString.trim())
|
||||
|
||||
if (content.length > 0) {
|
||||
return {
|
||||
content,
|
||||
metadata: {
|
||||
characterCount: content.length,
|
||||
extractionMethod: 'officeparser',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (officeError) {
|
||||
logger.warn('officeparser failed, trying mammoth:', officeError)
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await parseOfficeAsync(buffer)
|
||||
const mammoth = await import('mammoth')
|
||||
const result = await mammoth.extractRawText({ buffer })
|
||||
|
||||
if (!result) {
|
||||
throw new Error('officeparser returned no result')
|
||||
if (result.value && result.value.trim().length > 0) {
|
||||
const content = sanitizeTextForUTF8(result.value.trim())
|
||||
return {
|
||||
content,
|
||||
metadata: {
|
||||
characterCount: content.length,
|
||||
extractionMethod: 'mammoth',
|
||||
messages: result.messages,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const resultString = typeof result === 'string' ? result : String(result)
|
||||
|
||||
const content = sanitizeTextForUTF8(resultString.trim())
|
||||
|
||||
logger.info('DOC parsing completed successfully with officeparser')
|
||||
|
||||
return {
|
||||
content: content,
|
||||
metadata: {
|
||||
characterCount: content.length,
|
||||
extractionMethod: 'officeparser',
|
||||
},
|
||||
}
|
||||
} catch (extractError) {
|
||||
logger.warn('officeparser failed, using fallback:', extractError)
|
||||
return this.fallbackExtraction(buffer)
|
||||
} catch (mammothError) {
|
||||
logger.warn('mammoth failed:', mammothError)
|
||||
}
|
||||
|
||||
return this.fallbackExtraction(buffer)
|
||||
} catch (error) {
|
||||
logger.error('DOC buffer parsing error:', error)
|
||||
logger.error('DOC parsing error:', error)
|
||||
throw new Error(`Failed to parse DOC buffer: ${(error as Error).message}`)
|
||||
}
|
||||
}
|
||||
|
||||
private fallbackExtraction(buffer: Buffer): FileParseResult {
|
||||
logger.info('Using fallback text extraction for DOC file')
|
||||
const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
|
||||
|
||||
if (!isBinaryDoc) {
|
||||
const textContent = buffer.toString('utf8').trim()
|
||||
|
||||
if (textContent.length > 0) {
|
||||
const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
|
||||
const isProbablyText = printableChars / textContent.length > 0.9
|
||||
|
||||
if (isProbablyText) {
|
||||
return {
|
||||
content: sanitizeTextForUTF8(textContent),
|
||||
metadata: {
|
||||
extractionMethod: 'plaintext-fallback',
|
||||
characterCount: textContent.length,
|
||||
warning: 'File is not a valid DOC format, extracted as plain text',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))
|
||||
|
||||
|
||||
@@ -2,13 +2,18 @@ import { readFile } from 'fs/promises'
|
||||
import { createLogger } from '@sim/logger'
|
||||
import mammoth from 'mammoth'
|
||||
import type { FileParseResult, FileParser } from '@/lib/file-parsers/types'
|
||||
import { sanitizeTextForUTF8 } from '@/lib/file-parsers/utils'
|
||||
|
||||
const logger = createLogger('DocxParser')
|
||||
|
||||
// Define interface for mammoth result
|
||||
interface MammothMessage {
|
||||
type: 'warning' | 'error'
|
||||
message: string
|
||||
}
|
||||
|
||||
interface MammothResult {
|
||||
value: string
|
||||
messages: any[]
|
||||
messages: MammothMessage[]
|
||||
}
|
||||
|
||||
export class DocxParser implements FileParser {
|
||||
@@ -19,7 +24,6 @@ export class DocxParser implements FileParser {
|
||||
}
|
||||
|
||||
const buffer = await readFile(filePath)
|
||||
|
||||
return this.parseBuffer(buffer)
|
||||
} catch (error) {
|
||||
logger.error('DOCX file error:', error)
|
||||
@@ -29,26 +33,74 @@ export class DocxParser implements FileParser {
|
||||
|
||||
async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
|
||||
try {
|
||||
logger.info('Parsing buffer, size:', buffer.length)
|
||||
if (!buffer || buffer.length === 0) {
|
||||
throw new Error('Empty buffer provided')
|
||||
}
|
||||
|
||||
const result = await mammoth.extractRawText({ buffer })
|
||||
|
||||
let htmlResult: MammothResult = { value: '', messages: [] }
|
||||
try {
|
||||
htmlResult = await mammoth.convertToHtml({ buffer })
|
||||
} catch (htmlError) {
|
||||
logger.warn('HTML conversion warning:', htmlError)
|
||||
const result = await mammoth.extractRawText({ buffer })
|
||||
|
||||
if (result.value && result.value.trim().length > 0) {
|
||||
let htmlResult: MammothResult = { value: '', messages: [] }
|
||||
try {
|
||||
htmlResult = await mammoth.convertToHtml({ buffer })
|
||||
} catch {
|
||||
// HTML conversion is optional
|
||||
}
|
||||
|
||||
return {
|
||||
content: sanitizeTextForUTF8(result.value),
|
||||
metadata: {
|
||||
extractionMethod: 'mammoth',
|
||||
messages: [...result.messages, ...htmlResult.messages],
|
||||
html: htmlResult.value,
|
||||
},
|
||||
}
|
||||
}
|
||||
} catch (mammothError) {
|
||||
logger.warn('mammoth failed, trying officeparser:', mammothError)
|
||||
}
|
||||
|
||||
return {
|
||||
content: result.value,
|
||||
metadata: {
|
||||
messages: [...result.messages, ...htmlResult.messages],
|
||||
html: htmlResult.value,
|
||||
},
|
||||
try {
|
||||
const officeParser = await import('officeparser')
|
||||
const result = await officeParser.parseOfficeAsync(buffer)
|
||||
|
||||
if (result) {
|
||||
const resultString = typeof result === 'string' ? result : String(result)
|
||||
const content = sanitizeTextForUTF8(resultString.trim())
|
||||
|
||||
if (content.length > 0) {
|
||||
return {
|
||||
content,
|
||||
metadata: {
|
||||
extractionMethod: 'officeparser',
|
||||
characterCount: content.length,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (officeError) {
|
||||
logger.warn('officeparser failed:', officeError)
|
||||
}
|
||||
|
||||
const isZipFile = buffer.length >= 2 && buffer[0] === 0x50 && buffer[1] === 0x4b
|
||||
if (!isZipFile) {
|
||||
const textContent = buffer.toString('utf8').trim()
|
||||
if (textContent.length > 0) {
|
||||
return {
|
||||
content: sanitizeTextForUTF8(textContent),
|
||||
metadata: {
|
||||
extractionMethod: 'plaintext-fallback',
|
||||
characterCount: textContent.length,
|
||||
warning: 'File is not a valid DOCX format, extracted as plain text',
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('Failed to extract text from DOCX file')
|
||||
} catch (error) {
|
||||
logger.error('DOCX buffer parsing error:', error)
|
||||
logger.error('DOCX parsing error:', error)
|
||||
throw new Error(`Failed to parse DOCX buffer: ${(error as Error).message}`)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,22 @@
|
||||
export interface FileParseMetadata {
|
||||
characterCount?: number
|
||||
pageCount?: number
|
||||
extractionMethod?: string
|
||||
warning?: string
|
||||
messages?: unknown[]
|
||||
html?: string
|
||||
type?: string
|
||||
headers?: string[]
|
||||
totalRows?: number
|
||||
rowCount?: number
|
||||
sheetNames?: string[]
|
||||
source?: string
|
||||
[key: string]: unknown
|
||||
}
|
||||
|
||||
export interface FileParseResult {
|
||||
content: string
|
||||
metadata?: Record<string, any>
|
||||
metadata?: FileParseMetadata
|
||||
}
|
||||
|
||||
export interface FileParser {
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import { createLogger } from '@sim/logger'
|
||||
import { PDFDocument } from 'pdf-lib'
|
||||
import { getBYOKKey } from '@/lib/api-key/byok'
|
||||
import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
|
||||
import { env } from '@/lib/core/config/env'
|
||||
import { parseBuffer, parseFile } from '@/lib/file-parsers'
|
||||
import type { FileParseMetadata } from '@/lib/file-parsers/types'
|
||||
import { retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
|
||||
import { StorageService } from '@/lib/uploads'
|
||||
import { downloadFileFromUrl } from '@/lib/uploads/utils/file-utils.server'
|
||||
@@ -15,6 +17,8 @@ const TIMEOUTS = {
|
||||
MISTRAL_OCR_API: 120000,
|
||||
} as const
|
||||
|
||||
const MAX_CONCURRENT_CHUNKS = env.KB_CONFIG_CHUNK_CONCURRENCY
|
||||
|
||||
type OCRResult = {
|
||||
success: boolean
|
||||
error?: string
|
||||
@@ -36,6 +40,61 @@ type OCRRequestBody = {
|
||||
include_image_base64: boolean
|
||||
}
|
||||
|
||||
const MISTRAL_MAX_PAGES = 1000
|
||||
|
||||
/**
|
||||
* Get page count from a PDF buffer using unpdf
|
||||
*/
|
||||
async function getPdfPageCount(buffer: Buffer): Promise<number> {
|
||||
try {
|
||||
const { getDocumentProxy } = await import('unpdf')
|
||||
const uint8Array = new Uint8Array(buffer)
|
||||
const pdf = await getDocumentProxy(uint8Array)
|
||||
return pdf.numPages
|
||||
} catch (error) {
|
||||
logger.warn('Failed to get PDF page count:', error)
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a PDF buffer into multiple smaller PDFs
|
||||
* Returns an array of PDF buffers, each with at most maxPages pages
|
||||
*/
|
||||
async function splitPdfIntoChunks(
|
||||
pdfBuffer: Buffer,
|
||||
maxPages: number
|
||||
): Promise<{ buffer: Buffer; startPage: number; endPage: number }[]> {
|
||||
const sourcePdf = await PDFDocument.load(pdfBuffer)
|
||||
const totalPages = sourcePdf.getPageCount()
|
||||
|
||||
if (totalPages <= maxPages) {
|
||||
return [{ buffer: pdfBuffer, startPage: 0, endPage: totalPages - 1 }]
|
||||
}
|
||||
|
||||
const chunks: { buffer: Buffer; startPage: number; endPage: number }[] = []
|
||||
|
||||
for (let startPage = 0; startPage < totalPages; startPage += maxPages) {
|
||||
const endPage = Math.min(startPage + maxPages - 1, totalPages - 1)
|
||||
const pageCount = endPage - startPage + 1
|
||||
|
||||
const newPdf = await PDFDocument.create()
|
||||
const pageIndices = Array.from({ length: pageCount }, (_, i) => startPage + i)
|
||||
const copiedPages = await newPdf.copyPages(sourcePdf, pageIndices)
|
||||
|
||||
copiedPages.forEach((page) => newPdf.addPage(page))
|
||||
|
||||
const pdfBytes = await newPdf.save()
|
||||
chunks.push({
|
||||
buffer: Buffer.from(pdfBytes),
|
||||
startPage,
|
||||
endPage,
|
||||
})
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
type AzureOCRResponse = {
|
||||
pages?: OCRPage[]
|
||||
[key: string]: unknown
|
||||
@@ -81,7 +140,7 @@ export async function processDocument(
|
||||
const cloudUrl = 'cloudUrl' in parseResult ? parseResult.cloudUrl : undefined
|
||||
|
||||
let chunks: Chunk[]
|
||||
const metadata = 'metadata' in parseResult ? parseResult.metadata : {}
|
||||
const metadata: FileParseMetadata = parseResult.metadata ?? {}
|
||||
|
||||
const isJsonYaml =
|
||||
metadata.type === 'json' ||
|
||||
@@ -97,10 +156,11 @@ export async function processDocument(
|
||||
})
|
||||
} else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
|
||||
logger.info('Using structured data chunker for spreadsheet/CSV content')
|
||||
const rowCount = metadata.totalRows ?? metadata.rowCount
|
||||
chunks = await StructuredDataChunker.chunkStructuredData(content, {
|
||||
chunkSize,
|
||||
headers: metadata.headers,
|
||||
totalRows: metadata.totalRows || metadata.rowCount,
|
||||
totalRows: typeof rowCount === 'number' ? rowCount : undefined,
|
||||
sheetName: metadata.sheetNames?.[0],
|
||||
})
|
||||
} else {
|
||||
@@ -153,7 +213,7 @@ async function parseDocument(
|
||||
content: string
|
||||
processingMethod: 'file-parser' | 'mistral-ocr'
|
||||
cloudUrl?: string
|
||||
metadata?: any
|
||||
metadata?: FileParseMetadata
|
||||
}> {
|
||||
const isPDF = mimeType === 'application/pdf'
|
||||
const hasAzureMistralOCR =
|
||||
@@ -165,7 +225,7 @@ async function parseDocument(
|
||||
if (isPDF && (hasAzureMistralOCR || hasMistralOCR)) {
|
||||
if (hasAzureMistralOCR) {
|
||||
logger.info(`Using Azure Mistral OCR: ${filename}`)
|
||||
return parseWithAzureMistralOCR(fileUrl, filename, mimeType, userId, workspaceId)
|
||||
return parseWithAzureMistralOCR(fileUrl, filename, mimeType)
|
||||
}
|
||||
|
||||
if (hasMistralOCR) {
|
||||
@@ -188,13 +248,32 @@ async function handleFileForOCR(
|
||||
const isExternalHttps = fileUrl.startsWith('https://') && !fileUrl.includes('/api/files/serve/')
|
||||
|
||||
if (isExternalHttps) {
|
||||
return { httpsUrl: fileUrl }
|
||||
if (mimeType === 'application/pdf') {
|
||||
logger.info(`handleFileForOCR: Downloading external PDF to check page count`)
|
||||
try {
|
||||
const buffer = await downloadFileWithTimeout(fileUrl)
|
||||
logger.info(`handleFileForOCR: Downloaded external PDF: ${buffer.length} bytes`)
|
||||
return { httpsUrl: fileUrl, buffer }
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
`handleFileForOCR: Failed to download external PDF for page count check, proceeding without batching`,
|
||||
{
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
}
|
||||
)
|
||||
return { httpsUrl: fileUrl, buffer: undefined }
|
||||
}
|
||||
}
|
||||
logger.info(`handleFileForOCR: Using external URL directly`)
|
||||
return { httpsUrl: fileUrl, buffer: undefined }
|
||||
}
|
||||
|
||||
logger.info(`Uploading "${filename}" to cloud storage for OCR`)
|
||||
|
||||
const buffer = await downloadFileWithTimeout(fileUrl)
|
||||
|
||||
logger.info(`Downloaded ${filename}: ${buffer.length} bytes`)
|
||||
|
||||
try {
|
||||
const metadata: Record<string, string> = {
|
||||
originalName: filename,
|
||||
@@ -224,8 +303,7 @@ async function handleFileForOCR(
|
||||
900 // 15 minutes
|
||||
)
|
||||
|
||||
logger.info(`Successfully uploaded for OCR: ${cloudResult.key}`)
|
||||
return { httpsUrl, cloudUrl: httpsUrl }
|
||||
return { httpsUrl, cloudUrl: httpsUrl, buffer }
|
||||
} catch (uploadError) {
|
||||
const message = uploadError instanceof Error ? uploadError.message : 'Unknown error'
|
||||
throw new Error(`Cloud upload failed: ${message}. Cloud upload is required for OCR.`)
|
||||
@@ -321,13 +399,7 @@ async function makeOCRRequest(
|
||||
}
|
||||
}
|
||||
|
||||
async function parseWithAzureMistralOCR(
|
||||
fileUrl: string,
|
||||
filename: string,
|
||||
mimeType: string,
|
||||
userId?: string,
|
||||
workspaceId?: string | null
|
||||
) {
|
||||
async function parseWithAzureMistralOCR(fileUrl: string, filename: string, mimeType: string) {
|
||||
validateOCRConfig(
|
||||
env.OCR_AZURE_API_KEY,
|
||||
env.OCR_AZURE_ENDPOINT,
|
||||
@@ -336,6 +408,19 @@ async function parseWithAzureMistralOCR(
|
||||
)
|
||||
|
||||
const fileBuffer = await downloadFileForBase64(fileUrl)
|
||||
|
||||
if (mimeType === 'application/pdf') {
|
||||
const pageCount = await getPdfPageCount(fileBuffer)
|
||||
if (pageCount > MISTRAL_MAX_PAGES) {
|
||||
logger.info(
|
||||
`PDF has ${pageCount} pages, exceeds Azure OCR limit of ${MISTRAL_MAX_PAGES}. ` +
|
||||
`Falling back to file parser.`
|
||||
)
|
||||
return parseWithFileParser(fileUrl, filename, mimeType)
|
||||
}
|
||||
logger.info(`Azure Mistral OCR: PDF page count for ${filename}: ${pageCount}`)
|
||||
}
|
||||
|
||||
const base64Data = fileBuffer.toString('base64')
|
||||
const dataUri = `data:${mimeType};base64,${base64Data}`
|
||||
|
||||
@@ -374,17 +459,7 @@ async function parseWithAzureMistralOCR(
|
||||
message: error instanceof Error ? error.message : String(error),
|
||||
})
|
||||
|
||||
const fallbackMistralKey = await getMistralApiKey(workspaceId)
|
||||
if (fallbackMistralKey) {
|
||||
return parseWithMistralOCR(
|
||||
fileUrl,
|
||||
filename,
|
||||
mimeType,
|
||||
userId,
|
||||
workspaceId,
|
||||
fallbackMistralKey
|
||||
)
|
||||
}
|
||||
logger.info(`Falling back to file parser: ${filename}`)
|
||||
return parseWithFileParser(fileUrl, filename, mimeType)
|
||||
}
|
||||
}
|
||||
@@ -406,50 +481,35 @@ async function parseWithMistralOCR(
|
||||
throw new Error('Mistral parser tool not configured')
|
||||
}
|
||||
|
||||
const { httpsUrl, cloudUrl } = await handleFileForOCR(
|
||||
const { httpsUrl, cloudUrl, buffer } = await handleFileForOCR(
|
||||
fileUrl,
|
||||
filename,
|
||||
mimeType,
|
||||
userId,
|
||||
workspaceId
|
||||
)
|
||||
|
||||
logger.info(`Mistral OCR: Using presigned URL for ${filename}: ${httpsUrl.substring(0, 120)}...`)
|
||||
|
||||
let pageCount = 0
|
||||
if (mimeType === 'application/pdf' && buffer) {
|
||||
pageCount = await getPdfPageCount(buffer)
|
||||
logger.info(`PDF page count for ${filename}: ${pageCount}`)
|
||||
}
|
||||
|
||||
const needsBatching = pageCount > MISTRAL_MAX_PAGES
|
||||
|
||||
if (needsBatching && buffer) {
|
||||
logger.info(
|
||||
`PDF has ${pageCount} pages, exceeds limit of ${MISTRAL_MAX_PAGES}. Splitting and processing in chunks.`
|
||||
)
|
||||
return processMistralOCRInBatches(filename, apiKey, buffer, userId, cloudUrl)
|
||||
}
|
||||
|
||||
const params = { filePath: httpsUrl, apiKey, resultType: 'text' as const }
|
||||
|
||||
try {
|
||||
const response = await retryWithExponentialBackoff(
|
||||
async () => {
|
||||
let url =
|
||||
typeof mistralParserTool.request!.url === 'function'
|
||||
? mistralParserTool.request!.url(params)
|
||||
: mistralParserTool.request!.url
|
||||
|
||||
const isInternalRoute = url.startsWith('/')
|
||||
|
||||
if (isInternalRoute) {
|
||||
const { getBaseUrl } = await import('@/lib/core/utils/urls')
|
||||
url = `${getBaseUrl()}${url}`
|
||||
}
|
||||
|
||||
let headers =
|
||||
typeof mistralParserTool.request!.headers === 'function'
|
||||
? mistralParserTool.request!.headers(params)
|
||||
: mistralParserTool.request!.headers
|
||||
|
||||
if (isInternalRoute) {
|
||||
const { generateInternalToken } = await import('@/lib/auth/internal')
|
||||
const internalToken = await generateInternalToken(userId)
|
||||
headers = {
|
||||
...headers,
|
||||
Authorization: `Bearer ${internalToken}`,
|
||||
}
|
||||
}
|
||||
|
||||
const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody
|
||||
return makeOCRRequest(url, headers as Record<string, string>, requestBody)
|
||||
},
|
||||
{ maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
|
||||
)
|
||||
|
||||
const response = await executeMistralOCRRequest(params, userId)
|
||||
const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
|
||||
const content = processOCRContent(result, filename)
|
||||
|
||||
@@ -464,10 +524,204 @@ async function parseWithMistralOCR(
|
||||
}
|
||||
}
|
||||
|
||||
async function executeMistralOCRRequest(
|
||||
params: { filePath: string; apiKey: string; resultType: 'text' },
|
||||
userId?: string
|
||||
): Promise<Response> {
|
||||
return retryWithExponentialBackoff(
|
||||
async () => {
|
||||
let url =
|
||||
typeof mistralParserTool.request!.url === 'function'
|
||||
? mistralParserTool.request!.url(params)
|
||||
: mistralParserTool.request!.url
|
||||
|
||||
const isInternalRoute = url.startsWith('/')
|
||||
|
||||
if (isInternalRoute) {
|
||||
const { getBaseUrl } = await import('@/lib/core/utils/urls')
|
||||
url = `${getBaseUrl()}${url}`
|
||||
}
|
||||
|
||||
let headers =
|
||||
typeof mistralParserTool.request!.headers === 'function'
|
||||
? mistralParserTool.request!.headers(params)
|
||||
: mistralParserTool.request!.headers
|
||||
|
||||
if (isInternalRoute) {
|
||||
const { generateInternalToken } = await import('@/lib/auth/internal')
|
||||
const internalToken = await generateInternalToken(userId)
|
||||
headers = {
|
||||
...headers,
|
||||
Authorization: `Bearer ${internalToken}`,
|
||||
}
|
||||
}
|
||||
|
||||
const requestBody = mistralParserTool.request!.body!(params) as OCRRequestBody
|
||||
return makeOCRRequest(url, headers as Record<string, string>, requestBody)
|
||||
},
|
||||
{ maxRetries: 3, initialDelayMs: 1000, maxDelayMs: 10000 }
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single PDF chunk: upload to S3, OCR, cleanup
|
||||
*/
|
||||
async function processChunk(
|
||||
chunk: { buffer: Buffer; startPage: number; endPage: number },
|
||||
chunkIndex: number,
|
||||
totalChunks: number,
|
||||
filename: string,
|
||||
apiKey: string,
|
||||
userId?: string
|
||||
): Promise<{ index: number; content: string | null }> {
|
||||
const chunkPageCount = chunk.endPage - chunk.startPage + 1
|
||||
|
||||
logger.info(
|
||||
`Processing chunk ${chunkIndex + 1}/${totalChunks} (pages ${chunk.startPage + 1}-${chunk.endPage + 1}, ${chunkPageCount} pages)`
|
||||
)
|
||||
|
||||
let uploadedKey: string | null = null
|
||||
|
||||
try {
|
||||
// Upload the chunk to S3
|
||||
const timestamp = Date.now()
|
||||
const uniqueId = Math.random().toString(36).substring(2, 9)
|
||||
const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
|
||||
const chunkKey = `kb/${timestamp}-${uniqueId}-chunk${chunkIndex + 1}-${safeFileName}`
|
||||
|
||||
const metadata: Record<string, string> = {
|
||||
originalName: `${filename}_chunk${chunkIndex + 1}`,
|
||||
uploadedAt: new Date().toISOString(),
|
||||
purpose: 'knowledge-base',
|
||||
...(userId && { userId }),
|
||||
}
|
||||
|
||||
const uploadResult = await StorageService.uploadFile({
|
||||
file: chunk.buffer,
|
||||
fileName: `${filename}_chunk${chunkIndex + 1}`,
|
||||
contentType: 'application/pdf',
|
||||
context: 'knowledge-base',
|
||||
customKey: chunkKey,
|
||||
metadata,
|
||||
})
|
||||
|
||||
uploadedKey = uploadResult.key
|
||||
|
||||
const chunkUrl = await StorageService.generatePresignedDownloadUrl(
|
||||
uploadResult.key,
|
||||
'knowledge-base',
|
||||
900 // 15 minutes
|
||||
)
|
||||
|
||||
logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
|
||||
|
||||
// Process the chunk with Mistral OCR
|
||||
const params = {
|
||||
filePath: chunkUrl,
|
||||
apiKey,
|
||||
resultType: 'text' as const,
|
||||
}
|
||||
|
||||
const response = await executeMistralOCRRequest(params, userId)
|
||||
const result = (await mistralParserTool.transformResponse!(response, params)) as OCRResult
|
||||
|
||||
if (result.success && result.output?.content) {
|
||||
logger.info(`Chunk ${chunkIndex + 1}/${totalChunks} completed successfully`)
|
||||
return { index: chunkIndex, content: result.output.content }
|
||||
}
|
||||
logger.warn(`Chunk ${chunkIndex + 1}/${totalChunks} returned no content`)
|
||||
return { index: chunkIndex, content: null }
|
||||
} catch (error) {
|
||||
logger.error(`Chunk ${chunkIndex + 1}/${totalChunks} failed:`, {
|
||||
message: error instanceof Error ? error.message : String(error),
|
||||
})
|
||||
return { index: chunkIndex, content: null }
|
||||
} finally {
|
||||
// Clean up the chunk file from S3 after processing
|
||||
if (uploadedKey) {
|
||||
try {
|
||||
await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
|
||||
logger.info(`Cleaned up chunk ${chunkIndex + 1} from S3`)
|
||||
} catch (deleteError) {
|
||||
logger.warn(`Failed to clean up chunk ${chunkIndex + 1} from S3:`, {
|
||||
message: deleteError instanceof Error ? deleteError.message : String(deleteError),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function processMistralOCRInBatches(
|
||||
filename: string,
|
||||
apiKey: string,
|
||||
pdfBuffer: Buffer,
|
||||
userId?: string,
|
||||
cloudUrl?: string
|
||||
): Promise<{
|
||||
content: string
|
||||
processingMethod: 'mistral-ocr'
|
||||
cloudUrl?: string
|
||||
}> {
|
||||
const totalPages = await getPdfPageCount(pdfBuffer)
|
||||
logger.info(
|
||||
`Splitting ${filename} (${totalPages} pages) into chunks of ${MISTRAL_MAX_PAGES} pages`
|
||||
)
|
||||
|
||||
const pdfChunks = await splitPdfIntoChunks(pdfBuffer, MISTRAL_MAX_PAGES)
|
||||
logger.info(
|
||||
`Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
|
||||
)
|
||||
|
||||
// Process chunks concurrently with limited concurrency
|
||||
const results: { index: number; content: string | null }[] = []
|
||||
|
||||
for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
|
||||
const batch = pdfChunks.slice(i, i + MAX_CONCURRENT_CHUNKS)
|
||||
const batchPromises = batch.map((chunk, batchIndex) =>
|
||||
processChunk(chunk, i + batchIndex, pdfChunks.length, filename, apiKey, userId)
|
||||
)
|
||||
|
||||
const batchResults = await Promise.all(batchPromises)
|
||||
for (const result of batchResults) {
|
||||
results.push(result)
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`Completed batch ${Math.floor(i / MAX_CONCURRENT_CHUNKS) + 1}/${Math.ceil(pdfChunks.length / MAX_CONCURRENT_CHUNKS)}`
|
||||
)
|
||||
}
|
||||
|
||||
// Sort by index to maintain page order and filter out nulls
|
||||
const sortedResults = results
|
||||
.sort((a, b) => a.index - b.index)
|
||||
.filter((r) => r.content !== null)
|
||||
.map((r) => r.content as string)
|
||||
|
||||
if (sortedResults.length === 0) {
|
||||
// Don't fall back to file parser for large PDFs - it produces poor results
|
||||
// Better to fail clearly than return low-quality extraction
|
||||
throw new Error(
|
||||
`OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
|
||||
`Large PDFs require OCR - file parser fallback would produce poor results.`
|
||||
)
|
||||
}
|
||||
|
||||
const combinedContent = sortedResults.join('\n\n')
|
||||
logger.info(
|
||||
`Successfully processed ${sortedResults.length}/${pdfChunks.length} chunks for ${filename}`
|
||||
)
|
||||
|
||||
return {
|
||||
content: combinedContent,
|
||||
processingMethod: 'mistral-ocr',
|
||||
cloudUrl,
|
||||
}
|
||||
}
|
||||
|
||||
async function parseWithFileParser(fileUrl: string, filename: string, mimeType: string) {
|
||||
try {
|
||||
let content: string
|
||||
let metadata: any = {}
|
||||
let metadata: FileParseMetadata = {}
|
||||
|
||||
if (fileUrl.startsWith('data:')) {
|
||||
content = await parseDataURI(fileUrl, filename, mimeType)
|
||||
@@ -513,7 +767,7 @@ async function parseDataURI(fileUrl: string, filename: string, mimeType: string)
|
||||
async function parseHttpFile(
|
||||
fileUrl: string,
|
||||
filename: string
|
||||
): Promise<{ content: string; metadata?: any }> {
|
||||
): Promise<{ content: string; metadata?: FileParseMetadata }> {
|
||||
const buffer = await downloadFileWithTimeout(fileUrl)
|
||||
|
||||
const extension = filename.split('.').pop()?.toLowerCase()
|
||||
|
||||
@@ -212,7 +212,6 @@ export async function processDocumentTags(
|
||||
return result
|
||||
}
|
||||
|
||||
// Fetch existing tag definitions
|
||||
const existingDefinitions = await db
|
||||
.select()
|
||||
.from(knowledgeBaseTagDefinitions)
|
||||
@@ -220,18 +219,15 @@ export async function processDocumentTags(
|
||||
|
||||
const existingByName = new Map(existingDefinitions.map((def) => [def.displayName, def]))
|
||||
|
||||
// First pass: collect all validation errors
|
||||
const undefinedTags: string[] = []
|
||||
const typeErrors: string[] = []
|
||||
|
||||
for (const tag of tagData) {
|
||||
// Skip if no tag name
|
||||
if (!tag.tagName?.trim()) continue
|
||||
|
||||
const tagName = tag.tagName.trim()
|
||||
const fieldType = tag.fieldType || 'text'
|
||||
|
||||
// For boolean, check if value is defined; for others, check if value is non-empty
|
||||
const hasValue =
|
||||
fieldType === 'boolean'
|
||||
? tag.value !== undefined && tag.value !== null && tag.value !== ''
|
||||
@@ -239,14 +235,12 @@ export async function processDocumentTags(
|
||||
|
||||
if (!hasValue) continue
|
||||
|
||||
// Check if tag exists
|
||||
const existingDef = existingByName.get(tagName)
|
||||
if (!existingDef) {
|
||||
undefinedTags.push(tagName)
|
||||
continue
|
||||
}
|
||||
|
||||
// Validate value type using shared validation
|
||||
const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
|
||||
const actualFieldType = existingDef.fieldType || fieldType
|
||||
const validationError = validateTagValue(tagName, String(rawValue), actualFieldType)
|
||||
@@ -255,7 +249,6 @@ export async function processDocumentTags(
|
||||
}
|
||||
}
|
||||
|
||||
// Throw combined error if there are any validation issues
|
||||
if (undefinedTags.length > 0 || typeErrors.length > 0) {
|
||||
const errorParts: string[] = []
|
||||
|
||||
@@ -270,7 +263,6 @@ export async function processDocumentTags(
|
||||
throw new Error(errorParts.join('\n'))
|
||||
}
|
||||
|
||||
// Second pass: process valid tags
|
||||
for (const tag of tagData) {
|
||||
if (!tag.tagName?.trim()) continue
|
||||
|
||||
@@ -285,14 +277,13 @@ export async function processDocumentTags(
|
||||
if (!hasValue) continue
|
||||
|
||||
const existingDef = existingByName.get(tagName)
|
||||
if (!existingDef) continue // Already validated above
|
||||
if (!existingDef) continue
|
||||
|
||||
const targetSlot = existingDef.tagSlot
|
||||
const actualFieldType = existingDef.fieldType || fieldType
|
||||
const rawValue = typeof tag.value === 'string' ? tag.value.trim() : tag.value
|
||||
const stringValue = String(rawValue).trim()
|
||||
|
||||
// Assign value to the slot with proper type conversion (values already validated)
|
||||
if (actualFieldType === 'boolean') {
|
||||
setTagValue(result, targetSlot, parseBooleanValue(stringValue) ?? false)
|
||||
} else if (actualFieldType === 'number') {
|
||||
@@ -440,7 +431,6 @@ export async function processDocumentAsync(
|
||||
|
||||
logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)
|
||||
|
||||
// Use KB's chunkingConfig as fallback if processingOptions not provided
|
||||
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
|
||||
|
||||
await withTimeout(
|
||||
@@ -469,7 +459,6 @@ export async function processDocumentAsync(
|
||||
`[${documentId}] Document parsed successfully, generating embeddings for ${processed.chunks.length} chunks`
|
||||
)
|
||||
|
||||
// Generate embeddings in batches for large documents
|
||||
const chunkTexts = processed.chunks.map((chunk) => chunk.text)
|
||||
const embeddings: number[][] = []
|
||||
|
||||
@@ -485,7 +474,9 @@ export async function processDocumentAsync(
|
||||
|
||||
logger.info(`[${documentId}] Processing embedding batch ${batchNum}/${totalBatches}`)
|
||||
const batchEmbeddings = await generateEmbeddings(batch, undefined, kb[0].workspaceId)
|
||||
embeddings.push(...batchEmbeddings)
|
||||
for (const emb of batchEmbeddings) {
|
||||
embeddings.push(emb)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -562,23 +553,18 @@ export async function processDocumentAsync(
|
||||
}))
|
||||
|
||||
await db.transaction(async (tx) => {
|
||||
// Insert embeddings in batches for large documents
|
||||
if (embeddingRecords.length > 0) {
|
||||
const batchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
|
||||
const totalBatches = Math.ceil(embeddingRecords.length / batchSize)
|
||||
await tx.delete(embedding).where(eq(embedding.documentId, documentId))
|
||||
|
||||
logger.info(
|
||||
`[${documentId}] Inserting ${embeddingRecords.length} embeddings in ${totalBatches} batches`
|
||||
)
|
||||
|
||||
for (let i = 0; i < embeddingRecords.length; i += batchSize) {
|
||||
const batch = embeddingRecords.slice(i, i + batchSize)
|
||||
const batchNum = Math.floor(i / batchSize) + 1
|
||||
const insertBatchSize = LARGE_DOC_CONFIG.MAX_CHUNKS_PER_BATCH
|
||||
const batches: (typeof embeddingRecords)[] = []
|
||||
for (let i = 0; i < embeddingRecords.length; i += insertBatchSize) {
|
||||
batches.push(embeddingRecords.slice(i, i + insertBatchSize))
|
||||
}
|
||||
|
||||
logger.info(`[${documentId}] Inserting ${embeddingRecords.length} embeddings`)
|
||||
for (const batch of batches) {
|
||||
await tx.insert(embedding).values(batch)
|
||||
logger.info(
|
||||
`[${documentId}] Inserted batch ${batchNum}/${totalBatches} (${batch.length} records)`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,11 +675,9 @@ export async function createDocumentRecords(
|
||||
requestId: string,
|
||||
userId?: string
|
||||
): Promise<DocumentData[]> {
|
||||
// Check storage limits before creating documents
|
||||
if (userId) {
|
||||
const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
|
||||
|
||||
// Get knowledge base owner
|
||||
const kb = await db
|
||||
.select({ userId: knowledgeBase.userId })
|
||||
.from(knowledgeBase)
|
||||
@@ -713,7 +697,7 @@ export async function createDocumentRecords(
|
||||
for (const docData of documents) {
|
||||
const documentId = randomUUID()
|
||||
|
||||
let processedTags: Record<string, any> = {}
|
||||
let processedTags: Partial<ProcessedDocumentTags> = {}
|
||||
|
||||
if (docData.documentTagsData) {
|
||||
try {
|
||||
@@ -722,7 +706,6 @@ export async function createDocumentRecords(
|
||||
processedTags = await processDocumentTags(knowledgeBaseId, tagData, requestId)
|
||||
}
|
||||
} catch (error) {
|
||||
// Re-throw validation errors, only catch JSON parse errors
|
||||
if (error instanceof SyntaxError) {
|
||||
logger.warn(`[${requestId}] Failed to parse documentTagsData for bulk document:`, error)
|
||||
} else {
|
||||
@@ -791,7 +774,6 @@ export async function createDocumentRecords(
|
||||
if (userId) {
|
||||
const totalSize = documents.reduce((sum, doc) => sum + doc.fileSize, 0)
|
||||
|
||||
// Get knowledge base owner
|
||||
const kb = await db
|
||||
.select({ userId: knowledgeBase.userId })
|
||||
.from(knowledgeBase)
|
||||
@@ -1079,7 +1061,7 @@ export async function createSingleDocument(
|
||||
const now = new Date()
|
||||
|
||||
// Process structured tag data if provided
|
||||
let processedTags: Record<string, any> = {
|
||||
let processedTags: ProcessedDocumentTags = {
|
||||
// Text tags (7 slots)
|
||||
tag1: documentData.tag1 ?? null,
|
||||
tag2: documentData.tag2 ?? null,
|
||||
@@ -1555,23 +1537,30 @@ export async function updateDocument(
|
||||
return value || null
|
||||
}
|
||||
|
||||
// Type-safe access to tag slots in updateData
|
||||
type UpdateDataWithTags = typeof updateData & Record<TagSlot, string | undefined>
|
||||
const typedUpdateData = updateData as UpdateDataWithTags
|
||||
|
||||
ALL_TAG_SLOTS.forEach((slot: TagSlot) => {
|
||||
const updateValue = (updateData as any)[slot]
|
||||
const updateValue = typedUpdateData[slot]
|
||||
if (updateValue !== undefined) {
|
||||
;(dbUpdateData as any)[slot] = convertTagValue(slot, updateValue)
|
||||
;(dbUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[slot] =
|
||||
convertTagValue(slot, updateValue)
|
||||
}
|
||||
})
|
||||
|
||||
await db.transaction(async (tx) => {
|
||||
await tx.update(document).set(dbUpdateData).where(eq(document.id, documentId))
|
||||
|
||||
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => (updateData as any)[field] !== undefined)
|
||||
const hasTagUpdates = ALL_TAG_SLOTS.some((field) => typedUpdateData[field] !== undefined)
|
||||
|
||||
if (hasTagUpdates) {
|
||||
const embeddingUpdateData: Record<string, any> = {}
|
||||
const embeddingUpdateData: Partial<ProcessedDocumentTags> = {}
|
||||
ALL_TAG_SLOTS.forEach((field) => {
|
||||
if ((updateData as any)[field] !== undefined) {
|
||||
embeddingUpdateData[field] = convertTagValue(field, (updateData as any)[field])
|
||||
if (typedUpdateData[field] !== undefined) {
|
||||
;(embeddingUpdateData as Record<TagSlot, string | number | Date | boolean | null>)[
|
||||
field
|
||||
] = convertTagValue(field, typedUpdateData[field])
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ export interface RetryOptions {
|
||||
initialDelayMs?: number
|
||||
maxDelayMs?: number
|
||||
backoffMultiplier?: number
|
||||
retryCondition?: (error: RetryableError) => boolean
|
||||
retryCondition?: (error: unknown) => boolean
|
||||
}
|
||||
|
||||
export interface RetryResult<T> {
|
||||
@@ -30,11 +30,18 @@ function hasStatus(
|
||||
return typeof error === 'object' && error !== null && 'status' in error
|
||||
}
|
||||
|
||||
function isRetryableErrorType(error: unknown): error is RetryableError {
|
||||
if (!error) return false
|
||||
if (error instanceof Error) return true
|
||||
if (typeof error === 'object' && ('status' in error || 'message' in error)) return true
|
||||
return false
|
||||
}
|
||||
|
||||
/**
|
||||
* Default retry condition for rate limiting errors
|
||||
*/
|
||||
export function isRetryableError(error: RetryableError): boolean {
|
||||
if (!error) return false
|
||||
export function isRetryableError(error: unknown): boolean {
|
||||
if (!isRetryableErrorType(error)) return false
|
||||
|
||||
// Check for rate limiting status codes
|
||||
if (
|
||||
@@ -45,7 +52,7 @@ export function isRetryableError(error: RetryableError): boolean {
|
||||
}
|
||||
|
||||
// Check for rate limiting in error messages
|
||||
const errorMessage = error.message || error.toString()
|
||||
const errorMessage = error instanceof Error ? error.message : String(error)
|
||||
const rateLimitKeywords = [
|
||||
'rate limit',
|
||||
'rate_limit',
|
||||
|
||||
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
|
||||
import { getBYOKKey } from '@/lib/api-key/byok'
|
||||
import { env } from '@/lib/core/config/env'
|
||||
import { isRetryableError, retryWithExponentialBackoff } from '@/lib/knowledge/documents/utils'
|
||||
import { batchByTokenLimit, getTotalTokenCount } from '@/lib/tokenization'
|
||||
import { batchByTokenLimit } from '@/lib/tokenization'
|
||||
|
||||
const logger = createLogger('EmbeddingUtils')
|
||||
|
||||
@@ -26,6 +26,20 @@ interface EmbeddingConfig {
|
||||
modelName: string
|
||||
}
|
||||
|
||||
interface EmbeddingResponseItem {
|
||||
embedding: number[]
|
||||
index: number
|
||||
}
|
||||
|
||||
interface EmbeddingAPIResponse {
|
||||
data: EmbeddingResponseItem[]
|
||||
model: string
|
||||
usage: {
|
||||
prompt_tokens: number
|
||||
total_tokens: number
|
||||
}
|
||||
}
|
||||
|
||||
async function getEmbeddingConfig(
|
||||
embeddingModel = 'text-embedding-3-small',
|
||||
workspaceId?: string | null
|
||||
@@ -104,14 +118,14 @@ async function callEmbeddingAPI(inputs: string[], config: EmbeddingConfig): Prom
|
||||
)
|
||||
}
|
||||
|
||||
const data = await response.json()
|
||||
return data.data.map((item: any) => item.embedding)
|
||||
const data: EmbeddingAPIResponse = await response.json()
|
||||
return data.data.map((item) => item.embedding)
|
||||
},
|
||||
{
|
||||
maxRetries: 3,
|
||||
initialDelayMs: 1000,
|
||||
maxDelayMs: 10000,
|
||||
retryCondition: (error: any) => {
|
||||
retryCondition: (error: unknown) => {
|
||||
if (error instanceof EmbeddingAPIError) {
|
||||
return error.status === 429 || error.status >= 500
|
||||
}
|
||||
@@ -153,44 +167,27 @@ export async function generateEmbeddings(
|
||||
): Promise<number[][]> {
|
||||
const config = await getEmbeddingConfig(embeddingModel, workspaceId)
|
||||
|
||||
logger.info(
|
||||
`Using ${config.useAzure ? 'Azure OpenAI' : 'OpenAI'} for embeddings generation (${texts.length} texts)`
|
||||
)
|
||||
|
||||
const batches = batchByTokenLimit(texts, MAX_TOKENS_PER_REQUEST, embeddingModel)
|
||||
|
||||
logger.info(
|
||||
`Split ${texts.length} texts into ${batches.length} batches (max ${MAX_TOKENS_PER_REQUEST} tokens per batch, ${MAX_CONCURRENT_BATCHES} concurrent)`
|
||||
)
|
||||
|
||||
const batchResults = await processWithConcurrency(
|
||||
batches,
|
||||
MAX_CONCURRENT_BATCHES,
|
||||
async (batch, i) => {
|
||||
const batchTokenCount = getTotalTokenCount(batch, embeddingModel)
|
||||
|
||||
logger.info(
|
||||
`Processing batch ${i + 1}/${batches.length}: ${batch.length} texts, ${batchTokenCount} tokens`
|
||||
)
|
||||
|
||||
try {
|
||||
const batchEmbeddings = await callEmbeddingAPI(batch, config)
|
||||
|
||||
logger.info(
|
||||
`Generated ${batchEmbeddings.length} embeddings for batch ${i + 1}/${batches.length}`
|
||||
)
|
||||
|
||||
return batchEmbeddings
|
||||
return await callEmbeddingAPI(batch, config)
|
||||
} catch (error) {
|
||||
logger.error(`Failed to generate embeddings for batch ${i + 1}:`, error)
|
||||
logger.error(`Failed to generate embeddings for batch ${i + 1}/${batches.length}:`, error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
const allEmbeddings = batchResults.flat()
|
||||
|
||||
logger.info(`Successfully generated ${allEmbeddings.length} embeddings total`)
|
||||
const allEmbeddings: number[][] = []
|
||||
for (const batch of batchResults) {
|
||||
for (const emb of batch) {
|
||||
allEmbeddings.push(emb)
|
||||
}
|
||||
}
|
||||
|
||||
return allEmbeddings
|
||||
}
|
||||
|
||||
@@ -127,24 +127,6 @@ export function truncateToTokenLimit(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get token count for multiple texts (for batching decisions)
|
||||
* Returns array of token counts in same order as input
|
||||
*/
|
||||
export function getTokenCountsForBatch(
|
||||
texts: string[],
|
||||
modelName = 'text-embedding-3-small'
|
||||
): number[] {
|
||||
return texts.map((text) => getAccurateTokenCount(text, modelName))
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate total tokens across multiple texts
|
||||
*/
|
||||
export function getTotalTokenCount(texts: string[], modelName = 'text-embedding-3-small'): number {
|
||||
return texts.reduce((total, text) => total + getAccurateTokenCount(text, modelName), 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch texts by token count to stay within API limits
|
||||
* Returns array of batches where each batch's total tokens <= maxTokensPerBatch
|
||||
|
||||
@@ -12,8 +12,6 @@ export {
|
||||
estimateOutputTokens,
|
||||
estimateTokenCount,
|
||||
getAccurateTokenCount,
|
||||
getTokenCountsForBatch,
|
||||
getTotalTokenCount,
|
||||
truncateToTokenLimit,
|
||||
} from '@/lib/tokenization/estimators'
|
||||
export { processStreamingBlockLog, processStreamingBlockLogs } from '@/lib/tokenization/streaming'
|
||||
|
||||
@@ -127,6 +127,7 @@
|
||||
"onedollarstats": "0.0.10",
|
||||
"openai": "^4.91.1",
|
||||
"papaparse": "5.5.3",
|
||||
"pdf-lib": "1.17.1",
|
||||
"postgres": "^3.4.5",
|
||||
"posthog-js": "1.268.9",
|
||||
"posthog-node": "5.9.2",
|
||||
|
||||
@@ -17,7 +17,7 @@ export default defineConfig({
|
||||
build: {
|
||||
extensions: [
|
||||
additionalPackages({
|
||||
packages: ['unpdf'],
|
||||
packages: ['unpdf', 'pdf-lib'],
|
||||
}),
|
||||
],
|
||||
},
|
||||
|
||||
26
bun.lock
26
bun.lock
@@ -1,5 +1,6 @@
|
||||
{
|
||||
"lockfileVersion": 1,
|
||||
"configVersion": 0,
|
||||
"workspaces": {
|
||||
"": {
|
||||
"name": "simstudio",
|
||||
@@ -11,7 +12,7 @@
|
||||
"drizzle-kit": "^0.31.4",
|
||||
"husky": "9.1.7",
|
||||
"lint-staged": "16.0.0",
|
||||
"turbo": "2.7.2",
|
||||
"turbo": "2.7.3",
|
||||
},
|
||||
},
|
||||
"apps/docs": {
|
||||
@@ -156,6 +157,7 @@
|
||||
"onedollarstats": "0.0.10",
|
||||
"openai": "^4.91.1",
|
||||
"papaparse": "5.5.3",
|
||||
"pdf-lib": "1.17.1",
|
||||
"postgres": "^3.4.5",
|
||||
"posthog-js": "1.268.9",
|
||||
"posthog-node": "5.9.2",
|
||||
@@ -912,6 +914,10 @@
|
||||
|
||||
"@orama/orama": ["@orama/orama@3.1.18", "", {}, "sha512-a61ljmRVVyG5MC/698C8/FfFDw5a8LOIvyOLW5fztgUXqUpc1jOfQzOitSCbge657OgXXThmY3Tk8fpiDb4UcA=="],
|
||||
|
||||
"@pdf-lib/standard-fonts": ["@pdf-lib/standard-fonts@1.0.0", "", { "dependencies": { "pako": "^1.0.6" } }, "sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA=="],
|
||||
|
||||
"@pdf-lib/upng": ["@pdf-lib/upng@1.0.1", "", { "dependencies": { "pako": "^1.0.10" } }, "sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ=="],
|
||||
|
||||
"@peculiar/asn1-android": ["@peculiar/asn1-android@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-cBRCKtYPF7vJGN76/yG8VbxRcHLPF3HnkoHhKOZeHpoVtbMYfY9ROKtH3DtYUY9m8uI1Mh47PRhHf2hSK3xcSQ=="],
|
||||
|
||||
"@peculiar/asn1-cms": ["@peculiar/asn1-cms@2.6.0", "", { "dependencies": { "@peculiar/asn1-schema": "^2.6.0", "@peculiar/asn1-x509": "^2.6.0", "@peculiar/asn1-x509-attr": "^2.6.0", "asn1js": "^3.0.6", "tslib": "^2.8.1" } }, "sha512-2uZqP+ggSncESeUF/9Su8rWqGclEfEiz1SyU02WX5fUONFfkjzS2Z/F1Li0ofSmf4JqYXIOdCAZqIXAIBAT1OA=="],
|
||||
@@ -2864,6 +2870,8 @@
|
||||
|
||||
"pathval": ["pathval@2.0.1", "", {}, "sha512-//nshmD55c46FuFw26xV/xFAaB5HF9Xdap7HJBBnrKdAd6/GxDBaNA1870O79+9ueg61cZLSVc+OaFlfmObYVQ=="],
|
||||
|
||||
"pdf-lib": ["pdf-lib@1.17.1", "", { "dependencies": { "@pdf-lib/standard-fonts": "^1.0.0", "@pdf-lib/upng": "^1.0.1", "pako": "^1.0.11", "tslib": "^1.11.1" } }, "sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw=="],
|
||||
|
||||
"pdfjs-dist": ["pdfjs-dist@5.4.449", "", { "optionalDependencies": { "@napi-rs/canvas": "^0.1.81" } }, "sha512-CegnUaT0QwAyQMS+7o2POr4wWUNNe8VaKKlcuoRHeYo98cVnqPpwOXNSx6Trl6szH02JrRcsPgletV6GmF3LtQ=="],
|
||||
|
||||
"peberminta": ["peberminta@0.9.0", "", {}, "sha512-XIxfHpEuSJbITd1H3EeQwpcZbTLHc+VVr8ANI9t5sit565tsI4/xK3KWTUFE2e6QiangUkh3B0jihzmGnNrRsQ=="],
|
||||
@@ -3362,19 +3370,19 @@
|
||||
|
||||
"tunnel-agent": ["tunnel-agent@0.6.0", "", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w=="],
|
||||
|
||||
"turbo": ["turbo@2.7.2", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.2", "turbo-darwin-arm64": "2.7.2", "turbo-linux-64": "2.7.2", "turbo-linux-arm64": "2.7.2", "turbo-windows-64": "2.7.2", "turbo-windows-arm64": "2.7.2" }, "bin": { "turbo": "bin/turbo" } }, "sha512-5JIA5aYBAJSAhrhbyag1ZuMSgUZnHtI+Sq3H8D3an4fL8PeF+L1yYvbEJg47akP1PFfATMf5ehkqFnxfkmuwZQ=="],
|
||||
"turbo": ["turbo@2.7.3", "", { "optionalDependencies": { "turbo-darwin-64": "2.7.3", "turbo-darwin-arm64": "2.7.3", "turbo-linux-64": "2.7.3", "turbo-linux-arm64": "2.7.3", "turbo-windows-64": "2.7.3", "turbo-windows-arm64": "2.7.3" }, "bin": { "turbo": "bin/turbo" } }, "sha512-+HjKlP4OfYk+qzvWNETA3cUO5UuK6b5MSc2UJOKyvBceKucQoQGb2g7HlC2H1GHdkfKrk4YF1VPvROkhVZDDLQ=="],
|
||||
|
||||
"turbo-darwin-64": ["turbo-darwin-64@2.7.2", "", { "os": "darwin", "cpu": "x64" }, "sha512-dxY3X6ezcT5vm3coK6VGixbrhplbQMwgNsCsvZamS/+/6JiebqW9DKt4NwpgYXhDY2HdH00I7FWs3wkVuan4rA=="],
|
||||
"turbo-darwin-64": ["turbo-darwin-64@2.7.3", "", { "os": "darwin", "cpu": "x64" }, "sha512-aZHhvRiRHXbJw1EcEAq4aws1hsVVUZ9DPuSFaq9VVFAKCup7niIEwc22glxb7240yYEr1vLafdQ2U294Vcwz+w=="],
|
||||
|
||||
"turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.2", "", { "os": "darwin", "cpu": "arm64" }, "sha512-1bXmuwPLqNFt3mzrtYcVx1sdJ8UYb124Bf48nIgcpMCGZy3kDhgxNv1503kmuK/37OGOZbsWSQFU4I08feIuSg=="],
|
||||
"turbo-darwin-arm64": ["turbo-darwin-arm64@2.7.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-CkVrHSq+Bnhl9sX2LQgqQYVfLTWC2gvI74C4758OmU0djfrssDKU9d4YQF0AYXXhIIRZipSXfxClQziIMD+EAg=="],
|
||||
|
||||
"turbo-linux-64": ["turbo-linux-64@2.7.2", "", { "os": "linux", "cpu": "x64" }, "sha512-kP+TiiMaiPugbRlv57VGLfcjFNsFbo8H64wMBCPV2270Or2TpDCBULMzZrvEsvWFjT3pBFvToYbdp8/Kw0jAQg=="],
|
||||
"turbo-linux-64": ["turbo-linux-64@2.7.3", "", { "os": "linux", "cpu": "x64" }, "sha512-GqDsCNnzzr89kMaLGpRALyigUklzgxIrSy2pHZVXyifgczvYPnLglex78Aj3T2gu+T3trPPH2iJ+pWucVOCC2Q=="],
|
||||
|
||||
"turbo-linux-arm64": ["turbo-linux-arm64@2.7.2", "", { "os": "linux", "cpu": "arm64" }, "sha512-VDJwQ0+8zjAfbyY6boNaWfP6RIez4ypKHxwkuB6SrWbOSk+vxTyW5/hEjytTwK8w/TsbKVcMDyvpora8tEsRFw=="],
|
||||
"turbo-linux-arm64": ["turbo-linux-arm64@2.7.3", "", { "os": "linux", "cpu": "arm64" }, "sha512-NdCDTfIcIo3dWjsiaAHlxu5gW61Ed/8maah1IAF/9E3EtX0aAHNiBMbuYLZaR4vRJ7BeVkYB6xKWRtdFLZ0y3g=="],
|
||||
|
||||
"turbo-windows-64": ["turbo-windows-64@2.7.2", "", { "os": "win32", "cpu": "x64" }, "sha512-rPjqQXVnI6A6oxgzNEE8DNb6Vdj2Wwyhfv3oDc+YM3U9P7CAcBIlKv/868mKl4vsBtz4ouWpTQNXG8vljgJO+w=="],
|
||||
"turbo-windows-64": ["turbo-windows-64@2.7.3", "", { "os": "win32", "cpu": "x64" }, "sha512-7bVvO987daXGSJVYBoG8R4Q+csT1pKIgLJYZevXRQ0Hqw0Vv4mKme/TOjYXs9Qb1xMKh51Tb3bXKDbd8/4G08g=="],
|
||||
|
||||
"turbo-windows-arm64": ["turbo-windows-arm64@2.7.2", "", { "os": "win32", "cpu": "arm64" }, "sha512-tcnHvBhO515OheIFWdxA+qUvZzNqqcHbLVFc1+n+TJ1rrp8prYicQtbtmsiKgMvr/54jb9jOabU62URAobnB7g=="],
|
||||
"turbo-windows-arm64": ["turbo-windows-arm64@2.7.3", "", { "os": "win32", "cpu": "arm64" }, "sha512-nTodweTbPmkvwMu/a55XvjMsPtuyUSC+sV7f/SR57K36rB2I0YG21qNETN+00LOTUW9B3omd8XkiXJkt4kx/cw=="],
|
||||
|
||||
"tweetnacl": ["tweetnacl@0.14.5", "", {}, "sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA=="],
|
||||
|
||||
@@ -4046,6 +4054,8 @@
|
||||
|
||||
"path-scurry/lru-cache": ["lru-cache@11.2.4", "", {}, "sha512-B5Y16Jr9LB9dHVkh6ZevG+vAbOsNOYCX+sXvFWFu7B3Iz5mijW3zdbMyhsh8ANd2mSWBYdJgnqi+mL7/LrOPYg=="],
|
||||
|
||||
"pdf-lib/tslib": ["tslib@1.14.1", "", {}, "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg=="],
|
||||
|
||||
"pino/thread-stream": ["thread-stream@3.1.0", "", { "dependencies": { "real-require": "^0.2.0" } }, "sha512-OqyPZ9u96VohAyMfJykzmivOrY2wfMSf3C5TtFJVgN+Hm6aj+voFhlK+kZEIv2FBh1X6Xp3DlnCOfEQ3B2J86A=="],
|
||||
|
||||
"pino-pretty/pino-abstract-transport": ["pino-abstract-transport@3.0.0", "", { "dependencies": { "split2": "^4.0.0" } }, "sha512-wlfUczU+n7Hy/Ha5j9a/gZNy7We5+cXp8YL+X+PG8S0KXxw7n/JXA3c46Y0zQznIJ83URJiwy7Lh56WLokNuxg=="],
|
||||
|
||||
@@ -41,7 +41,7 @@
|
||||
"drizzle-kit": "^0.31.4",
|
||||
"husky": "9.1.7",
|
||||
"lint-staged": "16.0.0",
|
||||
"turbo": "2.7.2"
|
||||
"turbo": "2.7.3"
|
||||
},
|
||||
"lint-staged": {
|
||||
"*.{js,jsx,ts,tsx,json,css,scss}": [
|
||||
|
||||
Reference in New Issue
Block a user