mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-06 03:00:16 -04:00
fix(kb): chunking config persistence (#3877)
* fix(kb): persist chunking config correctly * fix kb config as sot * remove dead code * fix doc req bodies * add defaults for async for legacy docs
This commit is contained in:
committed by
GitHub
parent
a79c8a75ce
commit
b95a0491a0
@@ -457,11 +457,8 @@ describe('Knowledge Base Documents API Route', () => {
|
||||
},
|
||||
],
|
||||
processingOptions: {
|
||||
chunkSize: 1024,
|
||||
minCharactersPerChunk: 100,
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
chunkOverlap: 200,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -533,11 +530,8 @@ describe('Knowledge Base Documents API Route', () => {
|
||||
},
|
||||
],
|
||||
processingOptions: {
|
||||
chunkSize: 50, // Invalid: too small
|
||||
minCharactersPerChunk: 0, // Invalid: too small
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
chunkOverlap: 1000, // Invalid: too large
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -38,26 +38,14 @@ const CreateDocumentSchema = z.object({
|
||||
documentTagsData: z.string().optional(),
|
||||
})
|
||||
|
||||
/**
|
||||
* Schema for bulk document creation with processing options
|
||||
*
|
||||
* Processing options units:
|
||||
* - chunkSize: tokens (1 token ≈ 4 characters)
|
||||
* - minCharactersPerChunk: characters
|
||||
* - chunkOverlap: characters
|
||||
*/
|
||||
const BulkCreateDocumentsSchema = z.object({
|
||||
documents: z.array(CreateDocumentSchema),
|
||||
processingOptions: z.object({
|
||||
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
|
||||
chunkSize: z.number().min(100).max(4000),
|
||||
/** Minimum chunk size in characters */
|
||||
minCharactersPerChunk: z.number().min(1).max(2000),
|
||||
recipe: z.string(),
|
||||
lang: z.string(),
|
||||
/** Overlap between chunks in characters */
|
||||
chunkOverlap: z.number().min(0).max(500),
|
||||
}),
|
||||
processingOptions: z
|
||||
.object({
|
||||
recipe: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
bulk: z.literal(true),
|
||||
})
|
||||
|
||||
@@ -246,8 +234,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
|
||||
knowledgeBaseId,
|
||||
documentsCount: createdDocuments.length,
|
||||
uploadType: 'bulk',
|
||||
chunkSize: validatedData.processingOptions.chunkSize,
|
||||
recipe: validatedData.processingOptions.recipe,
|
||||
recipe: validatedData.processingOptions?.recipe,
|
||||
})
|
||||
} catch (_e) {
|
||||
// Silently fail
|
||||
@@ -256,7 +243,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
|
||||
processDocumentsWithQueue(
|
||||
createdDocuments,
|
||||
knowledgeBaseId,
|
||||
validatedData.processingOptions,
|
||||
validatedData.processingOptions ?? {},
|
||||
requestId
|
||||
).catch((error: unknown) => {
|
||||
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
|
||||
|
||||
@@ -25,13 +25,12 @@ const UpsertDocumentSchema = z.object({
|
||||
fileSize: z.number().min(1, 'File size must be greater than 0'),
|
||||
mimeType: z.string().min(1, 'MIME type is required'),
|
||||
documentTagsData: z.string().optional(),
|
||||
processingOptions: z.object({
|
||||
chunkSize: z.number().min(100).max(4000),
|
||||
minCharactersPerChunk: z.number().min(1).max(2000),
|
||||
recipe: z.string(),
|
||||
lang: z.string(),
|
||||
chunkOverlap: z.number().min(0).max(500),
|
||||
}),
|
||||
processingOptions: z
|
||||
.object({
|
||||
recipe: z.string().optional(),
|
||||
lang: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
workflowId: z.string().optional(),
|
||||
})
|
||||
|
||||
@@ -166,7 +165,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
|
||||
processDocumentsWithQueue(
|
||||
createdDocuments,
|
||||
knowledgeBaseId,
|
||||
validatedData.processingOptions,
|
||||
validatedData.processingOptions ?? {},
|
||||
requestId
|
||||
).catch((error: unknown) => {
|
||||
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
|
||||
@@ -178,8 +177,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
|
||||
knowledgeBaseId,
|
||||
documentsCount: 1,
|
||||
uploadType: 'single',
|
||||
chunkSize: validatedData.processingOptions.chunkSize,
|
||||
recipe: validatedData.processingOptions.recipe,
|
||||
recipe: validatedData.processingOptions?.recipe,
|
||||
})
|
||||
} catch (_e) {
|
||||
// Silently fail
|
||||
|
||||
@@ -187,8 +187,6 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
|
||||
requestId
|
||||
)
|
||||
|
||||
const chunkingConfig = result.kb.chunkingConfig ?? { maxSize: 1024, minSize: 100, overlap: 200 }
|
||||
|
||||
const documentData: DocumentData = {
|
||||
documentId: newDocument.id,
|
||||
filename: file.name,
|
||||
@@ -197,18 +195,7 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
|
||||
mimeType: contentType,
|
||||
}
|
||||
|
||||
processDocumentsWithQueue(
|
||||
[documentData],
|
||||
knowledgeBaseId,
|
||||
{
|
||||
chunkSize: chunkingConfig.maxSize,
|
||||
minCharactersPerChunk: chunkingConfig.minSize,
|
||||
chunkOverlap: chunkingConfig.overlap,
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
},
|
||||
requestId
|
||||
).catch(() => {
|
||||
processDocumentsWithQueue([documentData], knowledgeBaseId, {}, requestId).catch(() => {
|
||||
// Processing errors are logged internally
|
||||
})
|
||||
|
||||
|
||||
@@ -195,9 +195,6 @@ export function AddDocumentsModal({
|
||||
|
||||
try {
|
||||
await uploadFiles([fileToRetry], knowledgeBaseId, {
|
||||
chunkSize: chunkingConfig?.maxSize || 1024,
|
||||
minCharactersPerChunk: chunkingConfig?.minSize || 1,
|
||||
chunkOverlap: chunkingConfig?.overlap || 200,
|
||||
recipe: 'default',
|
||||
})
|
||||
removeFile(index)
|
||||
@@ -217,9 +214,6 @@ export function AddDocumentsModal({
|
||||
|
||||
try {
|
||||
await uploadFiles(files, knowledgeBaseId, {
|
||||
chunkSize: chunkingConfig?.maxSize || 1024,
|
||||
minCharactersPerChunk: chunkingConfig?.minSize || 1,
|
||||
chunkOverlap: chunkingConfig?.overlap || 200,
|
||||
recipe: 'default',
|
||||
})
|
||||
logger.info(`Successfully uploaded ${files.length} files`)
|
||||
|
||||
@@ -20,6 +20,7 @@ interface BaseCardProps {
|
||||
createdAt?: string
|
||||
updatedAt?: string
|
||||
connectorTypes?: string[]
|
||||
chunkingConfig?: { maxSize: number; minSize: number; overlap: number }
|
||||
onUpdate?: (id: string, name: string, description: string) => Promise<void>
|
||||
onDelete?: (id: string) => Promise<void>
|
||||
}
|
||||
@@ -78,6 +79,7 @@ export function BaseCard({
|
||||
description,
|
||||
updatedAt,
|
||||
connectorTypes = [],
|
||||
chunkingConfig,
|
||||
onUpdate,
|
||||
onDelete,
|
||||
}: BaseCardProps) {
|
||||
@@ -256,6 +258,7 @@ export function BaseCard({
|
||||
knowledgeBaseId={id}
|
||||
initialName={title}
|
||||
initialDescription={description === 'No description provided' ? '' : description}
|
||||
chunkingConfig={chunkingConfig}
|
||||
onSave={handleSave}
|
||||
/>
|
||||
)}
|
||||
|
||||
@@ -269,9 +269,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
|
||||
if (files.length > 0) {
|
||||
try {
|
||||
const uploadedFiles = await uploadFiles(files, newKnowledgeBase.id, {
|
||||
chunkSize: data.maxChunkSize,
|
||||
minCharactersPerChunk: data.minChunkSize,
|
||||
chunkOverlap: data.overlapSize,
|
||||
recipe: 'default',
|
||||
})
|
||||
|
||||
@@ -358,12 +355,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
|
||||
<Label htmlFor='minChunkSize'>Min Chunk Size (characters)</Label>
|
||||
<Input
|
||||
id='minChunkSize'
|
||||
type='number'
|
||||
min={1}
|
||||
max={2000}
|
||||
step={1}
|
||||
placeholder='100'
|
||||
{...register('minChunkSize', { valueAsNumber: true })}
|
||||
className={cn(errors.minChunkSize && 'border-[var(--text-error)]')}
|
||||
autoComplete='off'
|
||||
data-form-type='other'
|
||||
name='min-chunk-size'
|
||||
/>
|
||||
</div>
|
||||
|
||||
@@ -371,12 +371,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
|
||||
<Label htmlFor='maxChunkSize'>Max Chunk Size (tokens)</Label>
|
||||
<Input
|
||||
id='maxChunkSize'
|
||||
type='number'
|
||||
min={100}
|
||||
max={4000}
|
||||
step={1}
|
||||
placeholder='1024'
|
||||
{...register('maxChunkSize', { valueAsNumber: true })}
|
||||
className={cn(errors.maxChunkSize && 'border-[var(--text-error)]')}
|
||||
autoComplete='off'
|
||||
data-form-type='other'
|
||||
name='max-chunk-size'
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
@@ -385,12 +388,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
|
||||
<Label htmlFor='overlapSize'>Overlap (tokens)</Label>
|
||||
<Input
|
||||
id='overlapSize'
|
||||
type='number'
|
||||
min={0}
|
||||
max={500}
|
||||
step={1}
|
||||
placeholder='200'
|
||||
{...register('overlapSize', { valueAsNumber: true })}
|
||||
className={cn(errors.overlapSize && 'border-[var(--text-error)]')}
|
||||
autoComplete='off'
|
||||
data-form-type='other'
|
||||
name='overlap-size'
|
||||
/>
|
||||
<p className='text-[var(--text-muted)] text-xs'>
|
||||
1 token ≈ 4 characters. Max chunk size and overlap are in tokens.
|
||||
|
||||
@@ -17,6 +17,7 @@ import {
|
||||
Textarea,
|
||||
} from '@/components/emcn'
|
||||
import { cn } from '@/lib/core/utils/cn'
|
||||
import type { ChunkingConfig } from '@/lib/knowledge/types'
|
||||
|
||||
const logger = createLogger('EditKnowledgeBaseModal')
|
||||
|
||||
@@ -26,6 +27,7 @@ interface EditKnowledgeBaseModalProps {
|
||||
knowledgeBaseId: string
|
||||
initialName: string
|
||||
initialDescription: string
|
||||
chunkingConfig?: ChunkingConfig
|
||||
onSave: (id: string, name: string, description: string) => Promise<void>
|
||||
}
|
||||
|
||||
@@ -49,6 +51,7 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
|
||||
knowledgeBaseId,
|
||||
initialName,
|
||||
initialDescription,
|
||||
chunkingConfig,
|
||||
onSave,
|
||||
}: EditKnowledgeBaseModalProps) {
|
||||
const [isSubmitting, setIsSubmitting] = useState(false)
|
||||
@@ -137,6 +140,47 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{chunkingConfig && (
|
||||
<div className='flex flex-col gap-2'>
|
||||
<Label>Chunking Configuration</Label>
|
||||
<div className='grid grid-cols-3 gap-2'>
|
||||
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
|
||||
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
|
||||
Max Size
|
||||
</p>
|
||||
<p className='font-medium text-[var(--text-primary)] text-sm'>
|
||||
{chunkingConfig.maxSize.toLocaleString()}
|
||||
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
|
||||
tokens
|
||||
</span>
|
||||
</p>
|
||||
</div>
|
||||
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
|
||||
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
|
||||
Min Size
|
||||
</p>
|
||||
<p className='font-medium text-[var(--text-primary)] text-sm'>
|
||||
{chunkingConfig.minSize.toLocaleString()}
|
||||
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
|
||||
chars
|
||||
</span>
|
||||
</p>
|
||||
</div>
|
||||
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
|
||||
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
|
||||
Overlap
|
||||
</p>
|
||||
<p className='font-medium text-[var(--text-primary)] text-sm'>
|
||||
{chunkingConfig.overlap.toLocaleString()}
|
||||
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
|
||||
tokens
|
||||
</span>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</ModalBody>
|
||||
|
||||
|
||||
@@ -46,9 +46,6 @@ export interface UploadError {
|
||||
}
|
||||
|
||||
export interface ProcessingOptions {
|
||||
chunkSize?: number
|
||||
minCharactersPerChunk?: number
|
||||
chunkOverlap?: number
|
||||
recipe?: string
|
||||
}
|
||||
|
||||
@@ -1011,10 +1008,7 @@ export function useKnowledgeUpload(options: UseKnowledgeUploadOptions = {}) {
|
||||
...file,
|
||||
})),
|
||||
processingOptions: {
|
||||
chunkSize: processingOptions.chunkSize || 1024,
|
||||
minCharactersPerChunk: processingOptions.minCharactersPerChunk || 1,
|
||||
chunkOverlap: processingOptions.chunkOverlap || 200,
|
||||
recipe: processingOptions.recipe || 'default',
|
||||
recipe: processingOptions.recipe ?? 'default',
|
||||
lang: 'en',
|
||||
},
|
||||
bulk: true,
|
||||
|
||||
@@ -602,6 +602,7 @@ export function Knowledge() {
|
||||
knowledgeBaseId={activeKnowledgeBase.id}
|
||||
initialName={activeKnowledgeBase.name}
|
||||
initialDescription={activeKnowledgeBase.description || ''}
|
||||
chunkingConfig={activeKnowledgeBase.chunkingConfig}
|
||||
onSave={handleUpdateKnowledgeBase}
|
||||
/>
|
||||
)}
|
||||
|
||||
@@ -15,11 +15,8 @@ export type DocumentProcessingPayload = {
|
||||
mimeType: string
|
||||
}
|
||||
processingOptions: {
|
||||
chunkSize?: number
|
||||
minCharactersPerChunk?: number
|
||||
recipe?: string
|
||||
lang?: string
|
||||
chunkOverlap?: number
|
||||
}
|
||||
requestId: string
|
||||
}
|
||||
|
||||
@@ -101,11 +101,8 @@ export interface DocumentData {
|
||||
}
|
||||
|
||||
export interface ProcessingOptions {
|
||||
chunkSize?: number
|
||||
minCharactersPerChunk?: number
|
||||
recipe?: string
|
||||
lang?: string
|
||||
chunkOverlap?: number
|
||||
}
|
||||
|
||||
export interface DocumentJobData {
|
||||
@@ -416,13 +413,7 @@ export async function processDocumentAsync(
|
||||
fileSize: number
|
||||
mimeType: string
|
||||
},
|
||||
processingOptions: {
|
||||
chunkSize?: number
|
||||
minCharactersPerChunk?: number
|
||||
recipe?: string
|
||||
lang?: string
|
||||
chunkOverlap?: number
|
||||
}
|
||||
processingOptions: ProcessingOptions = {}
|
||||
): Promise<void> {
|
||||
const startTime = Date.now()
|
||||
try {
|
||||
@@ -456,7 +447,16 @@ export async function processDocumentAsync(
|
||||
|
||||
logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)
|
||||
|
||||
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
|
||||
const rawConfig = kb[0].chunkingConfig as {
|
||||
maxSize?: number
|
||||
minSize?: number
|
||||
overlap?: number
|
||||
} | null
|
||||
const kbConfig = {
|
||||
maxSize: rawConfig?.maxSize ?? 1024,
|
||||
minSize: rawConfig?.minSize ?? 100,
|
||||
overlap: rawConfig?.overlap ?? 200,
|
||||
}
|
||||
|
||||
await withTimeout(
|
||||
(async () => {
|
||||
@@ -464,9 +464,9 @@ export async function processDocumentAsync(
|
||||
docData.fileUrl,
|
||||
docData.filename,
|
||||
docData.mimeType,
|
||||
processingOptions.chunkSize ?? kbConfig.maxSize,
|
||||
processingOptions.chunkOverlap ?? kbConfig.overlap,
|
||||
processingOptions.minCharactersPerChunk ?? kbConfig.minSize,
|
||||
kbConfig.maxSize,
|
||||
kbConfig.overlap,
|
||||
kbConfig.minSize,
|
||||
kb[0].userId,
|
||||
kb[0].workspaceId
|
||||
)
|
||||
@@ -1573,16 +1573,6 @@ export async function retryDocumentProcessing(
|
||||
},
|
||||
requestId: string
|
||||
): Promise<{ success: boolean; status: string; message: string }> {
|
||||
const kb = await db
|
||||
.select({
|
||||
chunkingConfig: knowledgeBase.chunkingConfig,
|
||||
})
|
||||
.from(knowledgeBase)
|
||||
.where(eq(knowledgeBase.id, knowledgeBaseId))
|
||||
.limit(1)
|
||||
|
||||
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
|
||||
|
||||
await db.transaction(async (tx) => {
|
||||
await tx.delete(embedding).where(eq(embedding.documentId, documentId))
|
||||
|
||||
@@ -1600,14 +1590,6 @@ export async function retryDocumentProcessing(
|
||||
.where(eq(document.id, documentId))
|
||||
})
|
||||
|
||||
const processingOptions = {
|
||||
chunkSize: kbConfig.maxSize,
|
||||
minCharactersPerChunk: kbConfig.minSize,
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
chunkOverlap: kbConfig.overlap,
|
||||
}
|
||||
|
||||
await processDocumentsWithQueue(
|
||||
[
|
||||
{
|
||||
@@ -1619,7 +1601,7 @@ export async function retryDocumentProcessing(
|
||||
},
|
||||
],
|
||||
knowledgeBaseId,
|
||||
processingOptions,
|
||||
{},
|
||||
requestId
|
||||
)
|
||||
|
||||
|
||||
@@ -103,9 +103,6 @@ export const knowledgeCreateDocumentTool: ToolConfig<any, KnowledgeCreateDocumen
|
||||
const requestBody = {
|
||||
documents: documents,
|
||||
processingOptions: {
|
||||
chunkSize: 1024,
|
||||
minCharactersPerChunk: 1,
|
||||
chunkOverlap: 200,
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
},
|
||||
|
||||
@@ -108,9 +108,6 @@ export const knowledgeUpsertDocumentTool: ToolConfig<
|
||||
mimeType,
|
||||
...tagData,
|
||||
processingOptions: {
|
||||
chunkSize: 1024,
|
||||
minCharactersPerChunk: 1,
|
||||
chunkOverlap: 200,
|
||||
recipe: 'default',
|
||||
lang: 'en',
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user