fix(kb): chunking config persistence (#3877)

* fix(kb): persist chunking config correctly

* fix kb config as sot

* remove dead code

* fix doc req bodies

* add defaults for async for legacy docs
This commit is contained in:
Vikhyath Mondreti
2026-03-31 19:16:23 -07:00
committed by GitHub
parent a79c8a75ce
commit b95a0491a0
14 changed files with 93 additions and 112 deletions

View File

@@ -457,11 +457,8 @@ describe('Knowledge Base Documents API Route', () => {
},
],
processingOptions: {
chunkSize: 1024,
minCharactersPerChunk: 100,
recipe: 'default',
lang: 'en',
chunkOverlap: 200,
},
}
@@ -533,11 +530,8 @@ describe('Knowledge Base Documents API Route', () => {
},
],
processingOptions: {
chunkSize: 50, // Invalid: too small
minCharactersPerChunk: 0, // Invalid: too small
recipe: 'default',
lang: 'en',
chunkOverlap: 1000, // Invalid: too large
},
}

View File

@@ -38,26 +38,14 @@ const CreateDocumentSchema = z.object({
documentTagsData: z.string().optional(),
})
/**
* Schema for bulk document creation with processing options
*
* Processing options units:
* - chunkSize: tokens (1 token ≈ 4 characters)
* - minCharactersPerChunk: characters
* - chunkOverlap: characters
*/
const BulkCreateDocumentsSchema = z.object({
documents: z.array(CreateDocumentSchema),
processingOptions: z.object({
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
chunkSize: z.number().min(100).max(4000),
/** Minimum chunk size in characters */
minCharactersPerChunk: z.number().min(1).max(2000),
recipe: z.string(),
lang: z.string(),
/** Overlap between chunks in characters */
chunkOverlap: z.number().min(0).max(500),
}),
processingOptions: z
.object({
recipe: z.string().optional(),
lang: z.string().optional(),
})
.optional(),
bulk: z.literal(true),
})
@@ -246,8 +234,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
knowledgeBaseId,
documentsCount: createdDocuments.length,
uploadType: 'bulk',
chunkSize: validatedData.processingOptions.chunkSize,
recipe: validatedData.processingOptions.recipe,
recipe: validatedData.processingOptions?.recipe,
})
} catch (_e) {
// Silently fail
@@ -256,7 +243,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
processDocumentsWithQueue(
createdDocuments,
knowledgeBaseId,
validatedData.processingOptions,
validatedData.processingOptions ?? {},
requestId
).catch((error: unknown) => {
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)

View File

@@ -25,13 +25,12 @@ const UpsertDocumentSchema = z.object({
fileSize: z.number().min(1, 'File size must be greater than 0'),
mimeType: z.string().min(1, 'MIME type is required'),
documentTagsData: z.string().optional(),
processingOptions: z.object({
chunkSize: z.number().min(100).max(4000),
minCharactersPerChunk: z.number().min(1).max(2000),
recipe: z.string(),
lang: z.string(),
chunkOverlap: z.number().min(0).max(500),
}),
processingOptions: z
.object({
recipe: z.string().optional(),
lang: z.string().optional(),
})
.optional(),
workflowId: z.string().optional(),
})
@@ -166,7 +165,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
processDocumentsWithQueue(
createdDocuments,
knowledgeBaseId,
validatedData.processingOptions,
validatedData.processingOptions ?? {},
requestId
).catch((error: unknown) => {
logger.error(`[${requestId}] Critical error in document processing pipeline:`, error)
@@ -178,8 +177,7 @@ export async function POST(req: NextRequest, { params }: { params: Promise<{ id:
knowledgeBaseId,
documentsCount: 1,
uploadType: 'single',
chunkSize: validatedData.processingOptions.chunkSize,
recipe: validatedData.processingOptions.recipe,
recipe: validatedData.processingOptions?.recipe,
})
} catch (_e) {
// Silently fail

View File

@@ -187,8 +187,6 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
requestId
)
const chunkingConfig = result.kb.chunkingConfig ?? { maxSize: 1024, minSize: 100, overlap: 200 }
const documentData: DocumentData = {
documentId: newDocument.id,
filename: file.name,
@@ -197,18 +195,7 @@ export async function POST(request: NextRequest, { params }: DocumentsRouteParam
mimeType: contentType,
}
processDocumentsWithQueue(
[documentData],
knowledgeBaseId,
{
chunkSize: chunkingConfig.maxSize,
minCharactersPerChunk: chunkingConfig.minSize,
chunkOverlap: chunkingConfig.overlap,
recipe: 'default',
lang: 'en',
},
requestId
).catch(() => {
processDocumentsWithQueue([documentData], knowledgeBaseId, {}, requestId).catch(() => {
// Processing errors are logged internally
})

View File

@@ -195,9 +195,6 @@ export function AddDocumentsModal({
try {
await uploadFiles([fileToRetry], knowledgeBaseId, {
chunkSize: chunkingConfig?.maxSize || 1024,
minCharactersPerChunk: chunkingConfig?.minSize || 1,
chunkOverlap: chunkingConfig?.overlap || 200,
recipe: 'default',
})
removeFile(index)
@@ -217,9 +214,6 @@ export function AddDocumentsModal({
try {
await uploadFiles(files, knowledgeBaseId, {
chunkSize: chunkingConfig?.maxSize || 1024,
minCharactersPerChunk: chunkingConfig?.minSize || 1,
chunkOverlap: chunkingConfig?.overlap || 200,
recipe: 'default',
})
logger.info(`Successfully uploaded ${files.length} files`)

View File

@@ -20,6 +20,7 @@ interface BaseCardProps {
createdAt?: string
updatedAt?: string
connectorTypes?: string[]
chunkingConfig?: { maxSize: number; minSize: number; overlap: number }
onUpdate?: (id: string, name: string, description: string) => Promise<void>
onDelete?: (id: string) => Promise<void>
}
@@ -78,6 +79,7 @@ export function BaseCard({
description,
updatedAt,
connectorTypes = [],
chunkingConfig,
onUpdate,
onDelete,
}: BaseCardProps) {
@@ -256,6 +258,7 @@ export function BaseCard({
knowledgeBaseId={id}
initialName={title}
initialDescription={description === 'No description provided' ? '' : description}
chunkingConfig={chunkingConfig}
onSave={handleSave}
/>
)}

View File

@@ -269,9 +269,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
if (files.length > 0) {
try {
const uploadedFiles = await uploadFiles(files, newKnowledgeBase.id, {
chunkSize: data.maxChunkSize,
minCharactersPerChunk: data.minChunkSize,
chunkOverlap: data.overlapSize,
recipe: 'default',
})
@@ -358,12 +355,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
<Label htmlFor='minChunkSize'>Min Chunk Size (characters)</Label>
<Input
id='minChunkSize'
type='number'
min={1}
max={2000}
step={1}
placeholder='100'
{...register('minChunkSize', { valueAsNumber: true })}
className={cn(errors.minChunkSize && 'border-[var(--text-error)]')}
autoComplete='off'
data-form-type='other'
name='min-chunk-size'
/>
</div>
@@ -371,12 +371,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
<Label htmlFor='maxChunkSize'>Max Chunk Size (tokens)</Label>
<Input
id='maxChunkSize'
type='number'
min={100}
max={4000}
step={1}
placeholder='1024'
{...register('maxChunkSize', { valueAsNumber: true })}
className={cn(errors.maxChunkSize && 'border-[var(--text-error)]')}
autoComplete='off'
data-form-type='other'
name='max-chunk-size'
/>
</div>
</div>
@@ -385,12 +388,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({
<Label htmlFor='overlapSize'>Overlap (tokens)</Label>
<Input
id='overlapSize'
type='number'
min={0}
max={500}
step={1}
placeholder='200'
{...register('overlapSize', { valueAsNumber: true })}
className={cn(errors.overlapSize && 'border-[var(--text-error)]')}
autoComplete='off'
data-form-type='other'
name='overlap-size'
/>
<p className='text-[var(--text-muted)] text-xs'>
1 token 4 characters. Max chunk size and overlap are in tokens.

View File

@@ -17,6 +17,7 @@ import {
Textarea,
} from '@/components/emcn'
import { cn } from '@/lib/core/utils/cn'
import type { ChunkingConfig } from '@/lib/knowledge/types'
const logger = createLogger('EditKnowledgeBaseModal')
@@ -26,6 +27,7 @@ interface EditKnowledgeBaseModalProps {
knowledgeBaseId: string
initialName: string
initialDescription: string
chunkingConfig?: ChunkingConfig
onSave: (id: string, name: string, description: string) => Promise<void>
}
@@ -49,6 +51,7 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
knowledgeBaseId,
initialName,
initialDescription,
chunkingConfig,
onSave,
}: EditKnowledgeBaseModalProps) {
const [isSubmitting, setIsSubmitting] = useState(false)
@@ -137,6 +140,47 @@ export const EditKnowledgeBaseModal = memo(function EditKnowledgeBaseModal({
</p>
)}
</div>
{chunkingConfig && (
<div className='flex flex-col gap-2'>
<Label>Chunking Configuration</Label>
<div className='grid grid-cols-3 gap-2'>
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
Max Size
</p>
<p className='font-medium text-[var(--text-primary)] text-sm'>
{chunkingConfig.maxSize.toLocaleString()}
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
tokens
</span>
</p>
</div>
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
Min Size
</p>
<p className='font-medium text-[var(--text-primary)] text-sm'>
{chunkingConfig.minSize.toLocaleString()}
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
chars
</span>
</p>
</div>
<div className='rounded-sm border border-[var(--border-1)] bg-[var(--surface-2)] px-2.5 py-2'>
<p className='text-[var(--text-tertiary)] text-[11px] leading-tight'>
Overlap
</p>
<p className='font-medium text-[var(--text-primary)] text-sm'>
{chunkingConfig.overlap.toLocaleString()}
<span className='ml-0.5 font-normal text-[var(--text-tertiary)] text-[11px]'>
tokens
</span>
</p>
</div>
</div>
</div>
)}
</div>
</ModalBody>

View File

@@ -46,9 +46,6 @@ export interface UploadError {
}
export interface ProcessingOptions {
chunkSize?: number
minCharactersPerChunk?: number
chunkOverlap?: number
recipe?: string
}
@@ -1011,10 +1008,7 @@ export function useKnowledgeUpload(options: UseKnowledgeUploadOptions = {}) {
...file,
})),
processingOptions: {
chunkSize: processingOptions.chunkSize || 1024,
minCharactersPerChunk: processingOptions.minCharactersPerChunk || 1,
chunkOverlap: processingOptions.chunkOverlap || 200,
recipe: processingOptions.recipe || 'default',
recipe: processingOptions.recipe ?? 'default',
lang: 'en',
},
bulk: true,

View File

@@ -602,6 +602,7 @@ export function Knowledge() {
knowledgeBaseId={activeKnowledgeBase.id}
initialName={activeKnowledgeBase.name}
initialDescription={activeKnowledgeBase.description || ''}
chunkingConfig={activeKnowledgeBase.chunkingConfig}
onSave={handleUpdateKnowledgeBase}
/>
)}

View File

@@ -15,11 +15,8 @@ export type DocumentProcessingPayload = {
mimeType: string
}
processingOptions: {
chunkSize?: number
minCharactersPerChunk?: number
recipe?: string
lang?: string
chunkOverlap?: number
}
requestId: string
}

View File

@@ -101,11 +101,8 @@ export interface DocumentData {
}
export interface ProcessingOptions {
chunkSize?: number
minCharactersPerChunk?: number
recipe?: string
lang?: string
chunkOverlap?: number
}
export interface DocumentJobData {
@@ -416,13 +413,7 @@ export async function processDocumentAsync(
fileSize: number
mimeType: string
},
processingOptions: {
chunkSize?: number
minCharactersPerChunk?: number
recipe?: string
lang?: string
chunkOverlap?: number
}
processingOptions: ProcessingOptions = {}
): Promise<void> {
const startTime = Date.now()
try {
@@ -456,7 +447,16 @@ export async function processDocumentAsync(
logger.info(`[${documentId}] Status updated to 'processing', starting document processor`)
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
const rawConfig = kb[0].chunkingConfig as {
maxSize?: number
minSize?: number
overlap?: number
} | null
const kbConfig = {
maxSize: rawConfig?.maxSize ?? 1024,
minSize: rawConfig?.minSize ?? 100,
overlap: rawConfig?.overlap ?? 200,
}
await withTimeout(
(async () => {
@@ -464,9 +464,9 @@ export async function processDocumentAsync(
docData.fileUrl,
docData.filename,
docData.mimeType,
processingOptions.chunkSize ?? kbConfig.maxSize,
processingOptions.chunkOverlap ?? kbConfig.overlap,
processingOptions.minCharactersPerChunk ?? kbConfig.minSize,
kbConfig.maxSize,
kbConfig.overlap,
kbConfig.minSize,
kb[0].userId,
kb[0].workspaceId
)
@@ -1573,16 +1573,6 @@ export async function retryDocumentProcessing(
},
requestId: string
): Promise<{ success: boolean; status: string; message: string }> {
const kb = await db
.select({
chunkingConfig: knowledgeBase.chunkingConfig,
})
.from(knowledgeBase)
.where(eq(knowledgeBase.id, knowledgeBaseId))
.limit(1)
const kbConfig = kb[0].chunkingConfig as { maxSize: number; minSize: number; overlap: number }
await db.transaction(async (tx) => {
await tx.delete(embedding).where(eq(embedding.documentId, documentId))
@@ -1600,14 +1590,6 @@ export async function retryDocumentProcessing(
.where(eq(document.id, documentId))
})
const processingOptions = {
chunkSize: kbConfig.maxSize,
minCharactersPerChunk: kbConfig.minSize,
recipe: 'default',
lang: 'en',
chunkOverlap: kbConfig.overlap,
}
await processDocumentsWithQueue(
[
{
@@ -1619,7 +1601,7 @@ export async function retryDocumentProcessing(
},
],
knowledgeBaseId,
processingOptions,
{},
requestId
)

View File

@@ -103,9 +103,6 @@ export const knowledgeCreateDocumentTool: ToolConfig<any, KnowledgeCreateDocumen
const requestBody = {
documents: documents,
processingOptions: {
chunkSize: 1024,
minCharactersPerChunk: 1,
chunkOverlap: 200,
recipe: 'default',
lang: 'en',
},

View File

@@ -108,9 +108,6 @@ export const knowledgeUpsertDocumentTool: ToolConfig<
mimeType,
...tagData,
processingOptions: {
chunkSize: 1024,
minCharactersPerChunk: 1,
chunkOverlap: 200,
recipe: 'default',
lang: 'en',
},