improvement(kb): improve chunkers, respect user-specified chunk configurations, added tests (#2539)

* improvement(kb): improve chunkers, respect user-specified chunk configurations, added tests

* ack PR commnets

* updated docs

* cleanup
This commit is contained in:
Waleed
2025-12-22 20:47:29 -08:00
committed by GitHub
parent e0d96e2126
commit 37443a7b77
20 changed files with 583 additions and 219 deletions

View File

@@ -19,10 +19,10 @@ interface ProcessingOptions {
baseUrl?: string
/** Chunk size in tokens */
chunkSize?: number
/** Minimum chunk size */
minChunkSize?: number
/** Overlap between chunks */
overlap?: number
/** Minimum chunk size in characters */
minCharactersPerChunk?: number
/** Overlap between chunks in tokens */
chunkOverlap?: number
/** Dry run - only display results, don't save to DB */
dryRun?: boolean
/** Verbose output */
@@ -37,8 +37,8 @@ async function processDocs(options: ProcessingOptions = {}) {
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs/en'),
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
chunkSize: options.chunkSize || 1024,
minChunkSize: options.minChunkSize || 100,
overlap: options.overlap || 200,
minCharactersPerChunk: options.minCharactersPerChunk || 100,
chunkOverlap: options.chunkOverlap || 200,
clearExisting: options.clearExisting ?? false,
dryRun: options.dryRun ?? false,
verbose: options.verbose ?? false,
@@ -59,8 +59,8 @@ async function processDocs(options: ProcessingOptions = {}) {
// Initialize the chunker
const chunker = new DocsChunker({
chunkSize: config.chunkSize,
minChunkSize: config.minChunkSize,
overlap: config.overlap,
minCharactersPerChunk: config.minCharactersPerChunk,
chunkOverlap: config.chunkOverlap,
baseUrl: config.baseUrl,
})