feat(kb): added json/yaml parser+chunker, added dedicated csv chunker (#1539)

* feat(kb): added json/yaml parser+chunker, added dedicated csv chunker

* ack PR comments

* improved kb upload
This commit is contained in:
Waleed
2025-10-04 14:59:21 -07:00
committed by GitHub
parent 0e838940f1
commit 86ed32ea10
27 changed files with 1794 additions and 857 deletions

View File

@@ -1,98 +0,0 @@
#!/usr/bin/env bun
import path from 'path'
import { DocsChunker } from '@/lib/knowledge/documents/docs-chunker'
import type { DocChunk } from '@/lib/knowledge/documents/types'
import { createLogger } from '@/lib/logs/console/logger'
const logger = createLogger('ChunkDocsScript')
/**
* Script to chunk all .mdx files in the docs directory
*/
async function main() {
try {
// Initialize the docs chunker
const chunker = new DocsChunker({
chunkSize: 1024,
minChunkSize: 100,
overlap: 200,
baseUrl: 'https://docs.sim.ai',
})
// Path to the docs content directory
const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
logger.info(`Processing docs from: ${docsPath}`)
// Process all .mdx files
const chunks = await chunker.chunkAllDocs(docsPath)
logger.info(`\n=== CHUNKING RESULTS ===`)
logger.info(`Total chunks: ${chunks.length}`)
// Group chunks by document
const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = []
}
acc[chunk.sourceDocument].push(chunk)
return acc
}, {})
// Display summary
logger.info(`\n=== DOCUMENT SUMMARY ===`)
for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
logger.info(`${doc}: ${docChunks.length} chunks`)
}
// Display a few sample chunks
logger.info(`\n=== SAMPLE CHUNKS ===`)
chunks.slice(0, 3).forEach((chunk, index) => {
logger.info(`\nChunk ${index + 1}:`)
logger.info(` Source: ${chunk.sourceDocument}`)
logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
logger.info(` Link: ${chunk.headerLink}`)
logger.info(` Tokens: ${chunk.tokenCount}`)
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
logger.info(
` Embedding Preview: [${chunk.embedding
.slice(0, 5)
.map((n) => n.toFixed(4))
.join(', ')}...]`
)
logger.info(` Text Preview: ${chunk.text.slice(0, 100)}...`)
})
// Calculate total token count
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
const chunksWithEmbeddings = chunks.filter((chunk) => chunk.embedding.length > 0).length
logger.info(`\n=== STATISTICS ===`)
logger.info(`Total tokens: ${totalTokens}`)
logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
logger.info(`Chunks with embeddings: ${chunksWithEmbeddings}/${chunks.length}`)
if (chunks.length > 0 && chunks[0].embedding.length > 0) {
logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
}
const headerLevels = chunks.reduce<Record<number, number>>((acc, chunk) => {
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
return acc
}, {})
logger.info(`Header level distribution:`)
Object.entries(headerLevels)
.sort(([a], [b]) => Number(a) - Number(b))
.forEach(([level, count]) => {
logger.info(` H${level}: ${count} chunks`)
})
} catch (error) {
logger.error('Error processing docs:', error)
process.exit(1)
}
}
// Run the script
main().catch(console.error)

View File

@@ -1,215 +0,0 @@
#!/usr/bin/env bun
import path from 'path'
import { db } from '@sim/db'
import { docsEmbeddings } from '@sim/db/schema'
import { sql } from 'drizzle-orm'
import { isDev } from '@/lib/environment'
import { DocsChunker } from '@/lib/knowledge/documents/docs-chunker'
import { createLogger } from '@/lib/logs/console/logger'
const logger = createLogger('ProcessDocsEmbeddings')
interface ProcessingOptions {
/** Clear existing docs embeddings before processing */
clearExisting?: boolean
/** Path to docs directory */
docsPath?: string
/** Base URL for generating links */
baseUrl?: string
/** Chunk size in tokens */
chunkSize?: number
/** Minimum chunk size in tokens */
minChunkSize?: number
/** Overlap between chunks in tokens */
overlap?: number
}
/**
* Production script to process documentation and save embeddings to database
*/
async function processDocsEmbeddings(options: ProcessingOptions = {}) {
const startTime = Date.now()
let processedChunks = 0
let failedChunks = 0
try {
// Configuration
const config = {
clearExisting: options.clearExisting ?? false,
docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs/en'),
baseUrl: options.baseUrl ?? (isDev ? 'http://localhost:3001' : 'https://docs.sim.ai'),
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
minChunkSize: options.minChunkSize ?? 100,
overlap: options.overlap ?? 50,
}
logger.info('🚀 Starting docs embedding processing...')
logger.info(`Configuration:`, {
docsPath: config.docsPath,
baseUrl: config.baseUrl,
chunkSize: config.chunkSize,
clearExisting: config.clearExisting,
})
const chunker = new DocsChunker({
chunkSize: config.chunkSize,
minChunkSize: config.minChunkSize,
overlap: config.overlap,
baseUrl: config.baseUrl,
})
logger.info(`📚 Processing docs from: ${config.docsPath}`)
const chunks = await chunker.chunkAllDocs(config.docsPath)
if (chunks.length === 0) {
logger.warn('⚠️ No chunks generated from docs')
return { success: false, processedChunks: 0, failedChunks: 0 }
}
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
if (config.clearExisting) {
logger.info('🗑️ Clearing existing docs embeddings...')
try {
const deleteResult = await db.delete(docsEmbeddings)
logger.info(`✅ Successfully deleted existing embeddings`)
} catch (error) {
logger.error('❌ Failed to delete existing embeddings:', error)
throw new Error('Failed to clear existing embeddings')
}
}
const batchSize = 10
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
try {
const batchData = batch.map((chunk) => ({
chunkText: chunk.text,
sourceDocument: chunk.sourceDocument,
sourceLink: chunk.headerLink,
headerText: chunk.headerText,
headerLevel: chunk.headerLevel,
tokenCount: chunk.tokenCount,
embedding: chunk.embedding,
embeddingModel: chunk.embeddingModel,
metadata: chunk.metadata,
}))
await db.insert(docsEmbeddings).values(batchData)
processedChunks += batch.length
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
logger.info(
` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
)
}
} catch (error) {
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
failedChunks += batch.length
}
}
const savedCount = await db
.select({ count: sql<number>`count(*)` })
.from(docsEmbeddings)
.then((result) => result[0]?.count || 0)
const duration = Date.now() - startTime
logger.info(`✅ Processing complete!`)
logger.info(`📊 Results:`)
logger.info(` • Total chunks processed: ${chunks.length}`)
logger.info(` • Successfully saved: ${processedChunks}`)
logger.info(` • Failed: ${failedChunks}`)
logger.info(` • Database total: ${savedCount}`)
logger.info(` • Duration: ${Math.round(duration / 1000)}s`)
const documentStats = chunks.reduce(
(acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
}
acc[chunk.sourceDocument].chunks++
acc[chunk.sourceDocument].tokens += chunk.tokenCount
return acc
},
{} as Record<string, { chunks: number; tokens: number }>
)
logger.info(`📋 Document breakdown:`)
Object.entries(documentStats)
.sort(([, a], [, b]) => b.chunks - a.chunks)
.slice(0, 10)
.forEach(([doc, stats]) => {
logger.info(`${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
})
if (Object.keys(documentStats).length > 10) {
logger.info(` • ... and ${Object.keys(documentStats).length - 10} more documents`)
}
return {
success: failedChunks === 0,
processedChunks,
failedChunks,
totalChunks: chunks.length,
databaseCount: savedCount,
duration,
}
} catch (error) {
logger.error('💥 Fatal error during processing:', error)
return {
success: false,
processedChunks,
failedChunks,
error: error instanceof Error ? error.message : 'Unknown error',
}
}
}
/**
* Main function - handle command line arguments
*/
async function main() {
const args = process.argv.slice(2)
const options: ProcessingOptions = {}
if (args.includes('--clear')) {
options.clearExisting = true
}
if (args.includes('--help') || args.includes('-h')) {
console.log(`
Usage: bun run scripts/process-docs-embeddings.ts [options]
Options:
--clear Clear existing docs embeddings before processing
--help, -h Show this help message
Examples:
bun run scripts/process-docs-embeddings.ts
bun run scripts/process-docs-embeddings.ts --clear
`)
process.exit(0)
}
const result = await processDocsEmbeddings(options)
if (!result.success) {
process.exit(1)
}
}
if (import.meta.url.includes('process-docs-embeddings.ts')) {
main().catch((error) => {
logger.error('Script failed:', error)
process.exit(1)
})
}
export { processDocsEmbeddings }

View File

@@ -0,0 +1,256 @@
#!/usr/bin/env bun
import path from 'path'
import { db } from '@sim/db'
import { docsEmbeddings } from '@sim/db/schema'
import { sql } from 'drizzle-orm'
import { type DocChunk, DocsChunker } from '@/lib/chunkers'
import { isDev } from '@/lib/environment'
import { createLogger } from '@/lib/logs/console/logger'
const logger = createLogger('ProcessDocs')
interface ProcessingOptions {
/** Clear existing docs embeddings before processing */
clearExisting?: boolean
/** Path to docs directory */
docsPath?: string
/** Base URL for generating links */
baseUrl?: string
/** Chunk size in tokens */
chunkSize?: number
/** Minimum chunk size */
minChunkSize?: number
/** Overlap between chunks */
overlap?: number
/** Dry run - only display results, don't save to DB */
dryRun?: boolean
/** Verbose output */
verbose?: boolean
}
/**
* Process documentation files and optionally save embeddings to database
*/
async function processDocs(options: ProcessingOptions = {}) {
const config = {
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
chunkSize: options.chunkSize || 1024,
minChunkSize: options.minChunkSize || 100,
overlap: options.overlap || 200,
clearExisting: options.clearExisting ?? false,
dryRun: options.dryRun ?? false,
verbose: options.verbose ?? false,
}
let processedChunks = 0
let failedChunks = 0
try {
logger.info('🚀 Starting docs processing with config:', {
docsPath: config.docsPath,
baseUrl: config.baseUrl,
chunkSize: config.chunkSize,
clearExisting: config.clearExisting,
dryRun: config.dryRun,
})
// Initialize the chunker
const chunker = new DocsChunker({
chunkSize: config.chunkSize,
minChunkSize: config.minChunkSize,
overlap: config.overlap,
baseUrl: config.baseUrl,
})
// Process all .mdx files
logger.info(`📚 Processing docs from: ${config.docsPath}`)
const chunks = await chunker.chunkAllDocs(config.docsPath)
if (chunks.length === 0) {
logger.warn('⚠️ No chunks generated from docs')
return { success: false, processedChunks: 0, failedChunks: 0 }
}
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
// Group chunks by document for summary
const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = []
}
acc[chunk.sourceDocument].push(chunk)
return acc
}, {})
// Display summary
logger.info(`\n=== DOCUMENT SUMMARY ===`)
for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
logger.info(`${doc}: ${docChunks.length} chunks`)
}
// Display sample chunks in verbose or dry-run mode
if (config.verbose || config.dryRun) {
logger.info(`\n=== SAMPLE CHUNKS ===`)
chunks.slice(0, 3).forEach((chunk, index) => {
logger.info(`\nChunk ${index + 1}:`)
logger.info(` Source: ${chunk.sourceDocument}`)
logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
logger.info(` Link: ${chunk.headerLink}`)
logger.info(` Tokens: ${chunk.tokenCount}`)
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
if (config.verbose) {
logger.info(` Text Preview: ${chunk.text.substring(0, 200)}...`)
}
})
}
// If dry run, stop here
if (config.dryRun) {
logger.info('\n✅ Dry run complete - no data saved to database')
return { success: true, processedChunks: chunks.length, failedChunks: 0 }
}
// Clear existing embeddings if requested
if (config.clearExisting) {
logger.info('🗑️ Clearing existing docs embeddings...')
try {
await db.delete(docsEmbeddings)
logger.info(`✅ Successfully deleted existing embeddings`)
} catch (error) {
logger.error('❌ Failed to delete existing embeddings:', error)
throw new Error('Failed to clear existing embeddings')
}
}
// Save chunks to database in batches
const batchSize = 10
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
try {
const batchData = batch.map((chunk) => ({
chunkText: chunk.text,
sourceDocument: chunk.sourceDocument,
sourceLink: chunk.headerLink,
headerText: chunk.headerText,
headerLevel: chunk.headerLevel,
tokenCount: chunk.tokenCount,
embedding: chunk.embedding,
embeddingModel: chunk.embeddingModel,
metadata: chunk.metadata,
}))
await db.insert(docsEmbeddings).values(batchData)
processedChunks += batch.length
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
logger.info(
` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
)
}
} catch (error) {
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
failedChunks += batch.length
}
}
// Verify final count
const savedCount = await db
.select({ count: sql<number>`count(*)` })
.from(docsEmbeddings)
.then((res) => res[0]?.count || 0)
logger.info(
`\n✅ Processing complete!\n` +
` 📊 Total chunks: ${chunks.length}\n` +
` ✅ Processed: ${processedChunks}\n` +
` ❌ Failed: ${failedChunks}\n` +
` 💾 Total in DB: ${savedCount}`
)
return { success: failedChunks === 0, processedChunks, failedChunks }
} catch (error) {
logger.error('❌ Fatal error during processing:', error)
return { success: false, processedChunks, failedChunks }
}
}
/**
* Main entry point with CLI argument parsing
*/
async function main() {
const args = process.argv.slice(2)
const options: ProcessingOptions = {
clearExisting: args.includes('--clear'),
dryRun: args.includes('--dry-run'),
verbose: args.includes('--verbose'),
}
// Parse custom path if provided
const pathIndex = args.indexOf('--path')
if (pathIndex !== -1 && args[pathIndex + 1]) {
options.docsPath = args[pathIndex + 1]
}
// Parse custom base URL if provided
const urlIndex = args.indexOf('--url')
if (urlIndex !== -1 && args[urlIndex + 1]) {
options.baseUrl = args[urlIndex + 1]
}
// Parse chunk size if provided
const chunkSizeIndex = args.indexOf('--chunk-size')
if (chunkSizeIndex !== -1 && args[chunkSizeIndex + 1]) {
options.chunkSize = Number.parseInt(args[chunkSizeIndex + 1], 10)
}
// Show help if requested
if (args.includes('--help') || args.includes('-h')) {
console.log(`
📚 Process Documentation Script
Usage: bun run process-docs.ts [options]
Options:
--clear Clear existing embeddings before processing
--dry-run Process and display results without saving to DB
--verbose Show detailed output including text previews
--path <path> Custom path to docs directory
--url <url> Custom base URL for links
--chunk-size <n> Custom chunk size in tokens (default: 1024)
--help, -h Show this help message
Examples:
# Dry run to test chunking
bun run process-docs.ts --dry-run
# Process and save to database
bun run process-docs.ts
# Clear existing and reprocess
bun run process-docs.ts --clear
# Custom path with verbose output
bun run process-docs.ts --path ./my-docs --verbose
`)
process.exit(0)
}
const result = await processDocs(options)
process.exit(result.success ? 0 : 1)
}
// Run if executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch((error) => {
logger.error('Fatal error:', error)
process.exit(1)
})
}
export { processDocs }