Files
sim/apps/sim/scripts/process-docs.ts
Waleed 86ed32ea10 feat(kb): added json/yaml parser+chunker, added dedicated csv chunker (#1539)
* feat(kb): added json/yaml parser+chunker, added dedicated csv chunker

* ack PR comments

* improved kb upload
2025-10-04 14:59:21 -07:00

257 lines
7.9 KiB
TypeScript

#!/usr/bin/env bun
import path from 'path'
import { db } from '@sim/db'
import { docsEmbeddings } from '@sim/db/schema'
import { sql } from 'drizzle-orm'
import { type DocChunk, DocsChunker } from '@/lib/chunkers'
import { isDev } from '@/lib/environment'
import { createLogger } from '@/lib/logs/console/logger'
const logger = createLogger('ProcessDocs')
interface ProcessingOptions {
/** Clear existing docs embeddings before processing */
clearExisting?: boolean
/** Path to docs directory */
docsPath?: string
/** Base URL for generating links */
baseUrl?: string
/** Chunk size in tokens */
chunkSize?: number
/** Minimum chunk size */
minChunkSize?: number
/** Overlap between chunks */
overlap?: number
/** Dry run - only display results, don't save to DB */
dryRun?: boolean
/** Verbose output */
verbose?: boolean
}
/**
* Process documentation files and optionally save embeddings to database
*/
async function processDocs(options: ProcessingOptions = {}) {
const config = {
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
chunkSize: options.chunkSize || 1024,
minChunkSize: options.minChunkSize || 100,
overlap: options.overlap || 200,
clearExisting: options.clearExisting ?? false,
dryRun: options.dryRun ?? false,
verbose: options.verbose ?? false,
}
let processedChunks = 0
let failedChunks = 0
try {
logger.info('🚀 Starting docs processing with config:', {
docsPath: config.docsPath,
baseUrl: config.baseUrl,
chunkSize: config.chunkSize,
clearExisting: config.clearExisting,
dryRun: config.dryRun,
})
// Initialize the chunker
const chunker = new DocsChunker({
chunkSize: config.chunkSize,
minChunkSize: config.minChunkSize,
overlap: config.overlap,
baseUrl: config.baseUrl,
})
// Process all .mdx files
logger.info(`📚 Processing docs from: ${config.docsPath}`)
const chunks = await chunker.chunkAllDocs(config.docsPath)
if (chunks.length === 0) {
logger.warn('⚠️ No chunks generated from docs')
return { success: false, processedChunks: 0, failedChunks: 0 }
}
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
// Group chunks by document for summary
const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = []
}
acc[chunk.sourceDocument].push(chunk)
return acc
}, {})
// Display summary
logger.info(`\n=== DOCUMENT SUMMARY ===`)
for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
logger.info(`${doc}: ${docChunks.length} chunks`)
}
// Display sample chunks in verbose or dry-run mode
if (config.verbose || config.dryRun) {
logger.info(`\n=== SAMPLE CHUNKS ===`)
chunks.slice(0, 3).forEach((chunk, index) => {
logger.info(`\nChunk ${index + 1}:`)
logger.info(` Source: ${chunk.sourceDocument}`)
logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
logger.info(` Link: ${chunk.headerLink}`)
logger.info(` Tokens: ${chunk.tokenCount}`)
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
if (config.verbose) {
logger.info(` Text Preview: ${chunk.text.substring(0, 200)}...`)
}
})
}
// If dry run, stop here
if (config.dryRun) {
logger.info('\n✅ Dry run complete - no data saved to database')
return { success: true, processedChunks: chunks.length, failedChunks: 0 }
}
// Clear existing embeddings if requested
if (config.clearExisting) {
logger.info('🗑️ Clearing existing docs embeddings...')
try {
await db.delete(docsEmbeddings)
logger.info(`✅ Successfully deleted existing embeddings`)
} catch (error) {
logger.error('❌ Failed to delete existing embeddings:', error)
throw new Error('Failed to clear existing embeddings')
}
}
// Save chunks to database in batches
const batchSize = 10
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
try {
const batchData = batch.map((chunk) => ({
chunkText: chunk.text,
sourceDocument: chunk.sourceDocument,
sourceLink: chunk.headerLink,
headerText: chunk.headerText,
headerLevel: chunk.headerLevel,
tokenCount: chunk.tokenCount,
embedding: chunk.embedding,
embeddingModel: chunk.embeddingModel,
metadata: chunk.metadata,
}))
await db.insert(docsEmbeddings).values(batchData)
processedChunks += batch.length
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
logger.info(
` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
)
}
} catch (error) {
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
failedChunks += batch.length
}
}
// Verify final count
const savedCount = await db
.select({ count: sql<number>`count(*)` })
.from(docsEmbeddings)
.then((res) => res[0]?.count || 0)
logger.info(
`\n✅ Processing complete!\n` +
` 📊 Total chunks: ${chunks.length}\n` +
` ✅ Processed: ${processedChunks}\n` +
` ❌ Failed: ${failedChunks}\n` +
` 💾 Total in DB: ${savedCount}`
)
return { success: failedChunks === 0, processedChunks, failedChunks }
} catch (error) {
logger.error('❌ Fatal error during processing:', error)
return { success: false, processedChunks, failedChunks }
}
}
/**
* Main entry point with CLI argument parsing
*/
async function main() {
const args = process.argv.slice(2)
const options: ProcessingOptions = {
clearExisting: args.includes('--clear'),
dryRun: args.includes('--dry-run'),
verbose: args.includes('--verbose'),
}
// Parse custom path if provided
const pathIndex = args.indexOf('--path')
if (pathIndex !== -1 && args[pathIndex + 1]) {
options.docsPath = args[pathIndex + 1]
}
// Parse custom base URL if provided
const urlIndex = args.indexOf('--url')
if (urlIndex !== -1 && args[urlIndex + 1]) {
options.baseUrl = args[urlIndex + 1]
}
// Parse chunk size if provided
const chunkSizeIndex = args.indexOf('--chunk-size')
if (chunkSizeIndex !== -1 && args[chunkSizeIndex + 1]) {
options.chunkSize = Number.parseInt(args[chunkSizeIndex + 1], 10)
}
// Show help if requested
if (args.includes('--help') || args.includes('-h')) {
console.log(`
📚 Process Documentation Script
Usage: bun run process-docs.ts [options]
Options:
--clear Clear existing embeddings before processing
--dry-run Process and display results without saving to DB
--verbose Show detailed output including text previews
--path <path> Custom path to docs directory
--url <url> Custom base URL for links
--chunk-size <n> Custom chunk size in tokens (default: 1024)
--help, -h Show this help message
Examples:
# Dry run to test chunking
bun run process-docs.ts --dry-run
# Process and save to database
bun run process-docs.ts
# Clear existing and reprocess
bun run process-docs.ts --clear
# Custom path with verbose output
bun run process-docs.ts --path ./my-docs --verbose
`)
process.exit(0)
}
const result = await processDocs(options)
process.exit(result.success ? 0 : 1)
}
// Run if executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch((error) => {
logger.error('Fatal error:', error)
process.exit(1)
})
}
export { processDocs }