mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-30 09:18:01 -05:00
* feat(kb): added json/yaml parser+chunker, added dedicated csv chunker * ack PR comments * improved kb upload
257 lines
7.9 KiB
TypeScript
257 lines
7.9 KiB
TypeScript
#!/usr/bin/env bun
|
|
|
|
import path from 'path'
|
|
import { db } from '@sim/db'
|
|
import { docsEmbeddings } from '@sim/db/schema'
|
|
import { sql } from 'drizzle-orm'
|
|
import { type DocChunk, DocsChunker } from '@/lib/chunkers'
|
|
import { isDev } from '@/lib/environment'
|
|
import { createLogger } from '@/lib/logs/console/logger'
|
|
|
|
const logger = createLogger('ProcessDocs')
|
|
|
|
interface ProcessingOptions {
|
|
/** Clear existing docs embeddings before processing */
|
|
clearExisting?: boolean
|
|
/** Path to docs directory */
|
|
docsPath?: string
|
|
/** Base URL for generating links */
|
|
baseUrl?: string
|
|
/** Chunk size in tokens */
|
|
chunkSize?: number
|
|
/** Minimum chunk size */
|
|
minChunkSize?: number
|
|
/** Overlap between chunks */
|
|
overlap?: number
|
|
/** Dry run - only display results, don't save to DB */
|
|
dryRun?: boolean
|
|
/** Verbose output */
|
|
verbose?: boolean
|
|
}
|
|
|
|
/**
|
|
* Process documentation files and optionally save embeddings to database
|
|
*/
|
|
async function processDocs(options: ProcessingOptions = {}) {
|
|
const config = {
|
|
docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
|
|
baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
|
|
chunkSize: options.chunkSize || 1024,
|
|
minChunkSize: options.minChunkSize || 100,
|
|
overlap: options.overlap || 200,
|
|
clearExisting: options.clearExisting ?? false,
|
|
dryRun: options.dryRun ?? false,
|
|
verbose: options.verbose ?? false,
|
|
}
|
|
|
|
let processedChunks = 0
|
|
let failedChunks = 0
|
|
|
|
try {
|
|
logger.info('🚀 Starting docs processing with config:', {
|
|
docsPath: config.docsPath,
|
|
baseUrl: config.baseUrl,
|
|
chunkSize: config.chunkSize,
|
|
clearExisting: config.clearExisting,
|
|
dryRun: config.dryRun,
|
|
})
|
|
|
|
// Initialize the chunker
|
|
const chunker = new DocsChunker({
|
|
chunkSize: config.chunkSize,
|
|
minChunkSize: config.minChunkSize,
|
|
overlap: config.overlap,
|
|
baseUrl: config.baseUrl,
|
|
})
|
|
|
|
// Process all .mdx files
|
|
logger.info(`📚 Processing docs from: ${config.docsPath}`)
|
|
const chunks = await chunker.chunkAllDocs(config.docsPath)
|
|
|
|
if (chunks.length === 0) {
|
|
logger.warn('⚠️ No chunks generated from docs')
|
|
return { success: false, processedChunks: 0, failedChunks: 0 }
|
|
}
|
|
|
|
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
|
|
|
|
// Group chunks by document for summary
|
|
const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
|
|
if (!acc[chunk.sourceDocument]) {
|
|
acc[chunk.sourceDocument] = []
|
|
}
|
|
acc[chunk.sourceDocument].push(chunk)
|
|
return acc
|
|
}, {})
|
|
|
|
// Display summary
|
|
logger.info(`\n=== DOCUMENT SUMMARY ===`)
|
|
for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
|
|
logger.info(`${doc}: ${docChunks.length} chunks`)
|
|
}
|
|
|
|
// Display sample chunks in verbose or dry-run mode
|
|
if (config.verbose || config.dryRun) {
|
|
logger.info(`\n=== SAMPLE CHUNKS ===`)
|
|
chunks.slice(0, 3).forEach((chunk, index) => {
|
|
logger.info(`\nChunk ${index + 1}:`)
|
|
logger.info(` Source: ${chunk.sourceDocument}`)
|
|
logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
|
|
logger.info(` Link: ${chunk.headerLink}`)
|
|
logger.info(` Tokens: ${chunk.tokenCount}`)
|
|
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
|
|
if (config.verbose) {
|
|
logger.info(` Text Preview: ${chunk.text.substring(0, 200)}...`)
|
|
}
|
|
})
|
|
}
|
|
|
|
// If dry run, stop here
|
|
if (config.dryRun) {
|
|
logger.info('\n✅ Dry run complete - no data saved to database')
|
|
return { success: true, processedChunks: chunks.length, failedChunks: 0 }
|
|
}
|
|
|
|
// Clear existing embeddings if requested
|
|
if (config.clearExisting) {
|
|
logger.info('🗑️ Clearing existing docs embeddings...')
|
|
try {
|
|
await db.delete(docsEmbeddings)
|
|
logger.info(`✅ Successfully deleted existing embeddings`)
|
|
} catch (error) {
|
|
logger.error('❌ Failed to delete existing embeddings:', error)
|
|
throw new Error('Failed to clear existing embeddings')
|
|
}
|
|
}
|
|
|
|
// Save chunks to database in batches
|
|
const batchSize = 10
|
|
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
|
|
|
|
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
const batch = chunks.slice(i, i + batchSize)
|
|
|
|
try {
|
|
const batchData = batch.map((chunk) => ({
|
|
chunkText: chunk.text,
|
|
sourceDocument: chunk.sourceDocument,
|
|
sourceLink: chunk.headerLink,
|
|
headerText: chunk.headerText,
|
|
headerLevel: chunk.headerLevel,
|
|
tokenCount: chunk.tokenCount,
|
|
embedding: chunk.embedding,
|
|
embeddingModel: chunk.embeddingModel,
|
|
metadata: chunk.metadata,
|
|
}))
|
|
|
|
await db.insert(docsEmbeddings).values(batchData)
|
|
processedChunks += batch.length
|
|
|
|
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
|
|
logger.info(
|
|
` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
|
|
)
|
|
}
|
|
} catch (error) {
|
|
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
|
|
failedChunks += batch.length
|
|
}
|
|
}
|
|
|
|
// Verify final count
|
|
const savedCount = await db
|
|
.select({ count: sql<number>`count(*)` })
|
|
.from(docsEmbeddings)
|
|
.then((res) => res[0]?.count || 0)
|
|
|
|
logger.info(
|
|
`\n✅ Processing complete!\n` +
|
|
` 📊 Total chunks: ${chunks.length}\n` +
|
|
` ✅ Processed: ${processedChunks}\n` +
|
|
` ❌ Failed: ${failedChunks}\n` +
|
|
` 💾 Total in DB: ${savedCount}`
|
|
)
|
|
|
|
return { success: failedChunks === 0, processedChunks, failedChunks }
|
|
} catch (error) {
|
|
logger.error('❌ Fatal error during processing:', error)
|
|
return { success: false, processedChunks, failedChunks }
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main entry point with CLI argument parsing
|
|
*/
|
|
async function main() {
|
|
const args = process.argv.slice(2)
|
|
|
|
const options: ProcessingOptions = {
|
|
clearExisting: args.includes('--clear'),
|
|
dryRun: args.includes('--dry-run'),
|
|
verbose: args.includes('--verbose'),
|
|
}
|
|
|
|
// Parse custom path if provided
|
|
const pathIndex = args.indexOf('--path')
|
|
if (pathIndex !== -1 && args[pathIndex + 1]) {
|
|
options.docsPath = args[pathIndex + 1]
|
|
}
|
|
|
|
// Parse custom base URL if provided
|
|
const urlIndex = args.indexOf('--url')
|
|
if (urlIndex !== -1 && args[urlIndex + 1]) {
|
|
options.baseUrl = args[urlIndex + 1]
|
|
}
|
|
|
|
// Parse chunk size if provided
|
|
const chunkSizeIndex = args.indexOf('--chunk-size')
|
|
if (chunkSizeIndex !== -1 && args[chunkSizeIndex + 1]) {
|
|
options.chunkSize = Number.parseInt(args[chunkSizeIndex + 1], 10)
|
|
}
|
|
|
|
// Show help if requested
|
|
if (args.includes('--help') || args.includes('-h')) {
|
|
console.log(`
|
|
📚 Process Documentation Script
|
|
|
|
Usage: bun run process-docs.ts [options]
|
|
|
|
Options:
|
|
--clear Clear existing embeddings before processing
|
|
--dry-run Process and display results without saving to DB
|
|
--verbose Show detailed output including text previews
|
|
--path <path> Custom path to docs directory
|
|
--url <url> Custom base URL for links
|
|
--chunk-size <n> Custom chunk size in tokens (default: 1024)
|
|
--help, -h Show this help message
|
|
|
|
Examples:
|
|
# Dry run to test chunking
|
|
bun run process-docs.ts --dry-run
|
|
|
|
# Process and save to database
|
|
bun run process-docs.ts
|
|
|
|
# Clear existing and reprocess
|
|
bun run process-docs.ts --clear
|
|
|
|
# Custom path with verbose output
|
|
bun run process-docs.ts --path ./my-docs --verbose
|
|
`)
|
|
process.exit(0)
|
|
}
|
|
|
|
const result = await processDocs(options)
|
|
process.exit(result.success ? 0 : 1)
|
|
}
|
|
|
|
// Run if executed directly
|
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
main().catch((error) => {
|
|
logger.error('Fatal error:', error)
|
|
process.exit(1)
|
|
})
|
|
}
|
|
|
|
export { processDocs }
|