#!/usr/bin/env bun import path from 'path' import { db } from '@sim/db' import { docsEmbeddings } from '@sim/db/schema' import { sql } from 'drizzle-orm' import { type DocChunk, DocsChunker } from '@/lib/chunkers' import { isDev } from '@/lib/environment' import { createLogger } from '@/lib/logs/console/logger' const logger = createLogger('ProcessDocs') interface ProcessingOptions { /** Clear existing docs embeddings before processing */ clearExisting?: boolean /** Path to docs directory */ docsPath?: string /** Base URL for generating links */ baseUrl?: string /** Chunk size in tokens */ chunkSize?: number /** Minimum chunk size */ minChunkSize?: number /** Overlap between chunks */ overlap?: number /** Dry run - only display results, don't save to DB */ dryRun?: boolean /** Verbose output */ verbose?: boolean } /** * Process documentation files and optionally save embeddings to database */ async function processDocs(options: ProcessingOptions = {}) { const config = { docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'), baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'), chunkSize: options.chunkSize || 1024, minChunkSize: options.minChunkSize || 100, overlap: options.overlap || 200, clearExisting: options.clearExisting ?? false, dryRun: options.dryRun ?? false, verbose: options.verbose ?? false, } let processedChunks = 0 let failedChunks = 0 try { logger.info('šŸš€ Starting docs processing with config:', { docsPath: config.docsPath, baseUrl: config.baseUrl, chunkSize: config.chunkSize, clearExisting: config.clearExisting, dryRun: config.dryRun, }) // Initialize the chunker const chunker = new DocsChunker({ chunkSize: config.chunkSize, minChunkSize: config.minChunkSize, overlap: config.overlap, baseUrl: config.baseUrl, }) // Process all .mdx files logger.info(`šŸ“š Processing docs from: ${config.docsPath}`) const chunks = await chunker.chunkAllDocs(config.docsPath) if (chunks.length === 0) { logger.warn('āš ļø No chunks generated from docs') return { success: false, processedChunks: 0, failedChunks: 0 } } logger.info(`šŸ“Š Generated ${chunks.length} chunks with embeddings`) // Group chunks by document for summary const chunksByDoc = chunks.reduce>((acc, chunk) => { if (!acc[chunk.sourceDocument]) { acc[chunk.sourceDocument] = [] } acc[chunk.sourceDocument].push(chunk) return acc }, {}) // Display summary logger.info(`\n=== DOCUMENT SUMMARY ===`) for (const [doc, docChunks] of Object.entries(chunksByDoc)) { logger.info(`${doc}: ${docChunks.length} chunks`) } // Display sample chunks in verbose or dry-run mode if (config.verbose || config.dryRun) { logger.info(`\n=== SAMPLE CHUNKS ===`) chunks.slice(0, 3).forEach((chunk, index) => { logger.info(`\nChunk ${index + 1}:`) logger.info(` Source: ${chunk.sourceDocument}`) logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`) logger.info(` Link: ${chunk.headerLink}`) logger.info(` Tokens: ${chunk.tokenCount}`) logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`) if (config.verbose) { logger.info(` Text Preview: ${chunk.text.substring(0, 200)}...`) } }) } // If dry run, stop here if (config.dryRun) { logger.info('\nāœ… Dry run complete - no data saved to database') return { success: true, processedChunks: chunks.length, failedChunks: 0 } } // Clear existing embeddings if requested if (config.clearExisting) { logger.info('šŸ—‘ļø Clearing existing docs embeddings...') try { await db.delete(docsEmbeddings) logger.info(`āœ… Successfully deleted existing embeddings`) } catch (error) { logger.error('āŒ Failed to delete existing embeddings:', error) throw new Error('Failed to clear existing embeddings') } } // Save chunks to database in batches const batchSize = 10 logger.info(`šŸ’¾ Saving chunks to database (batch size: ${batchSize})...`) for (let i = 0; i < chunks.length; i += batchSize) { const batch = chunks.slice(i, i + batchSize) try { const batchData = batch.map((chunk) => ({ chunkText: chunk.text, sourceDocument: chunk.sourceDocument, sourceLink: chunk.headerLink, headerText: chunk.headerText, headerLevel: chunk.headerLevel, tokenCount: chunk.tokenCount, embedding: chunk.embedding, embeddingModel: chunk.embeddingModel, metadata: chunk.metadata, })) await db.insert(docsEmbeddings).values(batchData) processedChunks += batch.length if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) { logger.info( ` šŸ’¾ Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks` ) } } catch (error) { logger.error(`āŒ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error) failedChunks += batch.length } } // Verify final count const savedCount = await db .select({ count: sql`count(*)` }) .from(docsEmbeddings) .then((res) => res[0]?.count || 0) logger.info( `\nāœ… Processing complete!\n` + ` šŸ“Š Total chunks: ${chunks.length}\n` + ` āœ… Processed: ${processedChunks}\n` + ` āŒ Failed: ${failedChunks}\n` + ` šŸ’¾ Total in DB: ${savedCount}` ) return { success: failedChunks === 0, processedChunks, failedChunks } } catch (error) { logger.error('āŒ Fatal error during processing:', error) return { success: false, processedChunks, failedChunks } } } /** * Main entry point with CLI argument parsing */ async function main() { const args = process.argv.slice(2) const options: ProcessingOptions = { clearExisting: args.includes('--clear'), dryRun: args.includes('--dry-run'), verbose: args.includes('--verbose'), } // Parse custom path if provided const pathIndex = args.indexOf('--path') if (pathIndex !== -1 && args[pathIndex + 1]) { options.docsPath = args[pathIndex + 1] } // Parse custom base URL if provided const urlIndex = args.indexOf('--url') if (urlIndex !== -1 && args[urlIndex + 1]) { options.baseUrl = args[urlIndex + 1] } // Parse chunk size if provided const chunkSizeIndex = args.indexOf('--chunk-size') if (chunkSizeIndex !== -1 && args[chunkSizeIndex + 1]) { options.chunkSize = Number.parseInt(args[chunkSizeIndex + 1], 10) } // Show help if requested if (args.includes('--help') || args.includes('-h')) { console.log(` šŸ“š Process Documentation Script Usage: bun run process-docs.ts [options] Options: --clear Clear existing embeddings before processing --dry-run Process and display results without saving to DB --verbose Show detailed output including text previews --path Custom path to docs directory --url Custom base URL for links --chunk-size Custom chunk size in tokens (default: 1024) --help, -h Show this help message Examples: # Dry run to test chunking bun run process-docs.ts --dry-run # Process and save to database bun run process-docs.ts # Clear existing and reprocess bun run process-docs.ts --clear # Custom path with verbose output bun run process-docs.ts --path ./my-docs --verbose `) process.exit(0) } const result = await processDocs(options) process.exit(result.success ? 0 : 1) } // Run if executed directly if (import.meta.url === `file://${process.argv[1]}`) { main().catch((error) => { logger.error('Fatal error:', error) process.exit(1) }) } export { processDocs }