feat(kb): added json/yaml parser+chunker, added dedicated csv chunker (#1539)

* feat(kb): added json/yaml parser+chunker, added dedicated csv chunker * ack PR comments * improved kb upload
2026-02-01 10:14:56 -05:00 · 2025-10-04 14:59:21 -07:00
parent 0e838940f1
commit 86ed32ea10
27 changed files with 1794 additions and 857 deletions
--- a/apps/sim/scripts/chunk-docs.ts
+++ b/apps/sim/scripts/chunk-docs.ts
@@ -1,98 +0,0 @@
-#!/usr/bin/env bun
-
-import path from 'path'
-import { DocsChunker } from '@/lib/knowledge/documents/docs-chunker'
-import type { DocChunk } from '@/lib/knowledge/documents/types'
-import { createLogger } from '@/lib/logs/console/logger'
-
-const logger = createLogger('ChunkDocsScript')
-
-/**
- * Script to chunk all .mdx files in the docs directory
- */
-async function main() {
-  try {
-    // Initialize the docs chunker
-    const chunker = new DocsChunker({
-      chunkSize: 1024,
-      minChunkSize: 100,
-      overlap: 200,
-      baseUrl: 'https://docs.sim.ai',
-    })
-
-    // Path to the docs content directory
-    const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
-
-    logger.info(`Processing docs from: ${docsPath}`)
-
-    // Process all .mdx files
-    const chunks = await chunker.chunkAllDocs(docsPath)
-
-    logger.info(`\n=== CHUNKING RESULTS ===`)
-    logger.info(`Total chunks: ${chunks.length}`)
-
-    // Group chunks by document
-    const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
-      if (!acc[chunk.sourceDocument]) {
-        acc[chunk.sourceDocument] = []
-      }
-      acc[chunk.sourceDocument].push(chunk)
-      return acc
-    }, {})
-
-    // Display summary
-    logger.info(`\n=== DOCUMENT SUMMARY ===`)
-    for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
-      logger.info(`${doc}: ${docChunks.length} chunks`)
-    }
-
-    // Display a few sample chunks
-    logger.info(`\n=== SAMPLE CHUNKS ===`)
-    chunks.slice(0, 3).forEach((chunk, index) => {
-      logger.info(`\nChunk ${index + 1}:`)
-      logger.info(`  Source: ${chunk.sourceDocument}`)
-      logger.info(`  Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
-      logger.info(`  Link: ${chunk.headerLink}`)
-      logger.info(`  Tokens: ${chunk.tokenCount}`)
-      logger.info(`  Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
-      logger.info(
-        `  Embedding Preview: [${chunk.embedding
-          .slice(0, 5)
-          .map((n) => n.toFixed(4))
-          .join(', ')}...]`
-      )
-      logger.info(`  Text Preview: ${chunk.text.slice(0, 100)}...`)
-    })
-
-    // Calculate total token count
-    const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
-    const chunksWithEmbeddings = chunks.filter((chunk) => chunk.embedding.length > 0).length
-
-    logger.info(`\n=== STATISTICS ===`)
-    logger.info(`Total tokens: ${totalTokens}`)
-    logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
-    logger.info(`Chunks with embeddings: ${chunksWithEmbeddings}/${chunks.length}`)
-    if (chunks.length > 0 && chunks[0].embedding.length > 0) {
-      logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
-      logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
-    }
-
-    const headerLevels = chunks.reduce<Record<number, number>>((acc, chunk) => {
-      acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
-      return acc
-    }, {})
-
-    logger.info(`Header level distribution:`)
-    Object.entries(headerLevels)
-      .sort(([a], [b]) => Number(a) - Number(b))
-      .forEach(([level, count]) => {
-        logger.info(`  H${level}: ${count} chunks`)
-      })
-  } catch (error) {
-    logger.error('Error processing docs:', error)
-    process.exit(1)
-  }
-}
-
-// Run the script
-main().catch(console.error)
--- a/apps/sim/scripts/process-docs-embeddings.ts
+++ b/apps/sim/scripts/process-docs-embeddings.ts
@@ -1,215 +0,0 @@
-#!/usr/bin/env bun
-
-import path from 'path'
-import { db } from '@sim/db'
-import { docsEmbeddings } from '@sim/db/schema'
-import { sql } from 'drizzle-orm'
-import { isDev } from '@/lib/environment'
-import { DocsChunker } from '@/lib/knowledge/documents/docs-chunker'
-import { createLogger } from '@/lib/logs/console/logger'
-
-const logger = createLogger('ProcessDocsEmbeddings')
-
-interface ProcessingOptions {
-  /** Clear existing docs embeddings before processing */
-  clearExisting?: boolean
-  /** Path to docs directory */
-  docsPath?: string
-  /** Base URL for generating links */
-  baseUrl?: string
-  /** Chunk size in tokens */
-  chunkSize?: number
-  /** Minimum chunk size in tokens */
-  minChunkSize?: number
-  /** Overlap between chunks in tokens */
-  overlap?: number
-}
-
-/**
- * Production script to process documentation and save embeddings to database
- */
-async function processDocsEmbeddings(options: ProcessingOptions = {}) {
-  const startTime = Date.now()
-  let processedChunks = 0
-  let failedChunks = 0
-
-  try {
-    // Configuration
-    const config = {
-      clearExisting: options.clearExisting ?? false,
-      docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs/en'),
-      baseUrl: options.baseUrl ?? (isDev ? 'http://localhost:3001' : 'https://docs.sim.ai'),
-      chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
-      minChunkSize: options.minChunkSize ?? 100,
-      overlap: options.overlap ?? 50,
-    }
-
-    logger.info('🚀 Starting docs embedding processing...')
-    logger.info(`Configuration:`, {
-      docsPath: config.docsPath,
-      baseUrl: config.baseUrl,
-      chunkSize: config.chunkSize,
-      clearExisting: config.clearExisting,
-    })
-
-    const chunker = new DocsChunker({
-      chunkSize: config.chunkSize,
-      minChunkSize: config.minChunkSize,
-      overlap: config.overlap,
-      baseUrl: config.baseUrl,
-    })
-
-    logger.info(`📚 Processing docs from: ${config.docsPath}`)
-    const chunks = await chunker.chunkAllDocs(config.docsPath)
-
-    if (chunks.length === 0) {
-      logger.warn('⚠️ No chunks generated from docs')
-      return { success: false, processedChunks: 0, failedChunks: 0 }
-    }
-
-    logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
-
-    if (config.clearExisting) {
-      logger.info('🗑️ Clearing existing docs embeddings...')
-      try {
-        const deleteResult = await db.delete(docsEmbeddings)
-        logger.info(`✅ Successfully deleted existing embeddings`)
-      } catch (error) {
-        logger.error('❌ Failed to delete existing embeddings:', error)
-        throw new Error('Failed to clear existing embeddings')
-      }
-    }
-
-    const batchSize = 10
-    logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
-
-    for (let i = 0; i < chunks.length; i += batchSize) {
-      const batch = chunks.slice(i, i + batchSize)
-
-      try {
-        const batchData = batch.map((chunk) => ({
-          chunkText: chunk.text,
-          sourceDocument: chunk.sourceDocument,
-          sourceLink: chunk.headerLink,
-          headerText: chunk.headerText,
-          headerLevel: chunk.headerLevel,
-          tokenCount: chunk.tokenCount,
-          embedding: chunk.embedding,
-          embeddingModel: chunk.embeddingModel,
-          metadata: chunk.metadata,
-        }))
-
-        await db.insert(docsEmbeddings).values(batchData)
-
-        processedChunks += batch.length
-
-        if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
-          logger.info(
-            `  💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
-          )
-        }
-      } catch (error) {
-        logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
-        failedChunks += batch.length
-      }
-    }
-
-    const savedCount = await db
-      .select({ count: sql<number>`count(*)` })
-      .from(docsEmbeddings)
-      .then((result) => result[0]?.count || 0)
-
-    const duration = Date.now() - startTime
-
-    logger.info(`✅ Processing complete!`)
-    logger.info(`📊 Results:`)
-    logger.info(`  • Total chunks processed: ${chunks.length}`)
-    logger.info(`  • Successfully saved: ${processedChunks}`)
-    logger.info(`  • Failed: ${failedChunks}`)
-    logger.info(`  • Database total: ${savedCount}`)
-    logger.info(`  • Duration: ${Math.round(duration / 1000)}s`)
-
-    const documentStats = chunks.reduce(
-      (acc, chunk) => {
-        if (!acc[chunk.sourceDocument]) {
-          acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
-        }
-        acc[chunk.sourceDocument].chunks++
-        acc[chunk.sourceDocument].tokens += chunk.tokenCount
-        return acc
-      },
-      {} as Record<string, { chunks: number; tokens: number }>
-    )
-
-    logger.info(`📋 Document breakdown:`)
-    Object.entries(documentStats)
-      .sort(([, a], [, b]) => b.chunks - a.chunks)
-      .slice(0, 10)
-      .forEach(([doc, stats]) => {
-        logger.info(`  • ${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
-      })
-
-    if (Object.keys(documentStats).length > 10) {
-      logger.info(`  • ... and ${Object.keys(documentStats).length - 10} more documents`)
-    }
-
-    return {
-      success: failedChunks === 0,
-      processedChunks,
-      failedChunks,
-      totalChunks: chunks.length,
-      databaseCount: savedCount,
-      duration,
-    }
-  } catch (error) {
-    logger.error('💥 Fatal error during processing:', error)
-    return {
-      success: false,
-      processedChunks,
-      failedChunks,
-      error: error instanceof Error ? error.message : 'Unknown error',
-    }
-  }
-}
-
-/**
- * Main function - handle command line arguments
- */
-async function main() {
-  const args = process.argv.slice(2)
-  const options: ProcessingOptions = {}
-
-  if (args.includes('--clear')) {
-    options.clearExisting = true
-  }
-
-  if (args.includes('--help') || args.includes('-h')) {
-    console.log(`
-Usage: bun run scripts/process-docs-embeddings.ts [options]
-
-Options:
-  --clear     Clear existing docs embeddings before processing
-  --help, -h  Show this help message
-
-Examples:
-  bun run scripts/process-docs-embeddings.ts
-  bun run scripts/process-docs-embeddings.ts --clear
-`)
-    process.exit(0)
-  }
-
-  const result = await processDocsEmbeddings(options)
-
-  if (!result.success) {
-    process.exit(1)
-  }
-}
-
-if (import.meta.url.includes('process-docs-embeddings.ts')) {
-  main().catch((error) => {
-    logger.error('Script failed:', error)
-    process.exit(1)
-  })
-}
-
-export { processDocsEmbeddings }
--- a/apps/sim/scripts/process-docs.ts
+++ b/apps/sim/scripts/process-docs.ts
@@ -0,0 +1,256 @@
+#!/usr/bin/env bun
+
+import path from 'path'
+import { db } from '@sim/db'
+import { docsEmbeddings } from '@sim/db/schema'
+import { sql } from 'drizzle-orm'
+import { type DocChunk, DocsChunker } from '@/lib/chunkers'
+import { isDev } from '@/lib/environment'
+import { createLogger } from '@/lib/logs/console/logger'
+
+const logger = createLogger('ProcessDocs')
+
+interface ProcessingOptions {
+  /** Clear existing docs embeddings before processing */
+  clearExisting?: boolean
+  /** Path to docs directory */
+  docsPath?: string
+  /** Base URL for generating links */
+  baseUrl?: string
+  /** Chunk size in tokens */
+  chunkSize?: number
+  /** Minimum chunk size */
+  minChunkSize?: number
+  /** Overlap between chunks */
+  overlap?: number
+  /** Dry run - only display results, don't save to DB */
+  dryRun?: boolean
+  /** Verbose output */
+  verbose?: boolean
+}
+
+/**
+ * Process documentation files and optionally save embeddings to database
+ */
+async function processDocs(options: ProcessingOptions = {}) {
+  const config = {
+    docsPath: options.docsPath || path.join(process.cwd(), '../../apps/docs/content/docs'),
+    baseUrl: options.baseUrl || (isDev ? 'http://localhost:4000' : 'https://docs.sim.ai'),
+    chunkSize: options.chunkSize || 1024,
+    minChunkSize: options.minChunkSize || 100,
+    overlap: options.overlap || 200,
+    clearExisting: options.clearExisting ?? false,
+    dryRun: options.dryRun ?? false,
+    verbose: options.verbose ?? false,
+  }
+
+  let processedChunks = 0
+  let failedChunks = 0
+
+  try {
+    logger.info('🚀 Starting docs processing with config:', {
+      docsPath: config.docsPath,
+      baseUrl: config.baseUrl,
+      chunkSize: config.chunkSize,
+      clearExisting: config.clearExisting,
+      dryRun: config.dryRun,
+    })
+
+    // Initialize the chunker
+    const chunker = new DocsChunker({
+      chunkSize: config.chunkSize,
+      minChunkSize: config.minChunkSize,
+      overlap: config.overlap,
+      baseUrl: config.baseUrl,
+    })
+
+    // Process all .mdx files
+    logger.info(`📚 Processing docs from: ${config.docsPath}`)
+    const chunks = await chunker.chunkAllDocs(config.docsPath)
+
+    if (chunks.length === 0) {
+      logger.warn('⚠️ No chunks generated from docs')
+      return { success: false, processedChunks: 0, failedChunks: 0 }
+    }
+
+    logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
+
+    // Group chunks by document for summary
+    const chunksByDoc = chunks.reduce<Record<string, DocChunk[]>>((acc, chunk) => {
+      if (!acc[chunk.sourceDocument]) {
+        acc[chunk.sourceDocument] = []
+      }
+      acc[chunk.sourceDocument].push(chunk)
+      return acc
+    }, {})
+
+    // Display summary
+    logger.info(`\n=== DOCUMENT SUMMARY ===`)
+    for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
+      logger.info(`${doc}: ${docChunks.length} chunks`)
+    }
+
+    // Display sample chunks in verbose or dry-run mode
+    if (config.verbose || config.dryRun) {
+      logger.info(`\n=== SAMPLE CHUNKS ===`)
+      chunks.slice(0, 3).forEach((chunk, index) => {
+        logger.info(`\nChunk ${index + 1}:`)
+        logger.info(`  Source: ${chunk.sourceDocument}`)
+        logger.info(`  Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
+        logger.info(`  Link: ${chunk.headerLink}`)
+        logger.info(`  Tokens: ${chunk.tokenCount}`)
+        logger.info(`  Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
+        if (config.verbose) {
+          logger.info(`  Text Preview: ${chunk.text.substring(0, 200)}...`)
+        }
+      })
+    }
+
+    // If dry run, stop here
+    if (config.dryRun) {
+      logger.info('\n✅ Dry run complete - no data saved to database')
+      return { success: true, processedChunks: chunks.length, failedChunks: 0 }
+    }
+
+    // Clear existing embeddings if requested
+    if (config.clearExisting) {
+      logger.info('🗑️ Clearing existing docs embeddings...')
+      try {
+        await db.delete(docsEmbeddings)
+        logger.info(`✅ Successfully deleted existing embeddings`)
+      } catch (error) {
+        logger.error('❌ Failed to delete existing embeddings:', error)
+        throw new Error('Failed to clear existing embeddings')
+      }
+    }
+
+    // Save chunks to database in batches
+    const batchSize = 10
+    logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
+
+    for (let i = 0; i < chunks.length; i += batchSize) {
+      const batch = chunks.slice(i, i + batchSize)
+
+      try {
+        const batchData = batch.map((chunk) => ({
+          chunkText: chunk.text,
+          sourceDocument: chunk.sourceDocument,
+          sourceLink: chunk.headerLink,
+          headerText: chunk.headerText,
+          headerLevel: chunk.headerLevel,
+          tokenCount: chunk.tokenCount,
+          embedding: chunk.embedding,
+          embeddingModel: chunk.embeddingModel,
+          metadata: chunk.metadata,
+        }))
+
+        await db.insert(docsEmbeddings).values(batchData)
+        processedChunks += batch.length
+
+        if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
+          logger.info(
+            `  💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
+          )
+        }
+      } catch (error) {
+        logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
+        failedChunks += batch.length
+      }
+    }
+
+    // Verify final count
+    const savedCount = await db
+      .select({ count: sql<number>`count(*)` })
+      .from(docsEmbeddings)
+      .then((res) => res[0]?.count || 0)
+
+    logger.info(
+      `\n✅ Processing complete!\n` +
+        `   📊 Total chunks: ${chunks.length}\n` +
+        `   ✅ Processed: ${processedChunks}\n` +
+        `   ❌ Failed: ${failedChunks}\n` +
+        `   💾 Total in DB: ${savedCount}`
+    )
+
+    return { success: failedChunks === 0, processedChunks, failedChunks }
+  } catch (error) {
+    logger.error('❌ Fatal error during processing:', error)
+    return { success: false, processedChunks, failedChunks }
+  }
+}
+
+/**
+ * Main entry point with CLI argument parsing
+ */
+async function main() {
+  const args = process.argv.slice(2)
+
+  const options: ProcessingOptions = {
+    clearExisting: args.includes('--clear'),
+    dryRun: args.includes('--dry-run'),
+    verbose: args.includes('--verbose'),
+  }
+
+  // Parse custom path if provided
+  const pathIndex = args.indexOf('--path')
+  if (pathIndex !== -1 && args[pathIndex + 1]) {
+    options.docsPath = args[pathIndex + 1]
+  }
+
+  // Parse custom base URL if provided
+  const urlIndex = args.indexOf('--url')
+  if (urlIndex !== -1 && args[urlIndex + 1]) {
+    options.baseUrl = args[urlIndex + 1]
+  }
+
+  // Parse chunk size if provided
+  const chunkSizeIndex = args.indexOf('--chunk-size')
+  if (chunkSizeIndex !== -1 && args[chunkSizeIndex + 1]) {
+    options.chunkSize = Number.parseInt(args[chunkSizeIndex + 1], 10)
+  }
+
+  // Show help if requested
+  if (args.includes('--help') || args.includes('-h')) {
+    console.log(`
+📚 Process Documentation Script
+
+Usage: bun run process-docs.ts [options]
+
+Options:
+  --clear          Clear existing embeddings before processing
+  --dry-run        Process and display results without saving to DB
+  --verbose        Show detailed output including text previews
+  --path <path>    Custom path to docs directory
+  --url <url>      Custom base URL for links
+  --chunk-size <n> Custom chunk size in tokens (default: 1024)
+  --help, -h       Show this help message
+
+Examples:
+  # Dry run to test chunking
+  bun run process-docs.ts --dry-run
+
+  # Process and save to database
+  bun run process-docs.ts
+
+  # Clear existing and reprocess
+  bun run process-docs.ts --clear
+
+  # Custom path with verbose output
+  bun run process-docs.ts --path ./my-docs --verbose
+    `)
+    process.exit(0)
+  }
+
+  const result = await processDocs(options)
+  process.exit(result.success ? 0 : 1)
+}
+
+// Run if executed directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main().catch((error) => {
+    logger.error('Fatal error:', error)
+    process.exit(1)
+  })
+}
+
+export { processDocs }