Add db migration

2026-04-06 03:00:16 -04:00 · 2025-07-08 16:11:44 -07:00
parent 76c0c56689
commit b9fa50b4de
6 changed files with 5593 additions and 12 deletions
--- a/apps/sim/db/migrations/0051_typical_expediter.sql
+++ b/apps/sim/db/migrations/0051_typical_expediter.sql
@@ -0,0 +1,26 @@
+CREATE TABLE "docs_embeddings" (
+	"chunk_id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
+	"chunk_text" text NOT NULL,
+	"source_document" text NOT NULL,
+	"source_link" text NOT NULL,
+	"header_text" text NOT NULL,
+	"header_level" integer NOT NULL,
+	"token_count" integer NOT NULL,
+	"embedding" vector(1536) NOT NULL,
+	"embedding_model" text DEFAULT 'text-embedding-3-small' NOT NULL,
+	"metadata" jsonb DEFAULT '{}' NOT NULL,
+	"chunk_text_tsv" "tsvector" GENERATED ALWAYS AS (to_tsvector('english', "docs_embeddings"."chunk_text")) STORED,
+	"created_at" timestamp DEFAULT now() NOT NULL,
+	"updated_at" timestamp DEFAULT now() NOT NULL,
+	CONSTRAINT "docs_embedding_not_null_check" CHECK ("embedding" IS NOT NULL),
+	CONSTRAINT "docs_header_level_check" CHECK ("header_level" >= 1 AND "header_level" <= 6)
+);
+--> statement-breakpoint
+CREATE INDEX "docs_emb_source_document_idx" ON "docs_embeddings" USING btree ("source_document");--> statement-breakpoint
+CREATE INDEX "docs_emb_header_level_idx" ON "docs_embeddings" USING btree ("header_level");--> statement-breakpoint
+CREATE INDEX "docs_emb_source_header_idx" ON "docs_embeddings" USING btree ("source_document","header_level");--> statement-breakpoint
+CREATE INDEX "docs_emb_model_idx" ON "docs_embeddings" USING btree ("embedding_model");--> statement-breakpoint
+CREATE INDEX "docs_emb_created_at_idx" ON "docs_embeddings" USING btree ("created_at");--> statement-breakpoint
+CREATE INDEX "docs_embedding_vector_hnsw_idx" ON "docs_embeddings" USING hnsw ("embedding" vector_cosine_ops) WITH (m=16,ef_construction=64);--> statement-breakpoint
+CREATE INDEX "docs_emb_metadata_gin_idx" ON "docs_embeddings" USING gin ("metadata");--> statement-breakpoint
+CREATE INDEX "docs_emb_chunk_text_fts_idx" ON "docs_embeddings" USING gin ("chunk_text_tsv");
--- a/apps/sim/db/migrations/meta/0051_snapshot.json
+++ b/apps/sim/db/migrations/meta/0051_snapshot.json
--- a/apps/sim/db/migrations/meta/_journal.json
+++ b/apps/sim/db/migrations/meta/_journal.json
@@ -351,6 +351,13 @@
      "when": 1751659528896,
      "tag": "0050_big_mattie_franklin",
      "breakpoints": true
+    },
+    {
+      "idx": 51,
+      "version": "7",
+      "when": 1752014976338,
+      "tag": "0051_typical_expediter",
+      "breakpoints": true
    }
  ]
-}
+}
--- a/apps/sim/db/schema.ts
+++ b/apps/sim/db/schema.ts
@@ -13,6 +13,7 @@ import {
  text,
  timestamp,
  uniqueIndex,
+  uuid,
  vector,
 } from 'drizzle-orm/pg-core'

@@ -909,3 +910,66 @@ export const embedding = pgTable(
    embeddingNotNullCheck: check('embedding_not_null_check', sql`"embedding" IS NOT NULL`),
  })
 )
+
+export const docsEmbeddings = pgTable(
+  'docs_embeddings',
+  {
+    chunkId: uuid('chunk_id').primaryKey().defaultRandom(),
+    chunkText: text('chunk_text').notNull(),
+    sourceDocument: text('source_document').notNull(),
+    sourceLink: text('source_link').notNull(),
+    headerText: text('header_text').notNull(),
+    headerLevel: integer('header_level').notNull(),
+    tokenCount: integer('token_count').notNull(),
+
+    // Vector embedding - optimized for text-embedding-3-small with HNSW support
+    embedding: vector('embedding', { dimensions: 1536 }).notNull(),
+    embeddingModel: text('embedding_model').notNull().default('text-embedding-3-small'),
+
+    // Metadata for flexible filtering
+    metadata: jsonb('metadata').notNull().default('{}'),
+
+    // Full-text search support - generated tsvector column
+    chunkTextTsv: tsvector('chunk_text_tsv').generatedAlwaysAs(
+      (): SQL => sql`to_tsvector('english', ${docsEmbeddings.chunkText})`
+    ),
+
+    // Timestamps
+    createdAt: timestamp('created_at').notNull().defaultNow(),
+    updatedAt: timestamp('updated_at').notNull().defaultNow(),
+  },
+  (table) => ({
+    // Source document queries
+    sourceDocumentIdx: index('docs_emb_source_document_idx').on(table.sourceDocument),
+
+    // Header level filtering
+    headerLevelIdx: index('docs_emb_header_level_idx').on(table.headerLevel),
+
+    // Combined source and header queries
+    sourceHeaderIdx: index('docs_emb_source_header_idx').on(table.sourceDocument, table.headerLevel),
+
+    // Model-specific queries
+    modelIdx: index('docs_emb_model_idx').on(table.embeddingModel),
+
+    // Timestamp queries
+    createdAtIdx: index('docs_emb_created_at_idx').on(table.createdAt),
+
+    // Vector similarity search indexes (HNSW) - optimized for documentation embeddings
+    embeddingVectorHnswIdx: index('docs_embedding_vector_hnsw_idx')
+      .using('hnsw', table.embedding.op('vector_cosine_ops'))
+      .with({
+        m: 16,
+        ef_construction: 64,
+      }),
+
+    // GIN index for JSONB metadata queries
+    metadataGinIdx: index('docs_emb_metadata_gin_idx').using('gin', table.metadata),
+
+    // Full-text search index
+    chunkTextFtsIdx: index('docs_emb_chunk_text_fts_idx').using('gin', table.chunkTextTsv),
+
+    // Constraints
+    embeddingNotNullCheck: check('docs_embedding_not_null_check', sql`"embedding" IS NOT NULL`),
+    headerLevelCheck: check('docs_header_level_check', sql`"header_level" >= 1 AND "header_level" <= 6`),
+  })
+)
--- a/apps/sim/lib/documents/docs-chunker.ts
+++ b/apps/sim/lib/documents/docs-chunker.ts
@@ -23,9 +23,9 @@ export class DocsChunker {
  constructor(options: DocsChunkerOptions = {}) {
    // Use the existing TextChunker for chunking logic
    this.textChunker = new TextChunker({
-      chunkSize: options.chunkSize ?? 1024,
+      chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
      minChunkSize: options.minChunkSize ?? 100,
-      overlap: options.overlap ?? 200,
+      overlap: options.overlap ?? 50,
    })
    this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
  }
@@ -209,16 +209,25 @@ export class DocsChunker {
  }

  /**
-   * Split content into chunks using the existing TextChunker
+   * Split content into chunks using the existing TextChunker with table awareness
   */
  private async splitContent(content: string): Promise<string[]> {
    // Clean the content first
    const cleanedContent = this.cleanContent(content)
-
+    
+    // Detect table boundaries to avoid splitting them
+    const tableBoundaries = this.detectTableBoundaries(cleanedContent)
+    
    // Use the existing TextChunker
    const chunks = await this.textChunker.chunk(cleanedContent)
-
-    return chunks.map((chunk) => chunk.text)
+    
+    // Post-process chunks to ensure tables aren't split
+    const processedChunks = this.mergeTableChunks(chunks.map(chunk => chunk.text), tableBoundaries, cleanedContent)
+    
+    // Ensure no chunk exceeds 300 tokens
+    const finalChunks = this.enforceSizeLimit(processedChunks)
+    
+    return finalChunks
  }

  /**
@@ -239,20 +248,20 @@ export class DocsChunker {
    )
  }

-  /**
+    /**
   * Parse frontmatter from MDX content
   */
  private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
    const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
    const match = content.match(frontmatterRegex)
-
+    
    if (!match) {
      return { data: {}, content }
    }
-
+    
    const [, frontmatterText, markdownContent] = match
    const data: Frontmatter = {}
-
+    
    // Simple YAML parsing for title and description
    const lines = frontmatterText.split('\n')
    for (const line of lines) {
@@ -266,7 +275,357 @@ export class DocsChunker {
        data[key] = value
      }
    }
-
+    
    return { data, content: markdownContent }
  }
+
+  /**
+   * Split content by headers to respect document structure
+   */
+  private splitByHeaders(content: string): Array<{ header: string | null; content: string; level: number }> {
+    const lines = content.split('\n')
+    const sections: Array<{ header: string | null; content: string; level: number }> = []
+    
+    let currentHeader: string | null = null
+    let currentLevel = 0
+    let currentContent: string[] = []
+    
+    for (const line of lines) {
+      const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
+      
+      if (headerMatch) {
+        // Save previous section
+        if (currentContent.length > 0) {
+          sections.push({
+            header: currentHeader,
+            content: currentContent.join('\n').trim(),
+            level: currentLevel
+          })
+        }
+        
+        // Start new section
+        currentHeader = line
+        currentLevel = headerMatch[1].length
+        currentContent = []
+      } else {
+        currentContent.push(line)
+      }
+    }
+    
+    // Add final section
+    if (currentContent.length > 0) {
+      sections.push({
+        header: currentHeader,
+        content: currentContent.join('\n').trim(),
+        level: currentLevel
+      })
+    }
+    
+    return sections.filter(section => section.content.trim().length > 0)
+  }
+
+  /**
+   * Estimate token count (rough approximation)
+   */
+  private estimateTokens(text: string): number {
+    // Rough approximation: 1 token ≈ 4 characters
+    return Math.ceil(text.length / 4)
+  }
+
+  /**
+   * Merge small adjacent chunks to reach target size
+   */
+  private mergeSmallChunks(chunks: string[]): string[] {
+    const merged: string[] = []
+    let currentChunk = ''
+    
+    for (const chunk of chunks) {
+      const currentTokens = this.estimateTokens(currentChunk)
+      const chunkTokens = this.estimateTokens(chunk)
+      
+      // If adding this chunk would exceed target size, save current and start new
+      if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
+        if (currentChunk.trim()) {
+          merged.push(currentChunk.trim())
+        }
+        currentChunk = chunk
+      } else {
+        // Merge with current chunk
+        currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
+      }
+    }
+    
+    // Add final chunk
+    if (currentChunk.trim()) {
+      merged.push(currentChunk.trim())
+    }
+    
+    return merged
+  }
+
+  /**
+   * Chunk a section while preserving tables and structure
+   */
+  private async chunkSection(section: { header: string | null; content: string; level: number }): Promise<string[]> {
+    const content = section.content
+    const header = section.header
+    
+    // Check if content contains tables
+    const hasTable = this.containsTable(content)
+    
+    if (hasTable) {
+      // Split by tables and handle each part
+      return this.splitContentWithTables(content, header)
+    } else {
+      // Regular chunking for text-only content
+      const chunks = await this.textChunker.chunk(content)
+      return chunks.map((chunk, index) => {
+        // Add header to first chunk only
+        if (index === 0 && header) {
+          return `${header}\n\n${chunk.text}`.trim()
+        }
+        return chunk.text
+      })
+    }
+  }
+
+  /**
+   * Check if content contains markdown tables
+   */
+  private containsTable(content: string): boolean {
+    const lines = content.split('\n')
+    return lines.some((line, index) => {
+      if (line.includes('|') && line.split('|').length >= 3) {
+        const nextLine = lines[index + 1]
+        return nextLine && nextLine.includes('|') && nextLine.includes('-')
+      }
+      return false
+    })
+  }
+
+  /**
+   * Split content that contains tables, keeping tables intact
+   */
+  private splitContentWithTables(content: string, header: string | null): string[] {
+    const lines = content.split('\n')
+    const chunks: string[] = []
+    let currentChunk: string[] = []
+    let inTable = false
+    let tableLines: string[] = []
+    
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i]
+      
+      // Detect table start
+      if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
+        const nextLine = lines[i + 1]
+        if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
+          inTable = true
+          
+          // Save current chunk if it has content
+          if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
+            const chunkText = currentChunk.join('\n').trim()
+            const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
+            chunks.push(withHeader)
+            currentChunk = []
+          }
+          
+          tableLines = [line]
+          continue
+        }
+      }
+      
+      if (inTable) {
+        tableLines.push(line)
+        
+        // Detect table end
+        if (!line.includes('|') || line.trim() === '') {
+          inTable = false
+          
+          // Save table as its own chunk
+          const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
+          if (tableText.length > 0) {
+            const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
+            chunks.push(withHeader)
+          }
+          
+          tableLines = []
+          
+          // Start new chunk if current line has content
+          if (line.trim() !== '') {
+            currentChunk = [line]
+          }
+        }
+      } else {
+        currentChunk.push(line)
+        
+        // If chunk is getting large, save it
+        if (this.estimateTokens(currentChunk.join('\n')) > 250) {
+          const chunkText = currentChunk.join('\n').trim()
+          if (chunkText.length > 50) {
+            const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
+            chunks.push(withHeader)
+          }
+          currentChunk = []
+        }
+      }
+    }
+    
+    // Handle remaining content
+    if (inTable && tableLines.length > 0) {
+      const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
+      if (tableText.length > 0) {
+        const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
+        chunks.push(withHeader)
+      }
+    } else if (currentChunk.length > 0) {
+      const chunkText = currentChunk.join('\n').trim()
+      if (chunkText.length > 50) {
+        const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
+        chunks.push(withHeader)
+      }
+    }
+    
+    return chunks.filter(chunk => chunk.trim().length > 50)
+  }
+
+  /**
+   * Detect table boundaries in markdown content to avoid splitting them
+   */
+  private detectTableBoundaries(content: string): { start: number; end: number }[] {
+    const tables: { start: number; end: number }[] = []
+    const lines = content.split('\n')
+    
+    let inTable = false
+    let tableStart = -1
+    
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i].trim()
+      
+      // Detect table start (markdown table row with pipes)
+      if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
+        // Check if next line is table separator (contains dashes and pipes)
+        const nextLine = lines[i + 1]?.trim()
+        if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
+          inTable = true
+          tableStart = i
+        }
+      }
+      // Detect table end (empty line or non-table content)
+      else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
+        tables.push({ 
+          start: this.getCharacterPosition(lines, tableStart),
+          end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0
+        })
+        inTable = false
+      }
+    }
+    
+    // Handle table at end of content
+    if (inTable && tableStart >= 0) {
+      tables.push({
+        start: this.getCharacterPosition(lines, tableStart),
+        end: content.length
+      })
+    }
+    
+    return tables
+  }
+
+  /**
+   * Get character position from line number
+   */
+  private getCharacterPosition(lines: string[], lineIndex: number): number {
+    return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
+  }
+
+  /**
+   * Merge chunks that would split tables
+   */
+  private mergeTableChunks(chunks: string[], tableBoundaries: { start: number; end: number }[], originalContent: string): string[] {
+    if (tableBoundaries.length === 0) {
+      return chunks
+    }
+
+    const mergedChunks: string[] = []
+    let currentPosition = 0
+    
+    for (const chunk of chunks) {
+      const chunkStart = originalContent.indexOf(chunk, currentPosition)
+      const chunkEnd = chunkStart + chunk.length
+      
+      // Check if this chunk intersects with any table
+      const intersectsTable = tableBoundaries.some(table => 
+        (chunkStart >= table.start && chunkStart <= table.end) ||
+        (chunkEnd >= table.start && chunkEnd <= table.end) ||
+        (chunkStart <= table.start && chunkEnd >= table.end)
+      )
+      
+      if (intersectsTable) {
+        // Find which table(s) this chunk intersects with
+        const affectedTables = tableBoundaries.filter(table =>
+          (chunkStart >= table.start && chunkStart <= table.end) ||
+          (chunkEnd >= table.start && chunkEnd <= table.end) ||
+          (chunkStart <= table.start && chunkEnd >= table.end)
+        )
+        
+        // Create a chunk that includes the complete table(s)
+        const minStart = Math.min(chunkStart, ...affectedTables.map(t => t.start))
+        const maxEnd = Math.max(chunkEnd, ...affectedTables.map(t => t.end))
+        const completeChunk = originalContent.slice(minStart, maxEnd)
+        
+        // Only add if we haven't already included this content
+        if (!mergedChunks.some(existing => existing.includes(completeChunk.trim()))) {
+          mergedChunks.push(completeChunk.trim())
+        }
+      } else {
+        mergedChunks.push(chunk)
+      }
+      
+      currentPosition = chunkEnd
+    }
+    
+    return mergedChunks.filter(chunk => chunk.length > 50) // Filter out tiny chunks
+  }
+
+  /**
+   * Enforce 300 token size limit on chunks
+   */
+  private enforceSizeLimit(chunks: string[]): string[] {
+    const finalChunks: string[] = []
+    
+    for (const chunk of chunks) {
+      const tokens = this.estimateTokens(chunk)
+      
+      if (tokens <= 300) {
+        // Chunk is within limit
+        finalChunks.push(chunk)
+      } else {
+        // Chunk is too large - split it
+        const lines = chunk.split('\n')
+        let currentChunk = ''
+        
+        for (const line of lines) {
+          const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
+          
+          if (this.estimateTokens(testChunk) <= 300) {
+            currentChunk = testChunk
+          } else {
+            // Adding this line would exceed limit
+            if (currentChunk.trim()) {
+              finalChunks.push(currentChunk.trim())
+            }
+            currentChunk = line
+          }
+        }
+        
+        // Add final chunk if it has content
+        if (currentChunk.trim()) {
+          finalChunks.push(currentChunk.trim())
+        }
+      }
+    }
+    
+    return finalChunks.filter(chunk => chunk.trim().length > 100)
+  }
 }
--- a/apps/sim/scripts/process-docs-embeddings.ts
+++ b/apps/sim/scripts/process-docs-embeddings.ts
@@ -0,0 +1,215 @@
+#!/usr/bin/env bun
+
+import path from 'path'
+import { DocsChunker } from '@/lib/documents/docs-chunker'
+import { createLogger } from '@/lib/logs/console-logger'
+import { db } from '@/db'
+import { docsEmbeddings } from '@/db/schema'
+import { sql } from 'drizzle-orm'
+
+const logger = createLogger('ProcessDocsEmbeddings')
+
+interface ProcessingOptions {
+  /** Clear existing docs embeddings before processing */
+  clearExisting?: boolean
+  /** Path to docs directory */
+  docsPath?: string
+  /** Base URL for generating links */
+  baseUrl?: string
+  /** Chunk size in tokens */
+  chunkSize?: number
+  /** Minimum chunk size in tokens */
+  minChunkSize?: number
+  /** Overlap between chunks in tokens */
+  overlap?: number
+}
+
+/**
+ * Production script to process documentation and save embeddings to database
+ */
+async function processDocsEmbeddings(options: ProcessingOptions = {}) {
+  const startTime = Date.now()
+  let processedChunks = 0
+  let failedChunks = 0
+
+  try {
+    // Configuration
+    const config = {
+      clearExisting: options.clearExisting ?? false,
+      docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs'),
+      baseUrl: options.baseUrl ?? 'https://docs.simstudio.ai',
+      chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
+      minChunkSize: options.minChunkSize ?? 100,
+      overlap: options.overlap ?? 50,
+    }
+
+    logger.info('🚀 Starting docs embedding processing...')
+    logger.info(`Configuration:`, {
+      docsPath: config.docsPath,
+      baseUrl: config.baseUrl,
+      chunkSize: config.chunkSize,
+      clearExisting: config.clearExisting,
+    })
+
+    // Clear existing embeddings if requested
+    if (config.clearExisting) {
+      logger.info('🗑️ Clearing existing docs embeddings...')
+      const deleteResult = await db.delete(docsEmbeddings)
+      logger.info(`Deleted existing embeddings`)
+    }
+
+    // Initialize the docs chunker
+    const chunker = new DocsChunker({
+      chunkSize: config.chunkSize,
+      minChunkSize: config.minChunkSize,
+      overlap: config.overlap,
+      baseUrl: config.baseUrl,
+    })
+
+    // Process all .mdx files
+    logger.info(`📚 Processing docs from: ${config.docsPath}`)
+    const chunks = await chunker.chunkAllDocs(config.docsPath)
+
+    if (chunks.length === 0) {
+      logger.warn('⚠️ No chunks generated from docs')
+      return { success: false, processedChunks: 0, failedChunks: 0 }
+    }
+
+    logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
+
+    // Save chunks to database in batches for better performance
+    const batchSize = 10
+    logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
+
+    for (let i = 0; i < chunks.length; i += batchSize) {
+      const batch = chunks.slice(i, i + batchSize)
+      
+      try {
+        // Prepare batch data
+        const batchData = batch.map((chunk) => ({
+          chunkText: chunk.text,
+          sourceDocument: chunk.sourceDocument,
+          sourceLink: chunk.headerLink,
+          headerText: chunk.headerText,
+          headerLevel: chunk.headerLevel,
+          tokenCount: chunk.tokenCount,
+          embedding: chunk.embedding,
+          embeddingModel: chunk.embeddingModel,
+          metadata: chunk.metadata,
+        }))
+
+        // Insert batch
+        await db.insert(docsEmbeddings).values(batchData)
+        
+        processedChunks += batch.length
+        
+        if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
+          logger.info(`  💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`)
+        }
+      } catch (error) {
+        logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
+        failedChunks += batch.length
+      }
+    }
+
+    // Verify results
+    const savedCount = await db
+      .select({ count: sql<number>`count(*)` })
+      .from(docsEmbeddings)
+      .then((result) => result[0]?.count || 0)
+
+    const duration = Date.now() - startTime
+    
+    logger.info(`✅ Processing complete!`)
+    logger.info(`📊 Results:`)
+    logger.info(`  • Total chunks processed: ${chunks.length}`)
+    logger.info(`  • Successfully saved: ${processedChunks}`)
+    logger.info(`  • Failed: ${failedChunks}`)
+    logger.info(`  • Database total: ${savedCount}`)
+    logger.info(`  • Duration: ${Math.round(duration / 1000)}s`)
+    
+    // Summary by document
+    const documentStats = chunks.reduce((acc, chunk) => {
+      if (!acc[chunk.sourceDocument]) {
+        acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
+      }
+      acc[chunk.sourceDocument].chunks++
+      acc[chunk.sourceDocument].tokens += chunk.tokenCount
+      return acc
+    }, {} as Record<string, { chunks: number; tokens: number }>)
+
+    logger.info(`📋 Document breakdown:`)
+    Object.entries(documentStats)
+      .sort(([, a], [, b]) => b.chunks - a.chunks)
+      .slice(0, 10) // Top 10 documents
+      .forEach(([doc, stats]) => {
+        logger.info(`  • ${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
+      })
+
+    if (Object.keys(documentStats).length > 10) {
+      logger.info(`  • ... and ${Object.keys(documentStats).length - 10} more documents`)
+    }
+
+    return {
+      success: failedChunks === 0,
+      processedChunks,
+      failedChunks,
+      totalChunks: chunks.length,
+      databaseCount: savedCount,
+      duration,
+    }
+
+  } catch (error) {
+    logger.error('💥 Fatal error during processing:', error)
+    return {
+      success: false,
+      processedChunks,
+      failedChunks,
+      error: error instanceof Error ? error.message : 'Unknown error',
+    }
+  }
+}
+
+/**
+ * Main function - handle command line arguments
+ */
+async function main() {
+  const args = process.argv.slice(2)
+  const options: ProcessingOptions = {}
+
+  // Parse command line arguments
+  if (args.includes('--clear')) {
+    options.clearExisting = true
+  }
+
+  if (args.includes('--help') || args.includes('-h')) {
+    console.log(`
+Usage: bun run scripts/process-docs-embeddings.ts [options]
+
+Options:
+  --clear     Clear existing docs embeddings before processing
+  --help, -h  Show this help message
+
+Examples:
+  bun run scripts/process-docs-embeddings.ts
+  bun run scripts/process-docs-embeddings.ts --clear
+`)
+    process.exit(0)
+  }
+
+  const result = await processDocsEmbeddings(options)
+
+  if (!result.success) {
+    process.exit(1)
+  }
+}
+
+// Run the script
+if (process.argv[1]?.includes('process-docs-embeddings')) {
+  main().catch((error) => {
+    console.error('Script failed:', error)
+    process.exit(1)
+  })
+}
+
+export { processDocsEmbeddings }