Initial commit

2026-04-06 03:00:16 -04:00 · 2025-07-08 15:36:25 -07:00
parent 0f21fbf705
commit 850447a604
3 changed files with 418 additions and 0 deletions
--- a/apps/sim/lib/documents/docs-chunker.ts
+++ b/apps/sim/lib/documents/docs-chunker.ts
@@ -0,0 +1,272 @@
+import fs from 'fs/promises'
+import path from 'path'
+import { createLogger } from '@/lib/logs/console-logger'
+import { TextChunker } from './chunker'
+import { generateEmbeddings } from '@/app/api/knowledge/utils'
+import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'
+
+interface Frontmatter {
+  title?: string
+  description?: string
+  [key: string]: any
+}
+
+const logger = createLogger('DocsChunker')
+
+/**
+ * Docs-specific chunker that processes .mdx files and tracks header context
+ */
+export class DocsChunker {
+  private readonly textChunker: TextChunker
+  private readonly baseUrl: string
+
+  constructor(options: DocsChunkerOptions = {}) {
+    // Use the existing TextChunker for chunking logic
+    this.textChunker = new TextChunker({
+      chunkSize: options.chunkSize ?? 1024,
+      minChunkSize: options.minChunkSize ?? 100,
+      overlap: options.overlap ?? 200,
+    })
+    this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
+  }
+
+  /**
+   * Process all .mdx files in the docs directory
+   */
+  async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
+    const allChunks: DocChunk[] = []
+    
+    try {
+      const mdxFiles = await this.findMdxFiles(docsPath)
+      logger.info(`Found ${mdxFiles.length} .mdx files to process`)
+
+      for (const filePath of mdxFiles) {
+        try {
+          const chunks = await this.chunkMdxFile(filePath, docsPath)
+          allChunks.push(...chunks)
+          logger.info(`Processed ${filePath}: ${chunks.length} chunks`)
+        } catch (error) {
+          logger.error(`Error processing ${filePath}:`, error)
+        }
+      }
+
+      logger.info(`Total chunks generated: ${allChunks.length}`)
+      return allChunks
+    } catch (error) {
+      logger.error('Error processing docs:', error)
+      throw error
+    }
+  }
+
+  /**
+   * Process a single .mdx file
+   */
+  async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
+    const content = await fs.readFile(filePath, 'utf-8')
+    const relativePath = path.relative(basePath, filePath)
+    
+    // Parse frontmatter and content
+    const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
+    
+    // Extract headers from the content
+    const headers = this.extractHeaders(markdownContent)
+    
+    // Generate document URL
+    const documentUrl = this.generateDocumentUrl(relativePath)
+    
+    // Split content into chunks
+    const textChunks = await this.splitContent(markdownContent)
+    
+    // Generate embeddings for all chunks at once (batch processing)
+    logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
+    const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
+    const embeddingModel = 'text-embedding-3-small'
+    
+    // Convert to DocChunk objects with header context and embeddings
+    const chunks: DocChunk[] = []
+    let currentPosition = 0
+    
+    for (let i = 0; i < textChunks.length; i++) {
+      const chunkText = textChunks[i]
+      const chunkStart = currentPosition
+      const chunkEnd = currentPosition + chunkText.length
+      
+      // Find the most relevant header for this chunk
+      const relevantHeader = this.findRelevantHeader(headers, chunkStart)
+      
+      const chunk: DocChunk = {
+        text: chunkText,
+        tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
+        sourceDocument: relativePath,
+        headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
+        headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
+        headerLevel: relevantHeader?.level || 1,
+        embedding: embeddings[i] || [],
+        embeddingModel,
+        metadata: {
+          startIndex: chunkStart,
+          endIndex: chunkEnd,
+          hasFrontmatter: i === 0 && content.startsWith('---'),
+          documentTitle: frontmatter.title,
+          documentDescription: frontmatter.description,
+        },
+      }
+      
+      chunks.push(chunk)
+      currentPosition = chunkEnd
+    }
+    
+    return chunks
+  }
+
+  /**
+   * Find all .mdx files recursively
+   */
+  private async findMdxFiles(dirPath: string): Promise<string[]> {
+    const files: string[] = []
+    
+    const entries = await fs.readdir(dirPath, { withFileTypes: true })
+    
+    for (const entry of entries) {
+      const fullPath = path.join(dirPath, entry.name)
+      
+      if (entry.isDirectory()) {
+        const subFiles = await this.findMdxFiles(fullPath)
+        files.push(...subFiles)
+      } else if (entry.isFile() && entry.name.endsWith('.mdx')) {
+        files.push(fullPath)
+      }
+    }
+    
+    return files
+  }
+
+  /**
+   * Extract headers and their positions from markdown content
+   */
+  private extractHeaders(content: string): HeaderInfo[] {
+    const headers: HeaderInfo[] = []
+    const headerRegex = /^(#{1,6})\s+(.+)$/gm
+    let match
+    
+    while ((match = headerRegex.exec(content)) !== null) {
+      const level = match[1].length
+      const text = match[2].trim()
+      const anchor = this.generateAnchor(text)
+      
+      headers.push({
+        text,
+        level,
+        anchor,
+        position: match.index,
+      })
+    }
+    
+    return headers
+  }
+
+  /**
+   * Generate URL-safe anchor from header text
+   */
+  private generateAnchor(headerText: string): string {
+    return headerText
+      .toLowerCase()
+      .replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
+      .replace(/\s+/g, '-') // Replace spaces with hyphens
+      .replace(/-+/g, '-') // Replace multiple hyphens with single
+      .replace(/^-|-$/g, '') // Remove leading/trailing hyphens
+  }
+
+  /**
+   * Generate document URL from relative path
+   */
+  private generateDocumentUrl(relativePath: string): string {
+    // Convert file path to URL path
+    // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
+    const urlPath = relativePath
+      .replace(/\.mdx$/, '')
+      .replace(/\\/g, '/') // Handle Windows paths
+    
+    return `${this.baseUrl}/${urlPath}`
+  }
+
+  /**
+   * Find the most relevant header for a given position
+   */
+  private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
+    if (headers.length === 0) return null
+    
+    // Find the last header that comes before this position
+    let relevantHeader: HeaderInfo | null = null
+    
+    for (const header of headers) {
+      if (header.position <= position) {
+        relevantHeader = header
+      } else {
+        break
+      }
+    }
+    
+    return relevantHeader
+  }
+
+  /**
+   * Split content into chunks using the existing TextChunker
+   */
+  private async splitContent(content: string): Promise<string[]> {
+    // Clean the content first
+    const cleanedContent = this.cleanContent(content)
+    
+    // Use the existing TextChunker
+    const chunks = await this.textChunker.chunk(cleanedContent)
+    
+    return chunks.map(chunk => chunk.text)
+  }
+
+  /**
+   * Clean content by removing MDX-specific elements and excessive whitespace
+   */
+  private cleanContent(content: string): string {
+    return content
+      // Remove import statements
+      .replace(/^import\s+.*$/gm, '')
+      // Remove JSX components and React-style comments
+      .replace(/<[^>]+>/g, ' ')
+      .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
+      // Remove excessive whitespace
+      .replace(/\n{3,}/g, '\n\n')
+      .replace(/[ \t]{2,}/g, ' ')
+      .trim()
+  }
+
+
+
+  /**
+   * Parse frontmatter from MDX content
+   */
+  private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
+    const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
+    const match = content.match(frontmatterRegex)
+    
+    if (!match) {
+      return { data: {}, content }
+    }
+    
+    const [, frontmatterText, markdownContent] = match
+    const data: Frontmatter = {}
+    
+    // Simple YAML parsing for title and description
+    const lines = frontmatterText.split('\n')
+    for (const line of lines) {
+      const colonIndex = line.indexOf(':')
+      if (colonIndex > 0) {
+        const key = line.slice(0, colonIndex).trim()
+        const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
+        data[key] = value
+      }
+    }
+    
+    return { data, content: markdownContent }
+  }
+
+} 
--- a/apps/sim/lib/documents/types.ts
+++ b/apps/sim/lib/documents/types.ts
@@ -0,0 +1,53 @@
+export interface DocChunk {
+  /** The chunk text content */
+  text: string
+  /** Token count estimate for the chunk */
+  tokenCount: number
+  /** Source document path relative to docs/ */
+  sourceDocument: string
+  /** Link to the most relevant header section */
+  headerLink: string
+  /** The header text that this chunk belongs to */
+  headerText: string
+  /** Header level (1-6) */
+  headerLevel: number
+  /** OpenAI text embedding vector (1536 dimensions for text-embedding-3-small) */
+  embedding: number[]
+  /** Model used to generate the embedding */
+  embeddingModel: string
+  /** Metadata about the chunk */
+  metadata: {
+    /** Start position in the original document */
+    startIndex: number
+    /** End position in the original document */
+    endIndex: number
+    /** Whether this chunk contains the document frontmatter */
+    hasFrontmatter?: boolean
+    /** Document title from frontmatter */
+    documentTitle?: string
+    /** Document description from frontmatter */
+    documentDescription?: string
+  }
+}
+
+export interface DocsChunkerOptions {
+  /** Target chunk size in tokens */
+  chunkSize?: number
+  /** Minimum chunk size in tokens */
+  minChunkSize?: number
+  /** Overlap between chunks in tokens */
+  overlap?: number
+  /** Base URL for generating links */
+  baseUrl?: string
+}
+
+export interface HeaderInfo {
+  /** Header text */
+  text: string
+  /** Header level (1-6) */
+  level: number
+  /** Anchor link */
+  anchor: string
+  /** Position in document */
+  position: number
+} 
--- a/apps/sim/scripts/chunk-docs.ts
+++ b/apps/sim/scripts/chunk-docs.ts
@@ -0,0 +1,93 @@
+#!/usr/bin/env bun
+
+import path from 'path'
+import { DocsChunker } from '@/lib/documents/docs-chunker'
+import { createLogger } from '@/lib/logs/console-logger'
+
+const logger = createLogger('ChunkDocsScript')
+
+/**
+ * Script to chunk all .mdx files in the docs directory
+ */
+async function main() {
+  try {
+    // Initialize the docs chunker
+    const chunker = new DocsChunker({
+      chunkSize: 1024,
+      minChunkSize: 100,
+      overlap: 200,
+      baseUrl: 'https://docs.simstudio.ai',
+    })
+
+    // Path to the docs content directory
+    const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
+    
+    logger.info(`Processing docs from: ${docsPath}`)
+
+    // Process all .mdx files
+    const chunks = await chunker.chunkAllDocs(docsPath)
+
+    logger.info(`\n=== CHUNKING RESULTS ===`)
+    logger.info(`Total chunks: ${chunks.length}`)
+    
+    // Group chunks by document
+    const chunksByDoc = chunks.reduce((acc, chunk) => {
+      if (!acc[chunk.sourceDocument]) {
+        acc[chunk.sourceDocument] = []
+      }
+      acc[chunk.sourceDocument].push(chunk)
+      return acc
+    }, {} as Record<string, typeof chunks>)
+
+    // Display summary
+    logger.info(`\n=== DOCUMENT SUMMARY ===`)
+    for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
+      logger.info(`${doc}: ${docChunks.length} chunks`)
+    }
+
+    // Display a few sample chunks
+    logger.info(`\n=== SAMPLE CHUNKS ===`)
+    chunks.slice(0, 3).forEach((chunk, index) => {
+      logger.info(`\nChunk ${index + 1}:`)
+      logger.info(`  Source: ${chunk.sourceDocument}`)
+      logger.info(`  Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
+      logger.info(`  Link: ${chunk.headerLink}`)
+      logger.info(`  Tokens: ${chunk.tokenCount}`)
+      logger.info(`  Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
+      logger.info(`  Embedding Preview: [${chunk.embedding.slice(0, 5).map(n => n.toFixed(4)).join(', ')}...]`)
+      logger.info(`  Text Preview: ${chunk.text.slice(0, 100)}...`)
+    })
+
+    // Calculate total token count
+    const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
+    const chunksWithEmbeddings = chunks.filter(chunk => chunk.embedding.length > 0).length
+    
+    logger.info(`\n=== STATISTICS ===`)
+    logger.info(`Total tokens: ${totalTokens}`)
+    logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
+    logger.info(`Chunks with embeddings: ${chunksWithEmbeddings}/${chunks.length}`)
+    if (chunks.length > 0 && chunks[0].embedding.length > 0) {
+      logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
+      logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
+    }
+    
+    const headerLevels = chunks.reduce((acc, chunk) => {
+      acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
+      return acc
+    }, {} as Record<number, number>)
+    
+    logger.info(`Header level distribution:`)
+    Object.entries(headerLevels)
+      .sort(([a], [b]) => Number(a) - Number(b))
+      .forEach(([level, count]) => {
+        logger.info(`  H${level}: ${count} chunks`)
+      })
+
+  } catch (error) {
+    logger.error('Error processing docs:', error)
+    process.exit(1)
+  }
+}
+
+// Run the script
+main().catch(console.error)