Initial lint

2026-04-06 03:00:16 -04:00 · 2025-07-08 15:36:33 -07:00
parent 850447a604
commit 76c0c56689
3 changed files with 80 additions and 70 deletions
--- a/apps/sim/lib/documents/docs-chunker.ts
+++ b/apps/sim/lib/documents/docs-chunker.ts
@@ -1,8 +1,8 @@
 import fs from 'fs/promises'
 import path from 'path'
 import { createLogger } from '@/lib/logs/console-logger'
-import { TextChunker } from './chunker'
 import { generateEmbeddings } from '@/app/api/knowledge/utils'
+import { TextChunker } from './chunker'
 import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'

 interface Frontmatter {
@@ -35,7 +35,7 @@ export class DocsChunker {
   */
  async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
    const allChunks: DocChunk[] = []
-    
+
    try {
      const mdxFiles = await this.findMdxFiles(docsPath)
      logger.info(`Found ${mdxFiles.length} .mdx files to process`)
@@ -64,36 +64,36 @@ export class DocsChunker {
  async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
    const content = await fs.readFile(filePath, 'utf-8')
    const relativePath = path.relative(basePath, filePath)
-    
+
    // Parse frontmatter and content
    const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
-    
+
    // Extract headers from the content
    const headers = this.extractHeaders(markdownContent)
-    
+
    // Generate document URL
    const documentUrl = this.generateDocumentUrl(relativePath)
-    
+
    // Split content into chunks
    const textChunks = await this.splitContent(markdownContent)
-    
+
    // Generate embeddings for all chunks at once (batch processing)
    logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
    const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
    const embeddingModel = 'text-embedding-3-small'
-    
+
    // Convert to DocChunk objects with header context and embeddings
    const chunks: DocChunk[] = []
    let currentPosition = 0
-    
+
    for (let i = 0; i < textChunks.length; i++) {
      const chunkText = textChunks[i]
      const chunkStart = currentPosition
      const chunkEnd = currentPosition + chunkText.length
-      
+
      // Find the most relevant header for this chunk
      const relevantHeader = this.findRelevantHeader(headers, chunkStart)
-      
+
      const chunk: DocChunk = {
        text: chunkText,
        tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
@@ -111,11 +111,11 @@ export class DocsChunker {
          documentDescription: frontmatter.description,
        },
      }
-      
+
      chunks.push(chunk)
      currentPosition = chunkEnd
    }
-    
+
    return chunks
  }

@@ -124,12 +124,12 @@ export class DocsChunker {
   */
  private async findMdxFiles(dirPath: string): Promise<string[]> {
    const files: string[] = []
-    
+
    const entries = await fs.readdir(dirPath, { withFileTypes: true })
-    
+
    for (const entry of entries) {
      const fullPath = path.join(dirPath, entry.name)
-      
+
      if (entry.isDirectory()) {
        const subFiles = await this.findMdxFiles(fullPath)
        files.push(...subFiles)
@@ -137,7 +137,7 @@ export class DocsChunker {
        files.push(fullPath)
      }
    }
-    
+
    return files
  }

@@ -148,12 +148,12 @@ export class DocsChunker {
    const headers: HeaderInfo[] = []
    const headerRegex = /^(#{1,6})\s+(.+)$/gm
    let match
-    
+
    while ((match = headerRegex.exec(content)) !== null) {
      const level = match[1].length
      const text = match[2].trim()
      const anchor = this.generateAnchor(text)
-      
+
      headers.push({
        text,
        level,
@@ -161,7 +161,7 @@ export class DocsChunker {
        position: match.index,
      })
    }
-    
+
    return headers
  }

@@ -183,10 +183,8 @@ export class DocsChunker {
  private generateDocumentUrl(relativePath: string): string {
    // Convert file path to URL path
    // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
-    const urlPath = relativePath
-      .replace(/\.mdx$/, '')
-      .replace(/\\/g, '/') // Handle Windows paths
-    
+    const urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
+
    return `${this.baseUrl}/${urlPath}`
  }

@@ -195,10 +193,10 @@ export class DocsChunker {
   */
  private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
    if (headers.length === 0) return null
-    
+
    // Find the last header that comes before this position
    let relevantHeader: HeaderInfo | null = null
-    
+
    for (const header of headers) {
      if (header.position <= position) {
        relevantHeader = header
@@ -206,7 +204,7 @@ export class DocsChunker {
        break
      }
    }
-    
+
    return relevantHeader
  }

@@ -216,57 +214,59 @@ export class DocsChunker {
  private async splitContent(content: string): Promise<string[]> {
    // Clean the content first
    const cleanedContent = this.cleanContent(content)
-    
+
    // Use the existing TextChunker
    const chunks = await this.textChunker.chunk(cleanedContent)
-    
-    return chunks.map(chunk => chunk.text)
+
+    return chunks.map((chunk) => chunk.text)
  }

  /**
   * Clean content by removing MDX-specific elements and excessive whitespace
   */
  private cleanContent(content: string): string {
-    return content
-      // Remove import statements
-      .replace(/^import\s+.*$/gm, '')
-      // Remove JSX components and React-style comments
-      .replace(/<[^>]+>/g, ' ')
-      .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
-      // Remove excessive whitespace
-      .replace(/\n{3,}/g, '\n\n')
-      .replace(/[ \t]{2,}/g, ' ')
-      .trim()
+    return (
+      content
+        // Remove import statements
+        .replace(/^import\s+.*$/gm, '')
+        // Remove JSX components and React-style comments
+        .replace(/<[^>]+>/g, ' ')
+        .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
+        // Remove excessive whitespace
+        .replace(/\n{3,}/g, '\n\n')
+        .replace(/[ \t]{2,}/g, ' ')
+        .trim()
+    )
  }

-
-
  /**
   * Parse frontmatter from MDX content
   */
  private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
    const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
    const match = content.match(frontmatterRegex)
-    
+
    if (!match) {
      return { data: {}, content }
    }
-    
+
    const [, frontmatterText, markdownContent] = match
    const data: Frontmatter = {}
-    
+
    // Simple YAML parsing for title and description
    const lines = frontmatterText.split('\n')
    for (const line of lines) {
      const colonIndex = line.indexOf(':')
      if (colonIndex > 0) {
        const key = line.slice(0, colonIndex).trim()
-        const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
+        const value = line
+          .slice(colonIndex + 1)
+          .trim()
+          .replace(/^['"]|['"]$/g, '')
        data[key] = value
      }
    }
-    
+
    return { data, content: markdownContent }
  }
-
-} 
+}
--- a/apps/sim/lib/documents/types.ts
+++ b/apps/sim/lib/documents/types.ts
@@ -50,4 +50,4 @@ export interface HeaderInfo {
  anchor: string
  /** Position in document */
  position: number
-} 
+}
--- a/apps/sim/scripts/chunk-docs.ts
+++ b/apps/sim/scripts/chunk-docs.ts
@@ -21,7 +21,7 @@ async function main() {

    // Path to the docs content directory
    const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
-    
+
    logger.info(`Processing docs from: ${docsPath}`)

    // Process all .mdx files
@@ -29,15 +29,18 @@ async function main() {

    logger.info(`\n=== CHUNKING RESULTS ===`)
    logger.info(`Total chunks: ${chunks.length}`)
-    
+
    // Group chunks by document
-    const chunksByDoc = chunks.reduce((acc, chunk) => {
-      if (!acc[chunk.sourceDocument]) {
-        acc[chunk.sourceDocument] = []
-      }
-      acc[chunk.sourceDocument].push(chunk)
-      return acc
-    }, {} as Record<string, typeof chunks>)
+    const chunksByDoc = chunks.reduce(
+      (acc, chunk) => {
+        if (!acc[chunk.sourceDocument]) {
+          acc[chunk.sourceDocument] = []
+        }
+        acc[chunk.sourceDocument].push(chunk)
+        return acc
+      },
+      {} as Record<string, typeof chunks>
+    )

    // Display summary
    logger.info(`\n=== DOCUMENT SUMMARY ===`)
@@ -54,14 +57,19 @@ async function main() {
      logger.info(`  Link: ${chunk.headerLink}`)
      logger.info(`  Tokens: ${chunk.tokenCount}`)
      logger.info(`  Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
-      logger.info(`  Embedding Preview: [${chunk.embedding.slice(0, 5).map(n => n.toFixed(4)).join(', ')}...]`)
+      logger.info(
+        `  Embedding Preview: [${chunk.embedding
+          .slice(0, 5)
+          .map((n) => n.toFixed(4))
+          .join(', ')}...]`
+      )
      logger.info(`  Text Preview: ${chunk.text.slice(0, 100)}...`)
    })

    // Calculate total token count
    const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
-    const chunksWithEmbeddings = chunks.filter(chunk => chunk.embedding.length > 0).length
-    
+    const chunksWithEmbeddings = chunks.filter((chunk) => chunk.embedding.length > 0).length
+
    logger.info(`\n=== STATISTICS ===`)
    logger.info(`Total tokens: ${totalTokens}`)
    logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
@@ -70,19 +78,21 @@ async function main() {
      logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
      logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
    }
-    
-    const headerLevels = chunks.reduce((acc, chunk) => {
-      acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
-      return acc
-    }, {} as Record<number, number>)
-    
+
+    const headerLevels = chunks.reduce(
+      (acc, chunk) => {
+        acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
+        return acc
+      },
+      {} as Record<number, number>
+    )
+
    logger.info(`Header level distribution:`)
    Object.entries(headerLevels)
      .sort(([a], [b]) => Number(a) - Number(b))
      .forEach(([level, count]) => {
        logger.info(`  H${level}: ${count} chunks`)
      })
-
  } catch (error) {
    logger.error('Error processing docs:', error)
    process.exit(1)
@@ -90,4 +100,4 @@ async function main() {
 }

 // Run the script
-main().catch(console.error) 
+main().catch(console.error)