mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-06 03:00:16 -04:00
Initial commit
This commit is contained in:
272
apps/sim/lib/documents/docs-chunker.ts
Normal file
272
apps/sim/lib/documents/docs-chunker.ts
Normal file
@@ -0,0 +1,272 @@
|
||||
import fs from 'fs/promises'
|
||||
import path from 'path'
|
||||
import { createLogger } from '@/lib/logs/console-logger'
|
||||
import { TextChunker } from './chunker'
|
||||
import { generateEmbeddings } from '@/app/api/knowledge/utils'
|
||||
import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'
|
||||
|
||||
interface Frontmatter {
|
||||
title?: string
|
||||
description?: string
|
||||
[key: string]: any
|
||||
}
|
||||
|
||||
const logger = createLogger('DocsChunker')
|
||||
|
||||
/**
|
||||
* Docs-specific chunker that processes .mdx files and tracks header context
|
||||
*/
|
||||
export class DocsChunker {
|
||||
private readonly textChunker: TextChunker
|
||||
private readonly baseUrl: string
|
||||
|
||||
constructor(options: DocsChunkerOptions = {}) {
|
||||
// Use the existing TextChunker for chunking logic
|
||||
this.textChunker = new TextChunker({
|
||||
chunkSize: options.chunkSize ?? 1024,
|
||||
minChunkSize: options.minChunkSize ?? 100,
|
||||
overlap: options.overlap ?? 200,
|
||||
})
|
||||
this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
|
||||
}
|
||||
|
||||
/**
|
||||
* Process all .mdx files in the docs directory
|
||||
*/
|
||||
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
|
||||
const allChunks: DocChunk[] = []
|
||||
|
||||
try {
|
||||
const mdxFiles = await this.findMdxFiles(docsPath)
|
||||
logger.info(`Found ${mdxFiles.length} .mdx files to process`)
|
||||
|
||||
for (const filePath of mdxFiles) {
|
||||
try {
|
||||
const chunks = await this.chunkMdxFile(filePath, docsPath)
|
||||
allChunks.push(...chunks)
|
||||
logger.info(`Processed ${filePath}: ${chunks.length} chunks`)
|
||||
} catch (error) {
|
||||
logger.error(`Error processing ${filePath}:`, error)
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(`Total chunks generated: ${allChunks.length}`)
|
||||
return allChunks
|
||||
} catch (error) {
|
||||
logger.error('Error processing docs:', error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a single .mdx file
|
||||
*/
|
||||
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
|
||||
const content = await fs.readFile(filePath, 'utf-8')
|
||||
const relativePath = path.relative(basePath, filePath)
|
||||
|
||||
// Parse frontmatter and content
|
||||
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
|
||||
|
||||
// Extract headers from the content
|
||||
const headers = this.extractHeaders(markdownContent)
|
||||
|
||||
// Generate document URL
|
||||
const documentUrl = this.generateDocumentUrl(relativePath)
|
||||
|
||||
// Split content into chunks
|
||||
const textChunks = await this.splitContent(markdownContent)
|
||||
|
||||
// Generate embeddings for all chunks at once (batch processing)
|
||||
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
|
||||
const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
|
||||
const embeddingModel = 'text-embedding-3-small'
|
||||
|
||||
// Convert to DocChunk objects with header context and embeddings
|
||||
const chunks: DocChunk[] = []
|
||||
let currentPosition = 0
|
||||
|
||||
for (let i = 0; i < textChunks.length; i++) {
|
||||
const chunkText = textChunks[i]
|
||||
const chunkStart = currentPosition
|
||||
const chunkEnd = currentPosition + chunkText.length
|
||||
|
||||
// Find the most relevant header for this chunk
|
||||
const relevantHeader = this.findRelevantHeader(headers, chunkStart)
|
||||
|
||||
const chunk: DocChunk = {
|
||||
text: chunkText,
|
||||
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
|
||||
sourceDocument: relativePath,
|
||||
headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
|
||||
headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
|
||||
headerLevel: relevantHeader?.level || 1,
|
||||
embedding: embeddings[i] || [],
|
||||
embeddingModel,
|
||||
metadata: {
|
||||
startIndex: chunkStart,
|
||||
endIndex: chunkEnd,
|
||||
hasFrontmatter: i === 0 && content.startsWith('---'),
|
||||
documentTitle: frontmatter.title,
|
||||
documentDescription: frontmatter.description,
|
||||
},
|
||||
}
|
||||
|
||||
chunks.push(chunk)
|
||||
currentPosition = chunkEnd
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
/**
|
||||
* Find all .mdx files recursively
|
||||
*/
|
||||
private async findMdxFiles(dirPath: string): Promise<string[]> {
|
||||
const files: string[] = []
|
||||
|
||||
const entries = await fs.readdir(dirPath, { withFileTypes: true })
|
||||
|
||||
for (const entry of entries) {
|
||||
const fullPath = path.join(dirPath, entry.name)
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const subFiles = await this.findMdxFiles(fullPath)
|
||||
files.push(...subFiles)
|
||||
} else if (entry.isFile() && entry.name.endsWith('.mdx')) {
|
||||
files.push(fullPath)
|
||||
}
|
||||
}
|
||||
|
||||
return files
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract headers and their positions from markdown content
|
||||
*/
|
||||
private extractHeaders(content: string): HeaderInfo[] {
|
||||
const headers: HeaderInfo[] = []
|
||||
const headerRegex = /^(#{1,6})\s+(.+)$/gm
|
||||
let match
|
||||
|
||||
while ((match = headerRegex.exec(content)) !== null) {
|
||||
const level = match[1].length
|
||||
const text = match[2].trim()
|
||||
const anchor = this.generateAnchor(text)
|
||||
|
||||
headers.push({
|
||||
text,
|
||||
level,
|
||||
anchor,
|
||||
position: match.index,
|
||||
})
|
||||
}
|
||||
|
||||
return headers
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate URL-safe anchor from header text
|
||||
*/
|
||||
private generateAnchor(headerText: string): string {
|
||||
return headerText
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
|
||||
.replace(/\s+/g, '-') // Replace spaces with hyphens
|
||||
.replace(/-+/g, '-') // Replace multiple hyphens with single
|
||||
.replace(/^-|-$/g, '') // Remove leading/trailing hyphens
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate document URL from relative path
|
||||
*/
|
||||
private generateDocumentUrl(relativePath: string): string {
|
||||
// Convert file path to URL path
|
||||
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
|
||||
const urlPath = relativePath
|
||||
.replace(/\.mdx$/, '')
|
||||
.replace(/\\/g, '/') // Handle Windows paths
|
||||
|
||||
return `${this.baseUrl}/${urlPath}`
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the most relevant header for a given position
|
||||
*/
|
||||
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
|
||||
if (headers.length === 0) return null
|
||||
|
||||
// Find the last header that comes before this position
|
||||
let relevantHeader: HeaderInfo | null = null
|
||||
|
||||
for (const header of headers) {
|
||||
if (header.position <= position) {
|
||||
relevantHeader = header
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return relevantHeader
|
||||
}
|
||||
|
||||
/**
|
||||
* Split content into chunks using the existing TextChunker
|
||||
*/
|
||||
private async splitContent(content: string): Promise<string[]> {
|
||||
// Clean the content first
|
||||
const cleanedContent = this.cleanContent(content)
|
||||
|
||||
// Use the existing TextChunker
|
||||
const chunks = await this.textChunker.chunk(cleanedContent)
|
||||
|
||||
return chunks.map(chunk => chunk.text)
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean content by removing MDX-specific elements and excessive whitespace
|
||||
*/
|
||||
private cleanContent(content: string): string {
|
||||
return content
|
||||
// Remove import statements
|
||||
.replace(/^import\s+.*$/gm, '')
|
||||
// Remove JSX components and React-style comments
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
|
||||
// Remove excessive whitespace
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.replace(/[ \t]{2,}/g, ' ')
|
||||
.trim()
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Parse frontmatter from MDX content
|
||||
*/
|
||||
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
|
||||
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
|
||||
const match = content.match(frontmatterRegex)
|
||||
|
||||
if (!match) {
|
||||
return { data: {}, content }
|
||||
}
|
||||
|
||||
const [, frontmatterText, markdownContent] = match
|
||||
const data: Frontmatter = {}
|
||||
|
||||
// Simple YAML parsing for title and description
|
||||
const lines = frontmatterText.split('\n')
|
||||
for (const line of lines) {
|
||||
const colonIndex = line.indexOf(':')
|
||||
if (colonIndex > 0) {
|
||||
const key = line.slice(0, colonIndex).trim()
|
||||
const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
|
||||
data[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
return { data, content: markdownContent }
|
||||
}
|
||||
|
||||
}
|
||||
53
apps/sim/lib/documents/types.ts
Normal file
53
apps/sim/lib/documents/types.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
export interface DocChunk {
|
||||
/** The chunk text content */
|
||||
text: string
|
||||
/** Token count estimate for the chunk */
|
||||
tokenCount: number
|
||||
/** Source document path relative to docs/ */
|
||||
sourceDocument: string
|
||||
/** Link to the most relevant header section */
|
||||
headerLink: string
|
||||
/** The header text that this chunk belongs to */
|
||||
headerText: string
|
||||
/** Header level (1-6) */
|
||||
headerLevel: number
|
||||
/** OpenAI text embedding vector (1536 dimensions for text-embedding-3-small) */
|
||||
embedding: number[]
|
||||
/** Model used to generate the embedding */
|
||||
embeddingModel: string
|
||||
/** Metadata about the chunk */
|
||||
metadata: {
|
||||
/** Start position in the original document */
|
||||
startIndex: number
|
||||
/** End position in the original document */
|
||||
endIndex: number
|
||||
/** Whether this chunk contains the document frontmatter */
|
||||
hasFrontmatter?: boolean
|
||||
/** Document title from frontmatter */
|
||||
documentTitle?: string
|
||||
/** Document description from frontmatter */
|
||||
documentDescription?: string
|
||||
}
|
||||
}
|
||||
|
||||
export interface DocsChunkerOptions {
|
||||
/** Target chunk size in tokens */
|
||||
chunkSize?: number
|
||||
/** Minimum chunk size in tokens */
|
||||
minChunkSize?: number
|
||||
/** Overlap between chunks in tokens */
|
||||
overlap?: number
|
||||
/** Base URL for generating links */
|
||||
baseUrl?: string
|
||||
}
|
||||
|
||||
export interface HeaderInfo {
|
||||
/** Header text */
|
||||
text: string
|
||||
/** Header level (1-6) */
|
||||
level: number
|
||||
/** Anchor link */
|
||||
anchor: string
|
||||
/** Position in document */
|
||||
position: number
|
||||
}
|
||||
93
apps/sim/scripts/chunk-docs.ts
Normal file
93
apps/sim/scripts/chunk-docs.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import path from 'path'
|
||||
import { DocsChunker } from '@/lib/documents/docs-chunker'
|
||||
import { createLogger } from '@/lib/logs/console-logger'
|
||||
|
||||
const logger = createLogger('ChunkDocsScript')
|
||||
|
||||
/**
|
||||
* Script to chunk all .mdx files in the docs directory
|
||||
*/
|
||||
async function main() {
|
||||
try {
|
||||
// Initialize the docs chunker
|
||||
const chunker = new DocsChunker({
|
||||
chunkSize: 1024,
|
||||
minChunkSize: 100,
|
||||
overlap: 200,
|
||||
baseUrl: 'https://docs.simstudio.ai',
|
||||
})
|
||||
|
||||
// Path to the docs content directory
|
||||
const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
|
||||
|
||||
logger.info(`Processing docs from: ${docsPath}`)
|
||||
|
||||
// Process all .mdx files
|
||||
const chunks = await chunker.chunkAllDocs(docsPath)
|
||||
|
||||
logger.info(`\n=== CHUNKING RESULTS ===`)
|
||||
logger.info(`Total chunks: ${chunks.length}`)
|
||||
|
||||
// Group chunks by document
|
||||
const chunksByDoc = chunks.reduce((acc, chunk) => {
|
||||
if (!acc[chunk.sourceDocument]) {
|
||||
acc[chunk.sourceDocument] = []
|
||||
}
|
||||
acc[chunk.sourceDocument].push(chunk)
|
||||
return acc
|
||||
}, {} as Record<string, typeof chunks>)
|
||||
|
||||
// Display summary
|
||||
logger.info(`\n=== DOCUMENT SUMMARY ===`)
|
||||
for (const [doc, docChunks] of Object.entries(chunksByDoc)) {
|
||||
logger.info(`${doc}: ${docChunks.length} chunks`)
|
||||
}
|
||||
|
||||
// Display a few sample chunks
|
||||
logger.info(`\n=== SAMPLE CHUNKS ===`)
|
||||
chunks.slice(0, 3).forEach((chunk, index) => {
|
||||
logger.info(`\nChunk ${index + 1}:`)
|
||||
logger.info(` Source: ${chunk.sourceDocument}`)
|
||||
logger.info(` Header: ${chunk.headerText} (Level ${chunk.headerLevel})`)
|
||||
logger.info(` Link: ${chunk.headerLink}`)
|
||||
logger.info(` Tokens: ${chunk.tokenCount}`)
|
||||
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
|
||||
logger.info(` Embedding Preview: [${chunk.embedding.slice(0, 5).map(n => n.toFixed(4)).join(', ')}...]`)
|
||||
logger.info(` Text Preview: ${chunk.text.slice(0, 100)}...`)
|
||||
})
|
||||
|
||||
// Calculate total token count
|
||||
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
|
||||
const chunksWithEmbeddings = chunks.filter(chunk => chunk.embedding.length > 0).length
|
||||
|
||||
logger.info(`\n=== STATISTICS ===`)
|
||||
logger.info(`Total tokens: ${totalTokens}`)
|
||||
logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
|
||||
logger.info(`Chunks with embeddings: ${chunksWithEmbeddings}/${chunks.length}`)
|
||||
if (chunks.length > 0 && chunks[0].embedding.length > 0) {
|
||||
logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
|
||||
logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
|
||||
}
|
||||
|
||||
const headerLevels = chunks.reduce((acc, chunk) => {
|
||||
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
|
||||
return acc
|
||||
}, {} as Record<number, number>)
|
||||
|
||||
logger.info(`Header level distribution:`)
|
||||
Object.entries(headerLevels)
|
||||
.sort(([a], [b]) => Number(a) - Number(b))
|
||||
.forEach(([level, count]) => {
|
||||
logger.info(` H${level}: ${count} chunks`)
|
||||
})
|
||||
|
||||
} catch (error) {
|
||||
logger.error('Error processing docs:', error)
|
||||
process.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
main().catch(console.error)
|
||||
Reference in New Issue
Block a user