Initial lint

This commit is contained in:
Siddharth Ganesan
2025-07-08 15:36:33 -07:00
parent 850447a604
commit 76c0c56689
3 changed files with 80 additions and 70 deletions

View File

@@ -1,8 +1,8 @@
import fs from 'fs/promises'
import path from 'path'
import { createLogger } from '@/lib/logs/console-logger'
import { TextChunker } from './chunker'
import { generateEmbeddings } from '@/app/api/knowledge/utils'
import { TextChunker } from './chunker'
import type { DocChunk, DocsChunkerOptions, HeaderInfo } from './types'
interface Frontmatter {
@@ -35,7 +35,7 @@ export class DocsChunker {
*/
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
const allChunks: DocChunk[] = []
try {
const mdxFiles = await this.findMdxFiles(docsPath)
logger.info(`Found ${mdxFiles.length} .mdx files to process`)
@@ -64,36 +64,36 @@ export class DocsChunker {
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
const content = await fs.readFile(filePath, 'utf-8')
const relativePath = path.relative(basePath, filePath)
// Parse frontmatter and content
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
// Extract headers from the content
const headers = this.extractHeaders(markdownContent)
// Generate document URL
const documentUrl = this.generateDocumentUrl(relativePath)
// Split content into chunks
const textChunks = await this.splitContent(markdownContent)
// Generate embeddings for all chunks at once (batch processing)
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
const embeddings = textChunks.length > 0 ? await generateEmbeddings(textChunks) : []
const embeddingModel = 'text-embedding-3-small'
// Convert to DocChunk objects with header context and embeddings
const chunks: DocChunk[] = []
let currentPosition = 0
for (let i = 0; i < textChunks.length; i++) {
const chunkText = textChunks[i]
const chunkStart = currentPosition
const chunkEnd = currentPosition + chunkText.length
// Find the most relevant header for this chunk
const relevantHeader = this.findRelevantHeader(headers, chunkStart)
const chunk: DocChunk = {
text: chunkText,
tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
@@ -111,11 +111,11 @@ export class DocsChunker {
documentDescription: frontmatter.description,
},
}
chunks.push(chunk)
currentPosition = chunkEnd
}
return chunks
}
@@ -124,12 +124,12 @@ export class DocsChunker {
*/
private async findMdxFiles(dirPath: string): Promise<string[]> {
const files: string[] = []
const entries = await fs.readdir(dirPath, { withFileTypes: true })
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name)
if (entry.isDirectory()) {
const subFiles = await this.findMdxFiles(fullPath)
files.push(...subFiles)
@@ -137,7 +137,7 @@ export class DocsChunker {
files.push(fullPath)
}
}
return files
}
@@ -148,12 +148,12 @@ export class DocsChunker {
const headers: HeaderInfo[] = []
const headerRegex = /^(#{1,6})\s+(.+)$/gm
let match
while ((match = headerRegex.exec(content)) !== null) {
const level = match[1].length
const text = match[2].trim()
const anchor = this.generateAnchor(text)
headers.push({
text,
level,
@@ -161,7 +161,7 @@ export class DocsChunker {
position: match.index,
})
}
return headers
}
@@ -183,10 +183,8 @@ export class DocsChunker {
private generateDocumentUrl(relativePath: string): string {
// Convert file path to URL path
// e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
const urlPath = relativePath
.replace(/\.mdx$/, '')
.replace(/\\/g, '/') // Handle Windows paths
const urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
return `${this.baseUrl}/${urlPath}`
}
@@ -195,10 +193,10 @@ export class DocsChunker {
*/
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
if (headers.length === 0) return null
// Find the last header that comes before this position
let relevantHeader: HeaderInfo | null = null
for (const header of headers) {
if (header.position <= position) {
relevantHeader = header
@@ -206,7 +204,7 @@ export class DocsChunker {
break
}
}
return relevantHeader
}
@@ -216,57 +214,59 @@ export class DocsChunker {
private async splitContent(content: string): Promise<string[]> {
// Clean the content first
const cleanedContent = this.cleanContent(content)
// Use the existing TextChunker
const chunks = await this.textChunker.chunk(cleanedContent)
return chunks.map(chunk => chunk.text)
return chunks.map((chunk) => chunk.text)
}
/**
* Clean content by removing MDX-specific elements and excessive whitespace
*/
private cleanContent(content: string): string {
return content
// Remove import statements
.replace(/^import\s+.*$/gm, '')
// Remove JSX components and React-style comments
.replace(/<[^>]+>/g, ' ')
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
// Remove excessive whitespace
.replace(/\n{3,}/g, '\n\n')
.replace(/[ \t]{2,}/g, ' ')
.trim()
return (
content
// Remove import statements
.replace(/^import\s+.*$/gm, '')
// Remove JSX components and React-style comments
.replace(/<[^>]+>/g, ' ')
.replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
// Remove excessive whitespace
.replace(/\n{3,}/g, '\n\n')
.replace(/[ \t]{2,}/g, ' ')
.trim()
)
}
/**
* Parse frontmatter from MDX content
*/
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
if (!match) {
return { data: {}, content }
}
const [, frontmatterText, markdownContent] = match
const data: Frontmatter = {}
// Simple YAML parsing for title and description
const lines = frontmatterText.split('\n')
for (const line of lines) {
const colonIndex = line.indexOf(':')
if (colonIndex > 0) {
const key = line.slice(0, colonIndex).trim()
const value = line.slice(colonIndex + 1).trim().replace(/^['"]|['"]$/g, '')
const value = line
.slice(colonIndex + 1)
.trim()
.replace(/^['"]|['"]$/g, '')
data[key] = value
}
}
return { data, content: markdownContent }
}
}
}

View File

@@ -50,4 +50,4 @@ export interface HeaderInfo {
anchor: string
/** Position in document */
position: number
}
}

View File

@@ -21,7 +21,7 @@ async function main() {
// Path to the docs content directory
const docsPath = path.join(process.cwd(), '../../apps/docs/content/docs')
logger.info(`Processing docs from: ${docsPath}`)
// Process all .mdx files
@@ -29,15 +29,18 @@ async function main() {
logger.info(`\n=== CHUNKING RESULTS ===`)
logger.info(`Total chunks: ${chunks.length}`)
// Group chunks by document
const chunksByDoc = chunks.reduce((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = []
}
acc[chunk.sourceDocument].push(chunk)
return acc
}, {} as Record<string, typeof chunks>)
const chunksByDoc = chunks.reduce(
(acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = []
}
acc[chunk.sourceDocument].push(chunk)
return acc
},
{} as Record<string, typeof chunks>
)
// Display summary
logger.info(`\n=== DOCUMENT SUMMARY ===`)
@@ -54,14 +57,19 @@ async function main() {
logger.info(` Link: ${chunk.headerLink}`)
logger.info(` Tokens: ${chunk.tokenCount}`)
logger.info(` Embedding: ${chunk.embedding.length} dimensions (${chunk.embeddingModel})`)
logger.info(` Embedding Preview: [${chunk.embedding.slice(0, 5).map(n => n.toFixed(4)).join(', ')}...]`)
logger.info(
` Embedding Preview: [${chunk.embedding
.slice(0, 5)
.map((n) => n.toFixed(4))
.join(', ')}...]`
)
logger.info(` Text Preview: ${chunk.text.slice(0, 100)}...`)
})
// Calculate total token count
const totalTokens = chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0)
const chunksWithEmbeddings = chunks.filter(chunk => chunk.embedding.length > 0).length
const chunksWithEmbeddings = chunks.filter((chunk) => chunk.embedding.length > 0).length
logger.info(`\n=== STATISTICS ===`)
logger.info(`Total tokens: ${totalTokens}`)
logger.info(`Average tokens per chunk: ${Math.round(totalTokens / chunks.length)}`)
@@ -70,19 +78,21 @@ async function main() {
logger.info(`Embedding model: ${chunks[0].embeddingModel}`)
logger.info(`Embedding dimensions: ${chunks[0].embedding.length}`)
}
const headerLevels = chunks.reduce((acc, chunk) => {
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
return acc
}, {} as Record<number, number>)
const headerLevels = chunks.reduce(
(acc, chunk) => {
acc[chunk.headerLevel] = (acc[chunk.headerLevel] || 0) + 1
return acc
},
{} as Record<number, number>
)
logger.info(`Header level distribution:`)
Object.entries(headerLevels)
.sort(([a], [b]) => Number(a) - Number(b))
.forEach(([level, count]) => {
logger.info(` H${level}: ${count} chunks`)
})
} catch (error) {
logger.error('Error processing docs:', error)
process.exit(1)
@@ -90,4 +100,4 @@ async function main() {
}
// Run the script
main().catch(console.error)
main().catch(console.error)