mirror of
https://github.com/simstudioai/sim.git
synced 2026-02-01 10:14:56 -05:00
Add db migration
This commit is contained in:
215
apps/sim/scripts/process-docs-embeddings.ts
Normal file
215
apps/sim/scripts/process-docs-embeddings.ts
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import path from 'path'
|
||||
import { DocsChunker } from '@/lib/documents/docs-chunker'
|
||||
import { createLogger } from '@/lib/logs/console-logger'
|
||||
import { db } from '@/db'
|
||||
import { docsEmbeddings } from '@/db/schema'
|
||||
import { sql } from 'drizzle-orm'
|
||||
|
||||
const logger = createLogger('ProcessDocsEmbeddings')
|
||||
|
||||
interface ProcessingOptions {
|
||||
/** Clear existing docs embeddings before processing */
|
||||
clearExisting?: boolean
|
||||
/** Path to docs directory */
|
||||
docsPath?: string
|
||||
/** Base URL for generating links */
|
||||
baseUrl?: string
|
||||
/** Chunk size in tokens */
|
||||
chunkSize?: number
|
||||
/** Minimum chunk size in tokens */
|
||||
minChunkSize?: number
|
||||
/** Overlap between chunks in tokens */
|
||||
overlap?: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Production script to process documentation and save embeddings to database
|
||||
*/
|
||||
async function processDocsEmbeddings(options: ProcessingOptions = {}) {
|
||||
const startTime = Date.now()
|
||||
let processedChunks = 0
|
||||
let failedChunks = 0
|
||||
|
||||
try {
|
||||
// Configuration
|
||||
const config = {
|
||||
clearExisting: options.clearExisting ?? false,
|
||||
docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs'),
|
||||
baseUrl: options.baseUrl ?? 'https://docs.simstudio.ai',
|
||||
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
|
||||
minChunkSize: options.minChunkSize ?? 100,
|
||||
overlap: options.overlap ?? 50,
|
||||
}
|
||||
|
||||
logger.info('🚀 Starting docs embedding processing...')
|
||||
logger.info(`Configuration:`, {
|
||||
docsPath: config.docsPath,
|
||||
baseUrl: config.baseUrl,
|
||||
chunkSize: config.chunkSize,
|
||||
clearExisting: config.clearExisting,
|
||||
})
|
||||
|
||||
// Clear existing embeddings if requested
|
||||
if (config.clearExisting) {
|
||||
logger.info('🗑️ Clearing existing docs embeddings...')
|
||||
const deleteResult = await db.delete(docsEmbeddings)
|
||||
logger.info(`Deleted existing embeddings`)
|
||||
}
|
||||
|
||||
// Initialize the docs chunker
|
||||
const chunker = new DocsChunker({
|
||||
chunkSize: config.chunkSize,
|
||||
minChunkSize: config.minChunkSize,
|
||||
overlap: config.overlap,
|
||||
baseUrl: config.baseUrl,
|
||||
})
|
||||
|
||||
// Process all .mdx files
|
||||
logger.info(`📚 Processing docs from: ${config.docsPath}`)
|
||||
const chunks = await chunker.chunkAllDocs(config.docsPath)
|
||||
|
||||
if (chunks.length === 0) {
|
||||
logger.warn('⚠️ No chunks generated from docs')
|
||||
return { success: false, processedChunks: 0, failedChunks: 0 }
|
||||
}
|
||||
|
||||
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
|
||||
|
||||
// Save chunks to database in batches for better performance
|
||||
const batchSize = 10
|
||||
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
|
||||
|
||||
for (let i = 0; i < chunks.length; i += batchSize) {
|
||||
const batch = chunks.slice(i, i + batchSize)
|
||||
|
||||
try {
|
||||
// Prepare batch data
|
||||
const batchData = batch.map((chunk) => ({
|
||||
chunkText: chunk.text,
|
||||
sourceDocument: chunk.sourceDocument,
|
||||
sourceLink: chunk.headerLink,
|
||||
headerText: chunk.headerText,
|
||||
headerLevel: chunk.headerLevel,
|
||||
tokenCount: chunk.tokenCount,
|
||||
embedding: chunk.embedding,
|
||||
embeddingModel: chunk.embeddingModel,
|
||||
metadata: chunk.metadata,
|
||||
}))
|
||||
|
||||
// Insert batch
|
||||
await db.insert(docsEmbeddings).values(batchData)
|
||||
|
||||
processedChunks += batch.length
|
||||
|
||||
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
|
||||
logger.info(` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`)
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
|
||||
failedChunks += batch.length
|
||||
}
|
||||
}
|
||||
|
||||
// Verify results
|
||||
const savedCount = await db
|
||||
.select({ count: sql<number>`count(*)` })
|
||||
.from(docsEmbeddings)
|
||||
.then((result) => result[0]?.count || 0)
|
||||
|
||||
const duration = Date.now() - startTime
|
||||
|
||||
logger.info(`✅ Processing complete!`)
|
||||
logger.info(`📊 Results:`)
|
||||
logger.info(` • Total chunks processed: ${chunks.length}`)
|
||||
logger.info(` • Successfully saved: ${processedChunks}`)
|
||||
logger.info(` • Failed: ${failedChunks}`)
|
||||
logger.info(` • Database total: ${savedCount}`)
|
||||
logger.info(` • Duration: ${Math.round(duration / 1000)}s`)
|
||||
|
||||
// Summary by document
|
||||
const documentStats = chunks.reduce((acc, chunk) => {
|
||||
if (!acc[chunk.sourceDocument]) {
|
||||
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
|
||||
}
|
||||
acc[chunk.sourceDocument].chunks++
|
||||
acc[chunk.sourceDocument].tokens += chunk.tokenCount
|
||||
return acc
|
||||
}, {} as Record<string, { chunks: number; tokens: number }>)
|
||||
|
||||
logger.info(`📋 Document breakdown:`)
|
||||
Object.entries(documentStats)
|
||||
.sort(([, a], [, b]) => b.chunks - a.chunks)
|
||||
.slice(0, 10) // Top 10 documents
|
||||
.forEach(([doc, stats]) => {
|
||||
logger.info(` • ${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
|
||||
})
|
||||
|
||||
if (Object.keys(documentStats).length > 10) {
|
||||
logger.info(` • ... and ${Object.keys(documentStats).length - 10} more documents`)
|
||||
}
|
||||
|
||||
return {
|
||||
success: failedChunks === 0,
|
||||
processedChunks,
|
||||
failedChunks,
|
||||
totalChunks: chunks.length,
|
||||
databaseCount: savedCount,
|
||||
duration,
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
logger.error('💥 Fatal error during processing:', error)
|
||||
return {
|
||||
success: false,
|
||||
processedChunks,
|
||||
failedChunks,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function - handle command line arguments
|
||||
*/
|
||||
async function main() {
|
||||
const args = process.argv.slice(2)
|
||||
const options: ProcessingOptions = {}
|
||||
|
||||
// Parse command line arguments
|
||||
if (args.includes('--clear')) {
|
||||
options.clearExisting = true
|
||||
}
|
||||
|
||||
if (args.includes('--help') || args.includes('-h')) {
|
||||
console.log(`
|
||||
Usage: bun run scripts/process-docs-embeddings.ts [options]
|
||||
|
||||
Options:
|
||||
--clear Clear existing docs embeddings before processing
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
bun run scripts/process-docs-embeddings.ts
|
||||
bun run scripts/process-docs-embeddings.ts --clear
|
||||
`)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
const result = await processDocsEmbeddings(options)
|
||||
|
||||
if (!result.success) {
|
||||
process.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
if (process.argv[1]?.includes('process-docs-embeddings')) {
|
||||
main().catch((error) => {
|
||||
console.error('Script failed:', error)
|
||||
process.exit(1)
|
||||
})
|
||||
}
|
||||
|
||||
export { processDocsEmbeddings }
|
||||
Reference in New Issue
Block a user