Add db migration

This commit is contained in:
Siddharth Ganesan
2025-07-08 16:11:44 -07:00
parent 76c0c56689
commit b9fa50b4de
6 changed files with 5593 additions and 12 deletions

View File

@@ -0,0 +1,26 @@
CREATE TABLE "docs_embeddings" (
"chunk_id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
"chunk_text" text NOT NULL,
"source_document" text NOT NULL,
"source_link" text NOT NULL,
"header_text" text NOT NULL,
"header_level" integer NOT NULL,
"token_count" integer NOT NULL,
"embedding" vector(1536) NOT NULL,
"embedding_model" text DEFAULT 'text-embedding-3-small' NOT NULL,
"metadata" jsonb DEFAULT '{}' NOT NULL,
"chunk_text_tsv" "tsvector" GENERATED ALWAYS AS (to_tsvector('english', "docs_embeddings"."chunk_text")) STORED,
"created_at" timestamp DEFAULT now() NOT NULL,
"updated_at" timestamp DEFAULT now() NOT NULL,
CONSTRAINT "docs_embedding_not_null_check" CHECK ("embedding" IS NOT NULL),
CONSTRAINT "docs_header_level_check" CHECK ("header_level" >= 1 AND "header_level" <= 6)
);
--> statement-breakpoint
CREATE INDEX "docs_emb_source_document_idx" ON "docs_embeddings" USING btree ("source_document");--> statement-breakpoint
CREATE INDEX "docs_emb_header_level_idx" ON "docs_embeddings" USING btree ("header_level");--> statement-breakpoint
CREATE INDEX "docs_emb_source_header_idx" ON "docs_embeddings" USING btree ("source_document","header_level");--> statement-breakpoint
CREATE INDEX "docs_emb_model_idx" ON "docs_embeddings" USING btree ("embedding_model");--> statement-breakpoint
CREATE INDEX "docs_emb_created_at_idx" ON "docs_embeddings" USING btree ("created_at");--> statement-breakpoint
CREATE INDEX "docs_embedding_vector_hnsw_idx" ON "docs_embeddings" USING hnsw ("embedding" vector_cosine_ops) WITH (m=16,ef_construction=64);--> statement-breakpoint
CREATE INDEX "docs_emb_metadata_gin_idx" ON "docs_embeddings" USING gin ("metadata");--> statement-breakpoint
CREATE INDEX "docs_emb_chunk_text_fts_idx" ON "docs_embeddings" USING gin ("chunk_text_tsv");

File diff suppressed because it is too large Load Diff

View File

@@ -351,6 +351,13 @@
"when": 1751659528896,
"tag": "0050_big_mattie_franklin",
"breakpoints": true
},
{
"idx": 51,
"version": "7",
"when": 1752014976338,
"tag": "0051_typical_expediter",
"breakpoints": true
}
]
}
}

View File

@@ -13,6 +13,7 @@ import {
text,
timestamp,
uniqueIndex,
uuid,
vector,
} from 'drizzle-orm/pg-core'
@@ -909,3 +910,66 @@ export const embedding = pgTable(
embeddingNotNullCheck: check('embedding_not_null_check', sql`"embedding" IS NOT NULL`),
})
)
export const docsEmbeddings = pgTable(
'docs_embeddings',
{
chunkId: uuid('chunk_id').primaryKey().defaultRandom(),
chunkText: text('chunk_text').notNull(),
sourceDocument: text('source_document').notNull(),
sourceLink: text('source_link').notNull(),
headerText: text('header_text').notNull(),
headerLevel: integer('header_level').notNull(),
tokenCount: integer('token_count').notNull(),
// Vector embedding - optimized for text-embedding-3-small with HNSW support
embedding: vector('embedding', { dimensions: 1536 }).notNull(),
embeddingModel: text('embedding_model').notNull().default('text-embedding-3-small'),
// Metadata for flexible filtering
metadata: jsonb('metadata').notNull().default('{}'),
// Full-text search support - generated tsvector column
chunkTextTsv: tsvector('chunk_text_tsv').generatedAlwaysAs(
(): SQL => sql`to_tsvector('english', ${docsEmbeddings.chunkText})`
),
// Timestamps
createdAt: timestamp('created_at').notNull().defaultNow(),
updatedAt: timestamp('updated_at').notNull().defaultNow(),
},
(table) => ({
// Source document queries
sourceDocumentIdx: index('docs_emb_source_document_idx').on(table.sourceDocument),
// Header level filtering
headerLevelIdx: index('docs_emb_header_level_idx').on(table.headerLevel),
// Combined source and header queries
sourceHeaderIdx: index('docs_emb_source_header_idx').on(table.sourceDocument, table.headerLevel),
// Model-specific queries
modelIdx: index('docs_emb_model_idx').on(table.embeddingModel),
// Timestamp queries
createdAtIdx: index('docs_emb_created_at_idx').on(table.createdAt),
// Vector similarity search indexes (HNSW) - optimized for documentation embeddings
embeddingVectorHnswIdx: index('docs_embedding_vector_hnsw_idx')
.using('hnsw', table.embedding.op('vector_cosine_ops'))
.with({
m: 16,
ef_construction: 64,
}),
// GIN index for JSONB metadata queries
metadataGinIdx: index('docs_emb_metadata_gin_idx').using('gin', table.metadata),
// Full-text search index
chunkTextFtsIdx: index('docs_emb_chunk_text_fts_idx').using('gin', table.chunkTextTsv),
// Constraints
embeddingNotNullCheck: check('docs_embedding_not_null_check', sql`"embedding" IS NOT NULL`),
headerLevelCheck: check('docs_header_level_check', sql`"header_level" >= 1 AND "header_level" <= 6`),
})
)

View File

@@ -23,9 +23,9 @@ export class DocsChunker {
constructor(options: DocsChunkerOptions = {}) {
// Use the existing TextChunker for chunking logic
this.textChunker = new TextChunker({
chunkSize: options.chunkSize ?? 1024,
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
minChunkSize: options.minChunkSize ?? 100,
overlap: options.overlap ?? 200,
overlap: options.overlap ?? 50,
})
this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
}
@@ -209,16 +209,25 @@ export class DocsChunker {
}
/**
* Split content into chunks using the existing TextChunker
* Split content into chunks using the existing TextChunker with table awareness
*/
private async splitContent(content: string): Promise<string[]> {
// Clean the content first
const cleanedContent = this.cleanContent(content)
// Detect table boundaries to avoid splitting them
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
// Use the existing TextChunker
const chunks = await this.textChunker.chunk(cleanedContent)
return chunks.map((chunk) => chunk.text)
// Post-process chunks to ensure tables aren't split
const processedChunks = this.mergeTableChunks(chunks.map(chunk => chunk.text), tableBoundaries, cleanedContent)
// Ensure no chunk exceeds 300 tokens
const finalChunks = this.enforceSizeLimit(processedChunks)
return finalChunks
}
/**
@@ -239,20 +248,20 @@ export class DocsChunker {
)
}
/**
/**
* Parse frontmatter from MDX content
*/
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
if (!match) {
return { data: {}, content }
}
const [, frontmatterText, markdownContent] = match
const data: Frontmatter = {}
// Simple YAML parsing for title and description
const lines = frontmatterText.split('\n')
for (const line of lines) {
@@ -266,7 +275,357 @@ export class DocsChunker {
data[key] = value
}
}
return { data, content: markdownContent }
}
/**
* Split content by headers to respect document structure
*/
private splitByHeaders(content: string): Array<{ header: string | null; content: string; level: number }> {
const lines = content.split('\n')
const sections: Array<{ header: string | null; content: string; level: number }> = []
let currentHeader: string | null = null
let currentLevel = 0
let currentContent: string[] = []
for (const line of lines) {
const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
if (headerMatch) {
// Save previous section
if (currentContent.length > 0) {
sections.push({
header: currentHeader,
content: currentContent.join('\n').trim(),
level: currentLevel
})
}
// Start new section
currentHeader = line
currentLevel = headerMatch[1].length
currentContent = []
} else {
currentContent.push(line)
}
}
// Add final section
if (currentContent.length > 0) {
sections.push({
header: currentHeader,
content: currentContent.join('\n').trim(),
level: currentLevel
})
}
return sections.filter(section => section.content.trim().length > 0)
}
/**
* Estimate token count (rough approximation)
*/
private estimateTokens(text: string): number {
// Rough approximation: 1 token ≈ 4 characters
return Math.ceil(text.length / 4)
}
/**
* Merge small adjacent chunks to reach target size
*/
private mergeSmallChunks(chunks: string[]): string[] {
const merged: string[] = []
let currentChunk = ''
for (const chunk of chunks) {
const currentTokens = this.estimateTokens(currentChunk)
const chunkTokens = this.estimateTokens(chunk)
// If adding this chunk would exceed target size, save current and start new
if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
if (currentChunk.trim()) {
merged.push(currentChunk.trim())
}
currentChunk = chunk
} else {
// Merge with current chunk
currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
}
}
// Add final chunk
if (currentChunk.trim()) {
merged.push(currentChunk.trim())
}
return merged
}
/**
* Chunk a section while preserving tables and structure
*/
private async chunkSection(section: { header: string | null; content: string; level: number }): Promise<string[]> {
const content = section.content
const header = section.header
// Check if content contains tables
const hasTable = this.containsTable(content)
if (hasTable) {
// Split by tables and handle each part
return this.splitContentWithTables(content, header)
} else {
// Regular chunking for text-only content
const chunks = await this.textChunker.chunk(content)
return chunks.map((chunk, index) => {
// Add header to first chunk only
if (index === 0 && header) {
return `${header}\n\n${chunk.text}`.trim()
}
return chunk.text
})
}
}
/**
* Check if content contains markdown tables
*/
private containsTable(content: string): boolean {
const lines = content.split('\n')
return lines.some((line, index) => {
if (line.includes('|') && line.split('|').length >= 3) {
const nextLine = lines[index + 1]
return nextLine && nextLine.includes('|') && nextLine.includes('-')
}
return false
})
}
/**
* Split content that contains tables, keeping tables intact
*/
private splitContentWithTables(content: string, header: string | null): string[] {
const lines = content.split('\n')
const chunks: string[] = []
let currentChunk: string[] = []
let inTable = false
let tableLines: string[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
// Detect table start
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
const nextLine = lines[i + 1]
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
inTable = true
// Save current chunk if it has content
if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
const chunkText = currentChunk.join('\n').trim()
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
chunks.push(withHeader)
currentChunk = []
}
tableLines = [line]
continue
}
}
if (inTable) {
tableLines.push(line)
// Detect table end
if (!line.includes('|') || line.trim() === '') {
inTable = false
// Save table as its own chunk
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
if (tableText.length > 0) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
chunks.push(withHeader)
}
tableLines = []
// Start new chunk if current line has content
if (line.trim() !== '') {
currentChunk = [line]
}
}
} else {
currentChunk.push(line)
// If chunk is getting large, save it
if (this.estimateTokens(currentChunk.join('\n')) > 250) {
const chunkText = currentChunk.join('\n').trim()
if (chunkText.length > 50) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
chunks.push(withHeader)
}
currentChunk = []
}
}
}
// Handle remaining content
if (inTable && tableLines.length > 0) {
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
if (tableText.length > 0) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
chunks.push(withHeader)
}
} else if (currentChunk.length > 0) {
const chunkText = currentChunk.join('\n').trim()
if (chunkText.length > 50) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
chunks.push(withHeader)
}
}
return chunks.filter(chunk => chunk.trim().length > 50)
}
/**
* Detect table boundaries in markdown content to avoid splitting them
*/
private detectTableBoundaries(content: string): { start: number; end: number }[] {
const tables: { start: number; end: number }[] = []
const lines = content.split('\n')
let inTable = false
let tableStart = -1
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
// Detect table start (markdown table row with pipes)
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
// Check if next line is table separator (contains dashes and pipes)
const nextLine = lines[i + 1]?.trim()
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
inTable = true
tableStart = i
}
}
// Detect table end (empty line or non-table content)
else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0
})
inTable = false
}
}
// Handle table at end of content
if (inTable && tableStart >= 0) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: content.length
})
}
return tables
}
/**
* Get character position from line number
*/
private getCharacterPosition(lines: string[], lineIndex: number): number {
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
}
/**
* Merge chunks that would split tables
*/
private mergeTableChunks(chunks: string[], tableBoundaries: { start: number; end: number }[], originalContent: string): string[] {
if (tableBoundaries.length === 0) {
return chunks
}
const mergedChunks: string[] = []
let currentPosition = 0
for (const chunk of chunks) {
const chunkStart = originalContent.indexOf(chunk, currentPosition)
const chunkEnd = chunkStart + chunk.length
// Check if this chunk intersects with any table
const intersectsTable = tableBoundaries.some(table =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
if (intersectsTable) {
// Find which table(s) this chunk intersects with
const affectedTables = tableBoundaries.filter(table =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
// Create a chunk that includes the complete table(s)
const minStart = Math.min(chunkStart, ...affectedTables.map(t => t.start))
const maxEnd = Math.max(chunkEnd, ...affectedTables.map(t => t.end))
const completeChunk = originalContent.slice(minStart, maxEnd)
// Only add if we haven't already included this content
if (!mergedChunks.some(existing => existing.includes(completeChunk.trim()))) {
mergedChunks.push(completeChunk.trim())
}
} else {
mergedChunks.push(chunk)
}
currentPosition = chunkEnd
}
return mergedChunks.filter(chunk => chunk.length > 50) // Filter out tiny chunks
}
/**
* Enforce 300 token size limit on chunks
*/
private enforceSizeLimit(chunks: string[]): string[] {
const finalChunks: string[] = []
for (const chunk of chunks) {
const tokens = this.estimateTokens(chunk)
if (tokens <= 300) {
// Chunk is within limit
finalChunks.push(chunk)
} else {
// Chunk is too large - split it
const lines = chunk.split('\n')
let currentChunk = ''
for (const line of lines) {
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
if (this.estimateTokens(testChunk) <= 300) {
currentChunk = testChunk
} else {
// Adding this line would exceed limit
if (currentChunk.trim()) {
finalChunks.push(currentChunk.trim())
}
currentChunk = line
}
}
// Add final chunk if it has content
if (currentChunk.trim()) {
finalChunks.push(currentChunk.trim())
}
}
}
return finalChunks.filter(chunk => chunk.trim().length > 100)
}
}

View File

@@ -0,0 +1,215 @@
#!/usr/bin/env bun
import path from 'path'
import { DocsChunker } from '@/lib/documents/docs-chunker'
import { createLogger } from '@/lib/logs/console-logger'
import { db } from '@/db'
import { docsEmbeddings } from '@/db/schema'
import { sql } from 'drizzle-orm'
const logger = createLogger('ProcessDocsEmbeddings')
interface ProcessingOptions {
/** Clear existing docs embeddings before processing */
clearExisting?: boolean
/** Path to docs directory */
docsPath?: string
/** Base URL for generating links */
baseUrl?: string
/** Chunk size in tokens */
chunkSize?: number
/** Minimum chunk size in tokens */
minChunkSize?: number
/** Overlap between chunks in tokens */
overlap?: number
}
/**
* Production script to process documentation and save embeddings to database
*/
async function processDocsEmbeddings(options: ProcessingOptions = {}) {
const startTime = Date.now()
let processedChunks = 0
let failedChunks = 0
try {
// Configuration
const config = {
clearExisting: options.clearExisting ?? false,
docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs'),
baseUrl: options.baseUrl ?? 'https://docs.simstudio.ai',
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
minChunkSize: options.minChunkSize ?? 100,
overlap: options.overlap ?? 50,
}
logger.info('🚀 Starting docs embedding processing...')
logger.info(`Configuration:`, {
docsPath: config.docsPath,
baseUrl: config.baseUrl,
chunkSize: config.chunkSize,
clearExisting: config.clearExisting,
})
// Clear existing embeddings if requested
if (config.clearExisting) {
logger.info('🗑️ Clearing existing docs embeddings...')
const deleteResult = await db.delete(docsEmbeddings)
logger.info(`Deleted existing embeddings`)
}
// Initialize the docs chunker
const chunker = new DocsChunker({
chunkSize: config.chunkSize,
minChunkSize: config.minChunkSize,
overlap: config.overlap,
baseUrl: config.baseUrl,
})
// Process all .mdx files
logger.info(`📚 Processing docs from: ${config.docsPath}`)
const chunks = await chunker.chunkAllDocs(config.docsPath)
if (chunks.length === 0) {
logger.warn('⚠️ No chunks generated from docs')
return { success: false, processedChunks: 0, failedChunks: 0 }
}
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
// Save chunks to database in batches for better performance
const batchSize = 10
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
try {
// Prepare batch data
const batchData = batch.map((chunk) => ({
chunkText: chunk.text,
sourceDocument: chunk.sourceDocument,
sourceLink: chunk.headerLink,
headerText: chunk.headerText,
headerLevel: chunk.headerLevel,
tokenCount: chunk.tokenCount,
embedding: chunk.embedding,
embeddingModel: chunk.embeddingModel,
metadata: chunk.metadata,
}))
// Insert batch
await db.insert(docsEmbeddings).values(batchData)
processedChunks += batch.length
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
logger.info(` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`)
}
} catch (error) {
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
failedChunks += batch.length
}
}
// Verify results
const savedCount = await db
.select({ count: sql<number>`count(*)` })
.from(docsEmbeddings)
.then((result) => result[0]?.count || 0)
const duration = Date.now() - startTime
logger.info(`✅ Processing complete!`)
logger.info(`📊 Results:`)
logger.info(` • Total chunks processed: ${chunks.length}`)
logger.info(` • Successfully saved: ${processedChunks}`)
logger.info(` • Failed: ${failedChunks}`)
logger.info(` • Database total: ${savedCount}`)
logger.info(` • Duration: ${Math.round(duration / 1000)}s`)
// Summary by document
const documentStats = chunks.reduce((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
}
acc[chunk.sourceDocument].chunks++
acc[chunk.sourceDocument].tokens += chunk.tokenCount
return acc
}, {} as Record<string, { chunks: number; tokens: number }>)
logger.info(`📋 Document breakdown:`)
Object.entries(documentStats)
.sort(([, a], [, b]) => b.chunks - a.chunks)
.slice(0, 10) // Top 10 documents
.forEach(([doc, stats]) => {
logger.info(`${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
})
if (Object.keys(documentStats).length > 10) {
logger.info(` • ... and ${Object.keys(documentStats).length - 10} more documents`)
}
return {
success: failedChunks === 0,
processedChunks,
failedChunks,
totalChunks: chunks.length,
databaseCount: savedCount,
duration,
}
} catch (error) {
logger.error('💥 Fatal error during processing:', error)
return {
success: false,
processedChunks,
failedChunks,
error: error instanceof Error ? error.message : 'Unknown error',
}
}
}
/**
* Main function - handle command line arguments
*/
async function main() {
const args = process.argv.slice(2)
const options: ProcessingOptions = {}
// Parse command line arguments
if (args.includes('--clear')) {
options.clearExisting = true
}
if (args.includes('--help') || args.includes('-h')) {
console.log(`
Usage: bun run scripts/process-docs-embeddings.ts [options]
Options:
--clear Clear existing docs embeddings before processing
--help, -h Show this help message
Examples:
bun run scripts/process-docs-embeddings.ts
bun run scripts/process-docs-embeddings.ts --clear
`)
process.exit(0)
}
const result = await processDocsEmbeddings(options)
if (!result.success) {
process.exit(1)
}
}
// Run the script
if (process.argv[1]?.includes('process-docs-embeddings')) {
main().catch((error) => {
console.error('Script failed:', error)
process.exit(1)
})
}
export { processDocsEmbeddings }