mirror of
https://github.com/simstudioai/sim.git
synced 2026-04-06 03:00:16 -04:00
Add db migration
This commit is contained in:
26
apps/sim/db/migrations/0051_typical_expediter.sql
Normal file
26
apps/sim/db/migrations/0051_typical_expediter.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
CREATE TABLE "docs_embeddings" (
|
||||
"chunk_id" uuid PRIMARY KEY DEFAULT gen_random_uuid() NOT NULL,
|
||||
"chunk_text" text NOT NULL,
|
||||
"source_document" text NOT NULL,
|
||||
"source_link" text NOT NULL,
|
||||
"header_text" text NOT NULL,
|
||||
"header_level" integer NOT NULL,
|
||||
"token_count" integer NOT NULL,
|
||||
"embedding" vector(1536) NOT NULL,
|
||||
"embedding_model" text DEFAULT 'text-embedding-3-small' NOT NULL,
|
||||
"metadata" jsonb DEFAULT '{}' NOT NULL,
|
||||
"chunk_text_tsv" "tsvector" GENERATED ALWAYS AS (to_tsvector('english', "docs_embeddings"."chunk_text")) STORED,
|
||||
"created_at" timestamp DEFAULT now() NOT NULL,
|
||||
"updated_at" timestamp DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "docs_embedding_not_null_check" CHECK ("embedding" IS NOT NULL),
|
||||
CONSTRAINT "docs_header_level_check" CHECK ("header_level" >= 1 AND "header_level" <= 6)
|
||||
);
|
||||
--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_source_document_idx" ON "docs_embeddings" USING btree ("source_document");--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_header_level_idx" ON "docs_embeddings" USING btree ("header_level");--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_source_header_idx" ON "docs_embeddings" USING btree ("source_document","header_level");--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_model_idx" ON "docs_embeddings" USING btree ("embedding_model");--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_created_at_idx" ON "docs_embeddings" USING btree ("created_at");--> statement-breakpoint
|
||||
CREATE INDEX "docs_embedding_vector_hnsw_idx" ON "docs_embeddings" USING hnsw ("embedding" vector_cosine_ops) WITH (m=16,ef_construction=64);--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_metadata_gin_idx" ON "docs_embeddings" USING gin ("metadata");--> statement-breakpoint
|
||||
CREATE INDEX "docs_emb_chunk_text_fts_idx" ON "docs_embeddings" USING gin ("chunk_text_tsv");
|
||||
4910
apps/sim/db/migrations/meta/0051_snapshot.json
Normal file
4910
apps/sim/db/migrations/meta/0051_snapshot.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -351,6 +351,13 @@
|
||||
"when": 1751659528896,
|
||||
"tag": "0050_big_mattie_franklin",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 51,
|
||||
"version": "7",
|
||||
"when": 1752014976338,
|
||||
"tag": "0051_typical_expediter",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import {
|
||||
text,
|
||||
timestamp,
|
||||
uniqueIndex,
|
||||
uuid,
|
||||
vector,
|
||||
} from 'drizzle-orm/pg-core'
|
||||
|
||||
@@ -909,3 +910,66 @@ export const embedding = pgTable(
|
||||
embeddingNotNullCheck: check('embedding_not_null_check', sql`"embedding" IS NOT NULL`),
|
||||
})
|
||||
)
|
||||
|
||||
export const docsEmbeddings = pgTable(
|
||||
'docs_embeddings',
|
||||
{
|
||||
chunkId: uuid('chunk_id').primaryKey().defaultRandom(),
|
||||
chunkText: text('chunk_text').notNull(),
|
||||
sourceDocument: text('source_document').notNull(),
|
||||
sourceLink: text('source_link').notNull(),
|
||||
headerText: text('header_text').notNull(),
|
||||
headerLevel: integer('header_level').notNull(),
|
||||
tokenCount: integer('token_count').notNull(),
|
||||
|
||||
// Vector embedding - optimized for text-embedding-3-small with HNSW support
|
||||
embedding: vector('embedding', { dimensions: 1536 }).notNull(),
|
||||
embeddingModel: text('embedding_model').notNull().default('text-embedding-3-small'),
|
||||
|
||||
// Metadata for flexible filtering
|
||||
metadata: jsonb('metadata').notNull().default('{}'),
|
||||
|
||||
// Full-text search support - generated tsvector column
|
||||
chunkTextTsv: tsvector('chunk_text_tsv').generatedAlwaysAs(
|
||||
(): SQL => sql`to_tsvector('english', ${docsEmbeddings.chunkText})`
|
||||
),
|
||||
|
||||
// Timestamps
|
||||
createdAt: timestamp('created_at').notNull().defaultNow(),
|
||||
updatedAt: timestamp('updated_at').notNull().defaultNow(),
|
||||
},
|
||||
(table) => ({
|
||||
// Source document queries
|
||||
sourceDocumentIdx: index('docs_emb_source_document_idx').on(table.sourceDocument),
|
||||
|
||||
// Header level filtering
|
||||
headerLevelIdx: index('docs_emb_header_level_idx').on(table.headerLevel),
|
||||
|
||||
// Combined source and header queries
|
||||
sourceHeaderIdx: index('docs_emb_source_header_idx').on(table.sourceDocument, table.headerLevel),
|
||||
|
||||
// Model-specific queries
|
||||
modelIdx: index('docs_emb_model_idx').on(table.embeddingModel),
|
||||
|
||||
// Timestamp queries
|
||||
createdAtIdx: index('docs_emb_created_at_idx').on(table.createdAt),
|
||||
|
||||
// Vector similarity search indexes (HNSW) - optimized for documentation embeddings
|
||||
embeddingVectorHnswIdx: index('docs_embedding_vector_hnsw_idx')
|
||||
.using('hnsw', table.embedding.op('vector_cosine_ops'))
|
||||
.with({
|
||||
m: 16,
|
||||
ef_construction: 64,
|
||||
}),
|
||||
|
||||
// GIN index for JSONB metadata queries
|
||||
metadataGinIdx: index('docs_emb_metadata_gin_idx').using('gin', table.metadata),
|
||||
|
||||
// Full-text search index
|
||||
chunkTextFtsIdx: index('docs_emb_chunk_text_fts_idx').using('gin', table.chunkTextTsv),
|
||||
|
||||
// Constraints
|
||||
embeddingNotNullCheck: check('docs_embedding_not_null_check', sql`"embedding" IS NOT NULL`),
|
||||
headerLevelCheck: check('docs_header_level_check', sql`"header_level" >= 1 AND "header_level" <= 6`),
|
||||
})
|
||||
)
|
||||
|
||||
@@ -23,9 +23,9 @@ export class DocsChunker {
|
||||
constructor(options: DocsChunkerOptions = {}) {
|
||||
// Use the existing TextChunker for chunking logic
|
||||
this.textChunker = new TextChunker({
|
||||
chunkSize: options.chunkSize ?? 1024,
|
||||
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
|
||||
minChunkSize: options.minChunkSize ?? 100,
|
||||
overlap: options.overlap ?? 200,
|
||||
overlap: options.overlap ?? 50,
|
||||
})
|
||||
this.baseUrl = options.baseUrl ?? 'https://docs.simstudio.ai'
|
||||
}
|
||||
@@ -209,16 +209,25 @@ export class DocsChunker {
|
||||
}
|
||||
|
||||
/**
|
||||
* Split content into chunks using the existing TextChunker
|
||||
* Split content into chunks using the existing TextChunker with table awareness
|
||||
*/
|
||||
private async splitContent(content: string): Promise<string[]> {
|
||||
// Clean the content first
|
||||
const cleanedContent = this.cleanContent(content)
|
||||
|
||||
|
||||
// Detect table boundaries to avoid splitting them
|
||||
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
|
||||
|
||||
// Use the existing TextChunker
|
||||
const chunks = await this.textChunker.chunk(cleanedContent)
|
||||
|
||||
return chunks.map((chunk) => chunk.text)
|
||||
|
||||
// Post-process chunks to ensure tables aren't split
|
||||
const processedChunks = this.mergeTableChunks(chunks.map(chunk => chunk.text), tableBoundaries, cleanedContent)
|
||||
|
||||
// Ensure no chunk exceeds 300 tokens
|
||||
const finalChunks = this.enforceSizeLimit(processedChunks)
|
||||
|
||||
return finalChunks
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -239,20 +248,20 @@ export class DocsChunker {
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Parse frontmatter from MDX content
|
||||
*/
|
||||
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
|
||||
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
|
||||
const match = content.match(frontmatterRegex)
|
||||
|
||||
|
||||
if (!match) {
|
||||
return { data: {}, content }
|
||||
}
|
||||
|
||||
|
||||
const [, frontmatterText, markdownContent] = match
|
||||
const data: Frontmatter = {}
|
||||
|
||||
|
||||
// Simple YAML parsing for title and description
|
||||
const lines = frontmatterText.split('\n')
|
||||
for (const line of lines) {
|
||||
@@ -266,7 +275,357 @@ export class DocsChunker {
|
||||
data[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return { data, content: markdownContent }
|
||||
}
|
||||
|
||||
/**
|
||||
* Split content by headers to respect document structure
|
||||
*/
|
||||
private splitByHeaders(content: string): Array<{ header: string | null; content: string; level: number }> {
|
||||
const lines = content.split('\n')
|
||||
const sections: Array<{ header: string | null; content: string; level: number }> = []
|
||||
|
||||
let currentHeader: string | null = null
|
||||
let currentLevel = 0
|
||||
let currentContent: string[] = []
|
||||
|
||||
for (const line of lines) {
|
||||
const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
|
||||
|
||||
if (headerMatch) {
|
||||
// Save previous section
|
||||
if (currentContent.length > 0) {
|
||||
sections.push({
|
||||
header: currentHeader,
|
||||
content: currentContent.join('\n').trim(),
|
||||
level: currentLevel
|
||||
})
|
||||
}
|
||||
|
||||
// Start new section
|
||||
currentHeader = line
|
||||
currentLevel = headerMatch[1].length
|
||||
currentContent = []
|
||||
} else {
|
||||
currentContent.push(line)
|
||||
}
|
||||
}
|
||||
|
||||
// Add final section
|
||||
if (currentContent.length > 0) {
|
||||
sections.push({
|
||||
header: currentHeader,
|
||||
content: currentContent.join('\n').trim(),
|
||||
level: currentLevel
|
||||
})
|
||||
}
|
||||
|
||||
return sections.filter(section => section.content.trim().length > 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate token count (rough approximation)
|
||||
*/
|
||||
private estimateTokens(text: string): number {
|
||||
// Rough approximation: 1 token ≈ 4 characters
|
||||
return Math.ceil(text.length / 4)
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge small adjacent chunks to reach target size
|
||||
*/
|
||||
private mergeSmallChunks(chunks: string[]): string[] {
|
||||
const merged: string[] = []
|
||||
let currentChunk = ''
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const currentTokens = this.estimateTokens(currentChunk)
|
||||
const chunkTokens = this.estimateTokens(chunk)
|
||||
|
||||
// If adding this chunk would exceed target size, save current and start new
|
||||
if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
|
||||
if (currentChunk.trim()) {
|
||||
merged.push(currentChunk.trim())
|
||||
}
|
||||
currentChunk = chunk
|
||||
} else {
|
||||
// Merge with current chunk
|
||||
currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
|
||||
}
|
||||
}
|
||||
|
||||
// Add final chunk
|
||||
if (currentChunk.trim()) {
|
||||
merged.push(currentChunk.trim())
|
||||
}
|
||||
|
||||
return merged
|
||||
}
|
||||
|
||||
/**
|
||||
* Chunk a section while preserving tables and structure
|
||||
*/
|
||||
private async chunkSection(section: { header: string | null; content: string; level: number }): Promise<string[]> {
|
||||
const content = section.content
|
||||
const header = section.header
|
||||
|
||||
// Check if content contains tables
|
||||
const hasTable = this.containsTable(content)
|
||||
|
||||
if (hasTable) {
|
||||
// Split by tables and handle each part
|
||||
return this.splitContentWithTables(content, header)
|
||||
} else {
|
||||
// Regular chunking for text-only content
|
||||
const chunks = await this.textChunker.chunk(content)
|
||||
return chunks.map((chunk, index) => {
|
||||
// Add header to first chunk only
|
||||
if (index === 0 && header) {
|
||||
return `${header}\n\n${chunk.text}`.trim()
|
||||
}
|
||||
return chunk.text
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if content contains markdown tables
|
||||
*/
|
||||
private containsTable(content: string): boolean {
|
||||
const lines = content.split('\n')
|
||||
return lines.some((line, index) => {
|
||||
if (line.includes('|') && line.split('|').length >= 3) {
|
||||
const nextLine = lines[index + 1]
|
||||
return nextLine && nextLine.includes('|') && nextLine.includes('-')
|
||||
}
|
||||
return false
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Split content that contains tables, keeping tables intact
|
||||
*/
|
||||
private splitContentWithTables(content: string, header: string | null): string[] {
|
||||
const lines = content.split('\n')
|
||||
const chunks: string[] = []
|
||||
let currentChunk: string[] = []
|
||||
let inTable = false
|
||||
let tableLines: string[] = []
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i]
|
||||
|
||||
// Detect table start
|
||||
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
|
||||
const nextLine = lines[i + 1]
|
||||
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
|
||||
inTable = true
|
||||
|
||||
// Save current chunk if it has content
|
||||
if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
|
||||
const chunkText = currentChunk.join('\n').trim()
|
||||
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
|
||||
chunks.push(withHeader)
|
||||
currentChunk = []
|
||||
}
|
||||
|
||||
tableLines = [line]
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if (inTable) {
|
||||
tableLines.push(line)
|
||||
|
||||
// Detect table end
|
||||
if (!line.includes('|') || line.trim() === '') {
|
||||
inTable = false
|
||||
|
||||
// Save table as its own chunk
|
||||
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
|
||||
if (tableText.length > 0) {
|
||||
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
|
||||
chunks.push(withHeader)
|
||||
}
|
||||
|
||||
tableLines = []
|
||||
|
||||
// Start new chunk if current line has content
|
||||
if (line.trim() !== '') {
|
||||
currentChunk = [line]
|
||||
}
|
||||
}
|
||||
} else {
|
||||
currentChunk.push(line)
|
||||
|
||||
// If chunk is getting large, save it
|
||||
if (this.estimateTokens(currentChunk.join('\n')) > 250) {
|
||||
const chunkText = currentChunk.join('\n').trim()
|
||||
if (chunkText.length > 50) {
|
||||
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
|
||||
chunks.push(withHeader)
|
||||
}
|
||||
currentChunk = []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remaining content
|
||||
if (inTable && tableLines.length > 0) {
|
||||
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
|
||||
if (tableText.length > 0) {
|
||||
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
|
||||
chunks.push(withHeader)
|
||||
}
|
||||
} else if (currentChunk.length > 0) {
|
||||
const chunkText = currentChunk.join('\n').trim()
|
||||
if (chunkText.length > 50) {
|
||||
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
|
||||
chunks.push(withHeader)
|
||||
}
|
||||
}
|
||||
|
||||
return chunks.filter(chunk => chunk.trim().length > 50)
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect table boundaries in markdown content to avoid splitting them
|
||||
*/
|
||||
private detectTableBoundaries(content: string): { start: number; end: number }[] {
|
||||
const tables: { start: number; end: number }[] = []
|
||||
const lines = content.split('\n')
|
||||
|
||||
let inTable = false
|
||||
let tableStart = -1
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim()
|
||||
|
||||
// Detect table start (markdown table row with pipes)
|
||||
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
|
||||
// Check if next line is table separator (contains dashes and pipes)
|
||||
const nextLine = lines[i + 1]?.trim()
|
||||
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
|
||||
inTable = true
|
||||
tableStart = i
|
||||
}
|
||||
}
|
||||
// Detect table end (empty line or non-table content)
|
||||
else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
|
||||
tables.push({
|
||||
start: this.getCharacterPosition(lines, tableStart),
|
||||
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0
|
||||
})
|
||||
inTable = false
|
||||
}
|
||||
}
|
||||
|
||||
// Handle table at end of content
|
||||
if (inTable && tableStart >= 0) {
|
||||
tables.push({
|
||||
start: this.getCharacterPosition(lines, tableStart),
|
||||
end: content.length
|
||||
})
|
||||
}
|
||||
|
||||
return tables
|
||||
}
|
||||
|
||||
/**
|
||||
* Get character position from line number
|
||||
*/
|
||||
private getCharacterPosition(lines: string[], lineIndex: number): number {
|
||||
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge chunks that would split tables
|
||||
*/
|
||||
private mergeTableChunks(chunks: string[], tableBoundaries: { start: number; end: number }[], originalContent: string): string[] {
|
||||
if (tableBoundaries.length === 0) {
|
||||
return chunks
|
||||
}
|
||||
|
||||
const mergedChunks: string[] = []
|
||||
let currentPosition = 0
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const chunkStart = originalContent.indexOf(chunk, currentPosition)
|
||||
const chunkEnd = chunkStart + chunk.length
|
||||
|
||||
// Check if this chunk intersects with any table
|
||||
const intersectsTable = tableBoundaries.some(table =>
|
||||
(chunkStart >= table.start && chunkStart <= table.end) ||
|
||||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
|
||||
(chunkStart <= table.start && chunkEnd >= table.end)
|
||||
)
|
||||
|
||||
if (intersectsTable) {
|
||||
// Find which table(s) this chunk intersects with
|
||||
const affectedTables = tableBoundaries.filter(table =>
|
||||
(chunkStart >= table.start && chunkStart <= table.end) ||
|
||||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
|
||||
(chunkStart <= table.start && chunkEnd >= table.end)
|
||||
)
|
||||
|
||||
// Create a chunk that includes the complete table(s)
|
||||
const minStart = Math.min(chunkStart, ...affectedTables.map(t => t.start))
|
||||
const maxEnd = Math.max(chunkEnd, ...affectedTables.map(t => t.end))
|
||||
const completeChunk = originalContent.slice(minStart, maxEnd)
|
||||
|
||||
// Only add if we haven't already included this content
|
||||
if (!mergedChunks.some(existing => existing.includes(completeChunk.trim()))) {
|
||||
mergedChunks.push(completeChunk.trim())
|
||||
}
|
||||
} else {
|
||||
mergedChunks.push(chunk)
|
||||
}
|
||||
|
||||
currentPosition = chunkEnd
|
||||
}
|
||||
|
||||
return mergedChunks.filter(chunk => chunk.length > 50) // Filter out tiny chunks
|
||||
}
|
||||
|
||||
/**
|
||||
* Enforce 300 token size limit on chunks
|
||||
*/
|
||||
private enforceSizeLimit(chunks: string[]): string[] {
|
||||
const finalChunks: string[] = []
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const tokens = this.estimateTokens(chunk)
|
||||
|
||||
if (tokens <= 300) {
|
||||
// Chunk is within limit
|
||||
finalChunks.push(chunk)
|
||||
} else {
|
||||
// Chunk is too large - split it
|
||||
const lines = chunk.split('\n')
|
||||
let currentChunk = ''
|
||||
|
||||
for (const line of lines) {
|
||||
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
|
||||
|
||||
if (this.estimateTokens(testChunk) <= 300) {
|
||||
currentChunk = testChunk
|
||||
} else {
|
||||
// Adding this line would exceed limit
|
||||
if (currentChunk.trim()) {
|
||||
finalChunks.push(currentChunk.trim())
|
||||
}
|
||||
currentChunk = line
|
||||
}
|
||||
}
|
||||
|
||||
// Add final chunk if it has content
|
||||
if (currentChunk.trim()) {
|
||||
finalChunks.push(currentChunk.trim())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return finalChunks.filter(chunk => chunk.trim().length > 100)
|
||||
}
|
||||
}
|
||||
|
||||
215
apps/sim/scripts/process-docs-embeddings.ts
Normal file
215
apps/sim/scripts/process-docs-embeddings.ts
Normal file
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env bun
|
||||
|
||||
import path from 'path'
|
||||
import { DocsChunker } from '@/lib/documents/docs-chunker'
|
||||
import { createLogger } from '@/lib/logs/console-logger'
|
||||
import { db } from '@/db'
|
||||
import { docsEmbeddings } from '@/db/schema'
|
||||
import { sql } from 'drizzle-orm'
|
||||
|
||||
const logger = createLogger('ProcessDocsEmbeddings')
|
||||
|
||||
interface ProcessingOptions {
|
||||
/** Clear existing docs embeddings before processing */
|
||||
clearExisting?: boolean
|
||||
/** Path to docs directory */
|
||||
docsPath?: string
|
||||
/** Base URL for generating links */
|
||||
baseUrl?: string
|
||||
/** Chunk size in tokens */
|
||||
chunkSize?: number
|
||||
/** Minimum chunk size in tokens */
|
||||
minChunkSize?: number
|
||||
/** Overlap between chunks in tokens */
|
||||
overlap?: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Production script to process documentation and save embeddings to database
|
||||
*/
|
||||
async function processDocsEmbeddings(options: ProcessingOptions = {}) {
|
||||
const startTime = Date.now()
|
||||
let processedChunks = 0
|
||||
let failedChunks = 0
|
||||
|
||||
try {
|
||||
// Configuration
|
||||
const config = {
|
||||
clearExisting: options.clearExisting ?? false,
|
||||
docsPath: options.docsPath ?? path.join(process.cwd(), '../../apps/docs/content/docs'),
|
||||
baseUrl: options.baseUrl ?? 'https://docs.simstudio.ai',
|
||||
chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
|
||||
minChunkSize: options.minChunkSize ?? 100,
|
||||
overlap: options.overlap ?? 50,
|
||||
}
|
||||
|
||||
logger.info('🚀 Starting docs embedding processing...')
|
||||
logger.info(`Configuration:`, {
|
||||
docsPath: config.docsPath,
|
||||
baseUrl: config.baseUrl,
|
||||
chunkSize: config.chunkSize,
|
||||
clearExisting: config.clearExisting,
|
||||
})
|
||||
|
||||
// Clear existing embeddings if requested
|
||||
if (config.clearExisting) {
|
||||
logger.info('🗑️ Clearing existing docs embeddings...')
|
||||
const deleteResult = await db.delete(docsEmbeddings)
|
||||
logger.info(`Deleted existing embeddings`)
|
||||
}
|
||||
|
||||
// Initialize the docs chunker
|
||||
const chunker = new DocsChunker({
|
||||
chunkSize: config.chunkSize,
|
||||
minChunkSize: config.minChunkSize,
|
||||
overlap: config.overlap,
|
||||
baseUrl: config.baseUrl,
|
||||
})
|
||||
|
||||
// Process all .mdx files
|
||||
logger.info(`📚 Processing docs from: ${config.docsPath}`)
|
||||
const chunks = await chunker.chunkAllDocs(config.docsPath)
|
||||
|
||||
if (chunks.length === 0) {
|
||||
logger.warn('⚠️ No chunks generated from docs')
|
||||
return { success: false, processedChunks: 0, failedChunks: 0 }
|
||||
}
|
||||
|
||||
logger.info(`📊 Generated ${chunks.length} chunks with embeddings`)
|
||||
|
||||
// Save chunks to database in batches for better performance
|
||||
const batchSize = 10
|
||||
logger.info(`💾 Saving chunks to database (batch size: ${batchSize})...`)
|
||||
|
||||
for (let i = 0; i < chunks.length; i += batchSize) {
|
||||
const batch = chunks.slice(i, i + batchSize)
|
||||
|
||||
try {
|
||||
// Prepare batch data
|
||||
const batchData = batch.map((chunk) => ({
|
||||
chunkText: chunk.text,
|
||||
sourceDocument: chunk.sourceDocument,
|
||||
sourceLink: chunk.headerLink,
|
||||
headerText: chunk.headerText,
|
||||
headerLevel: chunk.headerLevel,
|
||||
tokenCount: chunk.tokenCount,
|
||||
embedding: chunk.embedding,
|
||||
embeddingModel: chunk.embeddingModel,
|
||||
metadata: chunk.metadata,
|
||||
}))
|
||||
|
||||
// Insert batch
|
||||
await db.insert(docsEmbeddings).values(batchData)
|
||||
|
||||
processedChunks += batch.length
|
||||
|
||||
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
|
||||
logger.info(` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`)
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
|
||||
failedChunks += batch.length
|
||||
}
|
||||
}
|
||||
|
||||
// Verify results
|
||||
const savedCount = await db
|
||||
.select({ count: sql<number>`count(*)` })
|
||||
.from(docsEmbeddings)
|
||||
.then((result) => result[0]?.count || 0)
|
||||
|
||||
const duration = Date.now() - startTime
|
||||
|
||||
logger.info(`✅ Processing complete!`)
|
||||
logger.info(`📊 Results:`)
|
||||
logger.info(` • Total chunks processed: ${chunks.length}`)
|
||||
logger.info(` • Successfully saved: ${processedChunks}`)
|
||||
logger.info(` • Failed: ${failedChunks}`)
|
||||
logger.info(` • Database total: ${savedCount}`)
|
||||
logger.info(` • Duration: ${Math.round(duration / 1000)}s`)
|
||||
|
||||
// Summary by document
|
||||
const documentStats = chunks.reduce((acc, chunk) => {
|
||||
if (!acc[chunk.sourceDocument]) {
|
||||
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
|
||||
}
|
||||
acc[chunk.sourceDocument].chunks++
|
||||
acc[chunk.sourceDocument].tokens += chunk.tokenCount
|
||||
return acc
|
||||
}, {} as Record<string, { chunks: number; tokens: number }>)
|
||||
|
||||
logger.info(`📋 Document breakdown:`)
|
||||
Object.entries(documentStats)
|
||||
.sort(([, a], [, b]) => b.chunks - a.chunks)
|
||||
.slice(0, 10) // Top 10 documents
|
||||
.forEach(([doc, stats]) => {
|
||||
logger.info(` • ${doc}: ${stats.chunks} chunks, ${stats.tokens} tokens`)
|
||||
})
|
||||
|
||||
if (Object.keys(documentStats).length > 10) {
|
||||
logger.info(` • ... and ${Object.keys(documentStats).length - 10} more documents`)
|
||||
}
|
||||
|
||||
return {
|
||||
success: failedChunks === 0,
|
||||
processedChunks,
|
||||
failedChunks,
|
||||
totalChunks: chunks.length,
|
||||
databaseCount: savedCount,
|
||||
duration,
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
logger.error('💥 Fatal error during processing:', error)
|
||||
return {
|
||||
success: false,
|
||||
processedChunks,
|
||||
failedChunks,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function - handle command line arguments
|
||||
*/
|
||||
async function main() {
|
||||
const args = process.argv.slice(2)
|
||||
const options: ProcessingOptions = {}
|
||||
|
||||
// Parse command line arguments
|
||||
if (args.includes('--clear')) {
|
||||
options.clearExisting = true
|
||||
}
|
||||
|
||||
if (args.includes('--help') || args.includes('-h')) {
|
||||
console.log(`
|
||||
Usage: bun run scripts/process-docs-embeddings.ts [options]
|
||||
|
||||
Options:
|
||||
--clear Clear existing docs embeddings before processing
|
||||
--help, -h Show this help message
|
||||
|
||||
Examples:
|
||||
bun run scripts/process-docs-embeddings.ts
|
||||
bun run scripts/process-docs-embeddings.ts --clear
|
||||
`)
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
const result = await processDocsEmbeddings(options)
|
||||
|
||||
if (!result.success) {
|
||||
process.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Run the script
|
||||
if (process.argv[1]?.includes('process-docs-embeddings')) {
|
||||
main().catch((error) => {
|
||||
console.error('Script failed:', error)
|
||||
process.exit(1)
|
||||
})
|
||||
}
|
||||
|
||||
export { processDocsEmbeddings }
|
||||
Reference in New Issue
Block a user