This commit is contained in:
Siddharth Ganesan
2025-07-08 16:11:55 -07:00
parent b9fa50b4de
commit 70a5f4ec31
5 changed files with 249 additions and 417 deletions

View File

@@ -93,12 +93,8 @@
"name": "account_user_id_user_id_fk",
"tableFrom": "account",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -170,12 +166,8 @@
"name": "api_key_user_id_user_id_fk",
"tableFrom": "api_key",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -185,9 +177,7 @@
"api_key_key_unique": {
"name": "api_key_key_unique",
"nullsNotDistinct": false,
"columns": [
"key"
]
"columns": ["key"]
}
},
"policies": {},
@@ -312,12 +302,8 @@
"name": "chat_workflow_id_workflow_id_fk",
"tableFrom": "chat",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -325,12 +311,8 @@
"name": "chat_user_id_user_id_fk",
"tableFrom": "chat",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -396,12 +378,8 @@
"name": "custom_tools_user_id_user_id_fk",
"tableFrom": "custom_tools",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -838,12 +816,8 @@
"name": "document_knowledge_base_id_knowledge_base_id_fk",
"tableFrom": "document",
"tableTo": "knowledge_base",
"columnsFrom": [
"knowledge_base_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["knowledge_base_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1140,12 +1114,8 @@
"name": "embedding_knowledge_base_id_knowledge_base_id_fk",
"tableFrom": "embedding",
"tableTo": "knowledge_base",
"columnsFrom": [
"knowledge_base_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["knowledge_base_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -1153,12 +1123,8 @@
"name": "embedding_document_id_document_id_fk",
"tableFrom": "embedding",
"tableTo": "document",
"columnsFrom": [
"document_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["document_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1210,12 +1176,8 @@
"name": "environment_user_id_user_id_fk",
"tableFrom": "environment",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1225,9 +1187,7 @@
"environment_user_id_unique": {
"name": "environment_user_id_unique",
"nullsNotDistinct": false,
"columns": [
"user_id"
]
"columns": ["user_id"]
}
},
"policies": {},
@@ -1294,12 +1254,8 @@
"name": "invitation_inviter_id_user_id_fk",
"tableFrom": "invitation",
"tableTo": "user",
"columnsFrom": [
"inviter_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["inviter_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -1307,12 +1263,8 @@
"name": "invitation_organization_id_organization_id_fk",
"tableFrom": "invitation",
"tableTo": "organization",
"columnsFrom": [
"organization_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["organization_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1479,12 +1431,8 @@
"name": "knowledge_base_user_id_user_id_fk",
"tableFrom": "knowledge_base",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -1492,12 +1440,8 @@
"name": "knowledge_base_workspace_id_workspace_id_fk",
"tableFrom": "knowledge_base",
"tableTo": "workspace",
"columnsFrom": [
"workspace_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workspace_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1588,12 +1532,8 @@
"name": "marketplace_workflow_id_workflow_id_fk",
"tableFrom": "marketplace",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -1601,12 +1541,8 @@
"name": "marketplace_author_id_user_id_fk",
"tableFrom": "marketplace",
"tableTo": "user",
"columnsFrom": [
"author_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["author_id"],
"columnsTo": ["id"],
"onDelete": "no action",
"onUpdate": "no action"
}
@@ -1659,12 +1595,8 @@
"name": "member_user_id_user_id_fk",
"tableFrom": "member",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -1672,12 +1604,8 @@
"name": "member_organization_id_organization_id_fk",
"tableFrom": "member",
"tableTo": "organization",
"columnsFrom": [
"organization_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["organization_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -1801,12 +1729,8 @@
"name": "memory_workflow_id_workflow_id_fk",
"tableFrom": "memory",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -2069,12 +1993,8 @@
"name": "permissions_user_id_user_id_fk",
"tableFrom": "permissions",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -2150,12 +2070,8 @@
"name": "session_user_id_user_id_fk",
"tableFrom": "session",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -2163,12 +2079,8 @@
"name": "session_active_organization_id_organization_id_fk",
"tableFrom": "session",
"tableTo": "organization",
"columnsFrom": [
"active_organization_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["active_organization_id"],
"columnsTo": ["id"],
"onDelete": "set null",
"onUpdate": "no action"
}
@@ -2178,9 +2090,7 @@
"session_token_unique": {
"name": "session_token_unique",
"nullsNotDistinct": false,
"columns": [
"token"
]
"columns": ["token"]
}
},
"policies": {},
@@ -2280,12 +2190,8 @@
"name": "settings_user_id_user_id_fk",
"tableFrom": "settings",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -2295,9 +2201,7 @@
"settings_user_id_unique": {
"name": "settings_user_id_unique",
"nullsNotDistinct": false,
"columns": [
"user_id"
]
"columns": ["user_id"]
}
},
"policies": {},
@@ -2482,9 +2386,7 @@
"user_email_unique": {
"name": "user_email_unique",
"nullsNotDistinct": false,
"columns": [
"email"
]
"columns": ["email"]
}
},
"policies": {},
@@ -2617,12 +2519,8 @@
"name": "user_stats_user_id_user_id_fk",
"tableFrom": "user_stats",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -2632,9 +2530,7 @@
"user_stats_user_id_unique": {
"name": "user_stats_user_id_unique",
"nullsNotDistinct": false,
"columns": [
"user_id"
]
"columns": ["user_id"]
}
},
"policies": {},
@@ -2735,9 +2631,7 @@
"waitlist_email_unique": {
"name": "waitlist_email_unique",
"nullsNotDistinct": false,
"columns": [
"email"
]
"columns": ["email"]
}
},
"policies": {},
@@ -2822,12 +2716,8 @@
"name": "webhook_workflow_id_workflow_id_fk",
"tableFrom": "webhook",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -2975,12 +2865,8 @@
"name": "workflow_user_id_user_id_fk",
"tableFrom": "workflow",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -2988,12 +2874,8 @@
"name": "workflow_workspace_id_workspace_id_fk",
"tableFrom": "workflow",
"tableTo": "workspace",
"columnsFrom": [
"workspace_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workspace_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -3001,12 +2883,8 @@
"name": "workflow_folder_id_workflow_folder_id_fk",
"tableFrom": "workflow",
"tableTo": "workflow_folder",
"columnsFrom": [
"folder_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["folder_id"],
"columnsTo": ["id"],
"onDelete": "set null",
"onUpdate": "no action"
}
@@ -3219,12 +3097,8 @@
"name": "workflow_blocks_workflow_id_workflow_id_fk",
"tableFrom": "workflow_blocks",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -3377,12 +3251,8 @@
"name": "workflow_edges_workflow_id_workflow_id_fk",
"tableFrom": "workflow_edges",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -3390,12 +3260,8 @@
"name": "workflow_edges_source_block_id_workflow_blocks_id_fk",
"tableFrom": "workflow_edges",
"tableTo": "workflow_blocks",
"columnsFrom": [
"source_block_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["source_block_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -3403,12 +3269,8 @@
"name": "workflow_edges_target_block_id_workflow_blocks_id_fk",
"tableFrom": "workflow_edges",
"tableTo": "workflow_blocks",
"columnsFrom": [
"target_block_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["target_block_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -3717,12 +3579,8 @@
"name": "workflow_execution_blocks_workflow_id_workflow_id_fk",
"tableFrom": "workflow_execution_blocks",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -3991,12 +3849,8 @@
"name": "workflow_execution_logs_workflow_id_workflow_id_fk",
"tableFrom": "workflow_execution_logs",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -4004,12 +3858,8 @@
"name": "workflow_execution_logs_state_snapshot_id_workflow_execution_snapshots_id_fk",
"tableFrom": "workflow_execution_logs",
"tableTo": "workflow_execution_snapshots",
"columnsFrom": [
"state_snapshot_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["state_snapshot_id"],
"columnsTo": ["id"],
"onDelete": "no action",
"onUpdate": "no action"
}
@@ -4129,12 +3979,8 @@
"name": "workflow_execution_snapshots_workflow_id_workflow_id_fk",
"tableFrom": "workflow_execution_snapshots",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4279,12 +4125,8 @@
"name": "workflow_folder_user_id_user_id_fk",
"tableFrom": "workflow_folder",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -4292,12 +4134,8 @@
"name": "workflow_folder_workspace_id_workspace_id_fk",
"tableFrom": "workflow_folder",
"tableTo": "workspace",
"columnsFrom": [
"workspace_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workspace_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4374,12 +4212,8 @@
"name": "workflow_logs_workflow_id_workflow_id_fk",
"tableFrom": "workflow_logs",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4478,12 +4312,8 @@
"name": "workflow_schedule_workflow_id_workflow_id_fk",
"tableFrom": "workflow_schedule",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4493,9 +4323,7 @@
"workflow_schedule_workflow_id_unique": {
"name": "workflow_schedule_workflow_id_unique",
"nullsNotDistinct": false,
"columns": [
"workflow_id"
]
"columns": ["workflow_id"]
}
},
"policies": {},
@@ -4589,12 +4417,8 @@
"name": "workflow_subflows_workflow_id_workflow_id_fk",
"tableFrom": "workflow_subflows",
"tableTo": "workflow",
"columnsFrom": [
"workflow_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workflow_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4648,12 +4472,8 @@
"name": "workspace_owner_id_user_id_fk",
"tableFrom": "workspace",
"tableTo": "user",
"columnsFrom": [
"owner_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["owner_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4747,12 +4567,8 @@
"name": "workspace_invitation_workspace_id_workspace_id_fk",
"tableFrom": "workspace_invitation",
"tableTo": "workspace",
"columnsFrom": [
"workspace_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workspace_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -4760,12 +4576,8 @@
"name": "workspace_invitation_inviter_id_user_id_fk",
"tableFrom": "workspace_invitation",
"tableTo": "user",
"columnsFrom": [
"inviter_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["inviter_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4775,9 +4587,7 @@
"workspace_invitation_token_unique": {
"name": "workspace_invitation_token_unique",
"nullsNotDistinct": false,
"columns": [
"token"
]
"columns": ["token"]
}
},
"policies": {},
@@ -4856,12 +4666,8 @@
"name": "workspace_member_workspace_id_workspace_id_fk",
"tableFrom": "workspace_member",
"tableTo": "workspace",
"columnsFrom": [
"workspace_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["workspace_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
},
@@ -4869,12 +4675,8 @@
"name": "workspace_member_user_id_user_id_fk",
"tableFrom": "workspace_member",
"tableTo": "user",
"columnsFrom": [
"user_id"
],
"columnsTo": [
"id"
],
"columnsFrom": ["user_id"],
"columnsTo": ["id"],
"onDelete": "cascade",
"onUpdate": "no action"
}
@@ -4890,11 +4692,7 @@
"public.permission_type": {
"name": "permission_type",
"schema": "public",
"values": [
"admin",
"write",
"read"
]
"values": ["admin", "write", "read"]
}
},
"schemas": {},
@@ -4907,4 +4705,4 @@
"schemas": {},
"tables": {}
}
}
}

View File

@@ -360,4 +360,4 @@
"breakpoints": true
}
]
}
}

View File

@@ -946,7 +946,10 @@ export const docsEmbeddings = pgTable(
headerLevelIdx: index('docs_emb_header_level_idx').on(table.headerLevel),
// Combined source and header queries
sourceHeaderIdx: index('docs_emb_source_header_idx').on(table.sourceDocument, table.headerLevel),
sourceHeaderIdx: index('docs_emb_source_header_idx').on(
table.sourceDocument,
table.headerLevel
),
// Model-specific queries
modelIdx: index('docs_emb_model_idx').on(table.embeddingModel),
@@ -970,6 +973,9 @@ export const docsEmbeddings = pgTable(
// Constraints
embeddingNotNullCheck: check('docs_embedding_not_null_check', sql`"embedding" IS NOT NULL`),
headerLevelCheck: check('docs_header_level_check', sql`"header_level" >= 1 AND "header_level" <= 6`),
headerLevelCheck: check(
'docs_header_level_check',
sql`"header_level" >= 1 AND "header_level" <= 6`
),
})
)

View File

@@ -214,19 +214,23 @@ export class DocsChunker {
private async splitContent(content: string): Promise<string[]> {
// Clean the content first
const cleanedContent = this.cleanContent(content)
// Detect table boundaries to avoid splitting them
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
// Use the existing TextChunker
const chunks = await this.textChunker.chunk(cleanedContent)
// Post-process chunks to ensure tables aren't split
const processedChunks = this.mergeTableChunks(chunks.map(chunk => chunk.text), tableBoundaries, cleanedContent)
const processedChunks = this.mergeTableChunks(
chunks.map((chunk) => chunk.text),
tableBoundaries,
cleanedContent
)
// Ensure no chunk exceeds 300 tokens
const finalChunks = this.enforceSizeLimit(processedChunks)
return finalChunks
}
@@ -248,20 +252,20 @@ export class DocsChunker {
)
}
/**
/**
* Parse frontmatter from MDX content
*/
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
if (!match) {
return { data: {}, content }
}
const [, frontmatterText, markdownContent] = match
const data: Frontmatter = {}
// Simple YAML parsing for title and description
const lines = frontmatterText.split('\n')
for (const line of lines) {
@@ -275,34 +279,36 @@ export class DocsChunker {
data[key] = value
}
}
return { data, content: markdownContent }
}
/**
* Split content by headers to respect document structure
*/
private splitByHeaders(content: string): Array<{ header: string | null; content: string; level: number }> {
private splitByHeaders(
content: string
): Array<{ header: string | null; content: string; level: number }> {
const lines = content.split('\n')
const sections: Array<{ header: string | null; content: string; level: number }> = []
let currentHeader: string | null = null
let currentLevel = 0
let currentContent: string[] = []
for (const line of lines) {
const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
if (headerMatch) {
// Save previous section
if (currentContent.length > 0) {
sections.push({
header: currentHeader,
content: currentContent.join('\n').trim(),
level: currentLevel
level: currentLevel,
})
}
// Start new section
currentHeader = line
currentLevel = headerMatch[1].length
@@ -311,17 +317,17 @@ export class DocsChunker {
currentContent.push(line)
}
}
// Add final section
if (currentContent.length > 0) {
sections.push({
header: currentHeader,
content: currentContent.join('\n').trim(),
level: currentLevel
level: currentLevel,
})
}
return sections.filter(section => section.content.trim().length > 0)
return sections.filter((section) => section.content.trim().length > 0)
}
/**
@@ -338,11 +344,11 @@ export class DocsChunker {
private mergeSmallChunks(chunks: string[]): string[] {
const merged: string[] = []
let currentChunk = ''
for (const chunk of chunks) {
const currentTokens = this.estimateTokens(currentChunk)
const chunkTokens = this.estimateTokens(chunk)
// If adding this chunk would exceed target size, save current and start new
if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
if (currentChunk.trim()) {
@@ -354,39 +360,42 @@ export class DocsChunker {
currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
}
}
// Add final chunk
if (currentChunk.trim()) {
merged.push(currentChunk.trim())
}
return merged
}
/**
* Chunk a section while preserving tables and structure
*/
private async chunkSection(section: { header: string | null; content: string; level: number }): Promise<string[]> {
private async chunkSection(section: {
header: string | null
content: string
level: number
}): Promise<string[]> {
const content = section.content
const header = section.header
// Check if content contains tables
const hasTable = this.containsTable(content)
if (hasTable) {
// Split by tables and handle each part
return this.splitContentWithTables(content, header)
} else {
// Regular chunking for text-only content
const chunks = await this.textChunker.chunk(content)
return chunks.map((chunk, index) => {
// Add header to first chunk only
if (index === 0 && header) {
return `${header}\n\n${chunk.text}`.trim()
}
return chunk.text
})
}
// Regular chunking for text-only content
const chunks = await this.textChunker.chunk(content)
return chunks.map((chunk, index) => {
// Add header to first chunk only
if (index === 0 && header) {
return `${header}\n\n${chunk.text}`.trim()
}
return chunk.text
})
}
/**
@@ -397,7 +406,7 @@ export class DocsChunker {
return lines.some((line, index) => {
if (line.includes('|') && line.split('|').length >= 3) {
const nextLine = lines[index + 1]
return nextLine && nextLine.includes('|') && nextLine.includes('-')
return nextLine?.includes('|') && nextLine.includes('-')
}
return false
})
@@ -412,45 +421,50 @@ export class DocsChunker {
let currentChunk: string[] = []
let inTable = false
let tableLines: string[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
// Detect table start
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
const nextLine = lines[i + 1]
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
if (nextLine?.includes('|') && nextLine.includes('-')) {
inTable = true
// Save current chunk if it has content
if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
const chunkText = currentChunk.join('\n').trim()
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
const withHeader =
chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
chunks.push(withHeader)
currentChunk = []
}
tableLines = [line]
continue
}
}
if (inTable) {
tableLines.push(line)
// Detect table end
if (!line.includes('|') || line.trim() === '') {
inTable = false
// Save table as its own chunk
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
const tableText = tableLines
.filter((l) => l.trim())
.join('\n')
.trim()
if (tableText.length > 0) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
const withHeader =
chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
chunks.push(withHeader)
}
tableLines = []
// Start new chunk if current line has content
if (line.trim() !== '') {
currentChunk = [line]
@@ -458,22 +472,26 @@ export class DocsChunker {
}
} else {
currentChunk.push(line)
// If chunk is getting large, save it
if (this.estimateTokens(currentChunk.join('\n')) > 250) {
const chunkText = currentChunk.join('\n').trim()
if (chunkText.length > 50) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
const withHeader =
chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
chunks.push(withHeader)
}
currentChunk = []
}
}
}
// Handle remaining content
if (inTable && tableLines.length > 0) {
const tableText = tableLines.filter(l => l.trim()).join('\n').trim()
const tableText = tableLines
.filter((l) => l.trim())
.join('\n')
.trim()
if (tableText.length > 0) {
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
chunks.push(withHeader)
@@ -485,8 +503,8 @@ export class DocsChunker {
chunks.push(withHeader)
}
}
return chunks.filter(chunk => chunk.trim().length > 50)
return chunks.filter((chunk) => chunk.trim().length > 50)
}
/**
@@ -495,40 +513,40 @@ export class DocsChunker {
private detectTableBoundaries(content: string): { start: number; end: number }[] {
const tables: { start: number; end: number }[] = []
const lines = content.split('\n')
let inTable = false
let tableStart = -1
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
// Detect table start (markdown table row with pipes)
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
// Check if next line is table separator (contains dashes and pipes)
const nextLine = lines[i + 1]?.trim()
if (nextLine && nextLine.includes('|') && nextLine.includes('-')) {
if (nextLine?.includes('|') && nextLine.includes('-')) {
inTable = true
tableStart = i
}
}
// Detect table end (empty line or non-table content)
else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
tables.push({
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0
end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0,
})
inTable = false
}
}
// Handle table at end of content
if (inTable && tableStart >= 0) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
end: content.length
end: content.length,
})
}
return tables
}
@@ -542,50 +560,56 @@ export class DocsChunker {
/**
* Merge chunks that would split tables
*/
private mergeTableChunks(chunks: string[], tableBoundaries: { start: number; end: number }[], originalContent: string): string[] {
private mergeTableChunks(
chunks: string[],
tableBoundaries: { start: number; end: number }[],
originalContent: string
): string[] {
if (tableBoundaries.length === 0) {
return chunks
}
const mergedChunks: string[] = []
let currentPosition = 0
for (const chunk of chunks) {
const chunkStart = originalContent.indexOf(chunk, currentPosition)
const chunkEnd = chunkStart + chunk.length
// Check if this chunk intersects with any table
const intersectsTable = tableBoundaries.some(table =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
if (intersectsTable) {
// Find which table(s) this chunk intersects with
const affectedTables = tableBoundaries.filter(table =>
const intersectsTable = tableBoundaries.some(
(table) =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
if (intersectsTable) {
// Find which table(s) this chunk intersects with
const affectedTables = tableBoundaries.filter(
(table) =>
(chunkStart >= table.start && chunkStart <= table.end) ||
(chunkEnd >= table.start && chunkEnd <= table.end) ||
(chunkStart <= table.start && chunkEnd >= table.end)
)
// Create a chunk that includes the complete table(s)
const minStart = Math.min(chunkStart, ...affectedTables.map(t => t.start))
const maxEnd = Math.max(chunkEnd, ...affectedTables.map(t => t.end))
const minStart = Math.min(chunkStart, ...affectedTables.map((t) => t.start))
const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
const completeChunk = originalContent.slice(minStart, maxEnd)
// Only add if we haven't already included this content
if (!mergedChunks.some(existing => existing.includes(completeChunk.trim()))) {
if (!mergedChunks.some((existing) => existing.includes(completeChunk.trim()))) {
mergedChunks.push(completeChunk.trim())
}
} else {
mergedChunks.push(chunk)
}
currentPosition = chunkEnd
}
return mergedChunks.filter(chunk => chunk.length > 50) // Filter out tiny chunks
return mergedChunks.filter((chunk) => chunk.length > 50) // Filter out tiny chunks
}
/**
@@ -593,10 +617,10 @@ export class DocsChunker {
*/
private enforceSizeLimit(chunks: string[]): string[] {
const finalChunks: string[] = []
for (const chunk of chunks) {
const tokens = this.estimateTokens(chunk)
if (tokens <= 300) {
// Chunk is within limit
finalChunks.push(chunk)
@@ -604,10 +628,10 @@ export class DocsChunker {
// Chunk is too large - split it
const lines = chunk.split('\n')
let currentChunk = ''
for (const line of lines) {
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
if (this.estimateTokens(testChunk) <= 300) {
currentChunk = testChunk
} else {
@@ -618,14 +642,14 @@ export class DocsChunker {
currentChunk = line
}
}
// Add final chunk if it has content
if (currentChunk.trim()) {
finalChunks.push(currentChunk.trim())
}
}
}
return finalChunks.filter(chunk => chunk.trim().length > 100)
return finalChunks.filter((chunk) => chunk.trim().length > 100)
}
}

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env bun
import path from 'path'
import { sql } from 'drizzle-orm'
import { DocsChunker } from '@/lib/documents/docs-chunker'
import { createLogger } from '@/lib/logs/console-logger'
import { db } from '@/db'
import { docsEmbeddings } from '@/db/schema'
import { sql } from 'drizzle-orm'
const logger = createLogger('ProcessDocsEmbeddings')
@@ -83,7 +83,7 @@ async function processDocsEmbeddings(options: ProcessingOptions = {}) {
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize)
try {
// Prepare batch data
const batchData = batch.map((chunk) => ({
@@ -100,11 +100,13 @@ async function processDocsEmbeddings(options: ProcessingOptions = {}) {
// Insert batch
await db.insert(docsEmbeddings).values(batchData)
processedChunks += batch.length
if (i % (batchSize * 5) === 0 || i + batchSize >= chunks.length) {
logger.info(` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`)
logger.info(
` 💾 Saved ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`
)
}
} catch (error) {
logger.error(`❌ Failed to save batch ${Math.floor(i / batchSize) + 1}:`, error)
@@ -119,7 +121,7 @@ async function processDocsEmbeddings(options: ProcessingOptions = {}) {
.then((result) => result[0]?.count || 0)
const duration = Date.now() - startTime
logger.info(`✅ Processing complete!`)
logger.info(`📊 Results:`)
logger.info(` • Total chunks processed: ${chunks.length}`)
@@ -127,16 +129,19 @@ async function processDocsEmbeddings(options: ProcessingOptions = {}) {
logger.info(` • Failed: ${failedChunks}`)
logger.info(` • Database total: ${savedCount}`)
logger.info(` • Duration: ${Math.round(duration / 1000)}s`)
// Summary by document
const documentStats = chunks.reduce((acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
}
acc[chunk.sourceDocument].chunks++
acc[chunk.sourceDocument].tokens += chunk.tokenCount
return acc
}, {} as Record<string, { chunks: number; tokens: number }>)
const documentStats = chunks.reduce(
(acc, chunk) => {
if (!acc[chunk.sourceDocument]) {
acc[chunk.sourceDocument] = { chunks: 0, tokens: 0 }
}
acc[chunk.sourceDocument].chunks++
acc[chunk.sourceDocument].tokens += chunk.tokenCount
return acc
},
{} as Record<string, { chunks: number; tokens: number }>
)
logger.info(`📋 Document breakdown:`)
Object.entries(documentStats)
@@ -158,7 +163,6 @@ async function processDocsEmbeddings(options: ProcessingOptions = {}) {
databaseCount: savedCount,
duration,
}
} catch (error) {
logger.error('💥 Fatal error during processing:', error)
return {
@@ -212,4 +216,4 @@ if (process.argv[1]?.includes('process-docs-embeddings')) {
})
}
export { processDocsEmbeddings }
export { processDocsEmbeddings }