mirror of
https://github.com/simstudioai/sim.git
synced 2026-01-08 22:48:14 -05:00
feat(docs): added vector search (#2583)
* feat(docs): added vector search * ack comments
This commit is contained in:
@@ -1,16 +1,126 @@
|
||||
import { createFromSource } from 'fumadocs-core/search/server'
|
||||
import { source } from '@/lib/source'
|
||||
import { sql } from 'drizzle-orm'
|
||||
import { type NextRequest, NextResponse } from 'next/server'
|
||||
import { db, docsEmbeddings } from '@/lib/db'
|
||||
import { generateSearchEmbedding } from '@/lib/embeddings'
|
||||
|
||||
export const revalidate = 3600 // Revalidate every hour
|
||||
export const runtime = 'nodejs'
|
||||
export const revalidate = 0
|
||||
|
||||
export const { GET } = createFromSource(source, {
|
||||
localeMap: {
|
||||
en: { language: 'english' },
|
||||
es: { language: 'spanish' },
|
||||
fr: { language: 'french' },
|
||||
de: { language: 'german' },
|
||||
// ja and zh are not supported by the stemmer library, so we'll skip language config for them
|
||||
ja: {},
|
||||
zh: {},
|
||||
},
|
||||
})
|
||||
/**
|
||||
* Hybrid search API endpoint
|
||||
* - English: Vector embeddings + keyword search
|
||||
* - Other languages: Keyword search only
|
||||
*/
|
||||
export async function GET(request: NextRequest) {
|
||||
try {
|
||||
const searchParams = request.nextUrl.searchParams
|
||||
const query = searchParams.get('query') || searchParams.get('q') || ''
|
||||
const locale = searchParams.get('locale') || 'en'
|
||||
const limit = Number.parseInt(searchParams.get('limit') || '10', 10)
|
||||
|
||||
if (!query || query.trim().length === 0) {
|
||||
return NextResponse.json([])
|
||||
}
|
||||
|
||||
const candidateLimit = limit * 3
|
||||
const similarityThreshold = 0.6
|
||||
|
||||
const localeMap: Record<string, string> = {
|
||||
en: 'english',
|
||||
es: 'spanish',
|
||||
fr: 'french',
|
||||
de: 'german',
|
||||
ja: 'simple', // PostgreSQL doesn't have Japanese support, use simple
|
||||
zh: 'simple', // PostgreSQL doesn't have Chinese support, use simple
|
||||
}
|
||||
const tsConfig = localeMap[locale] || 'simple'
|
||||
|
||||
const useVectorSearch = locale === 'en'
|
||||
let vectorResults: Array<{
|
||||
chunkId: string
|
||||
chunkText: string
|
||||
sourceDocument: string
|
||||
sourceLink: string
|
||||
headerText: string
|
||||
headerLevel: number
|
||||
similarity: number
|
||||
searchType: string
|
||||
}> = []
|
||||
|
||||
if (useVectorSearch) {
|
||||
const queryEmbedding = await generateSearchEmbedding(query)
|
||||
vectorResults = await db
|
||||
.select({
|
||||
chunkId: docsEmbeddings.chunkId,
|
||||
chunkText: docsEmbeddings.chunkText,
|
||||
sourceDocument: docsEmbeddings.sourceDocument,
|
||||
sourceLink: docsEmbeddings.sourceLink,
|
||||
headerText: docsEmbeddings.headerText,
|
||||
headerLevel: docsEmbeddings.headerLevel,
|
||||
similarity: sql<number>`1 - (${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector)`,
|
||||
searchType: sql<string>`'vector'`,
|
||||
})
|
||||
.from(docsEmbeddings)
|
||||
.where(
|
||||
sql`1 - (${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector) >= ${similarityThreshold}`
|
||||
)
|
||||
.orderBy(sql`${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector`)
|
||||
.limit(candidateLimit)
|
||||
}
|
||||
|
||||
const keywordResults = await db
|
||||
.select({
|
||||
chunkId: docsEmbeddings.chunkId,
|
||||
chunkText: docsEmbeddings.chunkText,
|
||||
sourceDocument: docsEmbeddings.sourceDocument,
|
||||
sourceLink: docsEmbeddings.sourceLink,
|
||||
headerText: docsEmbeddings.headerText,
|
||||
headerLevel: docsEmbeddings.headerLevel,
|
||||
similarity: sql<number>`ts_rank(${docsEmbeddings.chunkTextTsv}, plainto_tsquery(${tsConfig}, ${query}))`,
|
||||
searchType: sql<string>`'keyword'`,
|
||||
})
|
||||
.from(docsEmbeddings)
|
||||
.where(sql`${docsEmbeddings.chunkTextTsv} @@ plainto_tsquery(${tsConfig}, ${query})`)
|
||||
.orderBy(
|
||||
sql`ts_rank(${docsEmbeddings.chunkTextTsv}, plainto_tsquery(${tsConfig}, ${query})) DESC`
|
||||
)
|
||||
.limit(candidateLimit)
|
||||
|
||||
const seenIds = new Set<string>()
|
||||
const mergedResults = []
|
||||
|
||||
for (let i = 0; i < Math.max(vectorResults.length, keywordResults.length); i++) {
|
||||
if (i < vectorResults.length && !seenIds.has(vectorResults[i].chunkId)) {
|
||||
mergedResults.push(vectorResults[i])
|
||||
seenIds.add(vectorResults[i].chunkId)
|
||||
}
|
||||
if (i < keywordResults.length && !seenIds.has(keywordResults[i].chunkId)) {
|
||||
mergedResults.push(keywordResults[i])
|
||||
seenIds.add(keywordResults[i].chunkId)
|
||||
}
|
||||
}
|
||||
|
||||
const filteredResults = mergedResults.slice(0, limit)
|
||||
const searchResults = filteredResults.map((result) => {
|
||||
const title = result.headerText || result.sourceDocument.replace('.mdx', '')
|
||||
const pathParts = result.sourceDocument
|
||||
.replace('.mdx', '')
|
||||
.split('/')
|
||||
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
|
||||
|
||||
return {
|
||||
id: result.chunkId,
|
||||
type: 'page' as const,
|
||||
url: result.sourceLink,
|
||||
content: title,
|
||||
breadcrumbs: pathParts,
|
||||
}
|
||||
})
|
||||
|
||||
return NextResponse.json(searchResults)
|
||||
} catch (error) {
|
||||
console.error('Semantic search error:', error)
|
||||
|
||||
return NextResponse.json([])
|
||||
}
|
||||
}
|
||||
|
||||
4
apps/docs/lib/db.ts
Normal file
4
apps/docs/lib/db.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
import { db } from '@sim/db'
|
||||
import { docsEmbeddings } from '@sim/db/schema'
|
||||
|
||||
export { db, docsEmbeddings }
|
||||
40
apps/docs/lib/embeddings.ts
Normal file
40
apps/docs/lib/embeddings.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* Generate embeddings for search queries using OpenAI API
|
||||
*/
|
||||
export async function generateSearchEmbedding(query: string): Promise<number[]> {
|
||||
const apiKey = process.env.OPENAI_API_KEY
|
||||
|
||||
if (!apiKey) {
|
||||
throw new Error('OPENAI_API_KEY environment variable is required')
|
||||
}
|
||||
|
||||
const response = await fetch('https://api.openai.com/v1/embeddings', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
input: query,
|
||||
model: 'text-embedding-3-small',
|
||||
encoding_format: 'float',
|
||||
}),
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text()
|
||||
throw new Error(`OpenAI API failed: ${response.status} ${response.statusText} - ${errorText}`)
|
||||
}
|
||||
|
||||
const data = await response.json()
|
||||
|
||||
if (!data?.data || !Array.isArray(data.data) || data.data.length === 0) {
|
||||
throw new Error('OpenAI API returned invalid response structure: missing or empty data array')
|
||||
}
|
||||
|
||||
if (!data.data[0]?.embedding || !Array.isArray(data.data[0].embedding)) {
|
||||
throw new Error('OpenAI API returned invalid response structure: missing or invalid embedding')
|
||||
}
|
||||
|
||||
return data.data[0].embedding
|
||||
}
|
||||
@@ -11,16 +11,19 @@
|
||||
"type-check": "tsc --noEmit"
|
||||
},
|
||||
"dependencies": {
|
||||
"@sim/db": "workspace:*",
|
||||
"@tabler/icons-react": "^3.31.0",
|
||||
"@vercel/og": "^0.6.5",
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"drizzle-orm": "^0.44.5",
|
||||
"fumadocs-core": "16.2.3",
|
||||
"fumadocs-mdx": "14.1.0",
|
||||
"fumadocs-ui": "16.2.3",
|
||||
"lucide-react": "^0.511.0",
|
||||
"next": "16.1.0-canary.21",
|
||||
"next-themes": "^0.4.6",
|
||||
"postgres": "^3.4.5",
|
||||
"react": "19.2.1",
|
||||
"react-dom": "19.2.1",
|
||||
"tailwind-merge": "^3.0.2"
|
||||
|
||||
3
bun.lock
3
bun.lock
@@ -44,16 +44,19 @@
|
||||
"name": "docs",
|
||||
"version": "0.0.0",
|
||||
"dependencies": {
|
||||
"@sim/db": "workspace:*",
|
||||
"@tabler/icons-react": "^3.31.0",
|
||||
"@vercel/og": "^0.6.5",
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"drizzle-orm": "^0.44.5",
|
||||
"fumadocs-core": "16.2.3",
|
||||
"fumadocs-mdx": "14.1.0",
|
||||
"fumadocs-ui": "16.2.3",
|
||||
"lucide-react": "^0.511.0",
|
||||
"next": "16.1.0-canary.21",
|
||||
"next-themes": "^0.4.6",
|
||||
"postgres": "^3.4.5",
|
||||
"react": "19.2.1",
|
||||
"react-dom": "19.2.1",
|
||||
"tailwind-merge": "^3.0.2",
|
||||
|
||||
Reference in New Issue
Block a user