add vector search in store

This commit is contained in:
Swifty
2025-09-02 22:48:44 +02:00
parent 7db3db6b2e
commit df50c7352d
7 changed files with 736 additions and 183 deletions

View File

@@ -1,4 +1,5 @@
import logging
import os
from datetime import datetime, timezone
from typing import List, Dict, Any, Tuple
import re
@@ -26,11 +27,29 @@ from backend.data.notifications import (
NotificationEventModel,
)
from backend.notifications.notifications import queue_notification_async
from backend.server.v2.store.embeddings import SearchFieldType, StoreAgentSearchService
from backend.server.v2.store.embeddings import (
SubmissionStatus as SearchSubmissionStatus,
)
from backend.server.v2.store.embeddings import create_embedding
from backend.util.settings import Settings
logger = logging.getLogger(__name__)
settings = Settings()
# Initialize the search service with database URL
search_service: StoreAgentSearchService | None = None
def get_search_service() -> StoreAgentSearchService:
"""Get or create the search service instance"""
global search_service
if search_service is None:
# Get database URL from environment variable (same as Prisma uses)
db_url = os.getenv("DATABASE_URL", "postgresql://localhost:5432")
search_service = StoreAgentSearchService(db_url)
return search_service
# Constants for default admin values
DEFAULT_ADMIN_NAME = "AutoGPT Admin"
@@ -56,163 +75,76 @@ def sanitize_query(query: str | None) -> str | None:
)
class AdvancedSearchEngine:
"""Advanced search engine with multiple search algorithms"""
@staticmethod
def tokenize(text: str) -> List[str]:
"""Tokenize text into searchable words"""
# Convert to lowercase and split on word boundaries
text = text.lower()
# Remove special characters but keep spaces, numbers, and letters
text = re.sub(r'[^\w\s-]', ' ', text)
# Split and filter empty tokens
tokens = [token.strip() for token in text.split() if token.strip()]
return tokens
@staticmethod
def calculate_fuzzy_score(query: str, target: str, threshold: float = 0.6) -> float:
"""Calculate fuzzy matching score using sequence matching"""
query_lower = query.lower()
target_lower = target.lower()
# Direct substring match gets highest score
if query_lower in target_lower:
return 1.0
# Check word-level matches
query_tokens = AdvancedSearchEngine.tokenize(query)
target_tokens = AdvancedSearchEngine.tokenize(target)
# Calculate token overlap score
matching_tokens = sum(1 for qt in query_tokens if any(qt in tt for tt in target_tokens))
if query_tokens:
token_score = matching_tokens / len(query_tokens)
else:
token_score = 0
# Use SequenceMatcher for fuzzy matching
sequence_score = SequenceMatcher(None, query_lower, target_lower).ratio()
# Combine scores with weights
combined_score = (token_score * 0.6) + (sequence_score * 0.4)
return combined_score if combined_score >= threshold else 0
@staticmethod
def calculate_relevance_score(
agent: prisma.models.StoreAgent,
query: str,
search_fields: Dict[str, float]
) -> float:
"""Calculate relevance score for an agent based on search query"""
total_score = 0.0
query_lower = query.lower()
for field_name, weight in search_fields.items():
field_value = getattr(agent, field_name, None)
if field_value is None:
continue
if isinstance(field_value, list):
# For categories and other list fields
field_text = ' '.join(str(v) for v in field_value)
async def search_store_agents(
search_query: str,
) -> backend.server.v2.store.model.StoreAgentsResponse:
"""
Search for store agents using embeddings with SQLAlchemy
"""
query_embedding = create_embedding(search_query)
# Use SQLAlchemy service for search
service = get_search_service()
results = await service.search_by_embedding(query_embedding, limit=30)
# Convert raw results to StoreAgent models
agents = []
for row in results:
try:
# Handle agent_image - it could be a list or a single string
agent_image = row.get("agent_image", "")
if isinstance(agent_image, list) and agent_image:
agent_image = str(agent_image[0])
elif not agent_image:
agent_image = ""
else:
field_text = str(field_value)
# Calculate field score
field_score = AdvancedSearchEngine.calculate_fuzzy_score(query, field_text)
# Apply weight
total_score += field_score * weight
# Boost score for exact matches in important fields
if query_lower in agent.agent_name.lower():
total_score *= 1.5
# Boost for high-rated agents
if agent.rating > 4.0:
total_score *= 1.1
# Boost for popular agents
if agent.runs > 100:
total_score *= 1.05
return total_score
@staticmethod
def generate_search_variants(query: str) -> List[str]:
"""Generate search variants for typo tolerance"""
variants = [query]
query_lower = query.lower()
# Add common typo patterns
# Handle missing spaces (e.g., "chatgpt" -> "chat gpt")
for i in range(1, len(query_lower)):
variant = query_lower[:i] + ' ' + query_lower[i:]
variants.append(variant)
# Handle extra spaces (e.g., "chat gpt" -> "chatgpt")
variants.append(query_lower.replace(' ', ''))
# Handle common letter swaps
for i in range(len(query_lower) - 1):
chars = list(query_lower)
chars[i], chars[i + 1] = chars[i + 1], chars[i]
variants.append(''.join(chars))
return list(set(variants))[:10] # Limit to 10 variants
@staticmethod
def build_advanced_search_query(
query: str,
use_fuzzy: bool = True
) -> List[Dict[str, Any]]:
"""Build advanced search conditions for Prisma"""
conditions = []
# Generate search variants for typo tolerance
if use_fuzzy:
search_variants = AdvancedSearchEngine.generate_search_variants(query)
else:
search_variants = [query]
for variant in search_variants:
sanitized = sanitize_query(variant)
if not sanitized:
continue
# Tokenize for better matching
tokens = AdvancedSearchEngine.tokenize(sanitized)
# Build conditions for each token
for token in tokens:
# Search in agent name with highest priority
conditions.append({
"agent_name": {"contains": token, "mode": "insensitive"}
})
# Search in description
conditions.append({
"description": {"contains": token, "mode": "insensitive"}
})
# Search in sub_heading
conditions.append({
"sub_heading": {"contains": token, "mode": "insensitive"}
})
# Search in categories
conditions.append({
"categories": {"has": token}
})
# Search in creator username
conditions.append({
"creator_username": {"contains": token, "mode": "insensitive"}
})
return conditions
agent_image = str(agent_image)
agent = backend.server.v2.store.model.StoreAgent(
slug=row.get("slug", ""),
agent_name=row.get("agent_name", ""),
agent_image=agent_image,
creator=row.get("creator_username", "Needs Profile"),
creator_avatar=row.get("creator_avatar", ""),
sub_heading=row.get("sub_heading", ""),
description=row.get("description", ""),
runs=row.get("runs", 0),
rating=(
float(row.get("rating", 0.0))
if row.get("rating") is not None
else 0.0
),
)
agents.append(agent)
except Exception as e:
logger.error(f"Error creating StoreAgent from search result: {e}")
continue
return backend.server.v2.store.model.StoreAgentsResponse(
agents=agents,
pagination=backend.server.v2.store.model.Pagination(
current_page=1,
total_items=len(agents),
total_pages=1,
page_size=20,
),
)
async def search_agents(
search_query: str,
featured: bool | None = None,
creators: list[str] | None = None,
category: str | None = None,
page: int = 1,
page_size: int = 20,
) -> backend.server.v2.store.model.StoreAgentsResponse:
"""
Search for store agents using embeddings with optional filters
"""
# For now, just use the search_store_agents function
# In the future, we can add filtering logic here
return await search_store_agents(search_query)
async def get_store_agents(
@@ -226,19 +158,19 @@ async def get_store_agents(
) -> backend.server.v2.store.model.StoreAgentsResponse:
"""
Get PUBLIC store agents from the StoreAgent view with advanced search capabilities
Features:
- Fuzzy matching with typo tolerance
- Token-based search across multiple fields
- Relevance scoring and ranking
- Category and creator filtering
- Popularity boosting for high-rated/popular agents
Note: For production deployments with Supabase, you can enable PostgreSQL extensions
for even better search performance:
- pg_trgm: For trigram-based fuzzy matching (available in Supabase)
- Full-text search: Built into PostgreSQL
These can be enabled via Supabase dashboard or raw SQL queries if needed.
"""
logger.debug(
@@ -259,23 +191,23 @@ async def get_store_agents(
search_conditions = AdvancedSearchEngine.build_advanced_search_query(
search_query, use_fuzzy=True
)
if search_conditions:
where_clause["OR"] = search_conditions
# Define search relevance fields and their weights
search_fields = {
"agent_name": 3.0, # Highest weight for name matches
"description": 2.0, # High weight for description
"sub_heading": 1.5, # Medium weight for sub-heading
"categories": 1.0, # Lower weight for categories
"creator_username": 0.5 # Lower weight for creator name
"agent_name": 3.0, # Highest weight for name matches
"description": 2.0, # High weight for description
"sub_heading": 1.5, # Medium weight for sub-heading
"categories": 1.0, # Lower weight for categories
"creator_username": 0.5, # Lower weight for creator name
}
try:
# Fetch more results initially for relevance scoring if searching
fetch_limit = page_size * 3 if search_query else page_size
# Build order_by clause
order_by = []
if sorted_by == "rating":
@@ -287,7 +219,7 @@ async def get_store_agents(
# Default to rating for better UX when no sort specified
elif not sorted_by and not search_query:
order_by.append({"rating": "desc"})
# Fetch agents from database
agents = await prisma.models.StoreAgent.prisma().find_many(
where=prisma.types.StoreAgentWhereInput(**where_clause),
@@ -306,21 +238,21 @@ async def get_store_agents(
)
if score > 0: # Only include agents with positive scores
scored_agents.append((score, agent))
# Sort by relevance score (highest first)
scored_agents.sort(key=lambda x: x[0], reverse=True)
# Apply secondary sorting if specified
if sorted_by == "rating":
scored_agents.sort(key=lambda x: (x[0], x[1].rating), reverse=True)
elif sorted_by == "runs":
scored_agents.sort(key=lambda x: (x[0], x[1].runs), reverse=True)
# Extract agents for the requested page
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
page_agents = [agent for _, agent in scored_agents[start_idx:end_idx]]
# Update total count for pagination
total = len(scored_agents)
else:
@@ -329,7 +261,7 @@ async def get_store_agents(
total = await prisma.models.StoreAgent.prisma().count(
where=prisma.types.StoreAgentWhereInput(**where_clause)
)
total_pages = (total + page_size - 1) // page_size
# Convert to response models
@@ -1585,6 +1517,40 @@ async def review_store_submission(
f"Failed to update store listing version {store_listing_version_id}"
)
# Create embeddings if approved
if is_approved and submission.StoreListing:
service = get_search_service()
# Create embeddings for the approved listing
fields_to_embed = [
("name", submission.name, SearchFieldType.NAME),
("description", submission.description, SearchFieldType.DESCRIPTION),
]
if submission.subHeading:
fields_to_embed.append(
("subHeading", submission.subHeading, SearchFieldType.SUBHEADING)
)
if submission.categories:
categories_text = ", ".join(submission.categories)
fields_to_embed.append(
("categories", categories_text, SearchFieldType.CATEGORIES)
)
for field_name, field_value, field_type in fields_to_embed:
embedding = create_embedding(field_value)
await service.upsert_search_record(
store_listing_version_id=store_listing_version_id,
store_listing_id=submission.StoreListing.id,
field_name=field_name,
field_value=field_value,
embedding=embedding,
field_type=field_type,
submission_status=SearchSubmissionStatus.APPROVED,
is_available=submission.isAvailable,
)
# Send email notification to the agent creator
if store_listing_version.AgentGraph and store_listing_version.AgentGraph.User:
agent_creator = store_listing_version.AgentGraph.User

View File

@@ -0,0 +1,390 @@
"""Store search functionality with embeddings and pgvector using SQLAlchemy"""
from datetime import datetime, timezone
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence
from uuid import uuid4
from openai import OpenAI
from pgvector.sqlalchemy import Vector
from sqlalchemy import Boolean, Column, DateTime
from sqlalchemy import Enum as SQLEnum
from sqlalchemy import Index, String, UniqueConstraint, and_, select, text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.ext.declarative import declarative_base
# OpenAI client for embeddings
client = OpenAI()
def create_embedding(text: str) -> list[float]:
response = client.embeddings.create(
input=text,
model="text-embedding-3-small",
)
return response.data[0].embedding
# SQLAlchemy models
Base = declarative_base()
class SubmissionStatus(str, Enum):
PENDING = "PENDING"
APPROVED = "APPROVED"
REJECTED = "REJECTED"
class SearchFieldType(str, Enum):
NAME = "NAME"
DESCRIPTION = "DESCRIPTION"
CATEGORIES = "CATEGORIES"
SUBHEADING = "SUBHEADING"
class StoreAgentSearch(Base):
__tablename__ = "StoreAgentSearch"
id = Column(String, primary_key=True, default=lambda: str(uuid4()))
createdAt = Column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
updatedAt = Column(
DateTime(timezone=True),
default=lambda: datetime.now(timezone.utc),
onupdate=lambda: datetime.now(timezone.utc),
)
# Relations (foreign keys exist in DB but not modeled here)
storeListingVersionId = Column(String, nullable=False)
storeListingId = Column(String, nullable=False)
# Searchable fields
fieldName = Column(String, nullable=False)
fieldValue = Column(String, nullable=False)
# Vector embedding for similarity search
# text-embedding-3-small produces 1536-dimensional embeddings
embedding = Column(Vector(1536), nullable=False)
# Metadata
fieldType = Column(
SQLEnum(SearchFieldType, name="SearchFieldType", schema="platform"),
nullable=False,
)
submissionStatus = Column(
SQLEnum(SubmissionStatus, name="SubmissionStatus", schema="platform"),
nullable=False,
)
isAvailable = Column(Boolean, nullable=False)
# Constraints and schema
__table_args__ = (
UniqueConstraint(
"storeListingVersionId",
"fieldName",
name="_store_listing_version_field_unique",
),
Index("ix_store_agent_search_listing_version", "storeListingVersionId"),
Index("ix_store_agent_search_listing", "storeListingId"),
Index("ix_store_agent_search_field_name", "fieldName"),
Index("ix_store_agent_search_field_type", "fieldType"),
Index(
"ix_store_agent_search_status_available", "submissionStatus", "isAvailable"
),
{"schema": "platform"}, # Specify the schema
)
class StoreAgentSearchService:
"""Service class for Store Agent Search operations using SQLAlchemy"""
def __init__(self, database_url: str):
"""Initialize the search service with async database connection"""
# Parse the URL to handle schema and other params separately
from urllib.parse import parse_qs, urlparse, urlunparse
parsed = urlparse(database_url)
query_params = parse_qs(parsed.query)
# Extract schema if present
schema = query_params.pop("schema", ["public"])[0]
# Remove connect_timeout from query params (will be handled in connect_args)
connect_timeout = query_params.pop("connect_timeout", [None])[0]
# Rebuild query string without schema and connect_timeout
new_query = "&".join([f"{k}={v[0]}" for k, v in query_params.items()])
# Rebuild URL without schema and connect_timeout parameters
clean_url = urlunparse(
(
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
new_query,
parsed.fragment,
)
)
# Convert to async URL for asyncpg
if clean_url.startswith("postgresql://"):
clean_url = clean_url.replace("postgresql://", "postgresql+asyncpg://")
elif clean_url.startswith("postgres://"):
clean_url = clean_url.replace("postgres://", "postgresql+asyncpg://")
# Build connect_args
connect_args: dict[str, Any] = {"server_settings": {"search_path": schema}}
# Add timeout if present (asyncpg uses 'timeout' not 'connect_timeout')
if connect_timeout:
connect_args["timeout"] = float(connect_timeout)
# Create engine with schema in connect_args
self.engine = create_async_engine(
clean_url,
echo=False, # Set to True for debugging SQL queries
future=True,
connect_args=connect_args,
)
self.async_session = async_sessionmaker(
self.engine, class_=AsyncSession, expire_on_commit=False
)
async def create_search_record(
self,
store_listing_version_id: str,
store_listing_id: str,
field_name: str,
field_value: str,
embedding: list[float],
field_type: SearchFieldType,
submission_status: SubmissionStatus,
is_available: bool,
) -> StoreAgentSearch:
"""Create a new search record with embedding"""
async with self.async_session() as session:
search_record = StoreAgentSearch(
storeListingVersionId=store_listing_version_id,
storeListingId=store_listing_id,
fieldName=field_name,
fieldValue=field_value,
embedding=embedding,
fieldType=field_type,
submissionStatus=submission_status,
isAvailable=is_available,
)
session.add(search_record)
await session.commit()
await session.refresh(search_record)
return search_record
async def batch_create_search_records(
self, records: list[dict]
) -> list[StoreAgentSearch]:
"""Batch create multiple search records"""
async with self.async_session() as session:
search_records = []
for record in records:
search_record = StoreAgentSearch(
storeListingVersionId=record["storeListingVersionId"],
storeListingId=record["storeListingId"],
fieldName=record["fieldName"],
fieldValue=record["fieldValue"],
embedding=record["embedding"],
fieldType=record.get("fieldType", SearchFieldType.NAME),
submissionStatus=record["submissionStatus"],
isAvailable=record["isAvailable"],
)
session.add(search_record)
search_records.append(search_record)
await session.commit()
return search_records
async def search_by_embedding(
self, query_embedding: List[float], limit: int = 30
) -> List[Dict[str, Any]]:
"""
Search for store agents using vector similarity
Returns the best matching store listings based on embedding similarity
"""
async with self.async_session() as session:
# For vector similarity search, we still need to use raw SQL
# because SQLAlchemy doesn't yet fully support pgvector operators
# However, we can use SQLAlchemy's text() for safer query construction
query = text(
"""
WITH similarity_scores AS (
SELECT
sas."storeListingId",
MIN(sas.embedding <=> CAST(:embedding AS vector)) AS similarity_score
FROM platform."StoreAgentSearch" sas
WHERE
sas."submissionStatus" = 'APPROVED'
AND sas."isAvailable" = true
GROUP BY sas."storeListingId"
ORDER BY similarity_score
LIMIT :limit
)
SELECT
sa.listing_id as id,
sa.slug,
sa.agent_name,
sa.agent_image,
sa.description,
sa.sub_heading,
sa.featured,
sa.runs,
sa.rating,
sa.creator_username,
sa.creator_avatar,
ss.similarity_score
FROM similarity_scores ss
INNER JOIN platform."StoreAgent" sa
ON sa.listing_id = ss."storeListingId"
ORDER BY ss.similarity_score;
"""
)
# Format embedding as PostgreSQL array
embedding_str = "[" + ",".join(map(str, query_embedding)) + "]"
result = await session.execute(
query, {"embedding": embedding_str, "limit": limit}
)
rows = result.fetchall()
# Convert rows to dictionaries
return [dict(row._mapping) for row in rows]
async def get_search_records(
self,
store_listing_version_id: Optional[str] = None,
field_name: Optional[str] = None,
is_available: Optional[bool] = None,
) -> Sequence[StoreAgentSearch]:
"""
Get search records using SQLAlchemy ORM
"""
async with self.async_session() as session:
stmt = select(StoreAgentSearch)
# Build filters
filters = []
if store_listing_version_id:
filters.append(
StoreAgentSearch.storeListingVersionId == store_listing_version_id
)
if field_name:
filters.append(StoreAgentSearch.fieldName == field_name)
if is_available is not None:
filters.append(StoreAgentSearch.isAvailable == is_available)
if filters:
stmt = stmt.where(and_(*filters))
result = await session.execute(stmt)
return result.scalars().all()
async def update_search_embeddings(
self, store_listing_version_id: str, updates: Dict[str, List[float]]
) -> None:
"""Update embeddings for existing search records"""
async with self.async_session() as session:
for field_name, embedding in updates.items():
# For vector updates, we still need raw SQL due to pgvector
# Use $ parameters for asyncpg
query = text(
"""
UPDATE platform."StoreAgentSearch"
SET embedding = CAST(:embedding AS vector),
"updatedAt" = CAST(:updated_at AS TIMESTAMPTZ)
WHERE "storeListingVersionId" = :version_id
AND "fieldName" = :field_name
"""
)
embedding_str = "[" + ",".join(map(str, embedding)) + "]"
await session.execute(
query,
{
"embedding": embedding_str,
"updated_at": datetime.now(timezone.utc),
"version_id": store_listing_version_id,
"field_name": field_name,
},
)
await session.commit()
async def delete_search_records(self, store_listing_version_id: str) -> None:
"""Delete all search records for a store listing version using SQLAlchemy ORM"""
async with self.async_session() as session:
# Use SQLAlchemy ORM for deletion
stmt = select(StoreAgentSearch).where(
StoreAgentSearch.storeListingVersionId == store_listing_version_id
)
result = await session.execute(stmt)
records = result.scalars().all()
for record in records:
await session.delete(record)
await session.commit()
async def upsert_search_record(
self,
store_listing_version_id: str,
store_listing_id: str,
field_name: str,
field_value: str,
embedding: List[float],
field_type: SearchFieldType,
submission_status: SubmissionStatus,
is_available: bool,
) -> StoreAgentSearch:
"""Upsert a search record (update if exists, create if not)"""
async with self.async_session() as session:
# Check if record exists
stmt = select(StoreAgentSearch).where(
and_(
StoreAgentSearch.storeListingVersionId == store_listing_version_id,
StoreAgentSearch.fieldName == field_name,
)
)
result = await session.execute(stmt)
existing_record = result.scalar_one_or_none()
if existing_record:
# Update existing record
existing_record.fieldValue = field_value # type: ignore[attr-defined]
existing_record.embedding = embedding # type: ignore[attr-defined]
existing_record.fieldType = field_type # type: ignore[attr-defined]
existing_record.submissionStatus = submission_status # type: ignore[attr-defined]
existing_record.isAvailable = is_available # type: ignore[attr-defined]
existing_record.updatedAt = datetime.now(timezone.utc) # type: ignore[attr-defined]
await session.commit()
await session.refresh(existing_record)
return existing_record
else:
# Create new record
return await self.create_search_record(
store_listing_version_id=store_listing_version_id,
store_listing_id=store_listing_id,
field_name=field_name,
field_value=field_value,
embedding=embedding,
field_type=field_type,
submission_status=submission_status,
is_available=is_available,
)
async def close(self):
"""Close the database connection"""
await self.engine.dispose()

View File

@@ -154,15 +154,26 @@ async def get_agents(
)
try:
agents = await backend.server.v2.store.db.get_store_agents(
featured=featured,
creators=[creator] if creator else None,
sorted_by=sorted_by,
search_query=search_query,
category=category,
page=page,
page_size=page_size,
)
# Use vector similarity search if we have a search query, otherwise use traditional search
if search_query and search_query.strip():
agents = await backend.server.v2.store.db.search_agents(
search_query=search_query,
featured=featured,
creators=[creator] if creator else None,
category=category,
page=page,
page_size=page_size,
)
else:
agents = await backend.server.v2.store.db.get_store_agents(
featured=featured,
creators=[creator] if creator else None,
sorted_by=sorted_by,
search_query=search_query,
category=category,
page=page,
page_size=page_size,
)
return agents
except Exception as e:
logger.exception("Failed to retrieve store agents: %s", e)

View File

@@ -0,0 +1,49 @@
-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- CreateEnum
CREATE TYPE "SearchFieldType" AS ENUM ('NAME', 'DESCRIPTION', 'CATEGORIES', 'SUBHEADING');
-- CreateTable
CREATE TABLE "StoreAgentSearch" (
"id" TEXT NOT NULL,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updatedAt" TIMESTAMP(3) NOT NULL,
"storeListingVersionId" TEXT NOT NULL,
"storeListingId" TEXT NOT NULL,
"fieldName" TEXT NOT NULL,
"fieldValue" TEXT NOT NULL,
"embedding" vector(1536),
"fieldType" "SearchFieldType" NOT NULL,
"submissionStatus" "SubmissionStatus" NOT NULL,
"isAvailable" BOOLEAN NOT NULL,
CONSTRAINT "StoreAgentSearch_pkey" PRIMARY KEY ("id")
);
-- CreateIndex
CREATE INDEX "StoreAgentSearch_storeListingVersionId_idx" ON "StoreAgentSearch"("storeListingVersionId");
-- CreateIndex
CREATE INDEX "StoreAgentSearch_storeListingId_idx" ON "StoreAgentSearch"("storeListingId");
-- CreateIndex
CREATE INDEX "StoreAgentSearch_fieldName_idx" ON "StoreAgentSearch"("fieldName");
-- CreateIndex
CREATE INDEX "StoreAgentSearch_fieldType_idx" ON "StoreAgentSearch"("fieldType");
-- CreateIndex
CREATE INDEX "StoreAgentSearch_submissionStatus_isAvailable_idx" ON "StoreAgentSearch"("submissionStatus", "isAvailable");
-- CreateIndex
CREATE UNIQUE INDEX "StoreAgentSearch_storeListingVersionId_fieldName_key" ON "StoreAgentSearch"("storeListingVersionId", "fieldName");
-- Create HNSW index for vector similarity search (pgvector)
CREATE INDEX "StoreAgentSearch_embedding_idx" ON "StoreAgentSearch" USING hnsw (embedding vector_cosine_ops);
-- AddForeignKey
ALTER TABLE "StoreAgentSearch" ADD CONSTRAINT "StoreAgentSearch_storeListingVersionId_fkey" FOREIGN KEY ("storeListingVersionId") REFERENCES "StoreListingVersion"("id") ON DELETE CASCADE ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "StoreAgentSearch" ADD CONSTRAINT "StoreAgentSearch_storeListingId_fkey" FOREIGN KEY ("storeListingId") REFERENCES "StoreListing"("id") ON DELETE CASCADE ON UPDATE CASCADE;

View File

@@ -311,6 +311,73 @@ files = [
{file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"},
]
[[package]]
name = "asyncpg"
version = "0.30.0"
description = "An asyncio PostgreSQL driver"
optional = false
python-versions = ">=3.8.0"
groups = ["main"]
files = [
{file = "asyncpg-0.30.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e"},
{file = "asyncpg-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0"},
{file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f"},
{file = "asyncpg-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af"},
{file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75"},
{file = "asyncpg-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f"},
{file = "asyncpg-0.30.0-cp310-cp310-win32.whl", hash = "sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf"},
{file = "asyncpg-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50"},
{file = "asyncpg-0.30.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a"},
{file = "asyncpg-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed"},
{file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a"},
{file = "asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956"},
{file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056"},
{file = "asyncpg-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454"},
{file = "asyncpg-0.30.0-cp311-cp311-win32.whl", hash = "sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d"},
{file = "asyncpg-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f"},
{file = "asyncpg-0.30.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e"},
{file = "asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a"},
{file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3"},
{file = "asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737"},
{file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a"},
{file = "asyncpg-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af"},
{file = "asyncpg-0.30.0-cp312-cp312-win32.whl", hash = "sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e"},
{file = "asyncpg-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305"},
{file = "asyncpg-0.30.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70"},
{file = "asyncpg-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3"},
{file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33"},
{file = "asyncpg-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4"},
{file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4"},
{file = "asyncpg-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba"},
{file = "asyncpg-0.30.0-cp313-cp313-win32.whl", hash = "sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590"},
{file = "asyncpg-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e"},
{file = "asyncpg-0.30.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d"},
{file = "asyncpg-0.30.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168"},
{file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb"},
{file = "asyncpg-0.30.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f"},
{file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38"},
{file = "asyncpg-0.30.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"},
{file = "asyncpg-0.30.0-cp38-cp38-win32.whl", hash = "sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4"},
{file = "asyncpg-0.30.0-cp38-cp38-win_amd64.whl", hash = "sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b"},
{file = "asyncpg-0.30.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad"},
{file = "asyncpg-0.30.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff"},
{file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708"},
{file = "asyncpg-0.30.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144"},
{file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb"},
{file = "asyncpg-0.30.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547"},
{file = "asyncpg-0.30.0-cp39-cp39-win32.whl", hash = "sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a"},
{file = "asyncpg-0.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773"},
{file = "asyncpg-0.30.0.tar.gz", hash = "sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851"},
]
[package.dependencies]
async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.11.0\""}
[package.extras]
docs = ["Sphinx (>=8.1.3,<8.2.0)", "sphinx-rtd-theme (>=1.2.2)"]
gssauth = ["gssapi ; platform_system != \"Windows\"", "sspilib ; platform_system == \"Windows\""]
test = ["distro (>=1.9.0,<1.10.0)", "flake8 (>=6.1,<7.0)", "flake8-pyi (>=24.1.0,<24.2.0)", "gssapi ; platform_system == \"Linux\"", "k5test ; platform_system == \"Linux\"", "mypy (>=1.8.0,<1.9.0)", "sspilib ; platform_system == \"Windows\"", "uvloop (>=0.15.3) ; platform_system != \"Windows\" and python_version < \"3.14.0\""]
[[package]]
name = "attrs"
version = "25.3.0"
@@ -3600,6 +3667,21 @@ all = ["pbs-installer[download,install]"]
download = ["httpx (>=0.27.0,<1)"]
install = ["zstandard (>=0.21.0)"]
[[package]]
name = "pgvector"
version = "0.4.1"
description = "pgvector support for Python"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "pgvector-0.4.1-py3-none-any.whl", hash = "sha256:34bb4e99e1b13d08a2fe82dda9f860f15ddcd0166fbb25bffe15821cbfeb7362"},
{file = "pgvector-0.4.1.tar.gz", hash = "sha256:83d3a1c044ff0c2f1e95d13dfb625beb0b65506cfec0941bfe81fd0ad44f4003"},
]
[package.dependencies]
numpy = "*"
[[package]]
name = "pika"
version = "1.3.2"
@@ -7132,4 +7214,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.14"
content-hash = "892daa57d7126d9a9d5308005b07328a39b8c4cd7fe198f9b5ab10f957787c48"
content-hash = "e5e5cd7f11c2e2351c084aec7e4ddec6c154dc7274797af6ee616fa14db5676d"

View File

@@ -79,6 +79,8 @@ firecrawl-py = "^2.16.3"
exa-py = "^1.14.20"
croniter = "^6.0.0"
stagehand = "^0.5.1"
pgvector = "^0.4.1"
asyncpg = "^0.30.0"
[tool.poetry.group.dev.dependencies]
aiohappyeyeballs = "^2.6.1"

View File

@@ -724,6 +724,7 @@ model StoreListing {
// Relations
Versions StoreListingVersion[] @relation("ListingVersions")
SearchIndexes StoreAgentSearch[] @relation("SearchIndexes")
// Unique index on agentId to ensure only one listing per agent, regardless of number of versions the agent has.
@@unique([agentGraphId])
@@ -781,6 +782,9 @@ model StoreListingVersion {
// Reviews for this specific version
Reviews StoreListingReview[]
// Search index entries for vector search
SearchIndexes StoreAgentSearch[] @relation("SearchIndexes")
@@unique([storeListingId, version])
@@index([storeListingId, submissionStatus, isAvailable])
@@ -789,6 +793,55 @@ model StoreListingVersion {
@@index([agentGraphId, agentGraphVersion]) // Non-unique index for efficient lookups
}
model StoreAgentSearch {
id String @id @default(uuid())
createdAt DateTime @default(now())
updatedAt DateTime @updatedAt
// Relation to StoreListingVersion
storeListingVersionId String
StoreListingVersion StoreListingVersion @relation("SearchIndexes", fields: [storeListingVersionId], references: [id], onDelete: Cascade)
// Direct relation to StoreListing for easier joins
storeListingId String
StoreListing StoreListing @relation("SearchIndexes", fields: [storeListingId], references: [id], onDelete: Cascade)
// Searchable fields that will be embedded
fieldName String // The field being indexed (name, description, categories, subheading)
fieldValue String // The actual text content for this field
// Vector embedding for similarity search using pgvector
// text-embedding-3-small produces 1536-dimensional embeddings
embedding Float[] // pgvector field for storing 1536-dimensional embeddings
// Metadata for search optimization
fieldType SearchFieldType // Type of field for filtering
// Denormalized fields for filtering without joins
submissionStatus SubmissionStatus // Copy from StoreListingVersion for filtering approved content
isAvailable Boolean // Copy from StoreListingVersion for filtering available content
// Unique constraint to prevent duplicate indexing of the same field
@@unique([storeListingVersionId, fieldName])
// Indexes for performance
@@index([storeListingVersionId])
@@index([storeListingId])
@@index([fieldName])
@@index([fieldType])
@@index([submissionStatus, isAvailable]) // For filtering approved and available content
// Note: pgvector index will be created manually in migration for vector similarity search
}
// Enum for searchable field types
enum SearchFieldType {
NAME // Agent name
DESCRIPTION // Agent description
CATEGORIES // Agent categories
SUBHEADING // Agent subheading
}
model StoreListingReview {
id String @id @default(uuid())
createdAt DateTime @default(now())