Compare commits

...

2 Commits

Author SHA1 Message Date
Zamil Majdy
0a48c49902 test(backend): Update test to match removed SET search_path call
The SET search_path call was removed in favor of explicit schema
qualification, so the test now expects only 1 execute_raw call.
2026-01-19 21:31:40 -05:00
Zamil Majdy
1fc1102eb4 fix(backend): Use explicit schema qualification for pgvector types
Fix intermittent "type 'vector' does not exist" errors when using
PgBouncer in transaction mode. The issue was that SET search_path
and the actual query could run on different backend connections.

Changes:
- Add {schema} placeholder for raw schema name (e.g., platform)
- Use explicit type casting: {schema}.vector instead of ::vector
- Use explicit operator: OPERATOR({schema}.<=>) instead of <=>
- Remove set_public_search_path parameter (no longer needed)
- Remove search_path manipulation from DATABASE_URL

Tested on both local and dev environments via kubectl exec.

Fixes: AUTOGPT-SERVER-763, AUTOGPT-SERVER-764
2026-01-19 18:14:29 -05:00
5 changed files with 69 additions and 87 deletions

View File

@@ -154,15 +154,16 @@ async def store_content_embedding(
# Upsert the embedding # Upsert the embedding
# WHERE clause in DO UPDATE prevents PostgreSQL 15 bug with NULLS NOT DISTINCT # WHERE clause in DO UPDATE prevents PostgreSQL 15 bug with NULLS NOT DISTINCT
# Use {schema}.vector for explicit pgvector type qualification
await execute_raw_with_schema( await execute_raw_with_schema(
""" """
INSERT INTO {schema_prefix}"UnifiedContentEmbedding" ( INSERT INTO {schema_prefix}"UnifiedContentEmbedding" (
"id", "contentType", "contentId", "userId", "embedding", "searchableText", "metadata", "createdAt", "updatedAt" "id", "contentType", "contentId", "userId", "embedding", "searchableText", "metadata", "createdAt", "updatedAt"
) )
VALUES (gen_random_uuid()::text, $1::{schema_prefix}"ContentType", $2, $3, $4::vector, $5, $6::jsonb, NOW(), NOW()) VALUES (gen_random_uuid()::text, $1::{schema_prefix}"ContentType", $2, $3, $4::{schema}.vector, $5, $6::jsonb, NOW(), NOW())
ON CONFLICT ("contentType", "contentId", "userId") ON CONFLICT ("contentType", "contentId", "userId")
DO UPDATE SET DO UPDATE SET
"embedding" = $4::vector, "embedding" = $4::{schema}.vector,
"searchableText" = $5, "searchableText" = $5,
"metadata" = $6::jsonb, "metadata" = $6::jsonb,
"updatedAt" = NOW() "updatedAt" = NOW()
@@ -177,7 +178,6 @@ async def store_content_embedding(
searchable_text, searchable_text,
metadata_json, metadata_json,
client=client, client=client,
set_public_search_path=True,
) )
logger.info(f"Stored embedding for {content_type}:{content_id}") logger.info(f"Stored embedding for {content_type}:{content_id}")
@@ -236,7 +236,6 @@ async def get_content_embedding(
content_type, content_type,
content_id, content_id,
user_id, user_id,
set_public_search_path=True,
) )
if result and len(result) > 0: if result and len(result) > 0:
@@ -871,31 +870,34 @@ async def semantic_search(
# Add content type parameters and build placeholders dynamically # Add content type parameters and build placeholders dynamically
content_type_start_idx = len(params) + 1 content_type_start_idx = len(params) + 1
content_type_placeholders = ", ".join( content_type_placeholders = ", ".join(
f'${content_type_start_idx + i}::{{{{schema_prefix}}}}"ContentType"' '$' + str(content_type_start_idx + i) + '::{schema_prefix}"ContentType"'
for i in range(len(content_types)) for i in range(len(content_types))
) )
params.extend([ct.value for ct in content_types]) params.extend([ct.value for ct in content_types])
sql = f""" # Build min_similarity param index before appending
min_similarity_idx = len(params) + 1
params.append(min_similarity)
# Use regular string (not f-string) for template to preserve {schema_prefix} and {schema} placeholders
# Use OPERATOR({schema}.<=>) for explicit operator schema qualification
sql = """
SELECT SELECT
"contentId" as content_id, "contentId" as content_id,
"contentType" as content_type, "contentType" as content_type,
"searchableText" as searchable_text, "searchableText" as searchable_text,
metadata, metadata,
1 - (embedding <=> '{embedding_str}'::vector) as similarity 1 - (embedding OPERATOR({schema}.<=>) '""" + embedding_str + """'::{schema}.vector) as similarity
FROM {{{{schema_prefix}}}}"UnifiedContentEmbedding" FROM {schema_prefix}"UnifiedContentEmbedding"
WHERE "contentType" IN ({content_type_placeholders}) WHERE "contentType" IN (""" + content_type_placeholders + """)
{user_filter} """ + user_filter + """
AND 1 - (embedding <=> '{embedding_str}'::vector) >= ${len(params) + 1} AND 1 - (embedding OPERATOR({schema}.<=>) '""" + embedding_str + """'::{schema}.vector) >= $""" + str(min_similarity_idx) + """
ORDER BY similarity DESC ORDER BY similarity DESC
LIMIT $1 LIMIT $1
""" """
params.append(min_similarity)
try: try:
results = await query_raw_with_schema( results = await query_raw_with_schema(sql, *params)
sql, *params, set_public_search_path=True
)
return [ return [
{ {
"content_id": row["content_id"], "content_id": row["content_id"],
@@ -922,31 +924,33 @@ async def semantic_search(
# Add content type parameters and build placeholders dynamically # Add content type parameters and build placeholders dynamically
content_type_start_idx = len(params_lexical) + 1 content_type_start_idx = len(params_lexical) + 1
content_type_placeholders_lexical = ", ".join( content_type_placeholders_lexical = ", ".join(
f'${content_type_start_idx + i}::{{{{schema_prefix}}}}"ContentType"' '$' + str(content_type_start_idx + i) + '::{schema_prefix}"ContentType"'
for i in range(len(content_types)) for i in range(len(content_types))
) )
params_lexical.extend([ct.value for ct in content_types]) params_lexical.extend([ct.value for ct in content_types])
sql_lexical = f""" # Build query param index before appending
query_param_idx = len(params_lexical) + 1
params_lexical.append(f"%{query}%")
# Use regular string (not f-string) for template to preserve {schema_prefix} placeholders
sql_lexical = """
SELECT SELECT
"contentId" as content_id, "contentId" as content_id,
"contentType" as content_type, "contentType" as content_type,
"searchableText" as searchable_text, "searchableText" as searchable_text,
metadata, metadata,
0.0 as similarity 0.0 as similarity
FROM {{{{schema_prefix}}}}"UnifiedContentEmbedding" FROM {schema_prefix}"UnifiedContentEmbedding"
WHERE "contentType" IN ({content_type_placeholders_lexical}) WHERE "contentType" IN (""" + content_type_placeholders_lexical + """)
{user_filter} """ + user_filter + """
AND "searchableText" ILIKE ${len(params_lexical) + 1} AND "searchableText" ILIKE $""" + str(query_param_idx) + """
ORDER BY "updatedAt" DESC ORDER BY "updatedAt" DESC
LIMIT $1 LIMIT $1
""" """
params_lexical.append(f"%{query}%")
try: try:
results = await query_raw_with_schema( results = await query_raw_with_schema(sql_lexical, *params_lexical)
sql_lexical, *params_lexical, set_public_search_path=True
)
return [ return [
{ {
"content_id": row["content_id"], "content_id": row["content_id"],

View File

@@ -155,18 +155,14 @@ async def test_store_embedding_success(mocker):
) )
assert result is True assert result is True
# execute_raw is called twice: once for SET search_path, once for INSERT # execute_raw is called once for INSERT (no separate SET search_path needed)
assert mock_client.execute_raw.call_count == 2 assert mock_client.execute_raw.call_count == 1
# First call: SET search_path # Verify the INSERT query with the actual data
first_call_args = mock_client.execute_raw.call_args_list[0][0] call_args = mock_client.execute_raw.call_args_list[0][0]
assert "SET search_path" in first_call_args[0] assert "test-version-id" in call_args
assert "[0.1,0.2,0.3]" in call_args
# Second call: INSERT query with the actual data assert None in call_args # userId should be None for store agents
second_call_args = mock_client.execute_raw.call_args_list[1][0]
assert "test-version-id" in second_call_args
assert "[0.1,0.2,0.3]" in second_call_args
assert None in second_call_args # userId should be None for store agents
@pytest.mark.asyncio(loop_scope="session") @pytest.mark.asyncio(loop_scope="session")

View File

@@ -295,7 +295,7 @@ async def unified_hybrid_search(
FROM {{schema_prefix}}"UnifiedContentEmbedding" uce FROM {{schema_prefix}}"UnifiedContentEmbedding" uce
WHERE uce."contentType" = ANY({content_types_param}::{{schema_prefix}}"ContentType"[]) WHERE uce."contentType" = ANY({content_types_param}::{{schema_prefix}}"ContentType"[])
{user_filter} {user_filter}
ORDER BY uce.embedding <=> {embedding_param}::vector ORDER BY uce.embedding OPERATOR({{schema}}.<=>) {embedding_param}::{{schema}}.vector
LIMIT 200 LIMIT 200
) )
), ),
@@ -307,7 +307,7 @@ async def unified_hybrid_search(
uce.metadata, uce.metadata,
uce."updatedAt" as updated_at, uce."updatedAt" as updated_at,
-- Semantic score: cosine similarity (1 - distance) -- Semantic score: cosine similarity (1 - distance)
COALESCE(1 - (uce.embedding <=> {embedding_param}::vector), 0) as semantic_score, COALESCE(1 - (uce.embedding OPERATOR({{schema}}.<=>) {embedding_param}::{{schema}}.vector), 0) as semantic_score,
-- Lexical score: ts_rank_cd -- Lexical score: ts_rank_cd
COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw, COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw,
-- Category match from metadata -- Category match from metadata
@@ -363,9 +363,7 @@ async def unified_hybrid_search(
LIMIT {limit_param} OFFSET {offset_param} LIMIT {limit_param} OFFSET {offset_param}
""" """
results = await query_raw_with_schema( results = await query_raw_with_schema(sql_query, *params)
sql_query, *params, set_public_search_path=True
)
total = results[0]["total_count"] if results else 0 total = results[0]["total_count"] if results else 0
# Apply BM25 reranking # Apply BM25 reranking
@@ -585,7 +583,7 @@ async def hybrid_search(
WHERE uce."contentType" = 'STORE_AGENT'::{{schema_prefix}}"ContentType" WHERE uce."contentType" = 'STORE_AGENT'::{{schema_prefix}}"ContentType"
AND uce."userId" IS NULL AND uce."userId" IS NULL
AND {where_clause} AND {where_clause}
ORDER BY uce.embedding <=> {embedding_param}::vector ORDER BY uce.embedding OPERATOR({{schema}}.<=>) {embedding_param}::{{schema}}.vector
LIMIT 200 LIMIT 200
) uce ) uce
), ),
@@ -607,7 +605,7 @@ async def hybrid_search(
-- Searchable text for BM25 reranking -- Searchable text for BM25 reranking
COALESCE(sa.agent_name, '') || ' ' || COALESCE(sa.sub_heading, '') || ' ' || COALESCE(sa.description, '') as searchable_text, COALESCE(sa.agent_name, '') || ' ' || COALESCE(sa.sub_heading, '') || ' ' || COALESCE(sa.description, '') as searchable_text,
-- Semantic score -- Semantic score
COALESCE(1 - (uce.embedding <=> {embedding_param}::vector), 0) as semantic_score, COALESCE(1 - (uce.embedding OPERATOR({{schema}}.<=>) {embedding_param}::{{schema}}.vector), 0) as semantic_score,
-- Lexical score (raw, will normalize) -- Lexical score (raw, will normalize)
COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw, COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw,
-- Category match -- Category match
@@ -688,9 +686,7 @@ async def hybrid_search(
LIMIT {limit_param} OFFSET {offset_param} LIMIT {limit_param} OFFSET {offset_param}
""" """
results = await query_raw_with_schema( results = await query_raw_with_schema(sql_query, *params)
sql_query, *params, set_public_search_path=True
)
total = results[0]["total_count"] if results else 0 total = results[0]["total_count"] if results else 0

View File

@@ -38,20 +38,6 @@ POOL_TIMEOUT = os.getenv("DB_POOL_TIMEOUT")
if POOL_TIMEOUT: if POOL_TIMEOUT:
DATABASE_URL = add_param(DATABASE_URL, "pool_timeout", POOL_TIMEOUT) DATABASE_URL = add_param(DATABASE_URL, "pool_timeout", POOL_TIMEOUT)
# Add public schema to search_path for pgvector type access
# The vector extension is in public schema, but search_path is determined by schema parameter
# Extract the schema from DATABASE_URL or default to 'public' (matching get_database_schema())
parsed_url = urlparse(DATABASE_URL)
url_params = dict(parse_qsl(parsed_url.query))
db_schema = url_params.get("schema", "public")
# Build search_path, avoiding duplicates if db_schema is already 'public'
search_path_schemas = list(
dict.fromkeys([db_schema, "public"])
) # Preserves order, removes duplicates
search_path = ",".join(search_path_schemas)
# This allows using ::vector without schema qualification
DATABASE_URL = add_param(DATABASE_URL, "options", f"-c search_path={search_path}")
HTTP_TIMEOUT = int(POOL_TIMEOUT) if POOL_TIMEOUT else None HTTP_TIMEOUT = int(POOL_TIMEOUT) if POOL_TIMEOUT else None
prisma = Prisma( prisma = Prisma(
@@ -127,38 +113,43 @@ async def _raw_with_schema(
*args, *args,
execute: bool = False, execute: bool = False,
client: Prisma | None = None, client: Prisma | None = None,
set_public_search_path: bool = False,
) -> list[dict] | int: ) -> list[dict] | int:
"""Internal: Execute raw SQL with proper schema handling. """Internal: Execute raw SQL with proper schema handling.
Use query_raw_with_schema() or execute_raw_with_schema() instead. Use query_raw_with_schema() or execute_raw_with_schema() instead.
Supports placeholders:
- {schema_prefix}: Table/type prefix (e.g., "platform".)
- {schema}: Raw schema name (e.g., platform) for pgvector types and operators
Args: Args:
query_template: SQL query with {schema_prefix} placeholder query_template: SQL query with {schema_prefix} and/or {schema} placeholders
*args: Query parameters *args: Query parameters
execute: If False, executes SELECT query. If True, executes INSERT/UPDATE/DELETE. execute: If False, executes SELECT query. If True, executes INSERT/UPDATE/DELETE.
client: Optional Prisma client for transactions (only used when execute=True). client: Optional Prisma client for transactions (only used when execute=True).
set_public_search_path: If True, sets search_path to include public schema.
Needed for pgvector types and other public schema objects.
Returns: Returns:
- list[dict] if execute=False (query results) - list[dict] if execute=False (query results)
- int if execute=True (number of affected rows) - int if execute=True (number of affected rows)
Example with vector type:
await execute_raw_with_schema(
'INSERT INTO {schema_prefix}"Embedding" (vec) VALUES ($1::{schema}.vector)',
embedding_data
)
""" """
schema = get_database_schema() schema = get_database_schema()
schema_prefix = f'"{schema}".' if schema != "public" else "" schema_prefix = f'"{schema}".' if schema != "public" else ""
formatted_query = query_template.format(schema_prefix=schema_prefix)
formatted_query = query_template.format(
schema_prefix=schema_prefix,
schema=schema,
)
import prisma as prisma_module import prisma as prisma_module
db_client = client if client else prisma_module.get_client() db_client = client if client else prisma_module.get_client()
# Set search_path to include public schema if requested
# Prisma doesn't support the 'options' connection parameter, so we set it per-session
# This is idempotent and safe to call multiple times
if set_public_search_path:
await db_client.execute_raw(f"SET search_path = {schema}, public") # type: ignore
if execute: if execute:
result = await db_client.execute_raw(formatted_query, *args) # type: ignore result = await db_client.execute_raw(formatted_query, *args) # type: ignore
else: else:
@@ -167,16 +158,12 @@ async def _raw_with_schema(
return result return result
async def query_raw_with_schema( async def query_raw_with_schema(query_template: str, *args) -> list[dict]:
query_template: str, *args, set_public_search_path: bool = False
) -> list[dict]:
"""Execute raw SQL SELECT query with proper schema handling. """Execute raw SQL SELECT query with proper schema handling.
Args: Args:
query_template: SQL query with {schema_prefix} placeholder query_template: SQL query with {schema_prefix} and/or {schema} placeholders
*args: Query parameters *args: Query parameters
set_public_search_path: If True, sets search_path to include public schema.
Needed for pgvector types and other public schema objects.
Returns: Returns:
List of result rows as dictionaries List of result rows as dictionaries
@@ -187,23 +174,20 @@ async def query_raw_with_schema(
user_id user_id
) )
""" """
return await _raw_with_schema(query_template, *args, execute=False, set_public_search_path=set_public_search_path) # type: ignore return await _raw_with_schema(query_template, *args, execute=False) # type: ignore
async def execute_raw_with_schema( async def execute_raw_with_schema(
query_template: str, query_template: str,
*args, *args,
client: Prisma | None = None, client: Prisma | None = None,
set_public_search_path: bool = False,
) -> int: ) -> int:
"""Execute raw SQL command (INSERT/UPDATE/DELETE) with proper schema handling. """Execute raw SQL command (INSERT/UPDATE/DELETE) with proper schema handling.
Args: Args:
query_template: SQL query with {schema_prefix} placeholder query_template: SQL query with {schema_prefix} and/or {schema} placeholders
*args: Query parameters *args: Query parameters
client: Optional Prisma client for transactions client: Optional Prisma client for transactions
set_public_search_path: If True, sets search_path to include public schema.
Needed for pgvector types and other public schema objects.
Returns: Returns:
Number of affected rows Number of affected rows
@@ -215,7 +199,7 @@ async def execute_raw_with_schema(
client=tx # Optional transaction client client=tx # Optional transaction client
) )
""" """
return await _raw_with_schema(query_template, *args, execute=True, client=client, set_public_search_path=set_public_search_path) # type: ignore return await _raw_with_schema(query_template, *args, execute=True, client=client) # type: ignore
class BaseDbModel(BaseModel): class BaseDbModel(BaseModel):

View File

@@ -1,9 +1,10 @@
-- CreateExtension -- CreateExtension
-- Supabase: pgvector must be enabled via Dashboard → Database → Extensions first -- Supabase: pgvector must be enabled via Dashboard → Database → Extensions first
-- Create in public schema so vector type is available across all schemas -- Creates extension in current schema (determined by search_path)
-- The extension may already exist in a different schema (e.g., Supabase pre-enables it)
DO $$ DO $$
BEGIN BEGIN
CREATE EXTENSION IF NOT EXISTS "vector" WITH SCHEMA "public"; CREATE EXTENSION IF NOT EXISTS "vector";
EXCEPTION WHEN OTHERS THEN EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'vector extension not available or already exists, skipping'; RAISE NOTICE 'vector extension not available or already exists, skipping';
END $$; END $$;
@@ -12,6 +13,7 @@ END $$;
CREATE TYPE "ContentType" AS ENUM ('STORE_AGENT', 'BLOCK', 'INTEGRATION', 'DOCUMENTATION', 'LIBRARY_AGENT'); CREATE TYPE "ContentType" AS ENUM ('STORE_AGENT', 'BLOCK', 'INTEGRATION', 'DOCUMENTATION', 'LIBRARY_AGENT');
-- CreateTable -- CreateTable
-- Note: vector type is unqualified - relies on search_path including the schema where pgvector is installed
CREATE TABLE "UnifiedContentEmbedding" ( CREATE TABLE "UnifiedContentEmbedding" (
"id" TEXT NOT NULL, "id" TEXT NOT NULL,
"createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
@@ -19,7 +21,7 @@ CREATE TABLE "UnifiedContentEmbedding" (
"contentType" "ContentType" NOT NULL, "contentType" "ContentType" NOT NULL,
"contentId" TEXT NOT NULL, "contentId" TEXT NOT NULL,
"userId" TEXT, "userId" TEXT,
"embedding" public.vector(1536) NOT NULL, "embedding" vector(1536) NOT NULL,
"searchableText" TEXT NOT NULL, "searchableText" TEXT NOT NULL,
"metadata" JSONB NOT NULL DEFAULT '{}', "metadata" JSONB NOT NULL DEFAULT '{}',
@@ -45,4 +47,4 @@ CREATE UNIQUE INDEX "UnifiedContentEmbedding_contentType_contentId_userId_key" O
-- Uses cosine distance operator (<=>), which matches the query in hybrid_search.py -- Uses cosine distance operator (<=>), which matches the query in hybrid_search.py
-- Note: Drop first in case Prisma created a btree index (Prisma doesn't support HNSW) -- Note: Drop first in case Prisma created a btree index (Prisma doesn't support HNSW)
DROP INDEX IF EXISTS "UnifiedContentEmbedding_embedding_idx"; DROP INDEX IF EXISTS "UnifiedContentEmbedding_embedding_idx";
CREATE INDEX "UnifiedContentEmbedding_embedding_idx" ON "UnifiedContentEmbedding" USING hnsw ("embedding" public.vector_cosine_ops); CREATE INDEX "UnifiedContentEmbedding_embedding_idx" ON "UnifiedContentEmbedding" USING hnsw ("embedding" vector_cosine_ops);