test(backend): Update test to match removed SET search_path call

The SET search_path call was removed in favor of explicit schema qualification, so the test now expects only 1 execute_raw call.
fix(backend): Use explicit schema qualification for pgvector types
2026-01-20 04:28:09 -05:00 · 2026-01-19 21:31:40 -05:00 · 2026-01-19 18:14:29 -05:00
5 changed files with 69 additions and 87 deletions
--- a/autogpt_platform/backend/backend/api/features/store/embeddings.py
+++ b/autogpt_platform/backend/backend/api/features/store/embeddings.py
@@ -154,15 +154,16 @@ async def store_content_embedding(
        # Upsert the embedding
        # WHERE clause in DO UPDATE prevents PostgreSQL 15 bug with NULLS NOT DISTINCT
        # Use {schema}.vector for explicit pgvector type qualification
        await execute_raw_with_schema(
            """
            INSERT INTO {schema_prefix}"UnifiedContentEmbedding" (
                "id", "contentType", "contentId", "userId", "embedding", "searchableText", "metadata", "createdAt", "updatedAt"
            )
-            VALUES (gen_random_uuid()::text, $1::{schema_prefix}"ContentType", $2, $3, $4::vector, $5, $6::jsonb, NOW(), NOW())
+            VALUES (gen_random_uuid()::text, $1::{schema_prefix}"ContentType", $2, $3, $4::{schema}.vector, $5, $6::jsonb, NOW(), NOW())
            ON CONFLICT ("contentType", "contentId", "userId")
            DO UPDATE SET
-                "embedding" = $4::vector,
+                "embedding" = $4::{schema}.vector,
                "searchableText" = $5,
                "metadata" = $6::jsonb,
                "updatedAt" = NOW()
@@ -177,7 +178,6 @@ async def store_content_embedding(
            searchable_text,
            metadata_json,
            client=client,
            set_public_search_path=True,
        )
        logger.info(f"Stored embedding for {content_type}:{content_id}")
@@ -236,7 +236,6 @@ async def get_content_embedding(
            content_type,
            content_id,
            user_id,
            set_public_search_path=True,
        )
        if result and len(result) > 0:
@@ -871,31 +870,34 @@ async def semantic_search(
        # Add content type parameters and build placeholders dynamically
        content_type_start_idx = len(params) + 1
        content_type_placeholders = ", ".join(
-            f'${content_type_start_idx + i}::{{{{schema_prefix}}}}"ContentType"'
+            '$' + str(content_type_start_idx + i) + '::{schema_prefix}"ContentType"'
            for i in range(len(content_types))
        )
        params.extend([ct.value for ct in content_types])
-        sql = f"""
+        # Build min_similarity param index before appending
        min_similarity_idx = len(params) + 1
        params.append(min_similarity)
        # Use regular string (not f-string) for template to preserve {schema_prefix} and {schema} placeholders
        # Use OPERATOR({schema}.<=>) for explicit operator schema qualification
        sql = """
            SELECT
                "contentId" as content_id,
                "contentType" as content_type,
                "searchableText" as searchable_text,
                metadata,
-                1 - (embedding <=> '{embedding_str}'::vector) as similarity
+                1 - (embedding OPERATOR({schema}.<=>) '""" + embedding_str + """'::{schema}.vector) as similarity
-            FROM {{{{schema_prefix}}}}"UnifiedContentEmbedding"
+            FROM {schema_prefix}"UnifiedContentEmbedding"
-            WHERE "contentType" IN ({content_type_placeholders})
+            WHERE "contentType" IN (""" + content_type_placeholders + """)
-            {user_filter}
+            """ + user_filter + """
-            AND 1 - (embedding <=> '{embedding_str}'::vector) >= ${len(params) + 1}
+            AND 1 - (embedding OPERATOR({schema}.<=>) '""" + embedding_str + """'::{schema}.vector) >= $""" + str(min_similarity_idx) + """
            ORDER BY similarity DESC
            LIMIT $1
        """
        params.append(min_similarity)
        try:
-            results = await query_raw_with_schema(
+            results = await query_raw_with_schema(sql, *params)
                sql, *params, set_public_search_path=True
            )
            return [
                {
                    "content_id": row["content_id"],
@@ -922,31 +924,33 @@ async def semantic_search(
    # Add content type parameters and build placeholders dynamically
    content_type_start_idx = len(params_lexical) + 1
    content_type_placeholders_lexical = ", ".join(
-        f'${content_type_start_idx + i}::{{{{schema_prefix}}}}"ContentType"'
+        '$' + str(content_type_start_idx + i) + '::{schema_prefix}"ContentType"'
        for i in range(len(content_types))
    )
    params_lexical.extend([ct.value for ct in content_types])
-    sql_lexical = f"""
+    # Build query param index before appending
    query_param_idx = len(params_lexical) + 1
    params_lexical.append(f"%{query}%")
    # Use regular string (not f-string) for template to preserve {schema_prefix} placeholders
    sql_lexical = """
        SELECT
            "contentId" as content_id,
            "contentType" as content_type,
            "searchableText" as searchable_text,
            metadata,
            0.0 as similarity
-        FROM {{{{schema_prefix}}}}"UnifiedContentEmbedding"
+        FROM {schema_prefix}"UnifiedContentEmbedding"
-        WHERE "contentType" IN ({content_type_placeholders_lexical})
+        WHERE "contentType" IN (""" + content_type_placeholders_lexical + """)
-        {user_filter}
+        """ + user_filter + """
-        AND "searchableText" ILIKE ${len(params_lexical) + 1}
+        AND "searchableText" ILIKE $""" + str(query_param_idx) + """
        ORDER BY "updatedAt" DESC
        LIMIT $1
    """
    params_lexical.append(f"%{query}%")
    try:
-        results = await query_raw_with_schema(
+        results = await query_raw_with_schema(sql_lexical, *params_lexical)
            sql_lexical, *params_lexical, set_public_search_path=True
        )
        return [
            {
                "content_id": row["content_id"],
--- a/autogpt_platform/backend/backend/api/features/store/embeddings_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/embeddings_test.py
@@ -155,18 +155,14 @@ async def test_store_embedding_success(mocker):
    )
    assert result is True
-    # execute_raw is called twice: once for SET search_path, once for INSERT
+    # execute_raw is called once for INSERT (no separate SET search_path needed)
-    assert mock_client.execute_raw.call_count == 2
+    assert mock_client.execute_raw.call_count == 1
-    # First call: SET search_path
+    # Verify the INSERT query with the actual data
-    first_call_args = mock_client.execute_raw.call_args_list[0][0]
+    call_args = mock_client.execute_raw.call_args_list[0][0]
-    assert "SET search_path" in first_call_args[0]
+    assert "test-version-id" in call_args
-
+    assert "[0.1,0.2,0.3]" in call_args
-    # Second call: INSERT query with the actual data
+    assert None in call_args  # userId should be None for store agents
    second_call_args = mock_client.execute_raw.call_args_list[1][0]
    assert "test-version-id" in second_call_args
    assert "[0.1,0.2,0.3]" in second_call_args
    assert None in second_call_args  # userId should be None for store agents
@pytest.mark.asyncio(loop_scope="session")
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
@@ -295,7 +295,7 @@ async def unified_hybrid_search(
                FROM {{schema_prefix}}"UnifiedContentEmbedding" uce
                WHERE uce."contentType" = ANY({content_types_param}::{{schema_prefix}}"ContentType"[])
                {user_filter}
-                ORDER BY uce.embedding <=> {embedding_param}::vector
+                ORDER BY uce.embedding OPERATOR({{schema}}.<=>)  {embedding_param}::{{schema}}.vector
                LIMIT 200
            )
        ),
@@ -307,7 +307,7 @@ async def unified_hybrid_search(
                uce.metadata,
                uce."updatedAt" as updated_at,
                -- Semantic score: cosine similarity (1 - distance)
-                COALESCE(1 - (uce.embedding <=> {embedding_param}::vector), 0) as semantic_score,
+                COALESCE(1 - (uce.embedding OPERATOR({{schema}}.<=>)  {embedding_param}::{{schema}}.vector), 0) as semantic_score,
                -- Lexical score: ts_rank_cd
                COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw,
                -- Category match from metadata
@@ -363,9 +363,7 @@ async def unified_hybrid_search(
        LIMIT {limit_param} OFFSET {offset_param}
    """
-    results = await query_raw_with_schema(
+    results = await query_raw_with_schema(sql_query, *params)
        sql_query, *params, set_public_search_path=True
    )
    total = results[0]["total_count"] if results else 0
    # Apply BM25 reranking
@@ -585,7 +583,7 @@ async def hybrid_search(
                WHERE uce."contentType" = 'STORE_AGENT'::{{schema_prefix}}"ContentType"
                AND uce."userId" IS NULL
                AND {where_clause}
-                ORDER BY uce.embedding <=> {embedding_param}::vector
+                ORDER BY uce.embedding OPERATOR({{schema}}.<=>)  {embedding_param}::{{schema}}.vector
                LIMIT 200
            ) uce
        ),
@@ -607,7 +605,7 @@ async def hybrid_search(
                -- Searchable text for BM25 reranking
                COALESCE(sa.agent_name, '') || ' ' || COALESCE(sa.sub_heading, '') || ' ' || COALESCE(sa.description, '') as searchable_text,
                -- Semantic score
-                COALESCE(1 - (uce.embedding <=> {embedding_param}::vector), 0) as semantic_score,
+                COALESCE(1 - (uce.embedding OPERATOR({{schema}}.<=>)  {embedding_param}::{{schema}}.vector), 0) as semantic_score,
                -- Lexical score (raw, will normalize)
                COALESCE(ts_rank_cd(uce.search, plainto_tsquery('english', {query_param})), 0) as lexical_raw,
                -- Category match
@@ -688,9 +686,7 @@ async def hybrid_search(
        LIMIT {limit_param} OFFSET {offset_param}
    """
-    results = await query_raw_with_schema(
+    results = await query_raw_with_schema(sql_query, *params)
        sql_query, *params, set_public_search_path=True
    )
    total = results[0]["total_count"] if results else 0
--- a/autogpt_platform/backend/backend/data/db.py
+++ b/autogpt_platform/backend/backend/data/db.py
@@ -38,20 +38,6 @@ POOL_TIMEOUT = os.getenv("DB_POOL_TIMEOUT")
 if POOL_TIMEOUT:
    DATABASE_URL = add_param(DATABASE_URL, "pool_timeout", POOL_TIMEOUT)
 # Add public schema to search_path for pgvector type access
 # The vector extension is in public schema, but search_path is determined by schema parameter
 # Extract the schema from DATABASE_URL or default to 'public' (matching get_database_schema())
 parsed_url = urlparse(DATABASE_URL)
 url_params = dict(parse_qsl(parsed_url.query))
 db_schema = url_params.get("schema", "public")
 # Build search_path, avoiding duplicates if db_schema is already 'public'
 search_path_schemas = list(
    dict.fromkeys([db_schema, "public"])
 )  # Preserves order, removes duplicates
 search_path = ",".join(search_path_schemas)
 # This allows using ::vector without schema qualification
 DATABASE_URL = add_param(DATABASE_URL, "options", f"-c search_path={search_path}")
 HTTP_TIMEOUT = int(POOL_TIMEOUT) if POOL_TIMEOUT else None
 prisma = Prisma(
@@ -127,38 +113,43 @@ async def _raw_with_schema(
    *args,
    execute: bool = False,
    client: Prisma | None = None,
    set_public_search_path: bool = False,
 ) -> list[dict] | int:
    """Internal: Execute raw SQL with proper schema handling.
    Use query_raw_with_schema() or execute_raw_with_schema() instead.
    Supports placeholders:
        - {schema_prefix}: Table/type prefix (e.g., "platform".)
        - {schema}: Raw schema name (e.g., platform) for pgvector types and operators
    Args:
-        query_template: SQL query with {schema_prefix} placeholder
+        query_template: SQL query with {schema_prefix} and/or {schema} placeholders
        *args: Query parameters
        execute: If False, executes SELECT query. If True, executes INSERT/UPDATE/DELETE.
        client: Optional Prisma client for transactions (only used when execute=True).
        set_public_search_path: If True, sets search_path to include public schema.
                                Needed for pgvector types and other public schema objects.
    Returns:
        - list[dict] if execute=False (query results)
        - int if execute=True (number of affected rows)
    Example with vector type:
        await execute_raw_with_schema(
            'INSERT INTO {schema_prefix}"Embedding" (vec) VALUES ($1::{schema}.vector)',
            embedding_data
        )
    """
    schema = get_database_schema()
    schema_prefix = f'"{schema}".' if schema != "public" else ""
-    formatted_query = query_template.format(schema_prefix=schema_prefix)
+
    formatted_query = query_template.format(
        schema_prefix=schema_prefix,
        schema=schema,
    )
    import prisma as prisma_module
    db_client = client if client else prisma_module.get_client()
    # Set search_path to include public schema if requested
    # Prisma doesn't support the 'options' connection parameter, so we set it per-session
    # This is idempotent and safe to call multiple times
    if set_public_search_path:
        await db_client.execute_raw(f"SET search_path = {schema}, public")  # type: ignore
    if execute:
        result = await db_client.execute_raw(formatted_query, *args)  # type: ignore
    else:
@@ -167,16 +158,12 @@ async def _raw_with_schema(
    return result
-async def query_raw_with_schema(
+async def query_raw_with_schema(query_template: str, *args) -> list[dict]:
    query_template: str, *args, set_public_search_path: bool = False
 ) -> list[dict]:
    """Execute raw SQL SELECT query with proper schema handling.
    Args:
-        query_template: SQL query with {schema_prefix} placeholder
+        query_template: SQL query with {schema_prefix} and/or {schema} placeholders
        *args: Query parameters
        set_public_search_path: If True, sets search_path to include public schema.
                                Needed for pgvector types and other public schema objects.
    Returns:
        List of result rows as dictionaries
@@ -187,23 +174,20 @@ async def query_raw_with_schema(
            user_id
        )
    """
-    return await _raw_with_schema(query_template, *args, execute=False, set_public_search_path=set_public_search_path)  # type: ignore
+    return await _raw_with_schema(query_template, *args, execute=False)  # type: ignore
 async def execute_raw_with_schema(
    query_template: str,
    *args,
    client: Prisma | None = None,
    set_public_search_path: bool = False,
 ) -> int:
    """Execute raw SQL command (INSERT/UPDATE/DELETE) with proper schema handling.
    Args:
-        query_template: SQL query with {schema_prefix} placeholder
+        query_template: SQL query with {schema_prefix} and/or {schema} placeholders
        *args: Query parameters
        client: Optional Prisma client for transactions
        set_public_search_path: If True, sets search_path to include public schema.
                                Needed for pgvector types and other public schema objects.
    Returns:
        Number of affected rows
@@ -215,7 +199,7 @@ async def execute_raw_with_schema(
            client=tx  # Optional transaction client
        )
    """
-    return await _raw_with_schema(query_template, *args, execute=True, client=client, set_public_search_path=set_public_search_path)  # type: ignore
+    return await _raw_with_schema(query_template, *args, execute=True, client=client)  # type: ignore
 class BaseDbModel(BaseModel):
--- a/autogpt_platform/backend/migrations/20260109181714_add_docs_embedding/migration.sql
+++ b/autogpt_platform/backend/migrations/20260109181714_add_docs_embedding/migration.sql
@@ -1,9 +1,10 @@
 -- CreateExtension
 -- Supabase: pgvector must be enabled via Dashboard → Database → Extensions first
-- Create in public schema so vector type is available across all schemas
+-- Creates extension in current schema (determined by search_path)
 -- The extension may already exist in a different schema (e.g., Supabase pre-enables it)
 DO $$
 BEGIN
-    CREATE EXTENSION IF NOT EXISTS "vector" WITH SCHEMA "public";
+    CREATE EXTENSION IF NOT EXISTS "vector";
 EXCEPTION WHEN OTHERS THEN
    RAISE NOTICE 'vector extension not available or already exists, skipping';
 END $$;
@@ -12,6 +13,7 @@ END $$;
 CREATE TYPE "ContentType" AS ENUM ('STORE_AGENT', 'BLOCK', 'INTEGRATION', 'DOCUMENTATION', 'LIBRARY_AGENT');
 -- CreateTable
 -- Note: vector type is unqualified - relies on search_path including the schema where pgvector is installed
 CREATE TABLE "UnifiedContentEmbedding" (
    "id" TEXT NOT NULL,
    "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
@@ -19,7 +21,7 @@ CREATE TABLE "UnifiedContentEmbedding" (
    "contentType" "ContentType" NOT NULL,
    "contentId" TEXT NOT NULL,
    "userId" TEXT,
-    "embedding" public.vector(1536) NOT NULL,
+    "embedding" vector(1536) NOT NULL,
    "searchableText" TEXT NOT NULL,
    "metadata" JSONB NOT NULL DEFAULT '{}',
@@ -45,4 +47,4 @@ CREATE UNIQUE INDEX "UnifiedContentEmbedding_contentType_contentId_userId_key" O
 -- Uses cosine distance operator (<=>), which matches the query in hybrid_search.py
 -- Note: Drop first in case Prisma created a btree index (Prisma doesn't support HNSW)
 DROP INDEX IF EXISTS "UnifiedContentEmbedding_embedding_idx";
-CREATE INDEX "UnifiedContentEmbedding_embedding_idx" ON "UnifiedContentEmbedding" USING hnsw ("embedding" public.vector_cosine_ops);
+CREATE INDEX "UnifiedContentEmbedding_embedding_idx" ON "UnifiedContentEmbedding" USING hnsw ("embedding" vector_cosine_ops);