Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
ba76dd081e chore(deps): bump actions/cache from 4 to 5
Bumps [actions/cache](https://github.com/actions/cache) from 4 to 5.
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v4...v5)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-version: '5'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-15 04:40:39 +00:00
20 changed files with 264 additions and 1492 deletions

View File

@@ -83,7 +83,7 @@ jobs:
- name: Set up Python dependency cache
# On Windows, unpacking cached dependencies takes longer than just installing them
if: runner.os != 'Windows'
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
key: poetry-${{ runner.os }}-${{ hashFiles('classic/original_autogpt/poetry.lock') }}

View File

@@ -55,7 +55,7 @@ jobs:
- name: Set up Python dependency cache
# On Windows, unpacking cached dependencies takes longer than just installing them
if: runner.os != 'Windows'
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
key: poetry-${{ runner.os }}-${{ hashFiles('classic/benchmark/poetry.lock') }}

View File

@@ -107,7 +107,7 @@ jobs:
- name: Set up Python dependency cache
# On Windows, unpacking cached dependencies takes longer than just installing them
if: runner.os != 'Windows'
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ${{ runner.os == 'macOS' && '~/Library/Caches/pypoetry' || '~/.cache/pypoetry' }}
key: poetry-${{ runner.os }}-${{ hashFiles('classic/forge/poetry.lock') }}

View File

@@ -78,7 +78,7 @@ jobs:
python-version: ${{ env.min-python-version }}
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles(format('{0}/poetry.lock', matrix.sub-package)) }}
@@ -130,7 +130,7 @@ jobs:
python-version: ${{ env.min-python-version }}
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles(format('{0}/poetry.lock', matrix.sub-package)) }}

View File

@@ -41,7 +41,7 @@ jobs:
python-version: "3.11" # Use standard version matching CI
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('autogpt_platform/backend/poetry.lock') }}
@@ -91,7 +91,7 @@ jobs:
echo "PNPM_HOME=$HOME/.pnpm-store" >> $GITHUB_ENV
- name: Cache frontend dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ runner.os }}-pnpm-${{ hashFiles('autogpt_platform/frontend/pnpm-lock.yaml', 'autogpt_platform/frontend/package.json') }}
@@ -124,7 +124,7 @@ jobs:
# Phase 1: Cache and load Docker images for faster setup
- name: Set up Docker image cache
id: docker-cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/docker-cache
# Use a versioned key for cache invalidation when image list changes

View File

@@ -57,7 +57,7 @@ jobs:
python-version: "3.11" # Use standard version matching CI
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('autogpt_platform/backend/poetry.lock') }}
@@ -107,7 +107,7 @@ jobs:
echo "PNPM_HOME=$HOME/.pnpm-store" >> $GITHUB_ENV
- name: Cache frontend dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ runner.os }}-pnpm-${{ hashFiles('autogpt_platform/frontend/pnpm-lock.yaml', 'autogpt_platform/frontend/package.json') }}
@@ -140,7 +140,7 @@ jobs:
# Phase 1: Cache and load Docker images for faster setup
- name: Set up Docker image cache
id: docker-cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/docker-cache
# Use a versioned key for cache invalidation when image list changes

View File

@@ -39,7 +39,7 @@ jobs:
python-version: "3.11" # Use standard version matching CI
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('autogpt_platform/backend/poetry.lock') }}
@@ -89,7 +89,7 @@ jobs:
echo "PNPM_HOME=$HOME/.pnpm-store" >> $GITHUB_ENV
- name: Cache frontend dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ runner.os }}-pnpm-${{ hashFiles('autogpt_platform/frontend/pnpm-lock.yaml', 'autogpt_platform/frontend/package.json') }}
@@ -132,7 +132,7 @@ jobs:
# Phase 1: Cache and load Docker images for faster setup
- name: Set up Docker image cache
id: docker-cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/docker-cache
# Use a versioned key for cache invalidation when image list changes

View File

@@ -88,7 +88,7 @@ jobs:
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Set up Python dependency cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.cache/pypoetry
key: poetry-${{ runner.os }}-${{ hashFiles('autogpt_platform/backend/poetry.lock') }}
@@ -209,6 +209,7 @@ jobs:
PLAIN_OUTPUT: True
RUN_ENV: local
PORT: 8080
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# We know these are here, don't report this as a security vulnerability
# This is used as the default credential for the entire system's RabbitMQ instance
# If you want to replace this, you can do so by making our entire system generate

View File

@@ -45,7 +45,7 @@ jobs:
run: echo "key=${{ runner.os }}-pnpm-${{ hashFiles('autogpt_platform/frontend/pnpm-lock.yaml', 'autogpt_platform/frontend/package.json') }}" >> $GITHUB_OUTPUT
- name: Cache dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ steps.cache-key.outputs.key }}
@@ -73,7 +73,7 @@ jobs:
run: corepack enable
- name: Restore dependencies cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ needs.setup.outputs.cache-key }}
@@ -108,7 +108,7 @@ jobs:
run: corepack enable
- name: Restore dependencies cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ needs.setup.outputs.cache-key }}
@@ -164,7 +164,7 @@ jobs:
uses: docker/setup-buildx-action@v3
- name: Cache Docker layers
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: /tmp/.buildx-cache
key: ${{ runner.os }}-buildx-frontend-test-${{ hashFiles('autogpt_platform/docker-compose.yml', 'autogpt_platform/backend/Dockerfile', 'autogpt_platform/backend/pyproject.toml', 'autogpt_platform/backend/poetry.lock') }}
@@ -219,7 +219,7 @@ jobs:
fi
- name: Restore dependencies cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ needs.setup.outputs.cache-key }}

View File

@@ -44,7 +44,7 @@ jobs:
run: echo "key=${{ runner.os }}-pnpm-${{ hashFiles('autogpt_platform/frontend/pnpm-lock.yaml', 'autogpt_platform/frontend/package.json') }}" >> $GITHUB_OUTPUT
- name: Cache dependencies
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ steps.cache-key.outputs.key }}
@@ -88,7 +88,7 @@ jobs:
docker compose -f ../docker-compose.yml --profile local --profile deps_backend up -d
- name: Restore dependencies cache
uses: actions/cache@v4
uses: actions/cache@v5
with:
path: ~/.pnpm-store
key: ${{ needs.setup.outputs.cache-key }}

View File

@@ -1,417 +0,0 @@
"""
Content Type Handlers for Unified Embeddings
Pluggable system for different content sources (store agents, blocks, docs).
Each handler knows how to fetch and process its content type for embedding.
"""
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from prisma.enums import ContentType
from backend.data.db import query_raw_with_schema
logger = logging.getLogger(__name__)
@dataclass
class ContentItem:
"""Represents a piece of content to be embedded."""
content_id: str # Unique identifier (DB ID or file path)
content_type: ContentType
searchable_text: str # Combined text for embedding
metadata: dict[str, Any] # Content-specific metadata
user_id: str | None = None # For user-scoped content
class ContentHandler(ABC):
"""Base handler for fetching and processing content for embeddings."""
@property
@abstractmethod
def content_type(self) -> ContentType:
"""The ContentType this handler manages."""
pass
@abstractmethod
async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
"""
Fetch items that don't have embeddings yet.
Args:
batch_size: Maximum number of items to return
Returns:
List of ContentItem objects ready for embedding
"""
pass
@abstractmethod
async def get_stats(self) -> dict[str, int]:
"""
Get statistics about embedding coverage.
Returns:
Dict with keys: total, with_embeddings, without_embeddings
"""
pass
class StoreAgentHandler(ContentHandler):
"""Handler for marketplace store agent listings."""
@property
def content_type(self) -> ContentType:
return ContentType.STORE_AGENT
async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
"""Fetch approved store listings without embeddings."""
from backend.api.features.store.embeddings import build_searchable_text
missing = await query_raw_with_schema(
"""
SELECT
slv.id,
slv.name,
slv.description,
slv."subHeading",
slv.categories
FROM {schema_prefix}"StoreListingVersion" slv
LEFT JOIN {schema_prefix}"UnifiedContentEmbedding" uce
ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
WHERE slv."submissionStatus" = 'APPROVED'
AND slv."isDeleted" = false
AND uce."contentId" IS NULL
LIMIT $1
""",
batch_size,
)
return [
ContentItem(
content_id=row["id"],
content_type=ContentType.STORE_AGENT,
searchable_text=build_searchable_text(
name=row["name"],
description=row["description"],
sub_heading=row["subHeading"],
categories=row["categories"] or [],
),
metadata={
"name": row["name"],
"categories": row["categories"] or [],
},
user_id=None, # Store agents are public
)
for row in missing
]
async def get_stats(self) -> dict[str, int]:
"""Get statistics about store agent embedding coverage."""
# Count approved versions
approved_result = await query_raw_with_schema(
"""
SELECT COUNT(*) as count
FROM {schema_prefix}"StoreListingVersion"
WHERE "submissionStatus" = 'APPROVED'
AND "isDeleted" = false
"""
)
total_approved = approved_result[0]["count"] if approved_result else 0
# Count versions with embeddings
embedded_result = await query_raw_with_schema(
"""
SELECT COUNT(*) as count
FROM {schema_prefix}"StoreListingVersion" slv
JOIN {schema_prefix}"UnifiedContentEmbedding" uce ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
WHERE slv."submissionStatus" = 'APPROVED'
AND slv."isDeleted" = false
"""
)
with_embeddings = embedded_result[0]["count"] if embedded_result else 0
return {
"total": total_approved,
"with_embeddings": with_embeddings,
"without_embeddings": total_approved - with_embeddings,
}
class BlockHandler(ContentHandler):
"""Handler for block definitions (Python classes)."""
@property
def content_type(self) -> ContentType:
return ContentType.BLOCK
async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
"""Fetch blocks without embeddings."""
from backend.data.block import get_blocks
# Get all available blocks
all_blocks = get_blocks()
# Check which ones have embeddings
if not all_blocks:
return []
block_ids = list(all_blocks.keys())
# Query for existing embeddings
placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
existing_result = await query_raw_with_schema(
f"""
SELECT "contentId"
FROM {{schema_prefix}}"UnifiedContentEmbedding"
WHERE "contentType" = 'BLOCK'::{{schema_prefix}}"ContentType"
AND "contentId" = ANY(ARRAY[{placeholders}])
""",
*block_ids,
)
existing_ids = {row["contentId"] for row in existing_result}
missing_blocks = [
(block_id, block_cls)
for block_id, block_cls in all_blocks.items()
if block_id not in existing_ids
]
# Convert to ContentItem
items = []
for block_id, block_cls in missing_blocks[:batch_size]:
try:
block_instance = block_cls()
# Build searchable text from block metadata
parts = []
if hasattr(block_instance, "name") and block_instance.name:
parts.append(block_instance.name)
if (
hasattr(block_instance, "description")
and block_instance.description
):
parts.append(block_instance.description)
if hasattr(block_instance, "categories") and block_instance.categories:
# Convert BlockCategory enum to strings
parts.append(
" ".join(str(cat.value) for cat in block_instance.categories)
)
# Add input/output schema info
if hasattr(block_instance, "input_schema"):
schema = block_instance.input_schema
if hasattr(schema, "model_json_schema"):
schema_dict = schema.model_json_schema()
if "properties" in schema_dict:
for prop_name, prop_info in schema_dict[
"properties"
].items():
if "description" in prop_info:
parts.append(
f"{prop_name}: {prop_info['description']}"
)
searchable_text = " ".join(parts)
items.append(
ContentItem(
content_id=block_id,
content_type=ContentType.BLOCK,
searchable_text=searchable_text,
metadata={
"name": getattr(block_instance, "name", ""),
"categories": getattr(block_instance, "categories", []),
},
user_id=None, # Blocks are public
)
)
except Exception as e:
logger.warning(f"Failed to process block {block_id}: {e}")
continue
return items
async def get_stats(self) -> dict[str, int]:
"""Get statistics about block embedding coverage."""
from backend.data.block import get_blocks
all_blocks = get_blocks()
total_blocks = len(all_blocks)
if total_blocks == 0:
return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
block_ids = list(all_blocks.keys())
placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
embedded_result = await query_raw_with_schema(
f"""
SELECT COUNT(*) as count
FROM {{schema_prefix}}"UnifiedContentEmbedding"
WHERE "contentType" = 'BLOCK'::{{schema_prefix}}"ContentType"
AND "contentId" = ANY(ARRAY[{placeholders}])
""",
*block_ids,
)
with_embeddings = embedded_result[0]["count"] if embedded_result else 0
return {
"total": total_blocks,
"with_embeddings": with_embeddings,
"without_embeddings": total_blocks - with_embeddings,
}
class DocumentationHandler(ContentHandler):
"""Handler for documentation files (.md/.mdx)."""
@property
def content_type(self) -> ContentType:
return ContentType.DOCUMENTATION
def _get_docs_root(self) -> Path:
"""Get the documentation root directory."""
# Assuming docs are in /docs relative to project root
backend_root = Path(__file__).parent.parent.parent.parent
docs_root = backend_root.parent.parent / "docs"
return docs_root
def _extract_title_and_content(self, file_path: Path) -> tuple[str, str]:
"""Extract title and content from markdown file."""
try:
content = file_path.read_text(encoding="utf-8")
# Try to extract title from first # heading
lines = content.split("\n")
title = ""
body_lines = []
for line in lines:
if line.startswith("# ") and not title:
title = line[2:].strip()
else:
body_lines.append(line)
# If no title found, use filename
if not title:
title = file_path.stem.replace("-", " ").replace("_", " ").title()
body = "\n".join(body_lines)
return title, body
except Exception as e:
logger.warning(f"Failed to read {file_path}: {e}")
return file_path.stem, ""
async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
"""Fetch documentation files without embeddings."""
docs_root = self._get_docs_root()
if not docs_root.exists():
logger.warning(f"Documentation root not found: {docs_root}")
return []
# Find all .md and .mdx files
all_docs = list(docs_root.rglob("*.md")) + list(docs_root.rglob("*.mdx"))
# Get relative paths for content IDs
doc_paths = [str(doc.relative_to(docs_root)) for doc in all_docs]
if not doc_paths:
return []
# Check which ones have embeddings
placeholders = ",".join([f"${i+1}" for i in range(len(doc_paths))])
existing_result = await query_raw_with_schema(
f"""
SELECT "contentId"
FROM {{schema_prefix}}"UnifiedContentEmbedding"
WHERE "contentType" = 'DOCUMENTATION'::{{schema_prefix}}"ContentType"
AND "contentId" = ANY(ARRAY[{placeholders}])
""",
*doc_paths,
)
existing_ids = {row["contentId"] for row in existing_result}
missing_docs = [
(doc_path, doc_file)
for doc_path, doc_file in zip(doc_paths, all_docs)
if doc_path not in existing_ids
]
# Convert to ContentItem
items = []
for doc_path, doc_file in missing_docs[:batch_size]:
try:
title, content = self._extract_title_and_content(doc_file)
# Build searchable text
searchable_text = f"{title} {content}"
items.append(
ContentItem(
content_id=doc_path,
content_type=ContentType.DOCUMENTATION,
searchable_text=searchable_text,
metadata={
"title": title,
"path": doc_path,
},
user_id=None, # Documentation is public
)
)
except Exception as e:
logger.warning(f"Failed to process doc {doc_path}: {e}")
continue
return items
async def get_stats(self) -> dict[str, int]:
"""Get statistics about documentation embedding coverage."""
docs_root = self._get_docs_root()
if not docs_root.exists():
return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
# Count all .md and .mdx files
all_docs = list(docs_root.rglob("*.md")) + list(docs_root.rglob("*.mdx"))
total_docs = len(all_docs)
if total_docs == 0:
return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
doc_paths = [str(doc.relative_to(docs_root)) for doc in all_docs]
placeholders = ",".join([f"${i+1}" for i in range(len(doc_paths))])
embedded_result = await query_raw_with_schema(
f"""
SELECT COUNT(*) as count
FROM {{schema_prefix}}"UnifiedContentEmbedding"
WHERE "contentType" = 'DOCUMENTATION'::{{schema_prefix}}"ContentType"
AND "contentId" = ANY(ARRAY[{placeholders}])
""",
*doc_paths,
)
with_embeddings = embedded_result[0]["count"] if embedded_result else 0
return {
"total": total_docs,
"with_embeddings": with_embeddings,
"without_embeddings": total_docs - with_embeddings,
}
# Content handler registry
CONTENT_HANDLERS: dict[ContentType, ContentHandler] = {
ContentType.STORE_AGENT: StoreAgentHandler(),
ContentType.BLOCK: BlockHandler(),
ContentType.DOCUMENTATION: DocumentationHandler(),
}

View File

@@ -1,215 +0,0 @@
"""
Integration tests for content handlers using real DB.
Run with: poetry run pytest backend/api/features/store/content_handlers_integration_test.py -xvs
These tests use the real database but mock OpenAI calls.
"""
from unittest.mock import patch
import pytest
from backend.api.features.store.content_handlers import (
CONTENT_HANDLERS,
BlockHandler,
DocumentationHandler,
StoreAgentHandler,
)
from backend.api.features.store.embeddings import (
EMBEDDING_DIM,
backfill_all_content_types,
ensure_content_embedding,
get_embedding_stats,
)
@pytest.mark.asyncio(loop_scope="session")
async def test_store_agent_handler_real_db():
"""Test StoreAgentHandler with real database queries."""
handler = StoreAgentHandler()
# Get stats from real DB
stats = await handler.get_stats()
# Stats should have correct structure
assert "total" in stats
assert "with_embeddings" in stats
assert "without_embeddings" in stats
assert stats["total"] >= 0
assert stats["with_embeddings"] >= 0
assert stats["without_embeddings"] >= 0
# Get missing items (max 1 to keep test fast)
items = await handler.get_missing_items(batch_size=1)
# Items should be list (may be empty if all have embeddings)
assert isinstance(items, list)
if items:
item = items[0]
assert item.content_id is not None
assert item.content_type.value == "STORE_AGENT"
assert item.searchable_text != ""
assert item.user_id is None
@pytest.mark.asyncio(loop_scope="session")
async def test_block_handler_real_db():
"""Test BlockHandler with real database queries."""
handler = BlockHandler()
# Get stats from real DB
stats = await handler.get_stats()
# Stats should have correct structure
assert "total" in stats
assert "with_embeddings" in stats
assert "without_embeddings" in stats
assert stats["total"] >= 0 # Should have at least some blocks
assert stats["with_embeddings"] >= 0
assert stats["without_embeddings"] >= 0
# Get missing items (max 1 to keep test fast)
items = await handler.get_missing_items(batch_size=1)
# Items should be list
assert isinstance(items, list)
if items:
item = items[0]
assert item.content_id is not None # Should be block UUID
assert item.content_type.value == "BLOCK"
assert item.searchable_text != ""
assert item.user_id is None
@pytest.mark.asyncio(loop_scope="session")
async def test_documentation_handler_real_fs():
"""Test DocumentationHandler with real filesystem."""
handler = DocumentationHandler()
# Get stats from real filesystem
stats = await handler.get_stats()
# Stats should have correct structure
assert "total" in stats
assert "with_embeddings" in stats
assert "without_embeddings" in stats
assert stats["total"] >= 0
assert stats["with_embeddings"] >= 0
assert stats["without_embeddings"] >= 0
# Get missing items (max 1 to keep test fast)
items = await handler.get_missing_items(batch_size=1)
# Items should be list
assert isinstance(items, list)
if items:
item = items[0]
assert item.content_id is not None # Should be relative path
assert item.content_type.value == "DOCUMENTATION"
assert item.searchable_text != ""
assert item.user_id is None
@pytest.mark.asyncio(loop_scope="session")
async def test_get_embedding_stats_all_types():
"""Test get_embedding_stats aggregates all content types."""
stats = await get_embedding_stats()
# Should have structure with by_type and totals
assert "by_type" in stats
assert "totals" in stats
# Check each content type is present
by_type = stats["by_type"]
assert "STORE_AGENT" in by_type
assert "BLOCK" in by_type
assert "DOCUMENTATION" in by_type
# Check totals are aggregated
totals = stats["totals"]
assert totals["total"] >= 0
assert totals["with_embeddings"] >= 0
assert totals["without_embeddings"] >= 0
assert "coverage_percent" in totals
@pytest.mark.asyncio(loop_scope="session")
@patch("backend.api.features.store.embeddings.generate_embedding")
async def test_ensure_content_embedding_blocks(mock_generate):
"""Test creating embeddings for blocks (mocked OpenAI)."""
# Mock OpenAI to return fake embedding
mock_generate.return_value = [0.1] * EMBEDDING_DIM
# Get one block without embedding
handler = BlockHandler()
items = await handler.get_missing_items(batch_size=1)
if not items:
pytest.skip("No blocks without embeddings")
item = items[0]
# Try to create embedding (OpenAI mocked)
result = await ensure_content_embedding(
content_type=item.content_type,
content_id=item.content_id,
searchable_text=item.searchable_text,
metadata=item.metadata,
user_id=item.user_id,
)
# Should succeed with mocked OpenAI
assert result is True
mock_generate.assert_called_once()
@pytest.mark.asyncio(loop_scope="session")
@patch("backend.api.features.store.embeddings.generate_embedding")
async def test_backfill_all_content_types_dry_run(mock_generate):
"""Test backfill_all_content_types processes all handlers in order."""
# Mock OpenAI to return fake embedding
mock_generate.return_value = [0.1] * EMBEDDING_DIM
# Run backfill with batch_size=1 to process max 1 per type
result = await backfill_all_content_types(batch_size=1)
# Should have results for all content types
assert "by_type" in result
assert "totals" in result
by_type = result["by_type"]
assert "BLOCK" in by_type
assert "STORE_AGENT" in by_type
assert "DOCUMENTATION" in by_type
# Each type should have correct structure
for content_type, type_result in by_type.items():
assert "processed" in type_result
assert "success" in type_result
assert "failed" in type_result
# Totals should aggregate
totals = result["totals"]
assert totals["processed"] >= 0
assert totals["success"] >= 0
assert totals["failed"] >= 0
@pytest.mark.asyncio(loop_scope="session")
async def test_content_handler_registry():
"""Test all handlers are registered in correct order."""
from prisma.enums import ContentType
# All three types should be registered
assert ContentType.STORE_AGENT in CONTENT_HANDLERS
assert ContentType.BLOCK in CONTENT_HANDLERS
assert ContentType.DOCUMENTATION in CONTENT_HANDLERS
# Check handler types
assert isinstance(CONTENT_HANDLERS[ContentType.STORE_AGENT], StoreAgentHandler)
assert isinstance(CONTENT_HANDLERS[ContentType.BLOCK], BlockHandler)
assert isinstance(CONTENT_HANDLERS[ContentType.DOCUMENTATION], DocumentationHandler)

View File

@@ -1,324 +0,0 @@
"""
E2E tests for content handlers (blocks, store agents, documentation).
Tests the full flow: discovering content → generating embeddings → storing.
"""
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from prisma.enums import ContentType
from backend.api.features.store.content_handlers import (
CONTENT_HANDLERS,
BlockHandler,
DocumentationHandler,
StoreAgentHandler,
)
@pytest.mark.asyncio(loop_scope="session")
async def test_store_agent_handler_get_missing_items(mocker):
"""Test StoreAgentHandler fetches approved agents without embeddings."""
handler = StoreAgentHandler()
# Mock database query
mock_missing = [
{
"id": "agent-1",
"name": "Test Agent",
"description": "A test agent",
"subHeading": "Test heading",
"categories": ["AI", "Testing"],
}
]
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=mock_missing,
):
items = await handler.get_missing_items(batch_size=10)
assert len(items) == 1
assert items[0].content_id == "agent-1"
assert items[0].content_type == ContentType.STORE_AGENT
assert "Test Agent" in items[0].searchable_text
assert "A test agent" in items[0].searchable_text
assert items[0].metadata["name"] == "Test Agent"
assert items[0].user_id is None
@pytest.mark.asyncio(loop_scope="session")
async def test_store_agent_handler_get_stats(mocker):
"""Test StoreAgentHandler returns correct stats."""
handler = StoreAgentHandler()
# Mock approved count query
mock_approved = [{"count": 50}]
# Mock embedded count query
mock_embedded = [{"count": 30}]
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
side_effect=[mock_approved, mock_embedded],
):
stats = await handler.get_stats()
assert stats["total"] == 50
assert stats["with_embeddings"] == 30
assert stats["without_embeddings"] == 20
@pytest.mark.asyncio(loop_scope="session")
async def test_block_handler_get_missing_items(mocker):
"""Test BlockHandler discovers blocks without embeddings."""
handler = BlockHandler()
# Mock get_blocks to return test blocks
mock_block_class = MagicMock()
mock_block_instance = MagicMock()
mock_block_instance.name = "Calculator Block"
mock_block_instance.description = "Performs calculations"
mock_block_instance.categories = [MagicMock(value="MATH")]
mock_block_instance.input_schema.model_json_schema.return_value = {
"properties": {"expression": {"description": "Math expression to evaluate"}}
}
mock_block_class.return_value = mock_block_instance
mock_blocks = {"block-uuid-1": mock_block_class}
# Mock existing embeddings query (no embeddings exist)
mock_existing = []
with patch(
"backend.data.block.get_blocks",
return_value=mock_blocks,
):
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=mock_existing,
):
items = await handler.get_missing_items(batch_size=10)
assert len(items) == 1
assert items[0].content_id == "block-uuid-1"
assert items[0].content_type == ContentType.BLOCK
assert "Calculator Block" in items[0].searchable_text
assert "Performs calculations" in items[0].searchable_text
assert "MATH" in items[0].searchable_text
assert "expression: Math expression" in items[0].searchable_text
assert items[0].user_id is None
@pytest.mark.asyncio(loop_scope="session")
async def test_block_handler_get_stats(mocker):
"""Test BlockHandler returns correct stats."""
handler = BlockHandler()
# Mock get_blocks
mock_blocks = {
"block-1": MagicMock(),
"block-2": MagicMock(),
"block-3": MagicMock(),
}
# Mock embedded count query (2 blocks have embeddings)
mock_embedded = [{"count": 2}]
with patch(
"backend.data.block.get_blocks",
return_value=mock_blocks,
):
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=mock_embedded,
):
stats = await handler.get_stats()
assert stats["total"] == 3
assert stats["with_embeddings"] == 2
assert stats["without_embeddings"] == 1
@pytest.mark.asyncio(loop_scope="session")
async def test_documentation_handler_get_missing_items(tmp_path, mocker):
"""Test DocumentationHandler discovers docs without embeddings."""
handler = DocumentationHandler()
# Create temporary docs directory with test files
docs_root = tmp_path / "docs"
docs_root.mkdir()
(docs_root / "guide.md").write_text("# Getting Started\n\nThis is a guide.")
(docs_root / "api.mdx").write_text("# API Reference\n\nAPI documentation.")
# Mock _get_docs_root to return temp dir
with patch.object(handler, "_get_docs_root", return_value=docs_root):
# Mock existing embeddings query (no embeddings exist)
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=[],
):
items = await handler.get_missing_items(batch_size=10)
assert len(items) == 2
# Check guide.md
guide_item = next(
(item for item in items if item.content_id == "guide.md"), None
)
assert guide_item is not None
assert guide_item.content_type == ContentType.DOCUMENTATION
assert "Getting Started" in guide_item.searchable_text
assert "This is a guide" in guide_item.searchable_text
assert guide_item.metadata["title"] == "Getting Started"
assert guide_item.user_id is None
# Check api.mdx
api_item = next(
(item for item in items if item.content_id == "api.mdx"), None
)
assert api_item is not None
assert "API Reference" in api_item.searchable_text
@pytest.mark.asyncio(loop_scope="session")
async def test_documentation_handler_get_stats(tmp_path, mocker):
"""Test DocumentationHandler returns correct stats."""
handler = DocumentationHandler()
# Create temporary docs directory
docs_root = tmp_path / "docs"
docs_root.mkdir()
(docs_root / "doc1.md").write_text("# Doc 1")
(docs_root / "doc2.md").write_text("# Doc 2")
(docs_root / "doc3.mdx").write_text("# Doc 3")
# Mock embedded count query (1 doc has embedding)
mock_embedded = [{"count": 1}]
with patch.object(handler, "_get_docs_root", return_value=docs_root):
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=mock_embedded,
):
stats = await handler.get_stats()
assert stats["total"] == 3
assert stats["with_embeddings"] == 1
assert stats["without_embeddings"] == 2
@pytest.mark.asyncio(loop_scope="session")
async def test_documentation_handler_title_extraction(tmp_path):
"""Test DocumentationHandler extracts title from markdown heading."""
handler = DocumentationHandler()
# Test with heading
doc_with_heading = tmp_path / "with_heading.md"
doc_with_heading.write_text("# My Title\n\nContent here")
title, content = handler._extract_title_and_content(doc_with_heading)
assert title == "My Title"
assert "# My Title" not in content
assert "Content here" in content
# Test without heading
doc_without_heading = tmp_path / "no-heading.md"
doc_without_heading.write_text("Just content, no heading")
title, content = handler._extract_title_and_content(doc_without_heading)
assert title == "No Heading" # Uses filename
assert "Just content" in content
@pytest.mark.asyncio(loop_scope="session")
async def test_content_handlers_registry():
"""Test all content types are registered."""
assert ContentType.STORE_AGENT in CONTENT_HANDLERS
assert ContentType.BLOCK in CONTENT_HANDLERS
assert ContentType.DOCUMENTATION in CONTENT_HANDLERS
assert isinstance(CONTENT_HANDLERS[ContentType.STORE_AGENT], StoreAgentHandler)
assert isinstance(CONTENT_HANDLERS[ContentType.BLOCK], BlockHandler)
assert isinstance(CONTENT_HANDLERS[ContentType.DOCUMENTATION], DocumentationHandler)
@pytest.mark.asyncio(loop_scope="session")
async def test_block_handler_handles_missing_attributes():
"""Test BlockHandler gracefully handles blocks with missing attributes."""
handler = BlockHandler()
# Mock block with minimal attributes
mock_block_class = MagicMock()
mock_block_instance = MagicMock()
mock_block_instance.name = "Minimal Block"
# No description, categories, or schema
del mock_block_instance.description
del mock_block_instance.categories
del mock_block_instance.input_schema
mock_block_class.return_value = mock_block_instance
mock_blocks = {"block-minimal": mock_block_class}
with patch(
"backend.data.block.get_blocks",
return_value=mock_blocks,
):
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=[],
):
items = await handler.get_missing_items(batch_size=10)
assert len(items) == 1
assert items[0].searchable_text == "Minimal Block"
@pytest.mark.asyncio(loop_scope="session")
async def test_block_handler_skips_failed_blocks():
"""Test BlockHandler skips blocks that fail to instantiate."""
handler = BlockHandler()
# Mock one good block and one bad block
good_block = MagicMock()
good_instance = MagicMock()
good_instance.name = "Good Block"
good_instance.description = "Works fine"
good_instance.categories = []
good_block.return_value = good_instance
bad_block = MagicMock()
bad_block.side_effect = Exception("Instantiation failed")
mock_blocks = {"good-block": good_block, "bad-block": bad_block}
with patch(
"backend.data.block.get_blocks",
return_value=mock_blocks,
):
with patch(
"backend.api.features.store.content_handlers.query_raw_with_schema",
return_value=[],
):
items = await handler.get_missing_items(batch_size=10)
# Should only get the good block
assert len(items) == 1
assert items[0].content_id == "good-block"
@pytest.mark.asyncio(loop_scope="session")
async def test_documentation_handler_missing_docs_directory():
"""Test DocumentationHandler handles missing docs directory gracefully."""
handler = DocumentationHandler()
# Mock _get_docs_root to return non-existent path
fake_path = Path("/nonexistent/docs")
with patch.object(handler, "_get_docs_root", return_value=fake_path):
items = await handler.get_missing_items(batch_size=10)
assert items == []
stats = await handler.get_stats()
assert stats["total"] == 0
assert stats["with_embeddings"] == 0
assert stats["without_embeddings"] == 0

View File

@@ -14,7 +14,6 @@ import prisma
from prisma.enums import ContentType
from tiktoken import encoding_for_model
from backend.api.features.store.content_handlers import CONTENT_HANDLERS
from backend.data.db import execute_raw_with_schema, query_raw_with_schema
from backend.util.clients import get_openai_client
from backend.util.json import dumps
@@ -24,9 +23,6 @@ logger = logging.getLogger(__name__)
# OpenAI embedding model configuration
EMBEDDING_MODEL = "text-embedding-3-small"
# Embedding dimension for the model above
# text-embedding-3-small: 1536, text-embedding-3-large: 3072
EMBEDDING_DIM = 1536
# OpenAI embedding token limit (8,191 with 1 token buffer for safety)
EMBEDDING_MAX_TOKENS = 8191
@@ -373,69 +369,55 @@ async def delete_content_embedding(
async def get_embedding_stats() -> dict[str, Any]:
"""
Get statistics about embedding coverage for all content types.
Get statistics about embedding coverage.
Returns stats per content type and overall totals.
Returns counts of:
- Total approved listing versions
- Versions with embeddings
- Versions without embeddings
"""
try:
stats_by_type = {}
total_items = 0
total_with_embeddings = 0
total_without_embeddings = 0
# Count approved versions
approved_result = await query_raw_with_schema(
"""
SELECT COUNT(*) as count
FROM {schema_prefix}"StoreListingVersion"
WHERE "submissionStatus" = 'APPROVED'
AND "isDeleted" = false
"""
)
total_approved = approved_result[0]["count"] if approved_result else 0
# Aggregate stats from all handlers
for content_type, handler in CONTENT_HANDLERS.items():
try:
stats = await handler.get_stats()
stats_by_type[content_type.value] = {
"total": stats["total"],
"with_embeddings": stats["with_embeddings"],
"without_embeddings": stats["without_embeddings"],
"coverage_percent": (
round(stats["with_embeddings"] / stats["total"] * 100, 1)
if stats["total"] > 0
else 0
),
}
total_items += stats["total"]
total_with_embeddings += stats["with_embeddings"]
total_without_embeddings += stats["without_embeddings"]
except Exception as e:
logger.error(f"Failed to get stats for {content_type.value}: {e}")
stats_by_type[content_type.value] = {
"total": 0,
"with_embeddings": 0,
"without_embeddings": 0,
"coverage_percent": 0,
"error": str(e),
}
# Count versions with embeddings
embedded_result = await query_raw_with_schema(
"""
SELECT COUNT(*) as count
FROM {schema_prefix}"StoreListingVersion" slv
JOIN {schema_prefix}"UnifiedContentEmbedding" uce ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
WHERE slv."submissionStatus" = 'APPROVED'
AND slv."isDeleted" = false
"""
)
with_embeddings = embedded_result[0]["count"] if embedded_result else 0
return {
"by_type": stats_by_type,
"totals": {
"total": total_items,
"with_embeddings": total_with_embeddings,
"without_embeddings": total_without_embeddings,
"coverage_percent": (
round(total_with_embeddings / total_items * 100, 1)
if total_items > 0
else 0
),
},
"total_approved": total_approved,
"with_embeddings": with_embeddings,
"without_embeddings": total_approved - with_embeddings,
"coverage_percent": (
round(with_embeddings / total_approved * 100, 1)
if total_approved > 0
else 0
),
}
except Exception as e:
logger.error(f"Failed to get embedding stats: {e}")
return {
"by_type": {},
"totals": {
"total": 0,
"with_embeddings": 0,
"without_embeddings": 0,
"coverage_percent": 0,
},
"total_approved": 0,
"with_embeddings": 0,
"without_embeddings": 0,
"coverage_percent": 0,
"error": str(e),
}
@@ -444,118 +426,73 @@ async def backfill_missing_embeddings(batch_size: int = 10) -> dict[str, Any]:
"""
Generate embeddings for approved listings that don't have them.
BACKWARD COMPATIBILITY: Maintained for existing usage.
This now delegates to backfill_all_content_types() to process all content types.
Args:
batch_size: Number of embeddings to generate per content type
batch_size: Number of embeddings to generate in one call
Returns:
Dict with success/failure counts aggregated across all content types
Dict with success/failure counts
"""
# Delegate to the new generic backfill system
result = await backfill_all_content_types(batch_size)
try:
# Find approved versions without embeddings
missing = await query_raw_with_schema(
"""
SELECT
slv.id,
slv.name,
slv.description,
slv."subHeading",
slv.categories
FROM {schema_prefix}"StoreListingVersion" slv
LEFT JOIN {schema_prefix}"UnifiedContentEmbedding" uce
ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
WHERE slv."submissionStatus" = 'APPROVED'
AND slv."isDeleted" = false
AND uce."contentId" IS NULL
LIMIT $1
""",
batch_size,
)
# Return in the old format for backward compatibility
return result["totals"]
async def backfill_all_content_types(batch_size: int = 10) -> dict[str, Any]:
"""
Generate embeddings for all content types using registered handlers.
Processes content types in order: BLOCK → STORE_AGENT → DOCUMENTATION.
This ensures foundational content (blocks) are searchable first.
Args:
batch_size: Number of embeddings to generate per content type
Returns:
Dict with stats per content type and overall totals
"""
results_by_type = {}
total_processed = 0
total_success = 0
total_failed = 0
# Process content types in explicit order
processing_order = [
ContentType.BLOCK,
ContentType.STORE_AGENT,
ContentType.DOCUMENTATION,
]
for content_type in processing_order:
handler = CONTENT_HANDLERS.get(content_type)
if not handler:
logger.warning(f"No handler registered for {content_type.value}")
continue
try:
logger.info(f"Processing {content_type.value} content type...")
# Get missing items from handler
missing_items = await handler.get_missing_items(batch_size)
if not missing_items:
results_by_type[content_type.value] = {
"processed": 0,
"success": 0,
"failed": 0,
"message": "No missing embeddings",
}
continue
# Process embeddings concurrently for better performance
embedding_tasks = [
ensure_content_embedding(
content_type=item.content_type,
content_id=item.content_id,
searchable_text=item.searchable_text,
metadata=item.metadata,
user_id=item.user_id,
)
for item in missing_items
]
results = await asyncio.gather(*embedding_tasks, return_exceptions=True)
success = sum(1 for result in results if result is True)
failed = len(results) - success
results_by_type[content_type.value] = {
"processed": len(missing_items),
"success": success,
"failed": failed,
"message": f"Backfilled {success} embeddings, {failed} failed",
}
total_processed += len(missing_items)
total_success += success
total_failed += failed
logger.info(
f"{content_type.value}: processed {len(missing_items)}, "
f"success {success}, failed {failed}"
)
except Exception as e:
logger.error(f"Failed to process {content_type.value}: {e}")
results_by_type[content_type.value] = {
if not missing:
return {
"processed": 0,
"success": 0,
"failed": 0,
"error": str(e),
"message": "No missing embeddings",
}
return {
"by_type": results_by_type,
"totals": {
"processed": total_processed,
"success": total_success,
"failed": total_failed,
"message": f"Overall: {total_success} succeeded, {total_failed} failed",
},
}
# Process embeddings concurrently for better performance
embedding_tasks = [
ensure_embedding(
version_id=row["id"],
name=row["name"],
description=row["description"],
sub_heading=row["subHeading"],
categories=row["categories"] or [],
)
for row in missing
]
results = await asyncio.gather(*embedding_tasks, return_exceptions=True)
success = sum(1 for result in results if result is True)
failed = len(results) - success
return {
"processed": len(missing),
"success": success,
"failed": failed,
"message": f"Backfilled {success} embeddings, {failed} failed",
}
except Exception as e:
logger.error(f"Failed to backfill embeddings: {e}")
return {
"processed": 0,
"success": 0,
"failed": 0,
"error": str(e),
}
async def embed_query(query: str) -> list[float] | None:
@@ -629,109 +566,3 @@ async def ensure_content_embedding(
except Exception as e:
logger.error(f"Failed to ensure embedding for {content_type}:{content_id}: {e}")
return False
async def cleanup_orphaned_embeddings() -> dict[str, Any]:
"""
Clean up embeddings for blocks and docs that no longer exist.
Compares current blocks/docs with embeddings in database and removes orphaned records.
Store agents are NOT cleaned up - they're properly filtered during search.
Returns:
Dict with cleanup statistics per content type
"""
from backend.api.features.store.content_handlers import CONTENT_HANDLERS
from backend.data.db import query_raw_with_schema
results_by_type = {}
total_deleted = 0
# Only cleanup BLOCK and DOCUMENTATION - store agents are filtered during search
cleanup_types = [ContentType.BLOCK, ContentType.DOCUMENTATION]
for content_type in cleanup_types:
try:
handler = CONTENT_HANDLERS.get(content_type)
if not handler:
logger.warning(f"No handler registered for {content_type}")
results_by_type[content_type.value] = {
"deleted": 0,
"error": "No handler registered",
}
continue
# Get all current content IDs from handler
if content_type == ContentType.BLOCK:
from backend.data.block import get_blocks
current_ids = set(get_blocks().keys())
elif content_type == ContentType.DOCUMENTATION:
from pathlib import Path
backend_root = Path(__file__).parent.parent.parent.parent
docs_root = backend_root.parent.parent / "docs"
if docs_root.exists():
all_docs = list(docs_root.rglob("*.md")) + list(
docs_root.rglob("*.mdx")
)
current_ids = {str(doc.relative_to(docs_root)) for doc in all_docs}
else:
current_ids = set()
else:
current_ids = set()
# Get all embedding IDs from database
db_embeddings = await query_raw_with_schema(
"""
SELECT "contentId"
FROM {schema_prefix}"UnifiedContentEmbedding"
WHERE "contentType" = $1::{schema_prefix}"ContentType"
""",
content_type,
)
db_ids = {row["contentId"] for row in db_embeddings}
# Find orphaned embeddings (in DB but not in current content)
orphaned_ids = db_ids - current_ids
if not orphaned_ids:
logger.info(f"{content_type.value}: No orphaned embeddings found")
results_by_type[content_type.value] = {
"deleted": 0,
"message": "No orphaned embeddings",
}
continue
# Delete orphaned embeddings
deleted = 0
for content_id in orphaned_ids:
if await delete_content_embedding(content_type, content_id):
deleted += 1
logger.info(
f"{content_type.value}: Deleted {deleted}/{len(orphaned_ids)} orphaned embeddings"
)
results_by_type[content_type.value] = {
"deleted": deleted,
"orphaned": len(orphaned_ids),
"message": f"Deleted {deleted} orphaned embeddings",
}
total_deleted += deleted
except Exception as e:
logger.error(f"Failed to cleanup {content_type.value}: {e}")
results_by_type[content_type.value] = {
"deleted": 0,
"error": str(e),
}
return {
"by_type": results_by_type,
"totals": {
"deleted": total_deleted,
"message": f"Deleted {total_deleted} orphaned embeddings",
},
}

View File

@@ -4,13 +4,12 @@ Integration tests for embeddings with schema handling.
These tests verify that embeddings operations work correctly across different database schemas.
"""
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, patch
import pytest
from prisma.enums import ContentType
from backend.api.features.store import embeddings
from backend.api.features.store.embeddings import EMBEDDING_DIM
# Schema prefix tests removed - functionality moved to db.raw_with_schema() helper
@@ -29,7 +28,7 @@ async def test_store_content_embedding_with_schema():
result = await embeddings.store_content_embedding(
content_type=ContentType.STORE_AGENT,
content_id="test-id",
embedding=[0.1] * EMBEDDING_DIM,
embedding=[0.1] * 1536,
searchable_text="test text",
metadata={"test": "data"},
user_id=None,
@@ -126,69 +125,84 @@ async def test_delete_content_embedding_with_schema():
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
async def test_get_embedding_stats_with_schema():
"""Test embedding statistics with proper schema handling via content handlers."""
# Mock handler to return stats
mock_handler = MagicMock()
mock_handler.get_stats = AsyncMock(
return_value={
"total": 100,
"with_embeddings": 80,
"without_embeddings": 20,
}
)
"""Test embedding statistics with proper schema handling."""
with patch("backend.data.db.get_database_schema") as mock_schema:
mock_schema.return_value = "platform"
with patch(
"backend.api.features.store.embeddings.CONTENT_HANDLERS",
{ContentType.STORE_AGENT: mock_handler},
):
result = await embeddings.get_embedding_stats()
with patch("prisma.get_client") as mock_get_client:
mock_client = AsyncMock()
# Mock both query results
mock_client.query_raw.side_effect = [
[{"count": 100}], # total_approved
[{"count": 80}], # with_embeddings
]
mock_get_client.return_value = mock_client
# Verify handler was called
mock_handler.get_stats.assert_called_once()
result = await embeddings.get_embedding_stats()
# Verify new result structure
assert "by_type" in result
assert "totals" in result
assert result["totals"]["total"] == 100
assert result["totals"]["with_embeddings"] == 80
assert result["totals"]["without_embeddings"] == 20
assert result["totals"]["coverage_percent"] == 80.0
# Verify both queries were called
assert mock_client.query_raw.call_count == 2
# Get both SQL queries
first_call = mock_client.query_raw.call_args_list[0]
second_call = mock_client.query_raw.call_args_list[1]
first_sql = first_call[0][0]
second_sql = second_call[0][0]
# Verify schema prefix in both queries
assert '"platform"."StoreListingVersion"' in first_sql
assert '"platform"."StoreListingVersion"' in second_sql
assert '"platform"."UnifiedContentEmbedding"' in second_sql
# Verify results
assert result["total_approved"] == 100
assert result["with_embeddings"] == 80
assert result["without_embeddings"] == 20
assert result["coverage_percent"] == 80.0
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
async def test_backfill_missing_embeddings_with_schema():
"""Test backfilling embeddings via content handlers."""
from backend.api.features.store.content_handlers import ContentItem
"""Test backfilling embeddings with proper schema handling."""
with patch("backend.data.db.get_database_schema") as mock_schema:
mock_schema.return_value = "platform"
# Create mock content item
mock_item = ContentItem(
content_id="version-1",
content_type=ContentType.STORE_AGENT,
searchable_text="Test Agent Test description",
metadata={"name": "Test Agent"},
)
with patch("prisma.get_client") as mock_get_client:
mock_client = AsyncMock()
# Mock missing embeddings query
mock_client.query_raw.return_value = [
{
"id": "version-1",
"name": "Test Agent",
"description": "Test description",
"subHeading": "Test heading",
"categories": ["test"],
}
]
mock_get_client.return_value = mock_client
# Mock handler
mock_handler = MagicMock()
mock_handler.get_missing_items = AsyncMock(return_value=[mock_item])
with patch(
"backend.api.features.store.embeddings.CONTENT_HANDLERS",
{ContentType.STORE_AGENT: mock_handler},
):
with patch(
"backend.api.features.store.embeddings.generate_embedding",
return_value=[0.1] * EMBEDDING_DIM,
):
with patch(
"backend.api.features.store.embeddings.store_content_embedding",
return_value=True,
):
"backend.api.features.store.embeddings.ensure_embedding"
) as mock_ensure:
mock_ensure.return_value = True
result = await embeddings.backfill_missing_embeddings(batch_size=10)
# Verify handler was called
mock_handler.get_missing_items.assert_called_once_with(10)
# Verify the query was called
assert mock_client.query_raw.called
# Get the SQL query
call_args = mock_client.query_raw.call_args
sql_query = call_args[0][0]
# Verify schema prefix in query
assert '"platform"."StoreListingVersion"' in sql_query
assert '"platform"."UnifiedContentEmbedding"' in sql_query
# Verify ensure_embedding was called
assert mock_ensure.called
# Verify results
assert result["processed"] == 1
@@ -212,7 +226,7 @@ async def test_ensure_content_embedding_with_schema():
with patch(
"backend.api.features.store.embeddings.generate_embedding"
) as mock_generate:
mock_generate.return_value = [0.1] * EMBEDDING_DIM
mock_generate.return_value = [0.1] * 1536
with patch(
"backend.api.features.store.embeddings.store_content_embedding"
@@ -246,7 +260,7 @@ async def test_backward_compatibility_store_embedding():
result = await embeddings.store_embedding(
version_id="test-version-id",
embedding=[0.1] * EMBEDDING_DIM,
embedding=[0.1] * 1536,
tx=None,
)
@@ -301,7 +315,7 @@ async def test_schema_handling_error_cases():
result = await embeddings.store_content_embedding(
content_type=ContentType.STORE_AGENT,
content_id="test-id",
embedding=[0.1] * EMBEDDING_DIM,
embedding=[0.1] * 1536,
searchable_text="test",
metadata=None,
user_id=None,

View File

@@ -63,7 +63,7 @@ async def test_generate_embedding_success():
result = await embeddings.generate_embedding("test text")
assert result is not None
assert len(result) == embeddings.EMBEDDING_DIM
assert len(result) == 1536
assert result[0] == 0.1
mock_client.embeddings.create.assert_called_once_with(
@@ -110,7 +110,7 @@ async def test_generate_embedding_text_truncation():
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.data = [MagicMock()]
mock_response.data[0].embedding = [0.1] * embeddings.EMBEDDING_DIM
mock_response.data[0].embedding = [0.1] * 1536
# Use AsyncMock for async embeddings.create method
mock_client.embeddings.create = AsyncMock(return_value=mock_response)
@@ -297,92 +297,72 @@ async def test_ensure_embedding_generation_fails(mock_get, mock_generate):
@pytest.mark.asyncio(loop_scope="session")
async def test_get_embedding_stats():
"""Test embedding statistics retrieval."""
# Mock handler stats for each content type
mock_handler = MagicMock()
mock_handler.get_stats = AsyncMock(
return_value={
"total": 100,
"with_embeddings": 75,
"without_embeddings": 25,
}
)
# Mock approved count query and embedded count query
mock_approved_result = [{"count": 100}]
mock_embedded_result = [{"count": 75}]
# Patch the CONTENT_HANDLERS where it's used (in embeddings module)
with patch(
"backend.api.features.store.embeddings.CONTENT_HANDLERS",
{ContentType.STORE_AGENT: mock_handler},
"backend.api.features.store.embeddings.query_raw_with_schema",
side_effect=[mock_approved_result, mock_embedded_result],
):
result = await embeddings.get_embedding_stats()
assert "by_type" in result
assert "totals" in result
assert result["totals"]["total"] == 100
assert result["totals"]["with_embeddings"] == 75
assert result["totals"]["without_embeddings"] == 25
assert result["totals"]["coverage_percent"] == 75.0
assert result["total_approved"] == 100
assert result["with_embeddings"] == 75
assert result["without_embeddings"] == 25
assert result["coverage_percent"] == 75.0
@pytest.mark.asyncio(loop_scope="session")
@patch("backend.api.features.store.embeddings.store_content_embedding")
async def test_backfill_missing_embeddings_success(mock_store):
@patch("backend.api.features.store.embeddings.ensure_embedding")
async def test_backfill_missing_embeddings_success(mock_ensure):
"""Test backfill with successful embedding generation."""
# Mock ContentItem from handlers
from backend.api.features.store.content_handlers import ContentItem
mock_items = [
ContentItem(
content_id="version-1",
content_type=ContentType.STORE_AGENT,
searchable_text="Agent 1 Description 1",
metadata={"name": "Agent 1"},
),
ContentItem(
content_id="version-2",
content_type=ContentType.STORE_AGENT,
searchable_text="Agent 2 Description 2",
metadata={"name": "Agent 2"},
),
# Mock missing embeddings query
mock_missing = [
{
"id": "version-1",
"name": "Agent 1",
"description": "Description 1",
"subHeading": "Heading 1",
"categories": ["AI"],
},
{
"id": "version-2",
"name": "Agent 2",
"description": "Description 2",
"subHeading": "Heading 2",
"categories": ["Productivity"],
},
]
# Mock handler to return missing items
mock_handler = MagicMock()
mock_handler.get_missing_items = AsyncMock(return_value=mock_items)
# Mock store_content_embedding to succeed for first, fail for second
mock_store.side_effect = [True, False]
# Mock ensure_embedding to succeed for first, fail for second
mock_ensure.side_effect = [True, False]
with patch(
"backend.api.features.store.embeddings.CONTENT_HANDLERS",
{ContentType.STORE_AGENT: mock_handler},
"backend.api.features.store.embeddings.query_raw_with_schema",
return_value=mock_missing,
):
with patch(
"backend.api.features.store.embeddings.generate_embedding",
return_value=[0.1] * embeddings.EMBEDDING_DIM,
):
result = await embeddings.backfill_missing_embeddings(batch_size=5)
result = await embeddings.backfill_missing_embeddings(batch_size=5)
assert result["processed"] == 2
assert result["success"] == 1
assert result["failed"] == 1
assert mock_store.call_count == 2
assert result["processed"] == 2
assert result["success"] == 1
assert result["failed"] == 1
assert mock_ensure.call_count == 2
@pytest.mark.asyncio(loop_scope="session")
async def test_backfill_missing_embeddings_no_missing():
"""Test backfill when no embeddings are missing."""
# Mock handler to return no missing items
mock_handler = MagicMock()
mock_handler.get_missing_items = AsyncMock(return_value=[])
with patch(
"backend.api.features.store.embeddings.CONTENT_HANDLERS",
{ContentType.STORE_AGENT: mock_handler},
"backend.api.features.store.embeddings.query_raw_with_schema",
return_value=[],
):
result = await embeddings.backfill_missing_embeddings(batch_size=5)
assert result["processed"] == 0
assert result["success"] == 0
assert result["failed"] == 0
assert result["message"] == "No missing embeddings"
@pytest.mark.asyncio(loop_scope="session")

View File

@@ -11,7 +11,6 @@ from datetime import datetime
from typing import Any, Literal
from backend.api.features.store.embeddings import (
EMBEDDING_DIM,
embed_query,
embedding_to_vector_string,
)
@@ -179,39 +178,15 @@ async def hybrid_search(
# No user input is concatenated directly into the SQL string
where_clause = " AND ".join(where_parts)
# Graceful degradation: fall back to lexical-only search if embedding unavailable
# Embedding is required for hybrid search - fail fast if unavailable
if query_embedding is None or not query_embedding:
logger.warning(
"Failed to generate query embedding - falling back to lexical-only search. "
# Log detailed error server-side
logger.error(
"Failed to generate query embedding. "
"Check that openai_internal_api_key is configured and OpenAI API is accessible."
)
# Use zero embedding (semantic score will be 0)
query_embedding = [0.0] * EMBEDDING_DIM
# Adjust weights: redistribute semantic weight to other components
# Semantic becomes 0, lexical increases proportionally
total_non_semantic = (
weights.lexical + weights.category + weights.recency + weights.popularity
)
if total_non_semantic > 0:
# Redistribute semantic weight proportionally to other components
redistribution_factor = 1.0 / total_non_semantic
weights = HybridSearchWeights(
semantic=0.0,
lexical=weights.lexical * redistribution_factor,
category=weights.category * redistribution_factor,
recency=weights.recency * redistribution_factor,
popularity=weights.popularity * redistribution_factor,
)
else:
# Fallback: all weight to lexical if other components are also 0
weights = HybridSearchWeights(
semantic=0.0,
lexical=1.0,
category=0.0,
recency=0.0,
popularity=0.0,
)
# Raise generic error to client
raise ValueError("Search service temporarily unavailable")
# Add embedding parameter
embedding_str = embedding_to_vector_string(query_embedding)

View File

@@ -8,7 +8,6 @@ from unittest.mock import patch
import pytest
from backend.api.features.store import embeddings
from backend.api.features.store.hybrid_search import HybridSearchWeights, hybrid_search
@@ -50,7 +49,7 @@ async def test_hybrid_search_with_schema_handling():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM # Mock embedding
mock_embed.return_value = [0.1] * 1536 # Mock embedding
results, total = await hybrid_search(
query=query,
@@ -86,7 +85,7 @@ async def test_hybrid_search_with_public_schema():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
results, total = await hybrid_search(
query="test",
@@ -117,7 +116,7 @@ async def test_hybrid_search_with_custom_schema():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
results, total = await hybrid_search(
query="test",
@@ -135,52 +134,22 @@ async def test_hybrid_search_with_custom_schema():
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
async def test_hybrid_search_without_embeddings():
"""Test hybrid search gracefully degrades when embeddings are unavailable."""
# Mock database to return some results
mock_results = [
{
"slug": "test-agent",
"agent_name": "Test Agent",
"agent_image": "test.png",
"creator_username": "creator",
"creator_avatar": "avatar.png",
"sub_heading": "Test heading",
"description": "Test description",
"runs": 100,
"rating": 4.5,
"categories": ["AI"],
"featured": False,
"is_available": True,
"updated_at": "2025-01-01T00:00:00Z",
"semantic_score": 0.0, # Zero because no embedding
"lexical_score": 0.5,
"category_score": 0.0,
"recency_score": 0.1,
"popularity_score": 0.2,
"combined_score": 0.3,
"total_count": 1,
}
]
"""Test hybrid search fails fast when embeddings are unavailable."""
# Patch where the function is used, not where it's defined
with patch("backend.api.features.store.hybrid_search.embed_query") as mock_embed:
with patch(
"backend.api.features.store.hybrid_search.query_raw_with_schema"
) as mock_query:
# Simulate embedding failure
mock_embed.return_value = None
mock_query.return_value = mock_results
# Simulate embedding failure
mock_embed.return_value = None
# Should NOT raise - graceful degradation
results, total = await hybrid_search(
# Should raise ValueError with helpful message
with pytest.raises(ValueError) as exc_info:
await hybrid_search(
query="test",
page=1,
page_size=20,
)
# Verify it returns results even without embeddings
assert len(results) == 1
assert results[0]["slug"] == "test-agent"
assert total == 1
# Verify error message is generic (doesn't leak implementation details)
assert "Search service temporarily unavailable" in str(exc_info.value)
@pytest.mark.asyncio(loop_scope="session")
@@ -195,7 +164,7 @@ async def test_hybrid_search_with_filters():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
# Test with featured filter
results, total = await hybrid_search(
@@ -235,7 +204,7 @@ async def test_hybrid_search_weights():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
results, total = await hybrid_search(
query="test",
@@ -279,7 +248,7 @@ async def test_hybrid_search_min_score_filtering():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
# Test with custom min_score
results, total = await hybrid_search(
@@ -314,7 +283,7 @@ async def test_hybrid_search_pagination():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
# Test page 2 with page_size 10
results, total = await hybrid_search(
@@ -348,7 +317,7 @@ async def test_hybrid_search_error_handling():
with patch(
"backend.api.features.store.hybrid_search.embed_query"
) as mock_embed:
mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM
mock_embed.return_value = [0.1] * 1536
# Should raise exception
with pytest.raises(Exception) as exc_info:

View File

@@ -9,7 +9,6 @@ from backend.api.features.library.db import (
from backend.api.features.store.db import get_store_agent_details, get_store_agents
from backend.api.features.store.embeddings import (
backfill_missing_embeddings,
cleanup_orphaned_embeddings,
get_embedding_stats,
)
from backend.data import db
@@ -222,7 +221,6 @@ class DatabaseManager(AppService):
# Store Embeddings
get_embedding_stats = _(get_embedding_stats)
backfill_missing_embeddings = _(backfill_missing_embeddings)
cleanup_orphaned_embeddings = _(cleanup_orphaned_embeddings)
# Summary data - async
get_user_execution_summary_data = _(get_user_execution_summary_data)
@@ -278,7 +276,6 @@ class DatabaseManagerClient(AppServiceClient):
# Store Embeddings
get_embedding_stats = _(d.get_embedding_stats)
backfill_missing_embeddings = _(d.backfill_missing_embeddings)
cleanup_orphaned_embeddings = _(d.cleanup_orphaned_embeddings)
class DatabaseManagerAsyncClient(AppServiceClient):

View File

@@ -28,7 +28,6 @@ from backend.data.auth.oauth import cleanup_expired_oauth_tokens
from backend.data.block import BlockInput
from backend.data.execution import GraphExecutionWithNodes
from backend.data.model import CredentialsMetaInput
from backend.data.onboarding import increment_onboarding_runs
from backend.executor import utils as execution_utils
from backend.monitoring import (
NotificationJobArgs,
@@ -157,7 +156,6 @@ async def _execute_graph(**kwargs):
inputs=args.input_data,
graph_credentials_inputs=args.input_credentials,
)
await increment_onboarding_runs(args.user_id)
elapsed = asyncio.get_event_loop().time() - start_time
logger.info(
f"Graph execution started with ID {graph_exec.id} for graph {args.graph_id} "
@@ -257,14 +255,14 @@ def execution_accuracy_alerts():
def ensure_embeddings_coverage():
"""
Ensure all content types (store agents, blocks, docs) have embeddings for search.
Ensure approved store agents have embeddings for hybrid search.
Processes ALL missing embeddings in batches of 10 per content type until 100% coverage.
Missing embeddings = content invisible in hybrid search.
Processes ALL missing embeddings in batches of 10 until 100% coverage.
Missing embeddings = agents invisible in hybrid search.
Schedule: Runs every 6 hours (balanced between coverage and API costs).
- Catches new content added between scheduled runs
- Batch size 10 per content type: gradual processing to avoid rate limits
- Catches agents approved between scheduled runs
- Batch size 10: gradual processing to avoid rate limits
- Manual trigger available via execute_ensure_embeddings_coverage endpoint
"""
db_client = get_database_manager_client()
@@ -277,27 +275,13 @@ def ensure_embeddings_coverage():
)
return {"processed": 0, "success": 0, "failed": 0, "error": stats["error"]}
# Extract totals from new stats structure
totals = stats.get("totals", {})
without_embeddings = totals.get("without_embeddings", 0)
coverage_percent = totals.get("coverage_percent", 0)
if without_embeddings == 0:
logger.info("All content has embeddings, skipping backfill")
if stats["without_embeddings"] == 0:
logger.info("All approved agents have embeddings, skipping backfill")
return {"processed": 0, "success": 0, "failed": 0}
# Log per-content-type stats for visibility
by_type = stats.get("by_type", {})
for content_type, type_stats in by_type.items():
if type_stats.get("without_embeddings", 0) > 0:
logger.info(
f"{content_type}: {type_stats['without_embeddings']} items without embeddings "
f"({type_stats['coverage_percent']}% coverage)"
)
logger.info(
f"Total: {without_embeddings} items without embeddings "
f"({coverage_percent}% coverage) - processing all"
f"Found {stats['without_embeddings']} agents without embeddings "
f"({stats['coverage_percent']}% coverage) - processing all"
)
total_processed = 0
@@ -330,33 +314,10 @@ def ensure_embeddings_coverage():
f"Embedding backfill completed: {total_success}/{total_processed} succeeded, "
f"{total_failed} failed"
)
# Clean up orphaned embeddings for blocks and docs
logger.info("Running cleanup for orphaned embeddings (blocks/docs)...")
cleanup_result = db_client.cleanup_orphaned_embeddings()
cleanup_totals = cleanup_result.get("totals", {})
cleanup_deleted = cleanup_totals.get("deleted", 0)
if cleanup_deleted > 0:
logger.info(f"Cleanup completed: deleted {cleanup_deleted} orphaned embeddings")
by_type = cleanup_result.get("by_type", {})
for content_type, type_result in by_type.items():
if type_result.get("deleted", 0) > 0:
logger.info(
f"{content_type}: deleted {type_result['deleted']} orphaned embeddings"
)
else:
logger.info("Cleanup completed: no orphaned embeddings found")
return {
"backfill": {
"processed": total_processed,
"success": total_success,
"failed": total_failed,
},
"cleanup": {
"deleted": cleanup_deleted,
},
"processed": total_processed,
"success": total_success,
"failed": total_failed,
}