feat(backend): add cleanup for orphaned block/doc embeddings

- Add cleanup_orphaned_embeddings() function to detect and delete embeddings for blocks that were removed from code and docs that were deleted from filesystem - Integrated into ensure_embeddings_coverage() scheduler job (runs every 6 hours) - Cleanup runs after backfill: backfill adds missing, cleanup removes orphaned - Store agents NOT cleaned up - already filtered by is_available in search How it works: 1. Compares current blocks (from get_blocks()) vs embeddings in DB 2. Compares current docs (from filesystem scan) vs embeddings in DB 3. Deletes orphaned embeddings that no longer have corresponding content 4. Logs deletions per content type for visibility Prevents: - Search returning results for removed blocks - Search returning results for deleted docs - Database bloat from orphaned embedding records
ci(platform): remove OpenAI API key from backend CI
2026-01-15 01:58:23 -05:00 · 2026-01-14 23:18:38 -06:00 · 2026-01-14 23:11:19 -06:00 · 2026-01-14 23:08:43 -06:00 · 2026-01-14 23:03:32 -06:00 · 2026-01-14 22:43:56 -06:00
11 changed files with 1471 additions and 243 deletions
--- a/.github/workflows/platform-backend-ci.yml
+++ b/.github/workflows/platform-backend-ci.yml
@@ -209,7 +209,6 @@ jobs:
      PLAIN_OUTPUT: True
      RUN_ENV: local
      PORT: 8080
-      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      # We know these are here, don't report this as a security vulnerability
      # This is used as the default credential for the entire system's RabbitMQ instance
      # If you want to replace this, you can do so by making our entire system generate
--- a/autogpt_platform/backend/backend/api/features/store/content_handlers.py
+++ b/autogpt_platform/backend/backend/api/features/store/content_handlers.py
@@ -0,0 +1,417 @@
+"""
+Content Type Handlers for Unified Embeddings
+
+Pluggable system for different content sources (store agents, blocks, docs).
+Each handler knows how to fetch and process its content type for embedding.
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from prisma.enums import ContentType
+
+from backend.data.db import query_raw_with_schema
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ContentItem:
+    """Represents a piece of content to be embedded."""
+
+    content_id: str  # Unique identifier (DB ID or file path)
+    content_type: ContentType
+    searchable_text: str  # Combined text for embedding
+    metadata: dict[str, Any]  # Content-specific metadata
+    user_id: str | None = None  # For user-scoped content
+
+
+class ContentHandler(ABC):
+    """Base handler for fetching and processing content for embeddings."""
+
+    @property
+    @abstractmethod
+    def content_type(self) -> ContentType:
+        """The ContentType this handler manages."""
+        pass
+
+    @abstractmethod
+    async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
+        """
+        Fetch items that don't have embeddings yet.
+
+        Args:
+            batch_size: Maximum number of items to return
+
+        Returns:
+            List of ContentItem objects ready for embedding
+        """
+        pass
+
+    @abstractmethod
+    async def get_stats(self) -> dict[str, int]:
+        """
+        Get statistics about embedding coverage.
+
+        Returns:
+            Dict with keys: total, with_embeddings, without_embeddings
+        """
+        pass
+
+
+class StoreAgentHandler(ContentHandler):
+    """Handler for marketplace store agent listings."""
+
+    @property
+    def content_type(self) -> ContentType:
+        return ContentType.STORE_AGENT
+
+    async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
+        """Fetch approved store listings without embeddings."""
+        from backend.api.features.store.embeddings import build_searchable_text
+
+        missing = await query_raw_with_schema(
+            """
+            SELECT
+                slv.id,
+                slv.name,
+                slv.description,
+                slv."subHeading",
+                slv.categories
+            FROM {schema_prefix}"StoreListingVersion" slv
+            LEFT JOIN {schema_prefix}"UnifiedContentEmbedding" uce
+                ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
+            WHERE slv."submissionStatus" = 'APPROVED'
+            AND slv."isDeleted" = false
+            AND uce."contentId" IS NULL
+            LIMIT $1
+            """,
+            batch_size,
+        )
+
+        return [
+            ContentItem(
+                content_id=row["id"],
+                content_type=ContentType.STORE_AGENT,
+                searchable_text=build_searchable_text(
+                    name=row["name"],
+                    description=row["description"],
+                    sub_heading=row["subHeading"],
+                    categories=row["categories"] or [],
+                ),
+                metadata={
+                    "name": row["name"],
+                    "categories": row["categories"] or [],
+                },
+                user_id=None,  # Store agents are public
+            )
+            for row in missing
+        ]
+
+    async def get_stats(self) -> dict[str, int]:
+        """Get statistics about store agent embedding coverage."""
+        # Count approved versions
+        approved_result = await query_raw_with_schema(
+            """
+            SELECT COUNT(*) as count
+            FROM {schema_prefix}"StoreListingVersion"
+            WHERE "submissionStatus" = 'APPROVED'
+            AND "isDeleted" = false
+            """
+        )
+        total_approved = approved_result[0]["count"] if approved_result else 0
+
+        # Count versions with embeddings
+        embedded_result = await query_raw_with_schema(
+            """
+            SELECT COUNT(*) as count
+            FROM {schema_prefix}"StoreListingVersion" slv
+            JOIN {schema_prefix}"UnifiedContentEmbedding" uce ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
+            WHERE slv."submissionStatus" = 'APPROVED'
+            AND slv."isDeleted" = false
+            """
+        )
+        with_embeddings = embedded_result[0]["count"] if embedded_result else 0
+
+        return {
+            "total": total_approved,
+            "with_embeddings": with_embeddings,
+            "without_embeddings": total_approved - with_embeddings,
+        }
+
+
+class BlockHandler(ContentHandler):
+    """Handler for block definitions (Python classes)."""
+
+    @property
+    def content_type(self) -> ContentType:
+        return ContentType.BLOCK
+
+    async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
+        """Fetch blocks without embeddings."""
+        from backend.data.block import get_blocks
+
+        # Get all available blocks
+        all_blocks = get_blocks()
+
+        # Check which ones have embeddings
+        if not all_blocks:
+            return []
+
+        block_ids = list(all_blocks.keys())
+
+        # Query for existing embeddings
+        placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
+        existing_result = await query_raw_with_schema(
+            f"""
+            SELECT "contentId"
+            FROM {{schema_prefix}}"UnifiedContentEmbedding"
+            WHERE "contentType" = 'BLOCK'::{{schema_prefix}}"ContentType"
+            AND "contentId" = ANY(ARRAY[{placeholders}])
+            """,
+            *block_ids,
+        )
+
+        existing_ids = {row["contentId"] for row in existing_result}
+        missing_blocks = [
+            (block_id, block_cls)
+            for block_id, block_cls in all_blocks.items()
+            if block_id not in existing_ids
+        ]
+
+        # Convert to ContentItem
+        items = []
+        for block_id, block_cls in missing_blocks[:batch_size]:
+            try:
+                block_instance = block_cls()
+
+                # Build searchable text from block metadata
+                parts = []
+                if hasattr(block_instance, "name") and block_instance.name:
+                    parts.append(block_instance.name)
+                if (
+                    hasattr(block_instance, "description")
+                    and block_instance.description
+                ):
+                    parts.append(block_instance.description)
+                if hasattr(block_instance, "categories") and block_instance.categories:
+                    # Convert BlockCategory enum to strings
+                    parts.append(
+                        " ".join(str(cat.value) for cat in block_instance.categories)
+                    )
+
+                # Add input/output schema info
+                if hasattr(block_instance, "input_schema"):
+                    schema = block_instance.input_schema
+                    if hasattr(schema, "model_json_schema"):
+                        schema_dict = schema.model_json_schema()
+                        if "properties" in schema_dict:
+                            for prop_name, prop_info in schema_dict[
+                                "properties"
+                            ].items():
+                                if "description" in prop_info:
+                                    parts.append(
+                                        f"{prop_name}: {prop_info['description']}"
+                                    )
+
+                searchable_text = " ".join(parts)
+
+                items.append(
+                    ContentItem(
+                        content_id=block_id,
+                        content_type=ContentType.BLOCK,
+                        searchable_text=searchable_text,
+                        metadata={
+                            "name": getattr(block_instance, "name", ""),
+                            "categories": getattr(block_instance, "categories", []),
+                        },
+                        user_id=None,  # Blocks are public
+                    )
+                )
+            except Exception as e:
+                logger.warning(f"Failed to process block {block_id}: {e}")
+                continue
+
+        return items
+
+    async def get_stats(self) -> dict[str, int]:
+        """Get statistics about block embedding coverage."""
+        from backend.data.block import get_blocks
+
+        all_blocks = get_blocks()
+        total_blocks = len(all_blocks)
+
+        if total_blocks == 0:
+            return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
+
+        block_ids = list(all_blocks.keys())
+        placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
+
+        embedded_result = await query_raw_with_schema(
+            f"""
+            SELECT COUNT(*) as count
+            FROM {{schema_prefix}}"UnifiedContentEmbedding"
+            WHERE "contentType" = 'BLOCK'::{{schema_prefix}}"ContentType"
+            AND "contentId" = ANY(ARRAY[{placeholders}])
+            """,
+            *block_ids,
+        )
+
+        with_embeddings = embedded_result[0]["count"] if embedded_result else 0
+
+        return {
+            "total": total_blocks,
+            "with_embeddings": with_embeddings,
+            "without_embeddings": total_blocks - with_embeddings,
+        }
+
+
+class DocumentationHandler(ContentHandler):
+    """Handler for documentation files (.md/.mdx)."""
+
+    @property
+    def content_type(self) -> ContentType:
+        return ContentType.DOCUMENTATION
+
+    def _get_docs_root(self) -> Path:
+        """Get the documentation root directory."""
+        # Assuming docs are in /docs relative to project root
+        backend_root = Path(__file__).parent.parent.parent.parent
+        docs_root = backend_root.parent.parent / "docs"
+        return docs_root
+
+    def _extract_title_and_content(self, file_path: Path) -> tuple[str, str]:
+        """Extract title and content from markdown file."""
+        try:
+            content = file_path.read_text(encoding="utf-8")
+
+            # Try to extract title from first # heading
+            lines = content.split("\n")
+            title = ""
+            body_lines = []
+
+            for line in lines:
+                if line.startswith("# ") and not title:
+                    title = line[2:].strip()
+                else:
+                    body_lines.append(line)
+
+            # If no title found, use filename
+            if not title:
+                title = file_path.stem.replace("-", " ").replace("_", " ").title()
+
+            body = "\n".join(body_lines)
+
+            return title, body
+        except Exception as e:
+            logger.warning(f"Failed to read {file_path}: {e}")
+            return file_path.stem, ""
+
+    async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
+        """Fetch documentation files without embeddings."""
+        docs_root = self._get_docs_root()
+
+        if not docs_root.exists():
+            logger.warning(f"Documentation root not found: {docs_root}")
+            return []
+
+        # Find all .md and .mdx files
+        all_docs = list(docs_root.rglob("*.md")) + list(docs_root.rglob("*.mdx"))
+
+        # Get relative paths for content IDs
+        doc_paths = [str(doc.relative_to(docs_root)) for doc in all_docs]
+
+        if not doc_paths:
+            return []
+
+        # Check which ones have embeddings
+        placeholders = ",".join([f"${i+1}" for i in range(len(doc_paths))])
+        existing_result = await query_raw_with_schema(
+            f"""
+            SELECT "contentId"
+            FROM {{schema_prefix}}"UnifiedContentEmbedding"
+            WHERE "contentType" = 'DOCUMENTATION'::{{schema_prefix}}"ContentType"
+            AND "contentId" = ANY(ARRAY[{placeholders}])
+            """,
+            *doc_paths,
+        )
+
+        existing_ids = {row["contentId"] for row in existing_result}
+        missing_docs = [
+            (doc_path, doc_file)
+            for doc_path, doc_file in zip(doc_paths, all_docs)
+            if doc_path not in existing_ids
+        ]
+
+        # Convert to ContentItem
+        items = []
+        for doc_path, doc_file in missing_docs[:batch_size]:
+            try:
+                title, content = self._extract_title_and_content(doc_file)
+
+                # Build searchable text
+                searchable_text = f"{title} {content}"
+
+                items.append(
+                    ContentItem(
+                        content_id=doc_path,
+                        content_type=ContentType.DOCUMENTATION,
+                        searchable_text=searchable_text,
+                        metadata={
+                            "title": title,
+                            "path": doc_path,
+                        },
+                        user_id=None,  # Documentation is public
+                    )
+                )
+            except Exception as e:
+                logger.warning(f"Failed to process doc {doc_path}: {e}")
+                continue
+
+        return items
+
+    async def get_stats(self) -> dict[str, int]:
+        """Get statistics about documentation embedding coverage."""
+        docs_root = self._get_docs_root()
+
+        if not docs_root.exists():
+            return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
+
+        # Count all .md and .mdx files
+        all_docs = list(docs_root.rglob("*.md")) + list(docs_root.rglob("*.mdx"))
+        total_docs = len(all_docs)
+
+        if total_docs == 0:
+            return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
+
+        doc_paths = [str(doc.relative_to(docs_root)) for doc in all_docs]
+        placeholders = ",".join([f"${i+1}" for i in range(len(doc_paths))])
+
+        embedded_result = await query_raw_with_schema(
+            f"""
+            SELECT COUNT(*) as count
+            FROM {{schema_prefix}}"UnifiedContentEmbedding"
+            WHERE "contentType" = 'DOCUMENTATION'::{{schema_prefix}}"ContentType"
+            AND "contentId" = ANY(ARRAY[{placeholders}])
+            """,
+            *doc_paths,
+        )
+
+        with_embeddings = embedded_result[0]["count"] if embedded_result else 0
+
+        return {
+            "total": total_docs,
+            "with_embeddings": with_embeddings,
+            "without_embeddings": total_docs - with_embeddings,
+        }
+
+
+# Content handler registry
+CONTENT_HANDLERS: dict[ContentType, ContentHandler] = {
+    ContentType.STORE_AGENT: StoreAgentHandler(),
+    ContentType.BLOCK: BlockHandler(),
+    ContentType.DOCUMENTATION: DocumentationHandler(),
+}
--- a/autogpt_platform/backend/backend/api/features/store/content_handlers_integration_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/content_handlers_integration_test.py
@@ -0,0 +1,215 @@
+"""
+Integration tests for content handlers using real DB.
+
+Run with: poetry run pytest backend/api/features/store/content_handlers_integration_test.py -xvs
+
+These tests use the real database but mock OpenAI calls.
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from backend.api.features.store.content_handlers import (
+    CONTENT_HANDLERS,
+    BlockHandler,
+    DocumentationHandler,
+    StoreAgentHandler,
+)
+from backend.api.features.store.embeddings import (
+    EMBEDDING_DIM,
+    backfill_all_content_types,
+    ensure_content_embedding,
+    get_embedding_stats,
+)
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_store_agent_handler_real_db():
+    """Test StoreAgentHandler with real database queries."""
+    handler = StoreAgentHandler()
+
+    # Get stats from real DB
+    stats = await handler.get_stats()
+
+    # Stats should have correct structure
+    assert "total" in stats
+    assert "with_embeddings" in stats
+    assert "without_embeddings" in stats
+    assert stats["total"] >= 0
+    assert stats["with_embeddings"] >= 0
+    assert stats["without_embeddings"] >= 0
+
+    # Get missing items (max 1 to keep test fast)
+    items = await handler.get_missing_items(batch_size=1)
+
+    # Items should be list (may be empty if all have embeddings)
+    assert isinstance(items, list)
+
+    if items:
+        item = items[0]
+        assert item.content_id is not None
+        assert item.content_type.value == "STORE_AGENT"
+        assert item.searchable_text != ""
+        assert item.user_id is None
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_block_handler_real_db():
+    """Test BlockHandler with real database queries."""
+    handler = BlockHandler()
+
+    # Get stats from real DB
+    stats = await handler.get_stats()
+
+    # Stats should have correct structure
+    assert "total" in stats
+    assert "with_embeddings" in stats
+    assert "without_embeddings" in stats
+    assert stats["total"] >= 0  # Should have at least some blocks
+    assert stats["with_embeddings"] >= 0
+    assert stats["without_embeddings"] >= 0
+
+    # Get missing items (max 1 to keep test fast)
+    items = await handler.get_missing_items(batch_size=1)
+
+    # Items should be list
+    assert isinstance(items, list)
+
+    if items:
+        item = items[0]
+        assert item.content_id is not None  # Should be block UUID
+        assert item.content_type.value == "BLOCK"
+        assert item.searchable_text != ""
+        assert item.user_id is None
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_documentation_handler_real_fs():
+    """Test DocumentationHandler with real filesystem."""
+    handler = DocumentationHandler()
+
+    # Get stats from real filesystem
+    stats = await handler.get_stats()
+
+    # Stats should have correct structure
+    assert "total" in stats
+    assert "with_embeddings" in stats
+    assert "without_embeddings" in stats
+    assert stats["total"] >= 0
+    assert stats["with_embeddings"] >= 0
+    assert stats["without_embeddings"] >= 0
+
+    # Get missing items (max 1 to keep test fast)
+    items = await handler.get_missing_items(batch_size=1)
+
+    # Items should be list
+    assert isinstance(items, list)
+
+    if items:
+        item = items[0]
+        assert item.content_id is not None  # Should be relative path
+        assert item.content_type.value == "DOCUMENTATION"
+        assert item.searchable_text != ""
+        assert item.user_id is None
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_get_embedding_stats_all_types():
+    """Test get_embedding_stats aggregates all content types."""
+    stats = await get_embedding_stats()
+
+    # Should have structure with by_type and totals
+    assert "by_type" in stats
+    assert "totals" in stats
+
+    # Check each content type is present
+    by_type = stats["by_type"]
+    assert "STORE_AGENT" in by_type
+    assert "BLOCK" in by_type
+    assert "DOCUMENTATION" in by_type
+
+    # Check totals are aggregated
+    totals = stats["totals"]
+    assert totals["total"] >= 0
+    assert totals["with_embeddings"] >= 0
+    assert totals["without_embeddings"] >= 0
+    assert "coverage_percent" in totals
+
+
+@pytest.mark.asyncio(loop_scope="session")
+@patch("backend.api.features.store.embeddings.generate_embedding")
+async def test_ensure_content_embedding_blocks(mock_generate):
+    """Test creating embeddings for blocks (mocked OpenAI)."""
+    # Mock OpenAI to return fake embedding
+    mock_generate.return_value = [0.1] * EMBEDDING_DIM
+
+    # Get one block without embedding
+    handler = BlockHandler()
+    items = await handler.get_missing_items(batch_size=1)
+
+    if not items:
+        pytest.skip("No blocks without embeddings")
+
+    item = items[0]
+
+    # Try to create embedding (OpenAI mocked)
+    result = await ensure_content_embedding(
+        content_type=item.content_type,
+        content_id=item.content_id,
+        searchable_text=item.searchable_text,
+        metadata=item.metadata,
+        user_id=item.user_id,
+    )
+
+    # Should succeed with mocked OpenAI
+    assert result is True
+    mock_generate.assert_called_once()
+
+
+@pytest.mark.asyncio(loop_scope="session")
+@patch("backend.api.features.store.embeddings.generate_embedding")
+async def test_backfill_all_content_types_dry_run(mock_generate):
+    """Test backfill_all_content_types processes all handlers in order."""
+    # Mock OpenAI to return fake embedding
+    mock_generate.return_value = [0.1] * EMBEDDING_DIM
+
+    # Run backfill with batch_size=1 to process max 1 per type
+    result = await backfill_all_content_types(batch_size=1)
+
+    # Should have results for all content types
+    assert "by_type" in result
+    assert "totals" in result
+
+    by_type = result["by_type"]
+    assert "BLOCK" in by_type
+    assert "STORE_AGENT" in by_type
+    assert "DOCUMENTATION" in by_type
+
+    # Each type should have correct structure
+    for content_type, type_result in by_type.items():
+        assert "processed" in type_result
+        assert "success" in type_result
+        assert "failed" in type_result
+
+    # Totals should aggregate
+    totals = result["totals"]
+    assert totals["processed"] >= 0
+    assert totals["success"] >= 0
+    assert totals["failed"] >= 0
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_content_handler_registry():
+    """Test all handlers are registered in correct order."""
+    from prisma.enums import ContentType
+
+    # All three types should be registered
+    assert ContentType.STORE_AGENT in CONTENT_HANDLERS
+    assert ContentType.BLOCK in CONTENT_HANDLERS
+    assert ContentType.DOCUMENTATION in CONTENT_HANDLERS
+
+    # Check handler types
+    assert isinstance(CONTENT_HANDLERS[ContentType.STORE_AGENT], StoreAgentHandler)
+    assert isinstance(CONTENT_HANDLERS[ContentType.BLOCK], BlockHandler)
+    assert isinstance(CONTENT_HANDLERS[ContentType.DOCUMENTATION], DocumentationHandler)
--- a/autogpt_platform/backend/backend/api/features/store/content_handlers_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/content_handlers_test.py
@@ -0,0 +1,324 @@
+"""
+E2E tests for content handlers (blocks, store agents, documentation).
+
+Tests the full flow: discovering content → generating embeddings → storing.
+"""
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from prisma.enums import ContentType
+
+from backend.api.features.store.content_handlers import (
+    CONTENT_HANDLERS,
+    BlockHandler,
+    DocumentationHandler,
+    StoreAgentHandler,
+)
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_store_agent_handler_get_missing_items(mocker):
+    """Test StoreAgentHandler fetches approved agents without embeddings."""
+    handler = StoreAgentHandler()
+
+    # Mock database query
+    mock_missing = [
+        {
+            "id": "agent-1",
+            "name": "Test Agent",
+            "description": "A test agent",
+            "subHeading": "Test heading",
+            "categories": ["AI", "Testing"],
+        }
+    ]
+
+    with patch(
+        "backend.api.features.store.content_handlers.query_raw_with_schema",
+        return_value=mock_missing,
+    ):
+        items = await handler.get_missing_items(batch_size=10)
+
+        assert len(items) == 1
+        assert items[0].content_id == "agent-1"
+        assert items[0].content_type == ContentType.STORE_AGENT
+        assert "Test Agent" in items[0].searchable_text
+        assert "A test agent" in items[0].searchable_text
+        assert items[0].metadata["name"] == "Test Agent"
+        assert items[0].user_id is None
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_store_agent_handler_get_stats(mocker):
+    """Test StoreAgentHandler returns correct stats."""
+    handler = StoreAgentHandler()
+
+    # Mock approved count query
+    mock_approved = [{"count": 50}]
+    # Mock embedded count query
+    mock_embedded = [{"count": 30}]
+
+    with patch(
+        "backend.api.features.store.content_handlers.query_raw_with_schema",
+        side_effect=[mock_approved, mock_embedded],
+    ):
+        stats = await handler.get_stats()
+
+        assert stats["total"] == 50
+        assert stats["with_embeddings"] == 30
+        assert stats["without_embeddings"] == 20
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_block_handler_get_missing_items(mocker):
+    """Test BlockHandler discovers blocks without embeddings."""
+    handler = BlockHandler()
+
+    # Mock get_blocks to return test blocks
+    mock_block_class = MagicMock()
+    mock_block_instance = MagicMock()
+    mock_block_instance.name = "Calculator Block"
+    mock_block_instance.description = "Performs calculations"
+    mock_block_instance.categories = [MagicMock(value="MATH")]
+    mock_block_instance.input_schema.model_json_schema.return_value = {
+        "properties": {"expression": {"description": "Math expression to evaluate"}}
+    }
+    mock_block_class.return_value = mock_block_instance
+
+    mock_blocks = {"block-uuid-1": mock_block_class}
+
+    # Mock existing embeddings query (no embeddings exist)
+    mock_existing = []
+
+    with patch(
+        "backend.data.block.get_blocks",
+        return_value=mock_blocks,
+    ):
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=mock_existing,
+        ):
+            items = await handler.get_missing_items(batch_size=10)
+
+            assert len(items) == 1
+            assert items[0].content_id == "block-uuid-1"
+            assert items[0].content_type == ContentType.BLOCK
+            assert "Calculator Block" in items[0].searchable_text
+            assert "Performs calculations" in items[0].searchable_text
+            assert "MATH" in items[0].searchable_text
+            assert "expression: Math expression" in items[0].searchable_text
+            assert items[0].user_id is None
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_block_handler_get_stats(mocker):
+    """Test BlockHandler returns correct stats."""
+    handler = BlockHandler()
+
+    # Mock get_blocks
+    mock_blocks = {
+        "block-1": MagicMock(),
+        "block-2": MagicMock(),
+        "block-3": MagicMock(),
+    }
+
+    # Mock embedded count query (2 blocks have embeddings)
+    mock_embedded = [{"count": 2}]
+
+    with patch(
+        "backend.data.block.get_blocks",
+        return_value=mock_blocks,
+    ):
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=mock_embedded,
+        ):
+            stats = await handler.get_stats()
+
+            assert stats["total"] == 3
+            assert stats["with_embeddings"] == 2
+            assert stats["without_embeddings"] == 1
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_documentation_handler_get_missing_items(tmp_path, mocker):
+    """Test DocumentationHandler discovers docs without embeddings."""
+    handler = DocumentationHandler()
+
+    # Create temporary docs directory with test files
+    docs_root = tmp_path / "docs"
+    docs_root.mkdir()
+
+    (docs_root / "guide.md").write_text("# Getting Started\n\nThis is a guide.")
+    (docs_root / "api.mdx").write_text("# API Reference\n\nAPI documentation.")
+
+    # Mock _get_docs_root to return temp dir
+    with patch.object(handler, "_get_docs_root", return_value=docs_root):
+        # Mock existing embeddings query (no embeddings exist)
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=[],
+        ):
+            items = await handler.get_missing_items(batch_size=10)
+
+            assert len(items) == 2
+
+            # Check guide.md
+            guide_item = next(
+                (item for item in items if item.content_id == "guide.md"), None
+            )
+            assert guide_item is not None
+            assert guide_item.content_type == ContentType.DOCUMENTATION
+            assert "Getting Started" in guide_item.searchable_text
+            assert "This is a guide" in guide_item.searchable_text
+            assert guide_item.metadata["title"] == "Getting Started"
+            assert guide_item.user_id is None
+
+            # Check api.mdx
+            api_item = next(
+                (item for item in items if item.content_id == "api.mdx"), None
+            )
+            assert api_item is not None
+            assert "API Reference" in api_item.searchable_text
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_documentation_handler_get_stats(tmp_path, mocker):
+    """Test DocumentationHandler returns correct stats."""
+    handler = DocumentationHandler()
+
+    # Create temporary docs directory
+    docs_root = tmp_path / "docs"
+    docs_root.mkdir()
+    (docs_root / "doc1.md").write_text("# Doc 1")
+    (docs_root / "doc2.md").write_text("# Doc 2")
+    (docs_root / "doc3.mdx").write_text("# Doc 3")
+
+    # Mock embedded count query (1 doc has embedding)
+    mock_embedded = [{"count": 1}]
+
+    with patch.object(handler, "_get_docs_root", return_value=docs_root):
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=mock_embedded,
+        ):
+            stats = await handler.get_stats()
+
+            assert stats["total"] == 3
+            assert stats["with_embeddings"] == 1
+            assert stats["without_embeddings"] == 2
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_documentation_handler_title_extraction(tmp_path):
+    """Test DocumentationHandler extracts title from markdown heading."""
+    handler = DocumentationHandler()
+
+    # Test with heading
+    doc_with_heading = tmp_path / "with_heading.md"
+    doc_with_heading.write_text("# My Title\n\nContent here")
+    title, content = handler._extract_title_and_content(doc_with_heading)
+    assert title == "My Title"
+    assert "# My Title" not in content
+    assert "Content here" in content
+
+    # Test without heading
+    doc_without_heading = tmp_path / "no-heading.md"
+    doc_without_heading.write_text("Just content, no heading")
+    title, content = handler._extract_title_and_content(doc_without_heading)
+    assert title == "No Heading"  # Uses filename
+    assert "Just content" in content
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_content_handlers_registry():
+    """Test all content types are registered."""
+    assert ContentType.STORE_AGENT in CONTENT_HANDLERS
+    assert ContentType.BLOCK in CONTENT_HANDLERS
+    assert ContentType.DOCUMENTATION in CONTENT_HANDLERS
+
+    assert isinstance(CONTENT_HANDLERS[ContentType.STORE_AGENT], StoreAgentHandler)
+    assert isinstance(CONTENT_HANDLERS[ContentType.BLOCK], BlockHandler)
+    assert isinstance(CONTENT_HANDLERS[ContentType.DOCUMENTATION], DocumentationHandler)
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_block_handler_handles_missing_attributes():
+    """Test BlockHandler gracefully handles blocks with missing attributes."""
+    handler = BlockHandler()
+
+    # Mock block with minimal attributes
+    mock_block_class = MagicMock()
+    mock_block_instance = MagicMock()
+    mock_block_instance.name = "Minimal Block"
+    # No description, categories, or schema
+    del mock_block_instance.description
+    del mock_block_instance.categories
+    del mock_block_instance.input_schema
+    mock_block_class.return_value = mock_block_instance
+
+    mock_blocks = {"block-minimal": mock_block_class}
+
+    with patch(
+        "backend.data.block.get_blocks",
+        return_value=mock_blocks,
+    ):
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=[],
+        ):
+            items = await handler.get_missing_items(batch_size=10)
+
+            assert len(items) == 1
+            assert items[0].searchable_text == "Minimal Block"
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_block_handler_skips_failed_blocks():
+    """Test BlockHandler skips blocks that fail to instantiate."""
+    handler = BlockHandler()
+
+    # Mock one good block and one bad block
+    good_block = MagicMock()
+    good_instance = MagicMock()
+    good_instance.name = "Good Block"
+    good_instance.description = "Works fine"
+    good_instance.categories = []
+    good_block.return_value = good_instance
+
+    bad_block = MagicMock()
+    bad_block.side_effect = Exception("Instantiation failed")
+
+    mock_blocks = {"good-block": good_block, "bad-block": bad_block}
+
+    with patch(
+        "backend.data.block.get_blocks",
+        return_value=mock_blocks,
+    ):
+        with patch(
+            "backend.api.features.store.content_handlers.query_raw_with_schema",
+            return_value=[],
+        ):
+            items = await handler.get_missing_items(batch_size=10)
+
+            # Should only get the good block
+            assert len(items) == 1
+            assert items[0].content_id == "good-block"
+
+
+@pytest.mark.asyncio(loop_scope="session")
+async def test_documentation_handler_missing_docs_directory():
+    """Test DocumentationHandler handles missing docs directory gracefully."""
+    handler = DocumentationHandler()
+
+    # Mock _get_docs_root to return non-existent path
+    fake_path = Path("/nonexistent/docs")
+    with patch.object(handler, "_get_docs_root", return_value=fake_path):
+        items = await handler.get_missing_items(batch_size=10)
+        assert items == []
+
+        stats = await handler.get_stats()
+        assert stats["total"] == 0
+        assert stats["with_embeddings"] == 0
+        assert stats["without_embeddings"] == 0
--- a/autogpt_platform/backend/backend/api/features/store/embeddings.py
+++ b/autogpt_platform/backend/backend/api/features/store/embeddings.py
@@ -14,6 +14,7 @@ import prisma
 from prisma.enums import ContentType
 from tiktoken import encoding_for_model

+from backend.api.features.store.content_handlers import CONTENT_HANDLERS
 from backend.data.db import execute_raw_with_schema, query_raw_with_schema
 from backend.util.clients import get_openai_client
 from backend.util.json import dumps
@@ -23,6 +24,9 @@ logger = logging.getLogger(__name__)

 # OpenAI embedding model configuration
 EMBEDDING_MODEL = "text-embedding-3-small"
+# Embedding dimension for the model above
+# text-embedding-3-small: 1536, text-embedding-3-large: 3072
+EMBEDDING_DIM = 1536
 # OpenAI embedding token limit (8,191 with 1 token buffer for safety)
 EMBEDDING_MAX_TOKENS = 8191

@@ -369,55 +373,69 @@ async def delete_content_embedding(

 async def get_embedding_stats() -> dict[str, Any]:
    """
-    Get statistics about embedding coverage.
+    Get statistics about embedding coverage for all content types.

-    Returns counts of:
-    - Total approved listing versions
-    - Versions with embeddings
-    - Versions without embeddings
+    Returns stats per content type and overall totals.
    """
    try:
-        # Count approved versions
-        approved_result = await query_raw_with_schema(
-            """
-            SELECT COUNT(*) as count
-            FROM {schema_prefix}"StoreListingVersion"
-            WHERE "submissionStatus" = 'APPROVED'
-            AND "isDeleted" = false
-            """
-        )
-        total_approved = approved_result[0]["count"] if approved_result else 0
+        stats_by_type = {}
+        total_items = 0
+        total_with_embeddings = 0
+        total_without_embeddings = 0

-        # Count versions with embeddings
-        embedded_result = await query_raw_with_schema(
-            """
-            SELECT COUNT(*) as count
-            FROM {schema_prefix}"StoreListingVersion" slv
-            JOIN {schema_prefix}"UnifiedContentEmbedding" uce ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
-            WHERE slv."submissionStatus" = 'APPROVED'
-            AND slv."isDeleted" = false
-            """
-        )
-        with_embeddings = embedded_result[0]["count"] if embedded_result else 0
+        # Aggregate stats from all handlers
+        for content_type, handler in CONTENT_HANDLERS.items():
+            try:
+                stats = await handler.get_stats()
+                stats_by_type[content_type.value] = {
+                    "total": stats["total"],
+                    "with_embeddings": stats["with_embeddings"],
+                    "without_embeddings": stats["without_embeddings"],
+                    "coverage_percent": (
+                        round(stats["with_embeddings"] / stats["total"] * 100, 1)
+                        if stats["total"] > 0
+                        else 0
+                    ),
+                }
+
+                total_items += stats["total"]
+                total_with_embeddings += stats["with_embeddings"]
+                total_without_embeddings += stats["without_embeddings"]
+
+            except Exception as e:
+                logger.error(f"Failed to get stats for {content_type.value}: {e}")
+                stats_by_type[content_type.value] = {
+                    "total": 0,
+                    "with_embeddings": 0,
+                    "without_embeddings": 0,
+                    "coverage_percent": 0,
+                    "error": str(e),
+                }

        return {
-            "total_approved": total_approved,
-            "with_embeddings": with_embeddings,
-            "without_embeddings": total_approved - with_embeddings,
-            "coverage_percent": (
-                round(with_embeddings / total_approved * 100, 1)
-                if total_approved > 0
-                else 0
-            ),
+            "by_type": stats_by_type,
+            "totals": {
+                "total": total_items,
+                "with_embeddings": total_with_embeddings,
+                "without_embeddings": total_without_embeddings,
+                "coverage_percent": (
+                    round(total_with_embeddings / total_items * 100, 1)
+                    if total_items > 0
+                    else 0
+                ),
+            },
        }

    except Exception as e:
        logger.error(f"Failed to get embedding stats: {e}")
        return {
-            "total_approved": 0,
-            "with_embeddings": 0,
-            "without_embeddings": 0,
-            "coverage_percent": 0,
+            "by_type": {},
+            "totals": {
+                "total": 0,
+                "with_embeddings": 0,
+                "without_embeddings": 0,
+                "coverage_percent": 0,
+            },
            "error": str(e),
        }

@@ -426,73 +444,118 @@ async def backfill_missing_embeddings(batch_size: int = 10) -> dict[str, Any]:
    """
    Generate embeddings for approved listings that don't have them.

+    BACKWARD COMPATIBILITY: Maintained for existing usage.
+    This now delegates to backfill_all_content_types() to process all content types.
+
    Args:
-        batch_size: Number of embeddings to generate in one call
+        batch_size: Number of embeddings to generate per content type

    Returns:
-        Dict with success/failure counts
+        Dict with success/failure counts aggregated across all content types
    """
-    try:
-        # Find approved versions without embeddings
-        missing = await query_raw_with_schema(
-            """
-            SELECT
-                slv.id,
-                slv.name,
-                slv.description,
-                slv."subHeading",
-                slv.categories
-            FROM {schema_prefix}"StoreListingVersion" slv
-            LEFT JOIN {schema_prefix}"UnifiedContentEmbedding" uce
-                ON slv.id = uce."contentId" AND uce."contentType" = 'STORE_AGENT'::{schema_prefix}"ContentType"
-            WHERE slv."submissionStatus" = 'APPROVED'
-            AND slv."isDeleted" = false
-            AND uce."contentId" IS NULL
-            LIMIT $1
-            """,
-            batch_size,
-        )
+    # Delegate to the new generic backfill system
+    result = await backfill_all_content_types(batch_size)

-        if not missing:
-            return {
+    # Return in the old format for backward compatibility
+    return result["totals"]
+
+
+async def backfill_all_content_types(batch_size: int = 10) -> dict[str, Any]:
+    """
+    Generate embeddings for all content types using registered handlers.
+
+    Processes content types in order: BLOCK → STORE_AGENT → DOCUMENTATION.
+    This ensures foundational content (blocks) are searchable first.
+
+    Args:
+        batch_size: Number of embeddings to generate per content type
+
+    Returns:
+        Dict with stats per content type and overall totals
+    """
+    results_by_type = {}
+    total_processed = 0
+    total_success = 0
+    total_failed = 0
+
+    # Process content types in explicit order
+    processing_order = [
+        ContentType.BLOCK,
+        ContentType.STORE_AGENT,
+        ContentType.DOCUMENTATION,
+    ]
+
+    for content_type in processing_order:
+        handler = CONTENT_HANDLERS.get(content_type)
+        if not handler:
+            logger.warning(f"No handler registered for {content_type.value}")
+            continue
+        try:
+            logger.info(f"Processing {content_type.value} content type...")
+
+            # Get missing items from handler
+            missing_items = await handler.get_missing_items(batch_size)
+
+            if not missing_items:
+                results_by_type[content_type.value] = {
+                    "processed": 0,
+                    "success": 0,
+                    "failed": 0,
+                    "message": "No missing embeddings",
+                }
+                continue
+
+            # Process embeddings concurrently for better performance
+            embedding_tasks = [
+                ensure_content_embedding(
+                    content_type=item.content_type,
+                    content_id=item.content_id,
+                    searchable_text=item.searchable_text,
+                    metadata=item.metadata,
+                    user_id=item.user_id,
+                )
+                for item in missing_items
+            ]
+
+            results = await asyncio.gather(*embedding_tasks, return_exceptions=True)
+
+            success = sum(1 for result in results if result is True)
+            failed = len(results) - success
+
+            results_by_type[content_type.value] = {
+                "processed": len(missing_items),
+                "success": success,
+                "failed": failed,
+                "message": f"Backfilled {success} embeddings, {failed} failed",
+            }
+
+            total_processed += len(missing_items)
+            total_success += success
+            total_failed += failed
+
+            logger.info(
+                f"{content_type.value}: processed {len(missing_items)}, "
+                f"success {success}, failed {failed}"
+            )
+
+        except Exception as e:
+            logger.error(f"Failed to process {content_type.value}: {e}")
+            results_by_type[content_type.value] = {
                "processed": 0,
                "success": 0,
                "failed": 0,
-                "message": "No missing embeddings",
+                "error": str(e),
            }

-        # Process embeddings concurrently for better performance
-        embedding_tasks = [
-            ensure_embedding(
-                version_id=row["id"],
-                name=row["name"],
-                description=row["description"],
-                sub_heading=row["subHeading"],
-                categories=row["categories"] or [],
-            )
-            for row in missing
-        ]
-
-        results = await asyncio.gather(*embedding_tasks, return_exceptions=True)
-
-        success = sum(1 for result in results if result is True)
-        failed = len(results) - success
-
-        return {
-            "processed": len(missing),
-            "success": success,
-            "failed": failed,
-            "message": f"Backfilled {success} embeddings, {failed} failed",
-        }
-
-    except Exception as e:
-        logger.error(f"Failed to backfill embeddings: {e}")
-        return {
-            "processed": 0,
-            "success": 0,
-            "failed": 0,
-            "error": str(e),
-        }
+    return {
+        "by_type": results_by_type,
+        "totals": {
+            "processed": total_processed,
+            "success": total_success,
+            "failed": total_failed,
+            "message": f"Overall: {total_success} succeeded, {total_failed} failed",
+        },
+    }


 async def embed_query(query: str) -> list[float] | None:
@@ -566,3 +629,109 @@ async def ensure_content_embedding(
    except Exception as e:
        logger.error(f"Failed to ensure embedding for {content_type}:{content_id}: {e}")
        return False
+
+
+async def cleanup_orphaned_embeddings() -> dict[str, Any]:
+    """
+    Clean up embeddings for blocks and docs that no longer exist.
+
+    Compares current blocks/docs with embeddings in database and removes orphaned records.
+    Store agents are NOT cleaned up - they're properly filtered during search.
+
+    Returns:
+        Dict with cleanup statistics per content type
+    """
+    from backend.api.features.store.content_handlers import CONTENT_HANDLERS
+    from backend.data.db import query_raw_with_schema
+
+    results_by_type = {}
+    total_deleted = 0
+
+    # Only cleanup BLOCK and DOCUMENTATION - store agents are filtered during search
+    cleanup_types = [ContentType.BLOCK, ContentType.DOCUMENTATION]
+
+    for content_type in cleanup_types:
+        try:
+            handler = CONTENT_HANDLERS.get(content_type)
+            if not handler:
+                logger.warning(f"No handler registered for {content_type}")
+                results_by_type[content_type.value] = {
+                    "deleted": 0,
+                    "error": "No handler registered",
+                }
+                continue
+
+            # Get all current content IDs from handler
+            if content_type == ContentType.BLOCK:
+                from backend.data.block import get_blocks
+
+                current_ids = set(get_blocks().keys())
+            elif content_type == ContentType.DOCUMENTATION:
+                from pathlib import Path
+
+                backend_root = Path(__file__).parent.parent.parent.parent
+                docs_root = backend_root.parent.parent / "docs"
+                if docs_root.exists():
+                    all_docs = list(docs_root.rglob("*.md")) + list(
+                        docs_root.rglob("*.mdx")
+                    )
+                    current_ids = {str(doc.relative_to(docs_root)) for doc in all_docs}
+                else:
+                    current_ids = set()
+            else:
+                current_ids = set()
+
+            # Get all embedding IDs from database
+            db_embeddings = await query_raw_with_schema(
+                """
+                SELECT "contentId"
+                FROM {schema_prefix}"UnifiedContentEmbedding"
+                WHERE "contentType" = $1::{schema_prefix}"ContentType"
+                """,
+                content_type,
+            )
+
+            db_ids = {row["contentId"] for row in db_embeddings}
+
+            # Find orphaned embeddings (in DB but not in current content)
+            orphaned_ids = db_ids - current_ids
+
+            if not orphaned_ids:
+                logger.info(f"{content_type.value}: No orphaned embeddings found")
+                results_by_type[content_type.value] = {
+                    "deleted": 0,
+                    "message": "No orphaned embeddings",
+                }
+                continue
+
+            # Delete orphaned embeddings
+            deleted = 0
+            for content_id in orphaned_ids:
+                if await delete_content_embedding(content_type, content_id):
+                    deleted += 1
+
+            logger.info(
+                f"{content_type.value}: Deleted {deleted}/{len(orphaned_ids)} orphaned embeddings"
+            )
+            results_by_type[content_type.value] = {
+                "deleted": deleted,
+                "orphaned": len(orphaned_ids),
+                "message": f"Deleted {deleted} orphaned embeddings",
+            }
+
+            total_deleted += deleted
+
+        except Exception as e:
+            logger.error(f"Failed to cleanup {content_type.value}: {e}")
+            results_by_type[content_type.value] = {
+                "deleted": 0,
+                "error": str(e),
+            }
+
+    return {
+        "by_type": results_by_type,
+        "totals": {
+            "deleted": total_deleted,
+            "message": f"Deleted {total_deleted} orphaned embeddings",
+        },
+    }
--- a/autogpt_platform/backend/backend/api/features/store/embeddings_schema_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/embeddings_schema_test.py
@@ -4,12 +4,13 @@ Integration tests for embeddings with schema handling.
 These tests verify that embeddings operations work correctly across different database schemas.
 """

-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 import pytest
 from prisma.enums import ContentType

 from backend.api.features.store import embeddings
+from backend.api.features.store.embeddings import EMBEDDING_DIM

 # Schema prefix tests removed - functionality moved to db.raw_with_schema() helper

@@ -28,7 +29,7 @@ async def test_store_content_embedding_with_schema():
            result = await embeddings.store_content_embedding(
                content_type=ContentType.STORE_AGENT,
                content_id="test-id",
-                embedding=[0.1] * 1536,
+                embedding=[0.1] * EMBEDDING_DIM,
                searchable_text="test text",
                metadata={"test": "data"},
                user_id=None,
@@ -125,84 +126,69 @@ async def test_delete_content_embedding_with_schema():
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
 async def test_get_embedding_stats_with_schema():
-    """Test embedding statistics with proper schema handling."""
-    with patch("backend.data.db.get_database_schema") as mock_schema:
-        mock_schema.return_value = "platform"
+    """Test embedding statistics with proper schema handling via content handlers."""
+    # Mock handler to return stats
+    mock_handler = MagicMock()
+    mock_handler.get_stats = AsyncMock(
+        return_value={
+            "total": 100,
+            "with_embeddings": 80,
+            "without_embeddings": 20,
+        }
+    )

-        with patch("prisma.get_client") as mock_get_client:
-            mock_client = AsyncMock()
-            # Mock both query results
-            mock_client.query_raw.side_effect = [
-                [{"count": 100}],  # total_approved
-                [{"count": 80}],  # with_embeddings
-            ]
-            mock_get_client.return_value = mock_client
+    with patch(
+        "backend.api.features.store.embeddings.CONTENT_HANDLERS",
+        {ContentType.STORE_AGENT: mock_handler},
+    ):
+        result = await embeddings.get_embedding_stats()

-            result = await embeddings.get_embedding_stats()
+        # Verify handler was called
+        mock_handler.get_stats.assert_called_once()

-            # Verify both queries were called
-            assert mock_client.query_raw.call_count == 2
-
-            # Get both SQL queries
-            first_call = mock_client.query_raw.call_args_list[0]
-            second_call = mock_client.query_raw.call_args_list[1]
-
-            first_sql = first_call[0][0]
-            second_sql = second_call[0][0]
-
-            # Verify schema prefix in both queries
-            assert '"platform"."StoreListingVersion"' in first_sql
-            assert '"platform"."StoreListingVersion"' in second_sql
-            assert '"platform"."UnifiedContentEmbedding"' in second_sql
-
-            # Verify results
-            assert result["total_approved"] == 100
-            assert result["with_embeddings"] == 80
-            assert result["without_embeddings"] == 20
-            assert result["coverage_percent"] == 80.0
+        # Verify new result structure
+        assert "by_type" in result
+        assert "totals" in result
+        assert result["totals"]["total"] == 100
+        assert result["totals"]["with_embeddings"] == 80
+        assert result["totals"]["without_embeddings"] == 20
+        assert result["totals"]["coverage_percent"] == 80.0


@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
 async def test_backfill_missing_embeddings_with_schema():
-    """Test backfilling embeddings with proper schema handling."""
-    with patch("backend.data.db.get_database_schema") as mock_schema:
-        mock_schema.return_value = "platform"
+    """Test backfilling embeddings via content handlers."""
+    from backend.api.features.store.content_handlers import ContentItem

-        with patch("prisma.get_client") as mock_get_client:
-            mock_client = AsyncMock()
-            # Mock missing embeddings query
-            mock_client.query_raw.return_value = [
-                {
-                    "id": "version-1",
-                    "name": "Test Agent",
-                    "description": "Test description",
-                    "subHeading": "Test heading",
-                    "categories": ["test"],
-                }
-            ]
-            mock_get_client.return_value = mock_client
+    # Create mock content item
+    mock_item = ContentItem(
+        content_id="version-1",
+        content_type=ContentType.STORE_AGENT,
+        searchable_text="Test Agent Test description",
+        metadata={"name": "Test Agent"},
+    )

+    # Mock handler
+    mock_handler = MagicMock()
+    mock_handler.get_missing_items = AsyncMock(return_value=[mock_item])
+
+    with patch(
+        "backend.api.features.store.embeddings.CONTENT_HANDLERS",
+        {ContentType.STORE_AGENT: mock_handler},
+    ):
+        with patch(
+            "backend.api.features.store.embeddings.generate_embedding",
+            return_value=[0.1] * EMBEDDING_DIM,
+        ):
            with patch(
-                "backend.api.features.store.embeddings.ensure_embedding"
-            ) as mock_ensure:
-                mock_ensure.return_value = True
-
+                "backend.api.features.store.embeddings.store_content_embedding",
+                return_value=True,
+            ):
                result = await embeddings.backfill_missing_embeddings(batch_size=10)

-                # Verify the query was called
-                assert mock_client.query_raw.called
-
-                # Get the SQL query
-                call_args = mock_client.query_raw.call_args
-                sql_query = call_args[0][0]
-
-                # Verify schema prefix in query
-                assert '"platform"."StoreListingVersion"' in sql_query
-                assert '"platform"."UnifiedContentEmbedding"' in sql_query
-
-                # Verify ensure_embedding was called
-                assert mock_ensure.called
+                # Verify handler was called
+                mock_handler.get_missing_items.assert_called_once_with(10)

                # Verify results
                assert result["processed"] == 1
@@ -226,7 +212,7 @@ async def test_ensure_content_embedding_with_schema():
            with patch(
                "backend.api.features.store.embeddings.generate_embedding"
            ) as mock_generate:
-                mock_generate.return_value = [0.1] * 1536
+                mock_generate.return_value = [0.1] * EMBEDDING_DIM

                with patch(
                    "backend.api.features.store.embeddings.store_content_embedding"
@@ -260,7 +246,7 @@ async def test_backward_compatibility_store_embedding():

        result = await embeddings.store_embedding(
            version_id="test-version-id",
-            embedding=[0.1] * 1536,
+            embedding=[0.1] * EMBEDDING_DIM,
            tx=None,
        )

@@ -315,7 +301,7 @@ async def test_schema_handling_error_cases():
            result = await embeddings.store_content_embedding(
                content_type=ContentType.STORE_AGENT,
                content_id="test-id",
-                embedding=[0.1] * 1536,
+                embedding=[0.1] * EMBEDDING_DIM,
                searchable_text="test",
                metadata=None,
                user_id=None,
--- a/autogpt_platform/backend/backend/api/features/store/embeddings_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/embeddings_test.py
@@ -63,7 +63,7 @@ async def test_generate_embedding_success():
        result = await embeddings.generate_embedding("test text")

        assert result is not None
-        assert len(result) == 1536
+        assert len(result) == embeddings.EMBEDDING_DIM
        assert result[0] == 0.1

        mock_client.embeddings.create.assert_called_once_with(
@@ -110,7 +110,7 @@ async def test_generate_embedding_text_truncation():
    mock_client = MagicMock()
    mock_response = MagicMock()
    mock_response.data = [MagicMock()]
-    mock_response.data[0].embedding = [0.1] * 1536
+    mock_response.data[0].embedding = [0.1] * embeddings.EMBEDDING_DIM

    # Use AsyncMock for async embeddings.create method
    mock_client.embeddings.create = AsyncMock(return_value=mock_response)
@@ -297,72 +297,92 @@ async def test_ensure_embedding_generation_fails(mock_get, mock_generate):
@pytest.mark.asyncio(loop_scope="session")
 async def test_get_embedding_stats():
    """Test embedding statistics retrieval."""
-    # Mock approved count query and embedded count query
-    mock_approved_result = [{"count": 100}]
-    mock_embedded_result = [{"count": 75}]
+    # Mock handler stats for each content type
+    mock_handler = MagicMock()
+    mock_handler.get_stats = AsyncMock(
+        return_value={
+            "total": 100,
+            "with_embeddings": 75,
+            "without_embeddings": 25,
+        }
+    )

+    # Patch the CONTENT_HANDLERS where it's used (in embeddings module)
    with patch(
-        "backend.api.features.store.embeddings.query_raw_with_schema",
-        side_effect=[mock_approved_result, mock_embedded_result],
+        "backend.api.features.store.embeddings.CONTENT_HANDLERS",
+        {ContentType.STORE_AGENT: mock_handler},
    ):
        result = await embeddings.get_embedding_stats()

-        assert result["total_approved"] == 100
-        assert result["with_embeddings"] == 75
-        assert result["without_embeddings"] == 25
-        assert result["coverage_percent"] == 75.0
+        assert "by_type" in result
+        assert "totals" in result
+        assert result["totals"]["total"] == 100
+        assert result["totals"]["with_embeddings"] == 75
+        assert result["totals"]["without_embeddings"] == 25
+        assert result["totals"]["coverage_percent"] == 75.0


@pytest.mark.asyncio(loop_scope="session")
-@patch("backend.api.features.store.embeddings.ensure_embedding")
-async def test_backfill_missing_embeddings_success(mock_ensure):
+@patch("backend.api.features.store.embeddings.store_content_embedding")
+async def test_backfill_missing_embeddings_success(mock_store):
    """Test backfill with successful embedding generation."""
-    # Mock missing embeddings query
-    mock_missing = [
-        {
-            "id": "version-1",
-            "name": "Agent 1",
-            "description": "Description 1",
-            "subHeading": "Heading 1",
-            "categories": ["AI"],
-        },
-        {
-            "id": "version-2",
-            "name": "Agent 2",
-            "description": "Description 2",
-            "subHeading": "Heading 2",
-            "categories": ["Productivity"],
-        },
+    # Mock ContentItem from handlers
+    from backend.api.features.store.content_handlers import ContentItem
+
+    mock_items = [
+        ContentItem(
+            content_id="version-1",
+            content_type=ContentType.STORE_AGENT,
+            searchable_text="Agent 1 Description 1",
+            metadata={"name": "Agent 1"},
+        ),
+        ContentItem(
+            content_id="version-2",
+            content_type=ContentType.STORE_AGENT,
+            searchable_text="Agent 2 Description 2",
+            metadata={"name": "Agent 2"},
+        ),
    ]

-    # Mock ensure_embedding to succeed for first, fail for second
-    mock_ensure.side_effect = [True, False]
+    # Mock handler to return missing items
+    mock_handler = MagicMock()
+    mock_handler.get_missing_items = AsyncMock(return_value=mock_items)
+
+    # Mock store_content_embedding to succeed for first, fail for second
+    mock_store.side_effect = [True, False]

    with patch(
-        "backend.api.features.store.embeddings.query_raw_with_schema",
-        return_value=mock_missing,
+        "backend.api.features.store.embeddings.CONTENT_HANDLERS",
+        {ContentType.STORE_AGENT: mock_handler},
    ):
-        result = await embeddings.backfill_missing_embeddings(batch_size=5)
+        with patch(
+            "backend.api.features.store.embeddings.generate_embedding",
+            return_value=[0.1] * embeddings.EMBEDDING_DIM,
+        ):
+            result = await embeddings.backfill_missing_embeddings(batch_size=5)

-        assert result["processed"] == 2
-        assert result["success"] == 1
-        assert result["failed"] == 1
-        assert mock_ensure.call_count == 2
+            assert result["processed"] == 2
+            assert result["success"] == 1
+            assert result["failed"] == 1
+            assert mock_store.call_count == 2


@pytest.mark.asyncio(loop_scope="session")
 async def test_backfill_missing_embeddings_no_missing():
    """Test backfill when no embeddings are missing."""
+    # Mock handler to return no missing items
+    mock_handler = MagicMock()
+    mock_handler.get_missing_items = AsyncMock(return_value=[])
+
    with patch(
-        "backend.api.features.store.embeddings.query_raw_with_schema",
-        return_value=[],
+        "backend.api.features.store.embeddings.CONTENT_HANDLERS",
+        {ContentType.STORE_AGENT: mock_handler},
    ):
        result = await embeddings.backfill_missing_embeddings(batch_size=5)

        assert result["processed"] == 0
        assert result["success"] == 0
        assert result["failed"] == 0
-        assert result["message"] == "No missing embeddings"


@pytest.mark.asyncio(loop_scope="session")
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
@@ -11,6 +11,7 @@ from datetime import datetime
 from typing import Any, Literal

 from backend.api.features.store.embeddings import (
+    EMBEDDING_DIM,
    embed_query,
    embedding_to_vector_string,
 )
@@ -178,15 +179,39 @@ async def hybrid_search(
    # No user input is concatenated directly into the SQL string
    where_clause = " AND ".join(where_parts)

-    # Embedding is required for hybrid search - fail fast if unavailable
+    # Graceful degradation: fall back to lexical-only search if embedding unavailable
    if query_embedding is None or not query_embedding:
-        # Log detailed error server-side
-        logger.error(
-            "Failed to generate query embedding. "
+        logger.warning(
+            "Failed to generate query embedding - falling back to lexical-only search. "
            "Check that openai_internal_api_key is configured and OpenAI API is accessible."
        )
-        # Raise generic error to client
-        raise ValueError("Search service temporarily unavailable")
+        # Use zero embedding (semantic score will be 0)
+        query_embedding = [0.0] * EMBEDDING_DIM
+
+        # Adjust weights: redistribute semantic weight to other components
+        # Semantic becomes 0, lexical increases proportionally
+        total_non_semantic = (
+            weights.lexical + weights.category + weights.recency + weights.popularity
+        )
+        if total_non_semantic > 0:
+            # Redistribute semantic weight proportionally to other components
+            redistribution_factor = 1.0 / total_non_semantic
+            weights = HybridSearchWeights(
+                semantic=0.0,
+                lexical=weights.lexical * redistribution_factor,
+                category=weights.category * redistribution_factor,
+                recency=weights.recency * redistribution_factor,
+                popularity=weights.popularity * redistribution_factor,
+            )
+        else:
+            # Fallback: all weight to lexical if other components are also 0
+            weights = HybridSearchWeights(
+                semantic=0.0,
+                lexical=1.0,
+                category=0.0,
+                recency=0.0,
+                popularity=0.0,
+            )

    # Add embedding parameter
    embedding_str = embedding_to_vector_string(query_embedding)
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search_test.py
@@ -8,6 +8,7 @@ from unittest.mock import patch

 import pytest

+from backend.api.features.store import embeddings
 from backend.api.features.store.hybrid_search import HybridSearchWeights, hybrid_search


@@ -49,7 +50,7 @@ async def test_hybrid_search_with_schema_handling():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536  # Mock embedding
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM  # Mock embedding

            results, total = await hybrid_search(
                query=query,
@@ -85,7 +86,7 @@ async def test_hybrid_search_with_public_schema():
            with patch(
                "backend.api.features.store.hybrid_search.embed_query"
            ) as mock_embed:
-                mock_embed.return_value = [0.1] * 1536
+                mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

                results, total = await hybrid_search(
                    query="test",
@@ -116,7 +117,7 @@ async def test_hybrid_search_with_custom_schema():
            with patch(
                "backend.api.features.store.hybrid_search.embed_query"
            ) as mock_embed:
-                mock_embed.return_value = [0.1] * 1536
+                mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

                results, total = await hybrid_search(
                    query="test",
@@ -134,22 +135,52 @@ async def test_hybrid_search_with_custom_schema():
@pytest.mark.asyncio(loop_scope="session")
@pytest.mark.integration
 async def test_hybrid_search_without_embeddings():
-    """Test hybrid search fails fast when embeddings are unavailable."""
-    # Patch where the function is used, not where it's defined
-    with patch("backend.api.features.store.hybrid_search.embed_query") as mock_embed:
-        # Simulate embedding failure
-        mock_embed.return_value = None
+    """Test hybrid search gracefully degrades when embeddings are unavailable."""
+    # Mock database to return some results
+    mock_results = [
+        {
+            "slug": "test-agent",
+            "agent_name": "Test Agent",
+            "agent_image": "test.png",
+            "creator_username": "creator",
+            "creator_avatar": "avatar.png",
+            "sub_heading": "Test heading",
+            "description": "Test description",
+            "runs": 100,
+            "rating": 4.5,
+            "categories": ["AI"],
+            "featured": False,
+            "is_available": True,
+            "updated_at": "2025-01-01T00:00:00Z",
+            "semantic_score": 0.0,  # Zero because no embedding
+            "lexical_score": 0.5,
+            "category_score": 0.0,
+            "recency_score": 0.1,
+            "popularity_score": 0.2,
+            "combined_score": 0.3,
+            "total_count": 1,
+        }
+    ]

-        # Should raise ValueError with helpful message
-        with pytest.raises(ValueError) as exc_info:
-            await hybrid_search(
+    with patch("backend.api.features.store.hybrid_search.embed_query") as mock_embed:
+        with patch(
+            "backend.api.features.store.hybrid_search.query_raw_with_schema"
+        ) as mock_query:
+            # Simulate embedding failure
+            mock_embed.return_value = None
+            mock_query.return_value = mock_results
+
+            # Should NOT raise - graceful degradation
+            results, total = await hybrid_search(
                query="test",
                page=1,
                page_size=20,
            )

-        # Verify error message is generic (doesn't leak implementation details)
-        assert "Search service temporarily unavailable" in str(exc_info.value)
+            # Verify it returns results even without embeddings
+            assert len(results) == 1
+            assert results[0]["slug"] == "test-agent"
+            assert total == 1


@pytest.mark.asyncio(loop_scope="session")
@@ -164,7 +195,7 @@ async def test_hybrid_search_with_filters():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

            # Test with featured filter
            results, total = await hybrid_search(
@@ -204,7 +235,7 @@ async def test_hybrid_search_weights():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

            results, total = await hybrid_search(
                query="test",
@@ -248,7 +279,7 @@ async def test_hybrid_search_min_score_filtering():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

            # Test with custom min_score
            results, total = await hybrid_search(
@@ -283,7 +314,7 @@ async def test_hybrid_search_pagination():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

            # Test page 2 with page_size 10
            results, total = await hybrid_search(
@@ -317,7 +348,7 @@ async def test_hybrid_search_error_handling():
        with patch(
            "backend.api.features.store.hybrid_search.embed_query"
        ) as mock_embed:
-            mock_embed.return_value = [0.1] * 1536
+            mock_embed.return_value = [0.1] * embeddings.EMBEDDING_DIM

            # Should raise exception
            with pytest.raises(Exception) as exc_info:
--- a/autogpt_platform/backend/backend/executor/database.py
+++ b/autogpt_platform/backend/backend/executor/database.py
@@ -9,6 +9,7 @@ from backend.api.features.library.db import (
 from backend.api.features.store.db import get_store_agent_details, get_store_agents
 from backend.api.features.store.embeddings import (
    backfill_missing_embeddings,
+    cleanup_orphaned_embeddings,
    get_embedding_stats,
 )
 from backend.data import db
@@ -221,6 +222,7 @@ class DatabaseManager(AppService):
    # Store Embeddings
    get_embedding_stats = _(get_embedding_stats)
    backfill_missing_embeddings = _(backfill_missing_embeddings)
+    cleanup_orphaned_embeddings = _(cleanup_orphaned_embeddings)

    # Summary data - async
    get_user_execution_summary_data = _(get_user_execution_summary_data)
@@ -276,6 +278,7 @@ class DatabaseManagerClient(AppServiceClient):
    # Store Embeddings
    get_embedding_stats = _(d.get_embedding_stats)
    backfill_missing_embeddings = _(d.backfill_missing_embeddings)
+    cleanup_orphaned_embeddings = _(d.cleanup_orphaned_embeddings)


 class DatabaseManagerAsyncClient(AppServiceClient):
--- a/autogpt_platform/backend/backend/executor/scheduler.py
+++ b/autogpt_platform/backend/backend/executor/scheduler.py
@@ -28,6 +28,7 @@ from backend.data.auth.oauth import cleanup_expired_oauth_tokens
 from backend.data.block import BlockInput
 from backend.data.execution import GraphExecutionWithNodes
 from backend.data.model import CredentialsMetaInput
+from backend.data.onboarding import increment_onboarding_runs
 from backend.executor import utils as execution_utils
 from backend.monitoring import (
    NotificationJobArgs,
@@ -156,6 +157,7 @@ async def _execute_graph(**kwargs):
            inputs=args.input_data,
            graph_credentials_inputs=args.input_credentials,
        )
+        await increment_onboarding_runs(args.user_id)
        elapsed = asyncio.get_event_loop().time() - start_time
        logger.info(
            f"Graph execution started with ID {graph_exec.id} for graph {args.graph_id} "
@@ -255,14 +257,14 @@ def execution_accuracy_alerts():

 def ensure_embeddings_coverage():
    """
-    Ensure approved store agents have embeddings for hybrid search.
+    Ensure all content types (store agents, blocks, docs) have embeddings for search.

-    Processes ALL missing embeddings in batches of 10 until 100% coverage.
-    Missing embeddings = agents invisible in hybrid search.
+    Processes ALL missing embeddings in batches of 10 per content type until 100% coverage.
+    Missing embeddings = content invisible in hybrid search.

    Schedule: Runs every 6 hours (balanced between coverage and API costs).
-    - Catches agents approved between scheduled runs
-    - Batch size 10: gradual processing to avoid rate limits
+    - Catches new content added between scheduled runs
+    - Batch size 10 per content type: gradual processing to avoid rate limits
    - Manual trigger available via execute_ensure_embeddings_coverage endpoint
    """
    db_client = get_database_manager_client()
@@ -275,13 +277,27 @@ def ensure_embeddings_coverage():
        )
        return {"processed": 0, "success": 0, "failed": 0, "error": stats["error"]}

-    if stats["without_embeddings"] == 0:
-        logger.info("All approved agents have embeddings, skipping backfill")
+    # Extract totals from new stats structure
+    totals = stats.get("totals", {})
+    without_embeddings = totals.get("without_embeddings", 0)
+    coverage_percent = totals.get("coverage_percent", 0)
+
+    if without_embeddings == 0:
+        logger.info("All content has embeddings, skipping backfill")
        return {"processed": 0, "success": 0, "failed": 0}

+    # Log per-content-type stats for visibility
+    by_type = stats.get("by_type", {})
+    for content_type, type_stats in by_type.items():
+        if type_stats.get("without_embeddings", 0) > 0:
+            logger.info(
+                f"{content_type}: {type_stats['without_embeddings']} items without embeddings "
+                f"({type_stats['coverage_percent']}% coverage)"
+            )
+
    logger.info(
-        f"Found {stats['without_embeddings']} agents without embeddings "
-        f"({stats['coverage_percent']}% coverage) - processing all"
+        f"Total: {without_embeddings} items without embeddings "
+        f"({coverage_percent}% coverage) - processing all"
    )

    total_processed = 0
@@ -314,10 +330,33 @@ def ensure_embeddings_coverage():
        f"Embedding backfill completed: {total_success}/{total_processed} succeeded, "
        f"{total_failed} failed"
    )
+
+    # Clean up orphaned embeddings for blocks and docs
+    logger.info("Running cleanup for orphaned embeddings (blocks/docs)...")
+    cleanup_result = db_client.cleanup_orphaned_embeddings()
+    cleanup_totals = cleanup_result.get("totals", {})
+    cleanup_deleted = cleanup_totals.get("deleted", 0)
+
+    if cleanup_deleted > 0:
+        logger.info(f"Cleanup completed: deleted {cleanup_deleted} orphaned embeddings")
+        by_type = cleanup_result.get("by_type", {})
+        for content_type, type_result in by_type.items():
+            if type_result.get("deleted", 0) > 0:
+                logger.info(
+                    f"{content_type}: deleted {type_result['deleted']} orphaned embeddings"
+                )
+    else:
+        logger.info("Cleanup completed: no orphaned embeddings found")
+
    return {
-        "processed": total_processed,
-        "success": total_success,
-        "failed": total_failed,
+        "backfill": {
+            "processed": total_processed,
+            "success": total_success,
+            "failed": total_failed,
+        },
+        "cleanup": {
+            "deleted": cleanup_deleted,
+        },
    }