Files
AutoGPT/autogpt_platform/backend/backend/api/features/builder/db.py
Otto 943a1df815 dx(backend): Make Builder and Marketplace search work without embeddings (#12479)
When OpenAI credentials are unavailable (fork PRs, dev envs without API
keys), both builder block search and store agent functionality break:

1. **Block search returns wrong results.** `unified_hybrid_search` falls
back to a zero vector when embedding generation fails. With ~200 blocks
in `UnifiedContentEmbedding`, the zero-vector semantic scores are
garbage, and lexical matching on short block names is too weak — "Store
Value" doesn't appear in the top results for query "Store Value".

2. **Store submission approval fails entirely.**
`review_store_submission` calls `ensure_embedding()` inside a
transaction. When it throws, the entire transaction rolls back — no
store submissions get approved, the `StoreAgent` materialized view stays
empty, and all marketplace e2e tests fail.

3. **Store search returns nothing.** Even when store data exists,
`hybrid_search` queries `UnifiedContentEmbedding` which has no store
agent rows (backfill failed). It succeeds with zero results rather than
throwing, so the existing exception-based fallback never triggers.

### Changes 🏗️

- Replace `unified_hybrid_search` with in-memory text search in
`_hybrid_search_blocks` (-> `_text_search_blocks`). All ~200 blocks are
already loaded in memory, and `_score_primary_fields` provides correct
deterministic text relevance scoring against block name, description,
and input schema field descriptions — the same rich text the embedding
pipeline uses. CamelCase block names are split via `split_camelcase()`
to match the tokenization from PR #12400.

- Make embedding generation in `review_store_submission` best-effort:
catch failures and log a warning instead of rolling back the approval
transaction. The backfill scheduler retries later when credentials
become available.

- Fall through to direct DB search when `hybrid_search` returns empty
results (not just when it throws). The fallback uses ad-hoc
`to_tsvector`/`plainto_tsquery` with `ts_rank_cd` ranking on
`StoreAgent` view fields, restoring the search quality of the original
pre-hybrid implementation (stemming, stop-word removal, relevance
ranking).

- Fix Playwright artifact upload in end-to-end test CI

### Checklist 📋

#### For code changes:
- [x] I have clearly listed my changes in the PR description
- [x] I have made a test plan
- [x] I have tested my changes according to the test plan:
  - [x] `build.spec.ts`: 8/8 pass locally (was 0/7 before fix)
  - [x] All 79 e2e tests pass in CI (was 15 failures before fix)

---
Co-authored-by: Reinier van der Leer (@Pwuts)

---------

Co-authored-by: Reinier van der Leer <pwuts@agpt.co>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 00:11:06 +00:00

762 lines
23 KiB
Python

import logging
from dataclasses import dataclass
from difflib import SequenceMatcher
from typing import Any, Sequence, get_args, get_origin
import prisma
from prisma.models import mv_suggested_blocks
import backend.api.features.library.db as library_db
import backend.api.features.library.model as library_model
import backend.api.features.store.db as store_db
import backend.api.features.store.model as store_model
from backend.blocks import load_all_blocks
from backend.blocks._base import (
AnyBlockSchema,
BlockCategory,
BlockInfo,
BlockSchema,
BlockType,
)
from backend.blocks.llm import LlmModel
from backend.integrations.providers import ProviderName
from backend.util.cache import cached
from backend.util.models import Pagination
from backend.util.text import split_camelcase
from .model import (
BlockCategoryResponse,
BlockResponse,
BlockTypeFilter,
CountResponse,
FilterType,
Provider,
ProviderResponse,
SearchEntry,
)
logger = logging.getLogger(__name__)
llm_models = [name.name.lower().replace("_", " ") for name in LlmModel]
MAX_LIBRARY_AGENT_RESULTS = 100
MAX_MARKETPLACE_AGENT_RESULTS = 100
MIN_SCORE_FOR_FILTERED_RESULTS = 10.0
# Boost blocks over marketplace agents in search results
BLOCK_SCORE_BOOST = 50.0
# Block IDs to exclude from search results
EXCLUDED_BLOCK_IDS = frozenset(
{
"e189baac-8c20-45a1-94a7-55177ea42565", # AgentExecutorBlock
}
)
SearchResultItem = BlockInfo | library_model.LibraryAgent | store_model.StoreAgent
@dataclass
class _ScoredItem:
item: SearchResultItem
filter_type: FilterType
score: float
sort_key: str
@dataclass
class _SearchCacheEntry:
items: list[SearchResultItem]
total_items: dict[FilterType, int]
def get_block_categories(category_blocks: int = 3) -> list[BlockCategoryResponse]:
categories: dict[BlockCategory, BlockCategoryResponse] = {}
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
# Skip disabled and excluded blocks
if block.disabled or block.id in EXCLUDED_BLOCK_IDS:
continue
# Skip blocks that don't have categories (all should have at least one)
if not block.categories:
continue
# Add block to the categories
for category in block.categories:
if category not in categories:
categories[category] = BlockCategoryResponse(
name=category.name.lower(),
total_blocks=0,
blocks=[],
)
categories[category].total_blocks += 1
# Append if the category has less than the specified number of blocks
if len(categories[category].blocks) < category_blocks:
categories[category].blocks.append(block.get_info())
# Sort categories by name
return sorted(categories.values(), key=lambda x: x.name)
def get_blocks(
*,
category: str | None = None,
type: BlockTypeFilter | None = None,
provider: ProviderName | None = None,
page: int = 1,
page_size: int = 50,
) -> BlockResponse:
"""
Get blocks based on either category, type or provider.
Providing nothing fetches all block types.
"""
# Only one of category, type, or provider can be specified
if (category and type) or (category and provider) or (type and provider):
raise ValueError("Only one of category, type, or provider can be specified")
blocks: list[AnyBlockSchema] = []
skip = (page - 1) * page_size
take = page_size
total = 0
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
# Skip disabled blocks
if block.disabled:
continue
# Skip excluded blocks
if block.id in EXCLUDED_BLOCK_IDS:
continue
# Skip blocks that don't match the category
if category and category not in {c.name.lower() for c in block.categories}:
continue
# Skip blocks that don't match the type
if (
(type == "input" and block.block_type.value != "Input")
or (type == "output" and block.block_type.value != "Output")
or (type == "action" and block.block_type.value in ("Input", "Output"))
):
continue
# Skip blocks that don't match the provider
if provider:
credentials_info = block.input_schema.get_credentials_fields_info().values()
if not any(provider in info.provider for info in credentials_info):
continue
total += 1
if skip > 0:
skip -= 1
continue
if take > 0:
take -= 1
blocks.append(block)
return BlockResponse(
blocks=[b.get_info() for b in blocks],
pagination=Pagination(
total_items=total,
total_pages=(total + page_size - 1) // page_size,
current_page=page,
page_size=page_size,
),
)
def get_block_by_id(block_id: str) -> BlockInfo | None:
"""
Get a specific block by its ID.
"""
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
if block.id == block_id:
return block.get_info()
return None
async def update_search(user_id: str, search: SearchEntry) -> str:
"""
Upsert a search request for the user and return the search ID.
"""
if search.search_id:
# Update existing search
await prisma.models.BuilderSearchHistory.prisma().update(
where={
"id": search.search_id,
},
data={
"searchQuery": search.search_query or "",
"filter": search.filter or [], # type: ignore
"byCreator": search.by_creator or [],
},
)
return search.search_id
else:
# Create new search
new_search = await prisma.models.BuilderSearchHistory.prisma().create(
data={
"userId": user_id,
"searchQuery": search.search_query or "",
"filter": search.filter or [], # type: ignore
"byCreator": search.by_creator or [],
}
)
return new_search.id
async def get_recent_searches(user_id: str, limit: int = 5) -> list[SearchEntry]:
"""
Get the user's most recent search requests.
"""
searches = await prisma.models.BuilderSearchHistory.prisma().find_many(
where={
"userId": user_id,
},
order={
"updatedAt": "desc",
},
take=limit,
)
return [
SearchEntry(
search_query=s.searchQuery,
filter=s.filter, # type: ignore
by_creator=s.byCreator,
search_id=s.id,
)
for s in searches
]
async def get_sorted_search_results(
*,
user_id: str,
search_query: str | None,
filters: Sequence[FilterType],
by_creator: Sequence[str] | None = None,
) -> _SearchCacheEntry:
normalized_filters: tuple[FilterType, ...] = tuple(sorted(set(filters or [])))
normalized_creators: tuple[str, ...] = tuple(sorted(set(by_creator or [])))
return await _build_cached_search_results(
user_id=user_id,
search_query=search_query or "",
filters=normalized_filters,
by_creator=normalized_creators,
)
@cached(ttl_seconds=300, shared_cache=True)
async def _build_cached_search_results(
user_id: str,
search_query: str,
filters: tuple[FilterType, ...],
by_creator: tuple[str, ...],
) -> _SearchCacheEntry:
normalized_query = (search_query or "").strip().lower()
include_blocks = "blocks" in filters
include_integrations = "integrations" in filters
include_library_agents = "my_agents" in filters
include_marketplace_agents = "marketplace_agents" in filters
scored_items: list[_ScoredItem] = []
total_items: dict[FilterType, int] = {
"blocks": 0,
"integrations": 0,
"marketplace_agents": 0,
"my_agents": 0,
}
# Use hybrid search when query is present, otherwise list all blocks
if (include_blocks or include_integrations) and normalized_query:
block_results, block_total, integration_total = await _text_search_blocks(
query=search_query,
include_blocks=include_blocks,
include_integrations=include_integrations,
)
scored_items.extend(block_results)
total_items["blocks"] = block_total
total_items["integrations"] = integration_total
elif include_blocks or include_integrations:
# No query - list all blocks using in-memory approach
block_results, block_total, integration_total = _collect_block_results(
include_blocks=include_blocks,
include_integrations=include_integrations,
)
scored_items.extend(block_results)
total_items["blocks"] = block_total
total_items["integrations"] = integration_total
if include_library_agents:
library_response = await library_db.list_library_agents(
user_id=user_id,
search_term=search_query or None,
page=1,
page_size=MAX_LIBRARY_AGENT_RESULTS,
)
total_items["my_agents"] = library_response.pagination.total_items
scored_items.extend(
_build_library_items(
agents=library_response.agents,
normalized_query=normalized_query,
)
)
if include_marketplace_agents:
marketplace_response = await store_db.get_store_agents(
creators=list(by_creator) or None,
search_query=search_query or None,
page=1,
page_size=MAX_MARKETPLACE_AGENT_RESULTS,
)
total_items["marketplace_agents"] = marketplace_response.pagination.total_items
scored_items.extend(
_build_marketplace_items(
agents=marketplace_response.agents,
normalized_query=normalized_query,
)
)
sorted_items = sorted(
scored_items,
key=lambda entry: (-entry.score, entry.sort_key, entry.filter_type),
)
return _SearchCacheEntry(
items=[entry.item for entry in sorted_items],
total_items=total_items,
)
def _collect_block_results(
*,
include_blocks: bool,
include_integrations: bool,
) -> tuple[list[_ScoredItem], int, int]:
"""
Collect all blocks for listing (no search query).
All blocks get BLOCK_SCORE_BOOST to prioritize them over marketplace agents.
"""
results: list[_ScoredItem] = []
block_count = 0
integration_count = 0
if not include_blocks and not include_integrations:
return results, block_count, integration_count
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
if block.disabled:
continue
# Skip excluded blocks
if block.id in EXCLUDED_BLOCK_IDS:
continue
block_info = block.get_info()
credentials = list(block.input_schema.get_credentials_fields().values())
is_integration = len(credentials) > 0
if is_integration and not include_integrations:
continue
if not is_integration and not include_blocks:
continue
filter_type: FilterType = "integrations" if is_integration else "blocks"
if is_integration:
integration_count += 1
else:
block_count += 1
results.append(
_ScoredItem(
item=block_info,
filter_type=filter_type,
score=BLOCK_SCORE_BOOST,
sort_key=block_info.name.lower(),
)
)
return results, block_count, integration_count
async def _text_search_blocks(
*,
query: str,
include_blocks: bool,
include_integrations: bool,
) -> tuple[list[_ScoredItem], int, int]:
"""
Search blocks using in-memory text matching over the block registry.
All blocks are already loaded in memory, so this is fast and reliable
regardless of whether OpenAI embeddings are available.
Scoring:
- Base: text relevance via _score_primary_fields, plus BLOCK_SCORE_BOOST
to prioritize blocks over marketplace agents in combined results
- +20 if the block has an LlmModel field and the query matches an LLM model name
"""
results: list[_ScoredItem] = []
if not include_blocks and not include_integrations:
return results, 0, 0
normalized_query = query.strip().lower()
all_results, _, _ = _collect_block_results(
include_blocks=include_blocks,
include_integrations=include_integrations,
)
all_blocks = load_all_blocks()
for item in all_results:
block_info = item.item
assert isinstance(block_info, BlockInfo)
name = split_camelcase(block_info.name).lower()
# Build rich description including input field descriptions,
# matching the searchable text that the embedding pipeline uses
desc_parts = [block_info.description or ""]
block_cls = all_blocks.get(block_info.id)
if block_cls is not None:
block: AnyBlockSchema = block_cls()
desc_parts += [
f"{f}: {info.description}"
for f, info in block.input_schema.model_fields.items()
if info.description
]
description = " ".join(desc_parts).lower()
score = _score_primary_fields(name, description, normalized_query)
# Add LLM model match bonus
if block_cls is not None and _matches_llm_model(
block_cls().input_schema, normalized_query
):
score += 20
if score >= MIN_SCORE_FOR_FILTERED_RESULTS:
results.append(
_ScoredItem(
item=block_info,
filter_type=item.filter_type,
score=score + BLOCK_SCORE_BOOST,
sort_key=name,
)
)
block_count = sum(1 for r in results if r.filter_type == "blocks")
integration_count = sum(1 for r in results if r.filter_type == "integrations")
return results, block_count, integration_count
def _build_library_items(
*,
agents: list[library_model.LibraryAgent],
normalized_query: str,
) -> list[_ScoredItem]:
results: list[_ScoredItem] = []
for agent in agents:
score = _score_library_agent(agent, normalized_query)
if not _should_include_item(score, normalized_query):
continue
results.append(
_ScoredItem(
item=agent,
filter_type="my_agents",
score=score,
sort_key=_get_item_name(agent),
)
)
return results
def _build_marketplace_items(
*,
agents: list[store_model.StoreAgent],
normalized_query: str,
) -> list[_ScoredItem]:
results: list[_ScoredItem] = []
for agent in agents:
score = _score_store_agent(agent, normalized_query)
if not _should_include_item(score, normalized_query):
continue
results.append(
_ScoredItem(
item=agent,
filter_type="marketplace_agents",
score=score,
sort_key=_get_item_name(agent),
)
)
return results
def get_providers(
query: str = "",
page: int = 1,
page_size: int = 50,
) -> ProviderResponse:
providers = []
query = query.lower()
skip = (page - 1) * page_size
take = page_size
all_providers = _get_all_providers()
for provider in all_providers.values():
if (
query not in provider.name.value.lower()
and query not in provider.description.lower()
):
continue
if skip > 0:
skip -= 1
continue
if take > 0:
take -= 1
providers.append(provider)
total = len(all_providers)
return ProviderResponse(
providers=providers,
pagination=Pagination(
total_items=total,
total_pages=(total + page_size - 1) // page_size,
current_page=page,
page_size=page_size,
),
)
async def get_counts(user_id: str) -> CountResponse:
my_agents = await prisma.models.LibraryAgent.prisma().count(
where={
"userId": user_id,
"isDeleted": False,
"isArchived": False,
}
)
counts = await _get_static_counts()
return CountResponse(
my_agents=my_agents,
**counts,
)
@cached(ttl_seconds=3600)
async def _get_static_counts():
"""
Get counts of blocks, integrations, and marketplace agents.
This is cached to avoid unnecessary database queries and calculations.
"""
all_blocks = 0
input_blocks = 0
action_blocks = 0
output_blocks = 0
integrations = 0
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
if block.disabled:
continue
if block.id in EXCLUDED_BLOCK_IDS:
continue
all_blocks += 1
if block.block_type.value == "Input":
input_blocks += 1
elif block.block_type.value == "Output":
output_blocks += 1
else:
action_blocks += 1
credentials = list(block.input_schema.get_credentials_fields().values())
if len(credentials) > 0:
integrations += 1
marketplace_agents = await prisma.models.StoreAgent.prisma().count()
return {
"all_blocks": all_blocks,
"input_blocks": input_blocks,
"action_blocks": action_blocks,
"output_blocks": output_blocks,
"integrations": integrations,
"marketplace_agents": marketplace_agents,
}
def _contains_type(annotation: Any, target: type) -> bool:
"""Check if an annotation is or contains the target type (handles Optional/Union/Annotated)."""
if annotation is target:
return True
origin = get_origin(annotation)
if origin is None:
return False
return any(_contains_type(arg, target) for arg in get_args(annotation))
def _matches_llm_model(schema_cls: type[BlockSchema], query: str) -> bool:
for field in schema_cls.model_fields.values():
if _contains_type(field.annotation, LlmModel):
# Check if query matches any value in llm_models
if any(query in name for name in llm_models):
return True
return False
def _score_library_agent(
agent: library_model.LibraryAgent,
normalized_query: str,
) -> float:
if not normalized_query:
return 0.0
name = agent.name.lower()
description = (agent.description or "").lower()
instructions = (agent.instructions or "").lower()
score = _score_primary_fields(name, description, normalized_query)
score += _score_additional_field(instructions, normalized_query, 15, 6)
score += _score_additional_field(
agent.creator_name.lower(), normalized_query, 10, 5
)
return score
def _score_store_agent(
agent: store_model.StoreAgent,
normalized_query: str,
) -> float:
if not normalized_query:
return 0.0
name = agent.agent_name.lower()
description = agent.description.lower()
sub_heading = agent.sub_heading.lower()
score = _score_primary_fields(name, description, normalized_query)
score += _score_additional_field(sub_heading, normalized_query, 12, 6)
score += _score_additional_field(agent.creator.lower(), normalized_query, 10, 5)
return score
def _score_primary_fields(name: str, description: str, query: str) -> float:
score = 0.0
if name == query:
score += 120
elif name.startswith(query):
score += 90
elif query in name:
score += 60
score += SequenceMatcher(None, name, query).ratio() * 50
if description:
if query in description:
score += 30
score += SequenceMatcher(None, description, query).ratio() * 25
return score
def _score_additional_field(
value: str,
query: str,
contains_weight: float,
similarity_weight: float,
) -> float:
if not value or not query:
return 0.0
score = 0.0
if query in value:
score += contains_weight
score += SequenceMatcher(None, value, query).ratio() * similarity_weight
return score
def _should_include_item(score: float, normalized_query: str) -> bool:
if not normalized_query:
return True
return score >= MIN_SCORE_FOR_FILTERED_RESULTS
def _get_item_name(item: SearchResultItem) -> str:
if isinstance(item, BlockInfo):
return item.name.lower()
if isinstance(item, library_model.LibraryAgent):
return item.name.lower()
return item.agent_name.lower()
@cached(ttl_seconds=3600)
def _get_all_providers() -> dict[ProviderName, Provider]:
providers: dict[ProviderName, Provider] = {}
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
if block.disabled:
continue
credentials_info = block.input_schema.get_credentials_fields_info().values()
for info in credentials_info:
for provider in info.provider: # provider is a ProviderName enum member
if provider in providers:
providers[provider].integration_count += 1
else:
providers[provider] = Provider(
name=provider, description="", integration_count=1
)
return providers
@cached(ttl_seconds=3600, shared_cache=True)
async def get_suggested_blocks(count: int = 5) -> list[BlockInfo]:
"""Return the most-executed blocks from the last 14 days.
Queries the mv_suggested_blocks materialized view (refreshed hourly via pg_cron)
and returns the top `count` blocks sorted by execution count, excluding
Input/Output/Agent block types and blocks in EXCLUDED_BLOCK_IDS.
"""
results = await mv_suggested_blocks.prisma().find_many()
# Get the top blocks based on execution count
# But ignore Input, Output, Agent, and excluded blocks
blocks: list[tuple[BlockInfo, int]] = []
execution_counts = {row.block_id: row.execution_count for row in results}
for block_type in load_all_blocks().values():
block: AnyBlockSchema = block_type()
if block.disabled or block.block_type in (
BlockType.INPUT,
BlockType.OUTPUT,
BlockType.AGENT,
):
continue
if block.id in EXCLUDED_BLOCK_IDS:
continue
execution_count = execution_counts.get(block.id, 0)
blocks.append((block.get_info(), execution_count))
# Sort blocks by execution count
blocks.sort(key=lambda x: x[1], reverse=True)
suggested_blocks = [block[0] for block in blocks]
# Return the top blocks
return suggested_blocks[:count]