mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-03-17 03:00:27 -04:00
Compare commits
59 Commits
fix/copilo
...
fix/block-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc8f176a05 | ||
|
|
d2e54d51e4 | ||
|
|
40f5532b1e | ||
|
|
a08673cdbe | ||
|
|
c87e954644 | ||
|
|
349b9fc009 | ||
|
|
f2f48e98c8 | ||
|
|
dbad60933d | ||
|
|
d13ebbf64b | ||
|
|
ec0d9ab6ff | ||
|
|
e706c5afc2 | ||
|
|
93fc88ca1e | ||
|
|
28ff7b6057 | ||
|
|
378bd3afcc | ||
|
|
19e79dd236 | ||
|
|
f5f4c7d8f9 | ||
|
|
7d869d8288 | ||
|
|
40df7165d0 | ||
|
|
a56dc42a59 | ||
|
|
288ced743b | ||
|
|
41e2e80f60 | ||
|
|
c3eac2a6af | ||
|
|
5b6b68e469 | ||
|
|
296372b8b9 | ||
|
|
ee8896c818 | ||
|
|
699ecc8cec | ||
|
|
211be3aff1 | ||
|
|
3120981e4b | ||
|
|
5966d3669d | ||
|
|
c81ab1fc3b | ||
|
|
5446c7f18f | ||
|
|
2b0c9ba703 | ||
|
|
195c7011ae | ||
|
|
d4944fb22b | ||
|
|
a5ed8fefa9 | ||
|
|
a52a777b29 | ||
|
|
8bec7a6933 | ||
|
|
e73791efed | ||
|
|
2d161ce2b9 | ||
|
|
6fc4989654 | ||
|
|
976443bf6e | ||
|
|
4ceb15b3f1 | ||
|
|
3096f94996 | ||
|
|
6f90729612 | ||
|
|
ebf89dde8b | ||
|
|
5d057e97e5 | ||
|
|
1d2f641a26 | ||
|
|
dcb71ab0b9 | ||
|
|
8136b90860 | ||
|
|
4d179a7c37 | ||
|
|
f78adcdc65 | ||
|
|
40388b7520 | ||
|
|
dd7be1158b | ||
|
|
c0e59f0a6b | ||
|
|
104d1f1bf4 | ||
|
|
d9e9cd4c98 | ||
|
|
ca416300ec | ||
|
|
c589cd0c43 | ||
|
|
b6d863fcd2 |
@@ -5,16 +5,26 @@ Pluggable system for different content sources (store agents, blocks, docs).
|
||||
Each handler knows how to fetch and process its content type for embedding.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import itertools
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, get_args, get_origin
|
||||
from typing import TYPE_CHECKING, Any, get_args, get_origin
|
||||
|
||||
from prisma.enums import ContentType
|
||||
|
||||
from backend.blocks import get_blocks
|
||||
from backend.blocks.llm import LlmModel
|
||||
from backend.data.db import query_raw_with_schema
|
||||
from backend.util.text import split_camelcase
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from backend.blocks._base import AnyBlockSchema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -154,6 +164,28 @@ class StoreAgentHandler(ContentHandler):
|
||||
}
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1)
|
||||
def _get_enabled_blocks() -> dict[str, AnyBlockSchema]:
|
||||
"""Return ``{block_id: block_instance}`` for all enabled, instantiable blocks.
|
||||
|
||||
Disabled blocks and blocks that fail to instantiate are silently skipped
|
||||
(with a warning log), so callers never need their own try/except loop.
|
||||
|
||||
Results are cached for the process lifetime via ``lru_cache`` because
|
||||
blocks are registered at import time and never change while running.
|
||||
"""
|
||||
enabled: dict[str, AnyBlockSchema] = {}
|
||||
for block_id, block_cls in get_blocks().items():
|
||||
try:
|
||||
instance = block_cls()
|
||||
except Exception as e:
|
||||
logger.warning(f"Skipping block {block_id}: init failed: {e}")
|
||||
continue
|
||||
if not instance.disabled:
|
||||
enabled[block_id] = instance
|
||||
return enabled
|
||||
|
||||
|
||||
class BlockHandler(ContentHandler):
|
||||
"""Handler for block definitions (Python classes)."""
|
||||
|
||||
@@ -163,16 +195,14 @@ class BlockHandler(ContentHandler):
|
||||
|
||||
async def get_missing_items(self, batch_size: int) -> list[ContentItem]:
|
||||
"""Fetch blocks without embeddings."""
|
||||
from backend.blocks import get_blocks
|
||||
|
||||
# Get all available blocks
|
||||
all_blocks = get_blocks()
|
||||
|
||||
# Check which ones have embeddings
|
||||
if not all_blocks:
|
||||
# to_thread keeps the first (heavy) call off the event loop. On
|
||||
# subsequent calls the lru_cache makes this a dict lookup, so the
|
||||
# thread-pool overhead is negligible compared to the DB queries below.
|
||||
enabled = await asyncio.to_thread(_get_enabled_blocks)
|
||||
if not enabled:
|
||||
return []
|
||||
|
||||
block_ids = list(all_blocks.keys())
|
||||
block_ids = list(enabled.keys())
|
||||
|
||||
# Query for existing embeddings
|
||||
placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
|
||||
@@ -187,52 +217,42 @@ class BlockHandler(ContentHandler):
|
||||
)
|
||||
|
||||
existing_ids = {row["contentId"] for row in existing_result}
|
||||
missing_blocks = [
|
||||
(block_id, block_cls)
|
||||
for block_id, block_cls in all_blocks.items()
|
||||
if block_id not in existing_ids
|
||||
]
|
||||
|
||||
# Convert to ContentItem
|
||||
# Convert to ContentItem — disabled filtering already done by
|
||||
# _get_enabled_blocks so batch_size won't be exhausted by disabled blocks.
|
||||
missing = ((bid, b) for bid, b in enabled.items() if bid not in existing_ids)
|
||||
items = []
|
||||
for block_id, block_cls in missing_blocks[:batch_size]:
|
||||
for block_id, block in itertools.islice(missing, batch_size):
|
||||
try:
|
||||
block_instance = block_cls()
|
||||
|
||||
if block_instance.disabled:
|
||||
continue
|
||||
|
||||
# Build searchable text from block metadata
|
||||
parts = []
|
||||
if block_instance.name:
|
||||
parts.append(block_instance.name)
|
||||
if block_instance.description:
|
||||
parts.append(block_instance.description)
|
||||
if block_instance.categories:
|
||||
parts.append(
|
||||
" ".join(str(cat.value) for cat in block_instance.categories)
|
||||
if not block.name:
|
||||
logger.warning(
|
||||
f"Block {block_id} has no name — using block_id as fallback"
|
||||
)
|
||||
display_name = split_camelcase(block.name) if block.name else ""
|
||||
parts = []
|
||||
if display_name:
|
||||
parts.append(display_name)
|
||||
if block.description:
|
||||
parts.append(block.description)
|
||||
if block.categories:
|
||||
parts.append(" ".join(str(cat.value) for cat in block.categories))
|
||||
|
||||
# Add input schema field descriptions
|
||||
block_input_fields = block_instance.input_schema.model_fields
|
||||
parts += [
|
||||
f"{field_name}: {field_info.description}"
|
||||
for field_name, field_info in block_input_fields.items()
|
||||
for field_name, field_info in block.input_schema.model_fields.items()
|
||||
if field_info.description
|
||||
]
|
||||
|
||||
searchable_text = " ".join(parts)
|
||||
|
||||
categories_list = (
|
||||
[cat.value for cat in block_instance.categories]
|
||||
if block_instance.categories
|
||||
else []
|
||||
[cat.value for cat in block.categories] if block.categories else []
|
||||
)
|
||||
|
||||
# Extract provider names from credentials fields
|
||||
credentials_info = (
|
||||
block_instance.input_schema.get_credentials_fields_info()
|
||||
)
|
||||
credentials_info = block.input_schema.get_credentials_fields_info()
|
||||
is_integration = len(credentials_info) > 0
|
||||
provider_names = [
|
||||
provider.value.lower()
|
||||
@@ -243,7 +263,7 @@ class BlockHandler(ContentHandler):
|
||||
# Check if block has LlmModel field in input schema
|
||||
has_llm_model_field = any(
|
||||
_contains_type(field.annotation, LlmModel)
|
||||
for field in block_instance.input_schema.model_fields.values()
|
||||
for field in block.input_schema.model_fields.values()
|
||||
)
|
||||
|
||||
items.append(
|
||||
@@ -252,13 +272,13 @@ class BlockHandler(ContentHandler):
|
||||
content_type=ContentType.BLOCK,
|
||||
searchable_text=searchable_text,
|
||||
metadata={
|
||||
"name": block_instance.name,
|
||||
"name": display_name or block.name or block_id,
|
||||
"categories": categories_list,
|
||||
"providers": provider_names,
|
||||
"has_llm_model_field": has_llm_model_field,
|
||||
"is_integration": is_integration,
|
||||
},
|
||||
user_id=None, # Blocks are public
|
||||
user_id=None,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -269,22 +289,13 @@ class BlockHandler(ContentHandler):
|
||||
|
||||
async def get_stats(self) -> dict[str, int]:
|
||||
"""Get statistics about block embedding coverage."""
|
||||
from backend.blocks import get_blocks
|
||||
|
||||
all_blocks = get_blocks()
|
||||
|
||||
# Filter out disabled blocks - they're not indexed
|
||||
enabled_block_ids = [
|
||||
block_id
|
||||
for block_id, block_cls in all_blocks.items()
|
||||
if not block_cls().disabled
|
||||
]
|
||||
total_blocks = len(enabled_block_ids)
|
||||
enabled = await asyncio.to_thread(_get_enabled_blocks)
|
||||
total_blocks = len(enabled)
|
||||
|
||||
if total_blocks == 0:
|
||||
return {"total": 0, "with_embeddings": 0, "without_embeddings": 0}
|
||||
|
||||
block_ids = enabled_block_ids
|
||||
block_ids = list(enabled.keys())
|
||||
placeholders = ",".join([f"${i+1}" for i in range(len(block_ids))])
|
||||
|
||||
embedded_result = await query_raw_with_schema(
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
"""
|
||||
E2E tests for content handlers (blocks, store agents, documentation).
|
||||
|
||||
Tests the full flow: discovering content → generating embeddings → storing.
|
||||
Tests for content handlers (blocks, store agents, documentation).
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
@@ -15,15 +13,103 @@ from backend.api.features.store.content_handlers import (
|
||||
BlockHandler,
|
||||
DocumentationHandler,
|
||||
StoreAgentHandler,
|
||||
_get_enabled_blocks,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clear_block_cache():
|
||||
"""Clear the lru_cache on _get_enabled_blocks before each test."""
|
||||
_get_enabled_blocks.cache_clear()
|
||||
yield
|
||||
_get_enabled_blocks.cache_clear()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper to build a mock block class that returns a pre-configured instance
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_block_class(
|
||||
*,
|
||||
name: str = "Block",
|
||||
description: str = "",
|
||||
disabled: bool = False,
|
||||
categories: list[MagicMock] | None = None,
|
||||
fields: dict[str, str] | None = None,
|
||||
raise_on_init: Exception | None = None,
|
||||
) -> MagicMock:
|
||||
cls = MagicMock()
|
||||
if raise_on_init is not None:
|
||||
cls.side_effect = raise_on_init
|
||||
return cls
|
||||
inst = MagicMock()
|
||||
inst.name = name
|
||||
inst.disabled = disabled
|
||||
inst.description = description
|
||||
inst.categories = categories or []
|
||||
field_mocks = {
|
||||
fname: MagicMock(description=fdesc) for fname, fdesc in (fields or {}).items()
|
||||
}
|
||||
inst.input_schema.model_fields = field_mocks
|
||||
inst.input_schema.get_credentials_fields_info.return_value = {}
|
||||
cls.return_value = inst
|
||||
return cls
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _get_enabled_blocks
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_get_enabled_blocks_filters_disabled():
|
||||
"""Disabled blocks are excluded."""
|
||||
blocks = {
|
||||
"enabled": _make_block_class(name="E", disabled=False),
|
||||
"disabled": _make_block_class(name="D", disabled=True),
|
||||
}
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
result = _get_enabled_blocks()
|
||||
assert list(result.keys()) == ["enabled"]
|
||||
|
||||
|
||||
def test_get_enabled_blocks_skips_broken():
|
||||
"""Blocks that raise on init are skipped without crashing."""
|
||||
blocks = {
|
||||
"good": _make_block_class(name="Good"),
|
||||
"bad": _make_block_class(raise_on_init=RuntimeError("boom")),
|
||||
}
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
result = _get_enabled_blocks()
|
||||
assert list(result.keys()) == ["good"]
|
||||
|
||||
|
||||
def test_get_enabled_blocks_cached():
|
||||
"""_get_enabled_blocks() calls get_blocks() only once across multiple calls."""
|
||||
blocks = {"b1": _make_block_class(name="B1")}
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
) as mock_get_blocks:
|
||||
result1 = _get_enabled_blocks()
|
||||
result2 = _get_enabled_blocks()
|
||||
assert result1 is result2
|
||||
mock_get_blocks.assert_called_once()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# StoreAgentHandler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_store_agent_handler_get_missing_items(mocker):
|
||||
"""Test StoreAgentHandler fetches approved agents without embeddings."""
|
||||
handler = StoreAgentHandler()
|
||||
|
||||
# Mock database query
|
||||
mock_missing = [
|
||||
{
|
||||
"id": "agent-1",
|
||||
@@ -54,9 +140,7 @@ async def test_store_agent_handler_get_stats(mocker):
|
||||
"""Test StoreAgentHandler returns correct stats."""
|
||||
handler = StoreAgentHandler()
|
||||
|
||||
# Mock approved count query
|
||||
mock_approved = [{"count": 50}]
|
||||
# Mock embedded count query
|
||||
mock_embedded = [{"count": 30}]
|
||||
|
||||
with patch(
|
||||
@@ -70,74 +154,130 @@ async def test_store_agent_handler_get_stats(mocker):
|
||||
assert stats["without_embeddings"] == 20
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# BlockHandler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_get_missing_items(mocker):
|
||||
async def test_block_handler_get_missing_items():
|
||||
"""Test BlockHandler discovers blocks without embeddings."""
|
||||
handler = BlockHandler()
|
||||
|
||||
# Mock get_blocks to return test blocks
|
||||
mock_block_class = MagicMock()
|
||||
mock_block_instance = MagicMock()
|
||||
mock_block_instance.name = "Calculator Block"
|
||||
mock_block_instance.description = "Performs calculations"
|
||||
mock_block_instance.categories = [MagicMock(value="MATH")]
|
||||
mock_block_instance.disabled = False
|
||||
mock_field = MagicMock()
|
||||
mock_field.description = "Math expression to evaluate"
|
||||
mock_block_instance.input_schema.model_fields = {"expression": mock_field}
|
||||
mock_block_instance.input_schema.get_credentials_fields_info.return_value = {}
|
||||
mock_block_class.return_value = mock_block_instance
|
||||
|
||||
mock_blocks = {"block-uuid-1": mock_block_class}
|
||||
|
||||
# Mock existing embeddings query (no embeddings exist)
|
||||
mock_existing = []
|
||||
blocks = {
|
||||
"block-uuid-1": _make_block_class(
|
||||
name="CalculatorBlock",
|
||||
description="Performs calculations",
|
||||
categories=[MagicMock(value="MATH")],
|
||||
fields={"expression": "Math expression to evaluate"},
|
||||
),
|
||||
}
|
||||
|
||||
with patch(
|
||||
"backend.blocks.get_blocks",
|
||||
return_value=mock_blocks,
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=mock_existing,
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
assert items[0].content_id == "block-uuid-1"
|
||||
assert items[0].content_type == ContentType.BLOCK
|
||||
# CamelCase should be split in searchable text and metadata name
|
||||
assert "Calculator Block" in items[0].searchable_text
|
||||
assert "Performs calculations" in items[0].searchable_text
|
||||
assert "MATH" in items[0].searchable_text
|
||||
assert "expression: Math expression" in items[0].searchable_text
|
||||
assert items[0].metadata["name"] == "Calculator Block"
|
||||
assert items[0].user_id is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_get_stats(mocker):
|
||||
async def test_block_handler_get_missing_items_splits_camelcase():
|
||||
"""CamelCase block names are split for better search indexing."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {
|
||||
"ai-block": _make_block_class(name="AITextGeneratorBlock"),
|
||||
}
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
assert "AI Text Generator Block" in items[0].searchable_text
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_get_missing_items_batch_size_zero():
|
||||
"""batch_size=0 returns an empty list; the DB is still queried to find missing IDs."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {"b1": _make_block_class(name="B1")}
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
) as mock_query:
|
||||
items = await handler.get_missing_items(batch_size=0)
|
||||
assert items == []
|
||||
# DB query is still issued to learn which blocks lack embeddings;
|
||||
# the empty result comes from itertools.islice limiting to 0 items.
|
||||
mock_query.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_disabled_dont_exhaust_batch():
|
||||
"""Disabled blocks don't consume batch budget, so enabled blocks get indexed."""
|
||||
handler = BlockHandler()
|
||||
|
||||
# 5 disabled + 3 enabled, batch_size=2
|
||||
blocks = {
|
||||
**{
|
||||
f"dis-{i}": _make_block_class(name=f"D{i}", disabled=True) for i in range(5)
|
||||
},
|
||||
**{f"en-{i}": _make_block_class(name=f"E{i}") for i in range(3)},
|
||||
}
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=2)
|
||||
|
||||
assert len(items) == 2
|
||||
assert all(item.content_id.startswith("en-") for item in items)
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_get_stats():
|
||||
"""Test BlockHandler returns correct stats."""
|
||||
handler = BlockHandler()
|
||||
|
||||
# Mock get_blocks - each block class returns an instance with disabled=False
|
||||
def make_mock_block_class():
|
||||
mock_class = MagicMock()
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.disabled = False
|
||||
mock_class.return_value = mock_instance
|
||||
return mock_class
|
||||
|
||||
mock_blocks = {
|
||||
"block-1": make_mock_block_class(),
|
||||
"block-2": make_mock_block_class(),
|
||||
"block-3": make_mock_block_class(),
|
||||
blocks = {
|
||||
"block-1": _make_block_class(name="B1"),
|
||||
"block-2": _make_block_class(name="B2"),
|
||||
"block-3": _make_block_class(name="B3"),
|
||||
}
|
||||
|
||||
# Mock embedded count query (2 blocks have embeddings)
|
||||
mock_embedded = [{"count": 2}]
|
||||
|
||||
with patch(
|
||||
"backend.blocks.get_blocks",
|
||||
return_value=mock_blocks,
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
@@ -150,21 +290,123 @@ async def test_block_handler_get_stats(mocker):
|
||||
assert stats["without_embeddings"] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_get_stats_skips_broken():
|
||||
"""get_stats skips broken blocks instead of crashing."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {
|
||||
"good": _make_block_class(name="Good"),
|
||||
"bad": _make_block_class(raise_on_init=RuntimeError("boom")),
|
||||
}
|
||||
|
||||
mock_embedded = [{"count": 1}]
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=mock_embedded,
|
||||
):
|
||||
stats = await handler.get_stats()
|
||||
|
||||
assert stats["total"] == 1 # only the good block
|
||||
assert stats["with_embeddings"] == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_handles_none_name():
|
||||
"""When block.name is None the fallback display name logic is used."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {
|
||||
"none-name-block": _make_block_class(
|
||||
name="placeholder", # will be overridden to None below
|
||||
description="A block with no name",
|
||||
),
|
||||
}
|
||||
# Override the name to None after construction so _make_block_class
|
||||
# doesn't interfere with the mock wiring.
|
||||
blocks["none-name-block"].return_value.name = None
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
# display_name should be "" because block.name is None
|
||||
# searchable_text should still contain the description
|
||||
assert "A block with no name" in items[0].searchable_text
|
||||
# metadata["name"] falls back to block_id when both display_name
|
||||
# and block.name are falsy, ensuring it is always a non-empty string.
|
||||
assert items[0].metadata["name"] == "none-name-block"
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_handles_empty_attributes():
|
||||
"""Test BlockHandler handles blocks with empty/falsy attribute values."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {"block-minimal": _make_block_class(name="Minimal Block")}
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
assert items[0].searchable_text == "Minimal Block"
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_skips_failed_blocks():
|
||||
"""Test BlockHandler skips blocks that fail to instantiate."""
|
||||
handler = BlockHandler()
|
||||
|
||||
blocks = {
|
||||
"good-block": _make_block_class(name="Good Block", description="Works fine"),
|
||||
"bad-block": _make_block_class(raise_on_init=Exception("Instantiation failed")),
|
||||
}
|
||||
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.get_blocks", return_value=blocks
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
assert items[0].content_id == "good-block"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DocumentationHandler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_documentation_handler_get_missing_items(tmp_path, mocker):
|
||||
"""Test DocumentationHandler discovers docs without embeddings."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Create temporary docs directory with test files
|
||||
docs_root = tmp_path / "docs"
|
||||
docs_root.mkdir()
|
||||
|
||||
(docs_root / "guide.md").write_text("# Getting Started\n\nThis is a guide.")
|
||||
(docs_root / "api.mdx").write_text("# API Reference\n\nAPI documentation.")
|
||||
|
||||
# Mock _get_docs_root to return temp dir
|
||||
with patch.object(handler, "_get_docs_root", return_value=docs_root):
|
||||
# Mock existing embeddings query (no embeddings exist)
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
@@ -173,7 +415,6 @@ async def test_documentation_handler_get_missing_items(tmp_path, mocker):
|
||||
|
||||
assert len(items) == 2
|
||||
|
||||
# Check guide.md (content_id format: doc_path::section_index)
|
||||
guide_item = next(
|
||||
(item for item in items if item.content_id == "guide.md::0"), None
|
||||
)
|
||||
@@ -184,7 +425,6 @@ async def test_documentation_handler_get_missing_items(tmp_path, mocker):
|
||||
assert guide_item.metadata["doc_title"] == "Getting Started"
|
||||
assert guide_item.user_id is None
|
||||
|
||||
# Check api.mdx (content_id format: doc_path::section_index)
|
||||
api_item = next(
|
||||
(item for item in items if item.content_id == "api.mdx::0"), None
|
||||
)
|
||||
@@ -197,14 +437,12 @@ async def test_documentation_handler_get_stats(tmp_path, mocker):
|
||||
"""Test DocumentationHandler returns correct stats."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Create temporary docs directory
|
||||
docs_root = tmp_path / "docs"
|
||||
docs_root.mkdir()
|
||||
(docs_root / "doc1.md").write_text("# Doc 1")
|
||||
(docs_root / "doc2.md").write_text("# Doc 2")
|
||||
(docs_root / "doc3.mdx").write_text("# Doc 3")
|
||||
|
||||
# Mock embedded count query (1 doc has embedding)
|
||||
mock_embedded = [{"count": 1}]
|
||||
|
||||
with patch.object(handler, "_get_docs_root", return_value=docs_root):
|
||||
@@ -224,13 +462,11 @@ async def test_documentation_handler_title_extraction(tmp_path):
|
||||
"""Test DocumentationHandler extracts title from markdown heading."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Test with heading
|
||||
doc_with_heading = tmp_path / "with_heading.md"
|
||||
doc_with_heading.write_text("# My Title\n\nContent here")
|
||||
title = handler._extract_doc_title(doc_with_heading)
|
||||
assert title == "My Title"
|
||||
|
||||
# Test without heading
|
||||
doc_without_heading = tmp_path / "no-heading.md"
|
||||
doc_without_heading.write_text("Just content, no heading")
|
||||
title = handler._extract_doc_title(doc_without_heading)
|
||||
@@ -242,7 +478,6 @@ async def test_documentation_handler_markdown_chunking(tmp_path):
|
||||
"""Test DocumentationHandler chunks markdown by headings."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Test document with multiple sections
|
||||
doc_with_sections = tmp_path / "sections.md"
|
||||
doc_with_sections.write_text(
|
||||
"# Document Title\n\n"
|
||||
@@ -254,7 +489,6 @@ async def test_documentation_handler_markdown_chunking(tmp_path):
|
||||
)
|
||||
sections = handler._chunk_markdown_by_headings(doc_with_sections)
|
||||
|
||||
# Should have 3 sections: intro (with doc title), section one, section two
|
||||
assert len(sections) == 3
|
||||
assert sections[0].title == "Document Title"
|
||||
assert sections[0].index == 0
|
||||
@@ -268,7 +502,6 @@ async def test_documentation_handler_markdown_chunking(tmp_path):
|
||||
assert sections[2].index == 2
|
||||
assert "Content for section two" in sections[2].content
|
||||
|
||||
# Test document without headings
|
||||
doc_no_sections = tmp_path / "no-sections.md"
|
||||
doc_no_sections.write_text("Just plain content without any headings.")
|
||||
sections = handler._chunk_markdown_by_headings(doc_no_sections)
|
||||
@@ -282,21 +515,39 @@ async def test_documentation_handler_section_content_ids():
|
||||
"""Test DocumentationHandler creates and parses section content IDs."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Test making content ID
|
||||
content_id = handler._make_section_content_id("docs/guide.md", 2)
|
||||
assert content_id == "docs/guide.md::2"
|
||||
|
||||
# Test parsing content ID
|
||||
doc_path, section_index = handler._parse_section_content_id("docs/guide.md::2")
|
||||
assert doc_path == "docs/guide.md"
|
||||
assert section_index == 2
|
||||
|
||||
# Test parsing legacy format (no section index)
|
||||
doc_path, section_index = handler._parse_section_content_id("docs/old-format.md")
|
||||
assert doc_path == "docs/old-format.md"
|
||||
assert section_index == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_documentation_handler_missing_docs_directory():
|
||||
"""Test DocumentationHandler handles missing docs directory gracefully."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
fake_path = Path("/nonexistent/docs")
|
||||
with patch.object(handler, "_get_docs_root", return_value=fake_path):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
assert items == []
|
||||
|
||||
stats = await handler.get_stats()
|
||||
assert stats["total"] == 0
|
||||
assert stats["with_embeddings"] == 0
|
||||
assert stats["without_embeddings"] == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_content_handlers_registry():
|
||||
"""Test all content types are registered."""
|
||||
@@ -307,88 +558,3 @@ async def test_content_handlers_registry():
|
||||
assert isinstance(CONTENT_HANDLERS[ContentType.STORE_AGENT], StoreAgentHandler)
|
||||
assert isinstance(CONTENT_HANDLERS[ContentType.BLOCK], BlockHandler)
|
||||
assert isinstance(CONTENT_HANDLERS[ContentType.DOCUMENTATION], DocumentationHandler)
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_handles_empty_attributes():
|
||||
"""Test BlockHandler handles blocks with empty/falsy attribute values."""
|
||||
handler = BlockHandler()
|
||||
|
||||
# Mock block with empty values (all attributes exist but are falsy)
|
||||
mock_block_class = MagicMock()
|
||||
mock_block_instance = MagicMock()
|
||||
mock_block_instance.name = "Minimal Block"
|
||||
mock_block_instance.disabled = False
|
||||
mock_block_instance.description = ""
|
||||
mock_block_instance.categories = set()
|
||||
mock_block_instance.input_schema.model_fields = {}
|
||||
mock_block_instance.input_schema.get_credentials_fields_info.return_value = {}
|
||||
mock_block_class.return_value = mock_block_instance
|
||||
|
||||
mock_blocks = {"block-minimal": mock_block_class}
|
||||
|
||||
with patch(
|
||||
"backend.blocks.get_blocks",
|
||||
return_value=mock_blocks,
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
assert len(items) == 1
|
||||
assert items[0].searchable_text == "Minimal Block"
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_block_handler_skips_failed_blocks():
|
||||
"""Test BlockHandler skips blocks that fail to instantiate."""
|
||||
handler = BlockHandler()
|
||||
|
||||
# Mock one good block and one bad block
|
||||
good_block = MagicMock()
|
||||
good_instance = MagicMock()
|
||||
good_instance.name = "Good Block"
|
||||
good_instance.description = "Works fine"
|
||||
good_instance.categories = []
|
||||
good_instance.disabled = False
|
||||
good_instance.input_schema.model_fields = {}
|
||||
good_instance.input_schema.get_credentials_fields_info.return_value = {}
|
||||
good_block.return_value = good_instance
|
||||
|
||||
bad_block = MagicMock()
|
||||
bad_block.side_effect = Exception("Instantiation failed")
|
||||
|
||||
mock_blocks = {"good-block": good_block, "bad-block": bad_block}
|
||||
|
||||
with patch(
|
||||
"backend.blocks.get_blocks",
|
||||
return_value=mock_blocks,
|
||||
):
|
||||
with patch(
|
||||
"backend.api.features.store.content_handlers.query_raw_with_schema",
|
||||
return_value=[],
|
||||
):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
|
||||
# Should only get the good block
|
||||
assert len(items) == 1
|
||||
assert items[0].content_id == "good-block"
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_documentation_handler_missing_docs_directory():
|
||||
"""Test DocumentationHandler handles missing docs directory gracefully."""
|
||||
handler = DocumentationHandler()
|
||||
|
||||
# Mock _get_docs_root to return non-existent path
|
||||
fake_path = Path("/nonexistent/docs")
|
||||
with patch.object(handler, "_get_docs_root", return_value=fake_path):
|
||||
items = await handler.get_missing_items(batch_size=10)
|
||||
assert items == []
|
||||
|
||||
stats = await handler.get_stats()
|
||||
assert stats["total"] == 0
|
||||
assert stats["with_embeddings"] == 0
|
||||
assert stats["without_embeddings"] == 0
|
||||
|
||||
@@ -15,6 +15,7 @@ from prisma.enums import ContentType
|
||||
from tiktoken import encoding_for_model
|
||||
|
||||
from backend.api.features.store.content_handlers import CONTENT_HANDLERS
|
||||
from backend.blocks import get_blocks
|
||||
from backend.data.db import execute_raw_with_schema, query_raw_with_schema
|
||||
from backend.util.clients import get_openai_client
|
||||
from backend.util.json import dumps
|
||||
@@ -662,8 +663,6 @@ async def cleanup_orphaned_embeddings() -> dict[str, Any]:
|
||||
)
|
||||
current_ids = {row["id"] for row in valid_agents}
|
||||
elif content_type == ContentType.BLOCK:
|
||||
from backend.blocks import get_blocks
|
||||
|
||||
current_ids = set(get_blocks().keys())
|
||||
elif content_type == ContentType.DOCUMENTATION:
|
||||
# Use DocumentationHandler to get section-based content IDs
|
||||
|
||||
@@ -31,12 +31,10 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tokenize(text: str) -> list[str]:
|
||||
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric."""
|
||||
"""Tokenize text for BM25."""
|
||||
if not text:
|
||||
return []
|
||||
# Lowercase and split on non-alphanumeric characters
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
return tokens
|
||||
return re.findall(r"\b\w+\b", text.lower())
|
||||
|
||||
|
||||
def bm25_rerank(
|
||||
|
||||
@@ -14,9 +14,27 @@ from backend.api.features.store.hybrid_search import (
|
||||
HybridSearchWeights,
|
||||
UnifiedSearchWeights,
|
||||
hybrid_search,
|
||||
tokenize,
|
||||
unified_hybrid_search,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# tokenize (BM25)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text, expected",
|
||||
[
|
||||
("AITextGeneratorBlock", ["aitextgeneratorblock"]),
|
||||
("hello world", ["hello", "world"]),
|
||||
("", []),
|
||||
("HTTPRequest", ["httprequest"]),
|
||||
],
|
||||
)
|
||||
def test_tokenize(input_text: str, expected: list[str]):
|
||||
assert tokenize(input_text) == expected
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
@pytest.mark.integration
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
"""Backward-compatibility shim — ``split_camelcase`` now lives in backend.util.text."""
|
||||
|
||||
from backend.util.text import split_camelcase # noqa: F401
|
||||
|
||||
__all__ = ["split_camelcase"]
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Tests for split_camelcase (now in backend.util.text)."""
|
||||
|
||||
import pytest
|
||||
|
||||
from backend.util.text import split_camelcase
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# split_camelcase
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_text, expected",
|
||||
[
|
||||
("AITextGeneratorBlock", "AI Text Generator Block"),
|
||||
("HTTPRequestBlock", "HTTP Request Block"),
|
||||
("simpleWord", "simple Word"),
|
||||
("already spaced", "already spaced"),
|
||||
("XMLParser", "XML Parser"),
|
||||
("getHTTPResponse", "get HTTP Response"),
|
||||
("Block", "Block"),
|
||||
("", ""),
|
||||
("OAuth2Block", "OAuth2 Block"),
|
||||
("IOError", "IO Error"),
|
||||
("getHTTPSResponse", "get HTTPS Response"),
|
||||
# Known limitation: single-letter uppercase prefixes are NOT split.
|
||||
# "ABlock" stays "ABlock" because the algorithm requires the left
|
||||
# part of an uppercase run to retain at least 2 uppercase chars.
|
||||
("ABlock", "ABlock"),
|
||||
# Digit-to-uppercase transitions
|
||||
("Base64Encoder", "Base64 Encoder"),
|
||||
("UTF8Decoder", "UTF8 Decoder"),
|
||||
# Pure digits — no camelCase boundaries to split
|
||||
("123", "123"),
|
||||
# Known limitation: single-letter uppercase segments after digits
|
||||
# are not split from the following word. "3D" is only 1 uppercase
|
||||
# char so the uppercase-run rule cannot fire, producing "3 DRenderer"
|
||||
# rather than the ideal "3D Renderer".
|
||||
("3DRenderer", "3 DRenderer"),
|
||||
# Exception list — compound terms that should stay together
|
||||
("YouTubeBlock", "YouTube Block"),
|
||||
("OpenAIBlock", "OpenAI Block"),
|
||||
("AutoGPTAgent", "AutoGPT Agent"),
|
||||
("GitHubIntegration", "GitHub Integration"),
|
||||
("LinkedInBlock", "LinkedIn Block"),
|
||||
],
|
||||
)
|
||||
def test_split_camelcase(input_text: str, expected: str):
|
||||
assert split_camelcase(input_text) == expected
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
import bleach
|
||||
from bleach.css_sanitizer import CSSSanitizer
|
||||
@@ -154,3 +155,76 @@ class TextFormatter:
|
||||
)
|
||||
|
||||
return rendered_subject_template, rendered_base_template
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CamelCase splitting
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Map of split forms back to their canonical compound terms.
|
||||
# Mirrors the frontend exception list in frontend/src/lib/utils.ts.
|
||||
_CAMELCASE_EXCEPTIONS: dict[str, str] = {
|
||||
"Auto GPT": "AutoGPT",
|
||||
"Open AI": "OpenAI",
|
||||
"You Tube": "YouTube",
|
||||
"Git Hub": "GitHub",
|
||||
"Linked In": "LinkedIn",
|
||||
}
|
||||
|
||||
_CAMELCASE_EXCEPTION_RE = re.compile(
|
||||
"|".join(re.escape(k) for k in _CAMELCASE_EXCEPTIONS),
|
||||
)
|
||||
|
||||
|
||||
def split_camelcase(text: str) -> str:
|
||||
"""Split CamelCase into separate words.
|
||||
|
||||
Uses a single-pass character-by-character algorithm to avoid any
|
||||
regex backtracking concerns (guaranteed O(n) time).
|
||||
|
||||
After splitting, known compound terms are restored via an exception
|
||||
list (e.g. ``"YouTube"`` stays ``"YouTube"`` instead of becoming
|
||||
``"You Tube"``). The list mirrors the frontend mapping in
|
||||
``frontend/src/lib/utils.ts``.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> split_camelcase("AITextGeneratorBlock")
|
||||
'AI Text Generator Block'
|
||||
>>> split_camelcase("OAuth2Block")
|
||||
'OAuth2 Block'
|
||||
>>> split_camelcase("YouTubeBlock")
|
||||
'YouTube Block'
|
||||
"""
|
||||
if len(text) <= 1:
|
||||
return text
|
||||
|
||||
parts: list[str] = []
|
||||
prev = 0
|
||||
for i in range(1, len(text)):
|
||||
# Insert split between lowercase/digit and uppercase: "camelCase" -> "camel|Case"
|
||||
if (text[i - 1].islower() or text[i - 1].isdigit()) and text[i].isupper():
|
||||
parts.append(text[prev:i])
|
||||
prev = i
|
||||
# Insert split between uppercase run (2+ chars) and uppercase+lowercase:
|
||||
# "AIText" -> "AI|Text". Requires at least 3 consecutive uppercase chars
|
||||
# before the lowercase so that the left part keeps 2+ uppercase chars
|
||||
# (mirrors the original regex r"([A-Z]{2,})([A-Z][a-z])").
|
||||
elif (
|
||||
i >= 2
|
||||
and text[i - 2].isupper()
|
||||
and text[i - 1].isupper()
|
||||
and text[i].islower()
|
||||
and (i - 1 - prev) >= 2 # left part must retain at least 2 upper chars
|
||||
):
|
||||
parts.append(text[prev : i - 1])
|
||||
prev = i - 1
|
||||
|
||||
parts.append(text[prev:])
|
||||
result = " ".join(parts)
|
||||
|
||||
# Restore known compound terms that should not be split.
|
||||
result = _CAMELCASE_EXCEPTION_RE.sub(
|
||||
lambda m: _CAMELCASE_EXCEPTIONS[m.group()], result
|
||||
)
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user