fix(backend): remove split_camelcase from BM25 tokenizer to fix long-text truncation

split_camelcase has a 500-char safety cap designed for short block names. Calling it inside the BM25 tokenize() function silently truncated long documentation text, degrading search relevance for keywords past 500 chars. Block names are already split via split_camelcase at indexing time in content_handlers.py, so the tokenizer doesn't need it.
2026-04-08 03:00:28 -04:00 · 2026-03-16 17:21:42 +07:00
parent 349b9fc009
commit c87e954644
3 changed files with 4 additions and 21 deletions
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
@@ -20,7 +20,6 @@ from backend.api.features.store.embeddings import (
    embed_query,
    embedding_to_vector_string,
 )
-from backend.api.features.store.text_utils import split_camelcase
 from backend.data.db import query_raw_with_schema

 logger = logging.getLogger(__name__)
@@ -32,19 +31,10 @@ logger = logging.getLogger(__name__)


 def tokenize(text: str) -> list[str]:
-    """Simple tokenizer for BM25 — lowercase and split on word boundaries.
-
-    CamelCase is split first so "AITextGeneratorBlock" becomes
-    ``["ai", "text", "generator", "block"]``.
-
-    ``split_camelcase`` is applied to *all* content types (agents, blocks,
-    docs), not only block names.  For normal prose the function is effectively
-    a no-op (no CamelCase boundaries to split), while for code identifiers
-    that may appear in documentation it improves token coverage.
-    """
+    """Tokenize text for BM25."""
    if not text:
        return []
-    return re.findall(r"\b\w+\b", split_camelcase(text).lower())
+    return re.findall(r"\b\w+\b", text.lower())


 def bm25_rerank(
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search_test.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search_test.py
@@ -26,10 +26,10 @@ from backend.api.features.store.hybrid_search import (
@pytest.mark.parametrize(
    "input_text, expected",
    [
-        ("AITextGeneratorBlock", ["ai", "text", "generator", "block"]),
+        ("AITextGeneratorBlock", ["aitextgeneratorblock"]),
        ("hello world", ["hello", "world"]),
        ("", []),
-        ("HTTPRequest", ["http", "request"]),
+        ("HTTPRequest", ["httprequest"]),
    ],
 )
 def test_tokenize(input_text: str, expected: list[str]):
--- a/autogpt_platform/backend/backend/api/features/store/text_utils.py
+++ b/autogpt_platform/backend/backend/api/features/store/text_utils.py
@@ -1,8 +1,5 @@
 """Small text helpers shared across store search modules."""

-# Safety cap; block names are typically < 50 chars
-_MAX_CAMELCASE_INPUT_LEN = 500
-

 def split_camelcase(text: str) -> str:
    """Split CamelCase into separate words.
@@ -27,10 +24,6 @@ def split_camelcase(text: str) -> str:
        >>> split_camelcase("OAuth2Block")
        'OAuth2 Block'
    """
-    # Truncate to safety cap before processing
-    if len(text) > _MAX_CAMELCASE_INPUT_LEN:
-        text = text[:_MAX_CAMELCASE_INPUT_LEN]
-
    if len(text) <= 1:
        return text