fix(backend): split CamelCase block names for search indexing and BM25

Block names like "AITextGeneratorBlock" were indexed as single tokens, making them invisible to PostgreSQL tsvector lexical search and BM25 reranking when users searched for "text generator" or "AI text". Now CamelCase names are split into separate words before indexing (e.g. "AI Text Generator Block") so both lexical and BM25 matching work correctly.
2026-04-08 03:00:28 -04:00 · 2026-03-13 16:10:50 +07:00
parent ba301a3912
commit 70dfe64c6d
3 changed files with 19 additions and 14943 deletions
--- a/autogpt_platform/backend/backend/api/features/store/content_handlers.py
+++ b/autogpt_platform/backend/backend/api/features/store/content_handlers.py
@@ -6,6 +6,7 @@ Each handler knows how to fetch and process its content type for embedding.
 """

 import logging
+import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
@@ -205,7 +206,14 @@ class BlockHandler(ContentHandler):
                # Build searchable text from block metadata
                parts = []
                if block_instance.name:
-                    parts.append(block_instance.name)
+                    # Split CamelCase names into separate words so that
+                    # lexical search (tsvector) and BM25 can match individual
+                    # terms.  "AITextGeneratorBlock" → "AI Text Generator Block"
+                    split_name = re.sub(
+                        r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
+                    )
+                    split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
+                    parts.append(split_name)
                if block_instance.description:
                    parts.append(block_instance.description)
                if block_instance.categories:
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
@@ -31,10 +31,17 @@ logger = logging.getLogger(__name__)


 def tokenize(text: str) -> list[str]:
-    """Simple tokenizer for BM25 - lowercase and split on non-alphanumeric."""
+    """Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
+
+    Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
+    ["ai", "text", "generator", "block"] in addition to the original token,
+    improving recall for exact-term queries.
+    """
    if not text:
        return []
-    # Lowercase and split on non-alphanumeric characters
+    # Split CamelCase before tokenizing so individual words are matchable
+    text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
+    text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
    tokens = re.findall(r"\b\w+\b", text.lower())
    return tokens

--- a/autogpt_platform/frontend/src/app/api/openapi.json
+++ b/autogpt_platform/frontend/src/app/api/openapi.json