refactor(backend): extract shared split_camelcase helper, fix review comments

- Extract `split_camelcase()` into text_utils.py to avoid circular imports and share between block indexer and BM25 tokenizer (CodeRabbit nitpick) - Fix docstring accuracy in tokenize() (CodeRabbit nitpick) - Add exception handling for block_cls() in disabled-block filter to prevent a single broken block from crashing the backfill (Sentry) - ReDoS not a concern: both regex patterns are linear-time (CodeQL)
2026-04-08 03:00:28 -04:00 · 2026-03-13 16:25:24 +07:00
parent afcce75aff
commit a982fb8436
3 changed files with 37 additions and 22 deletions
--- a/autogpt_platform/backend/backend/api/features/store/content_handlers.py
+++ b/autogpt_platform/backend/backend/api/features/store/content_handlers.py
@@ -6,7 +6,6 @@ Each handler knows how to fetch and process its content type for embedding.
 """

 import logging
-import re
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
@@ -14,6 +13,7 @@ from typing import Any, get_args, get_origin

 from prisma.enums import ContentType

+from backend.api.features.store.text_utils import split_camelcase
 from backend.blocks.llm import LlmModel
 from backend.data.db import query_raw_with_schema

@@ -191,11 +191,17 @@ class BlockHandler(ContentHandler):
        # Filter disabled blocks before applying batch_size so that a large
        # number of disabled blocks can't exhaust the batch budget and prevent
        # enabled blocks from being indexed.
-        missing_blocks = [
-            (block_id, block_cls)
-            for block_id, block_cls in all_blocks.items()
-            if block_id not in existing_ids and not block_cls().disabled
-        ]
+        missing_blocks: list[tuple[str, type]] = []
+        for block_id, block_cls in all_blocks.items():
+            if block_id in existing_ids:
+                continue
+            try:
+                if block_cls().disabled:
+                    continue
+            except Exception as e:
+                logger.warning(f"Skipping block {block_id}: failed to init: {e}")
+                continue
+            missing_blocks.append((block_id, block_cls))

        # Convert to ContentItem
        items = []
@@ -206,14 +212,7 @@ class BlockHandler(ContentHandler):
                # Build searchable text from block metadata
                parts = []
                if block_instance.name:
-                    # Split CamelCase names into separate words so that
-                    # lexical search (tsvector) and BM25 can match individual
-                    # terms.  "AITextGeneratorBlock" → "AI Text Generator Block"
-                    split_name = re.sub(
-                        r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
-                    )
-                    split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
-                    parts.append(split_name)
+                    parts.append(split_camelcase(block_instance.name))
                if block_instance.description:
                    parts.append(block_instance.description)
                if block_instance.categories:
--- a/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
+++ b/autogpt_platform/backend/backend/api/features/store/hybrid_search.py
@@ -31,18 +31,16 @@ logger = logging.getLogger(__name__)


 def tokenize(text: str) -> list[str]:
-    """Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
+    """Simple tokenizer for BM25 — lowercase and split on word boundaries.

-    Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
-    ["ai", "text", "generator", "block"] in addition to the original token,
-    improving recall for exact-term queries.
+    CamelCase is split first so "AITextGeneratorBlock" becomes
+    ``["ai", "text", "generator", "block"]``.
    """
    if not text:
        return []
-    # Split CamelCase before tokenizing so individual words are matchable
-    text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
-    text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
-    tokens = re.findall(r"\b\w+\b", text.lower())
+    from backend.api.features.store.text_utils import split_camelcase
+
+    tokens = re.findall(r"\b\w+\b", split_camelcase(text).lower())
    return tokens


--- a/autogpt_platform/backend/backend/api/features/store/text_utils.py
+++ b/autogpt_platform/backend/backend/api/features/store/text_utils.py
@@ -0,0 +1,18 @@
+"""Small text helpers shared across store search modules."""
+
+import re
+
+
+def split_camelcase(text: str) -> str:
+    """Split CamelCase into separate words.
+
+    Examples::
+
+        >>> split_camelcase("AITextGeneratorBlock")
+        'AI Text Generator Block'
+        >>> split_camelcase("HTTPRequestBlock")
+        'HTTP Request Block'
+    """
+    text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
+    text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
+    return text