mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
refactor(backend): extract shared split_camelcase helper, fix review comments
- Extract `split_camelcase()` into text_utils.py to avoid circular imports and share between block indexer and BM25 tokenizer (CodeRabbit nitpick) - Fix docstring accuracy in tokenize() (CodeRabbit nitpick) - Add exception handling for block_cls() in disabled-block filter to prevent a single broken block from crashing the backfill (Sentry) - ReDoS not a concern: both regex patterns are linear-time (CodeQL)
This commit is contained in:
@@ -6,7 +6,6 @@ Each handler knows how to fetch and process its content type for embedding.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -14,6 +13,7 @@ from typing import Any, get_args, get_origin
|
||||
|
||||
from prisma.enums import ContentType
|
||||
|
||||
from backend.api.features.store.text_utils import split_camelcase
|
||||
from backend.blocks.llm import LlmModel
|
||||
from backend.data.db import query_raw_with_schema
|
||||
|
||||
@@ -191,11 +191,17 @@ class BlockHandler(ContentHandler):
|
||||
# Filter disabled blocks before applying batch_size so that a large
|
||||
# number of disabled blocks can't exhaust the batch budget and prevent
|
||||
# enabled blocks from being indexed.
|
||||
missing_blocks = [
|
||||
(block_id, block_cls)
|
||||
for block_id, block_cls in all_blocks.items()
|
||||
if block_id not in existing_ids and not block_cls().disabled
|
||||
]
|
||||
missing_blocks: list[tuple[str, type]] = []
|
||||
for block_id, block_cls in all_blocks.items():
|
||||
if block_id in existing_ids:
|
||||
continue
|
||||
try:
|
||||
if block_cls().disabled:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.warning(f"Skipping block {block_id}: failed to init: {e}")
|
||||
continue
|
||||
missing_blocks.append((block_id, block_cls))
|
||||
|
||||
# Convert to ContentItem
|
||||
items = []
|
||||
@@ -206,14 +212,7 @@ class BlockHandler(ContentHandler):
|
||||
# Build searchable text from block metadata
|
||||
parts = []
|
||||
if block_instance.name:
|
||||
# Split CamelCase names into separate words so that
|
||||
# lexical search (tsvector) and BM25 can match individual
|
||||
# terms. "AITextGeneratorBlock" → "AI Text Generator Block"
|
||||
split_name = re.sub(
|
||||
r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
|
||||
)
|
||||
split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
|
||||
parts.append(split_name)
|
||||
parts.append(split_camelcase(block_instance.name))
|
||||
if block_instance.description:
|
||||
parts.append(block_instance.description)
|
||||
if block_instance.categories:
|
||||
|
||||
@@ -31,18 +31,16 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tokenize(text: str) -> list[str]:
|
||||
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
|
||||
"""Simple tokenizer for BM25 — lowercase and split on word boundaries.
|
||||
|
||||
Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
|
||||
["ai", "text", "generator", "block"] in addition to the original token,
|
||||
improving recall for exact-term queries.
|
||||
CamelCase is split first so "AITextGeneratorBlock" becomes
|
||||
``["ai", "text", "generator", "block"]``.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
# Split CamelCase before tokenizing so individual words are matchable
|
||||
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
|
||||
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
from backend.api.features.store.text_utils import split_camelcase
|
||||
|
||||
tokens = re.findall(r"\b\w+\b", split_camelcase(text).lower())
|
||||
return tokens
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
"""Small text helpers shared across store search modules."""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def split_camelcase(text: str) -> str:
|
||||
"""Split CamelCase into separate words.
|
||||
|
||||
Examples::
|
||||
|
||||
>>> split_camelcase("AITextGeneratorBlock")
|
||||
'AI Text Generator Block'
|
||||
>>> split_camelcase("HTTPRequestBlock")
|
||||
'HTTP Request Block'
|
||||
"""
|
||||
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
|
||||
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
|
||||
return text
|
||||
Reference in New Issue
Block a user