refactor(backend): extract shared split_camelcase helper, fix review comments

- Extract `split_camelcase()` into text_utils.py to avoid circular imports
  and share between block indexer and BM25 tokenizer (CodeRabbit nitpick)
- Fix docstring accuracy in tokenize() (CodeRabbit nitpick)
- Add exception handling for block_cls() in disabled-block filter
  to prevent a single broken block from crashing the backfill (Sentry)
- ReDoS not a concern: both regex patterns are linear-time (CodeQL)
This commit is contained in:
Zamil Majdy
2026-03-13 16:25:24 +07:00
parent afcce75aff
commit a982fb8436
3 changed files with 37 additions and 22 deletions

View File

@@ -6,7 +6,6 @@ Each handler knows how to fetch and process its content type for embedding.
"""
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
@@ -14,6 +13,7 @@ from typing import Any, get_args, get_origin
from prisma.enums import ContentType
from backend.api.features.store.text_utils import split_camelcase
from backend.blocks.llm import LlmModel
from backend.data.db import query_raw_with_schema
@@ -191,11 +191,17 @@ class BlockHandler(ContentHandler):
# Filter disabled blocks before applying batch_size so that a large
# number of disabled blocks can't exhaust the batch budget and prevent
# enabled blocks from being indexed.
missing_blocks = [
(block_id, block_cls)
for block_id, block_cls in all_blocks.items()
if block_id not in existing_ids and not block_cls().disabled
]
missing_blocks: list[tuple[str, type]] = []
for block_id, block_cls in all_blocks.items():
if block_id in existing_ids:
continue
try:
if block_cls().disabled:
continue
except Exception as e:
logger.warning(f"Skipping block {block_id}: failed to init: {e}")
continue
missing_blocks.append((block_id, block_cls))
# Convert to ContentItem
items = []
@@ -206,14 +212,7 @@ class BlockHandler(ContentHandler):
# Build searchable text from block metadata
parts = []
if block_instance.name:
# Split CamelCase names into separate words so that
# lexical search (tsvector) and BM25 can match individual
# terms. "AITextGeneratorBlock" → "AI Text Generator Block"
split_name = re.sub(
r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
)
split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
parts.append(split_name)
parts.append(split_camelcase(block_instance.name))
if block_instance.description:
parts.append(block_instance.description)
if block_instance.categories:

View File

@@ -31,18 +31,16 @@ logger = logging.getLogger(__name__)
def tokenize(text: str) -> list[str]:
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
"""Simple tokenizer for BM25 lowercase and split on word boundaries.
Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
["ai", "text", "generator", "block"] in addition to the original token,
improving recall for exact-term queries.
CamelCase is split first so "AITextGeneratorBlock" becomes
``["ai", "text", "generator", "block"]``.
"""
if not text:
return []
# Split CamelCase before tokenizing so individual words are matchable
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
tokens = re.findall(r"\b\w+\b", text.lower())
from backend.api.features.store.text_utils import split_camelcase
tokens = re.findall(r"\b\w+\b", split_camelcase(text).lower())
return tokens

View File

@@ -0,0 +1,18 @@
"""Small text helpers shared across store search modules."""
import re
def split_camelcase(text: str) -> str:
"""Split CamelCase into separate words.
Examples::
>>> split_camelcase("AITextGeneratorBlock")
'AI Text Generator Block'
>>> split_camelcase("HTTPRequestBlock")
'HTTP Request Block'
"""
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
return text