fix(backend): remove split_camelcase from BM25 tokenizer to fix long-text truncation

split_camelcase has a 500-char safety cap designed for short block names.
Calling it inside the BM25 tokenize() function silently truncated long
documentation text, degrading search relevance for keywords past 500 chars.

Block names are already split via split_camelcase at indexing time in
content_handlers.py, so the tokenizer doesn't need it.
This commit is contained in:
Zamil Majdy
2026-03-16 17:21:42 +07:00
parent 349b9fc009
commit c87e954644
3 changed files with 4 additions and 21 deletions

View File

@@ -20,7 +20,6 @@ from backend.api.features.store.embeddings import (
embed_query,
embedding_to_vector_string,
)
from backend.api.features.store.text_utils import split_camelcase
from backend.data.db import query_raw_with_schema
logger = logging.getLogger(__name__)
@@ -32,19 +31,10 @@ logger = logging.getLogger(__name__)
def tokenize(text: str) -> list[str]:
"""Simple tokenizer for BM25 — lowercase and split on word boundaries.
CamelCase is split first so "AITextGeneratorBlock" becomes
``["ai", "text", "generator", "block"]``.
``split_camelcase`` is applied to *all* content types (agents, blocks,
docs), not only block names. For normal prose the function is effectively
a no-op (no CamelCase boundaries to split), while for code identifiers
that may appear in documentation it improves token coverage.
"""
"""Tokenize text for BM25."""
if not text:
return []
return re.findall(r"\b\w+\b", split_camelcase(text).lower())
return re.findall(r"\b\w+\b", text.lower())
def bm25_rerank(

View File

@@ -26,10 +26,10 @@ from backend.api.features.store.hybrid_search import (
@pytest.mark.parametrize(
"input_text, expected",
[
("AITextGeneratorBlock", ["ai", "text", "generator", "block"]),
("AITextGeneratorBlock", ["aitextgeneratorblock"]),
("hello world", ["hello", "world"]),
("", []),
("HTTPRequest", ["http", "request"]),
("HTTPRequest", ["httprequest"]),
],
)
def test_tokenize(input_text: str, expected: list[str]):

View File

@@ -1,8 +1,5 @@
"""Small text helpers shared across store search modules."""
# Safety cap; block names are typically < 50 chars
_MAX_CAMELCASE_INPUT_LEN = 500
def split_camelcase(text: str) -> str:
"""Split CamelCase into separate words.
@@ -27,10 +24,6 @@ def split_camelcase(text: str) -> str:
>>> split_camelcase("OAuth2Block")
'OAuth2 Block'
"""
# Truncate to safety cap before processing
if len(text) > _MAX_CAMELCASE_INPUT_LEN:
text = text[:_MAX_CAMELCASE_INPUT_LEN]
if len(text) <= 1:
return text