mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(backend): remove split_camelcase from BM25 tokenizer to fix long-text truncation
split_camelcase has a 500-char safety cap designed for short block names. Calling it inside the BM25 tokenize() function silently truncated long documentation text, degrading search relevance for keywords past 500 chars. Block names are already split via split_camelcase at indexing time in content_handlers.py, so the tokenizer doesn't need it.
This commit is contained in:
@@ -20,7 +20,6 @@ from backend.api.features.store.embeddings import (
|
||||
embed_query,
|
||||
embedding_to_vector_string,
|
||||
)
|
||||
from backend.api.features.store.text_utils import split_camelcase
|
||||
from backend.data.db import query_raw_with_schema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -32,19 +31,10 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tokenize(text: str) -> list[str]:
|
||||
"""Simple tokenizer for BM25 — lowercase and split on word boundaries.
|
||||
|
||||
CamelCase is split first so "AITextGeneratorBlock" becomes
|
||||
``["ai", "text", "generator", "block"]``.
|
||||
|
||||
``split_camelcase`` is applied to *all* content types (agents, blocks,
|
||||
docs), not only block names. For normal prose the function is effectively
|
||||
a no-op (no CamelCase boundaries to split), while for code identifiers
|
||||
that may appear in documentation it improves token coverage.
|
||||
"""
|
||||
"""Tokenize text for BM25."""
|
||||
if not text:
|
||||
return []
|
||||
return re.findall(r"\b\w+\b", split_camelcase(text).lower())
|
||||
return re.findall(r"\b\w+\b", text.lower())
|
||||
|
||||
|
||||
def bm25_rerank(
|
||||
|
||||
@@ -26,10 +26,10 @@ from backend.api.features.store.hybrid_search import (
|
||||
@pytest.mark.parametrize(
|
||||
"input_text, expected",
|
||||
[
|
||||
("AITextGeneratorBlock", ["ai", "text", "generator", "block"]),
|
||||
("AITextGeneratorBlock", ["aitextgeneratorblock"]),
|
||||
("hello world", ["hello", "world"]),
|
||||
("", []),
|
||||
("HTTPRequest", ["http", "request"]),
|
||||
("HTTPRequest", ["httprequest"]),
|
||||
],
|
||||
)
|
||||
def test_tokenize(input_text: str, expected: list[str]):
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
"""Small text helpers shared across store search modules."""
|
||||
|
||||
# Safety cap; block names are typically < 50 chars
|
||||
_MAX_CAMELCASE_INPUT_LEN = 500
|
||||
|
||||
|
||||
def split_camelcase(text: str) -> str:
|
||||
"""Split CamelCase into separate words.
|
||||
@@ -27,10 +24,6 @@ def split_camelcase(text: str) -> str:
|
||||
>>> split_camelcase("OAuth2Block")
|
||||
'OAuth2 Block'
|
||||
"""
|
||||
# Truncate to safety cap before processing
|
||||
if len(text) > _MAX_CAMELCASE_INPUT_LEN:
|
||||
text = text[:_MAX_CAMELCASE_INPUT_LEN]
|
||||
|
||||
if len(text) <= 1:
|
||||
return text
|
||||
|
||||
|
||||
Reference in New Issue
Block a user