mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
fix(backend): split CamelCase block names for search indexing and BM25
Block names like "AITextGeneratorBlock" were indexed as single tokens, making them invisible to PostgreSQL tsvector lexical search and BM25 reranking when users searched for "text generator" or "AI text". Now CamelCase names are split into separate words before indexing (e.g. "AI Text Generator Block") so both lexical and BM25 matching work correctly.
This commit is contained in:
@@ -6,6 +6,7 @@ Each handler knows how to fetch and process its content type for embedding.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -205,7 +206,14 @@ class BlockHandler(ContentHandler):
|
||||
# Build searchable text from block metadata
|
||||
parts = []
|
||||
if block_instance.name:
|
||||
parts.append(block_instance.name)
|
||||
# Split CamelCase names into separate words so that
|
||||
# lexical search (tsvector) and BM25 can match individual
|
||||
# terms. "AITextGeneratorBlock" → "AI Text Generator Block"
|
||||
split_name = re.sub(
|
||||
r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
|
||||
)
|
||||
split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
|
||||
parts.append(split_name)
|
||||
if block_instance.description:
|
||||
parts.append(block_instance.description)
|
||||
if block_instance.categories:
|
||||
|
||||
@@ -31,10 +31,17 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def tokenize(text: str) -> list[str]:
|
||||
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric."""
|
||||
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
|
||||
|
||||
Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
|
||||
["ai", "text", "generator", "block"] in addition to the original token,
|
||||
improving recall for exact-term queries.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
# Lowercase and split on non-alphanumeric characters
|
||||
# Split CamelCase before tokenizing so individual words are matchable
|
||||
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
|
||||
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
return tokens
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user