fix(backend): split CamelCase block names for search indexing and BM25

Block names like "AITextGeneratorBlock" were indexed as single tokens,
making them invisible to PostgreSQL tsvector lexical search and BM25
reranking when users searched for "text generator" or "AI text". Now
CamelCase names are split into separate words before indexing
(e.g. "AI Text Generator Block") so both lexical and BM25 matching
work correctly.
This commit is contained in:
Zamil Majdy
2026-03-13 16:10:50 +07:00
parent 699ecc8cec
commit ee8896c818
3 changed files with 19 additions and 15041 deletions

View File

@@ -6,6 +6,7 @@ Each handler knows how to fetch and process its content type for embedding.
"""
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
@@ -205,7 +206,14 @@ class BlockHandler(ContentHandler):
# Build searchable text from block metadata
parts = []
if block_instance.name:
parts.append(block_instance.name)
# Split CamelCase names into separate words so that
# lexical search (tsvector) and BM25 can match individual
# terms. "AITextGeneratorBlock" → "AI Text Generator Block"
split_name = re.sub(
r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
)
split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
parts.append(split_name)
if block_instance.description:
parts.append(block_instance.description)
if block_instance.categories:

View File

@@ -31,10 +31,17 @@ logger = logging.getLogger(__name__)
def tokenize(text: str) -> list[str]:
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric."""
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
["ai", "text", "generator", "block"] in addition to the original token,
improving recall for exact-term queries.
"""
if not text:
return []
# Lowercase and split on non-alphanumeric characters
# Split CamelCase before tokenizing so individual words are matchable
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
tokens = re.findall(r"\b\w+\b", text.lower())
return tokens

File diff suppressed because one or more lines are too long