fix(backend): split CamelCase block names for search indexing and BM25

Block names like "AITextGeneratorBlock" were indexed as single tokens,
making them invisible to PostgreSQL tsvector lexical search and BM25
reranking when users searched for "text generator" or "AI text". Now
CamelCase names are split into separate words before indexing
(e.g. "AI Text Generator Block") so both lexical and BM25 matching
work correctly.
This commit is contained in:
Zamil Majdy
2026-03-13 16:10:50 +07:00
parent ba301a3912
commit 70dfe64c6d
3 changed files with 19 additions and 14943 deletions

View File

@@ -6,6 +6,7 @@ Each handler knows how to fetch and process its content type for embedding.
"""
import logging
import re
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
@@ -205,7 +206,14 @@ class BlockHandler(ContentHandler):
# Build searchable text from block metadata
parts = []
if block_instance.name:
parts.append(block_instance.name)
# Split CamelCase names into separate words so that
# lexical search (tsvector) and BM25 can match individual
# terms. "AITextGeneratorBlock" → "AI Text Generator Block"
split_name = re.sub(
r"([a-z0-9])([A-Z])", r"\1 \2", block_instance.name
)
split_name = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", split_name)
parts.append(split_name)
if block_instance.description:
parts.append(block_instance.description)
if block_instance.categories:

View File

@@ -31,10 +31,17 @@ logger = logging.getLogger(__name__)
def tokenize(text: str) -> list[str]:
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric."""
"""Simple tokenizer for BM25 - lowercase and split on non-alphanumeric.
Also splits CamelCase tokens so that e.g. "AITextGeneratorBlock" yields
["ai", "text", "generator", "block"] in addition to the original token,
improving recall for exact-term queries.
"""
if not text:
return []
# Lowercase and split on non-alphanumeric characters
# Split CamelCase before tokenizing so individual words are matchable
text = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", text)
text = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", text)
tokens = re.findall(r"\b\w+\b", text.lower())
return tokens

File diff suppressed because one or more lines are too long