Use lucene syntax for searches

This uses the l2cs python library to convert
search queries written in "lucene" syntax to
Amazon's CloudSearch syntax

A mouseover blurb on the search results page
shows what the query was parsed as
This commit is contained in:
Keith Mitchell
2012-05-11 14:41:28 -07:00
parent 2243af7306
commit 27c31666f3
7 changed files with 101 additions and 82 deletions

View File

@@ -705,11 +705,13 @@ class FrontController(RedditController):
search_help_page = "/help/search"
verify_langs_regex = re.compile(r"\A[a-z][a-z](,[a-z][a-z])*\Z")
@base_listing
@validate(query = VLength('q', max_length=512),
sort = VMenu('sort', SearchSortMenu, remember=False),
restrict_sr = VBoolean('restrict_sr', default=False))
@validate(query=VLength('q', max_length=512),
sort=VMenu('sort', SearchSortMenu, remember=False),
restrict_sr=VBoolean('restrict_sr', default=False),
syntax=VOneOf('syntax', options=SearchQuery.known_syntaxes))
@api_doc(api_section.search, extensions=['json', 'xml'])
def GET_search(self, query, num, reverse, after, count, sort, restrict_sr):
def GET_search(self, query, num, reverse, after, count, sort, restrict_sr,
syntax):
"""Search links page."""
if query and '.' in query:
url = sanitize_url(query, require_scheme = True)
@@ -720,16 +722,19 @@ class FrontController(RedditController):
site = DefaultSR()
else:
site = c.site
if not syntax:
syntax = SearchQuery.default_syntax
try:
cleanup_message = None
try:
q = SearchQuery(query, site, sort)
q = SearchQuery(query, site, sort, syntax=syntax)
num, t, spane = self._search(q, num=num, after=after,
reverse = reverse, count = count)
except InvalidQuery:
# strip the query down to a whitelist
cleaned = re.sub("[^\w\s]+", "", query)
cleaned = re.sub("[^\w\s]+", " ", query)
cleaned = cleaned.lower()
# if it was nothing but mess, we have to stop
@@ -738,7 +743,7 @@ class FrontController(RedditController):
cleanup_message = strings.completely_invalid_search_query
else:
q = SearchQuery(cleaned, site, sort)
num, t, spane = self._search(q, num=num, after=after,
num, t, spane = self._search(q, num=num, after=after,
reverse=reverse, count=count)
cleanup_message = strings.invalid_search_query % {
"clean_query": cleaned
@@ -749,11 +754,13 @@ class FrontController(RedditController):
}
res = SearchPage(_('search results'), query, t, num, content=spane,
nav_menus = [SearchSortMenu(default=sort)],
search_params = dict(sort = sort),
nav_menus=[SearchSortMenu(default=sort)],
search_params=dict(sort=sort),
infotext=cleanup_message,
simple=False, site=c.site,
restrict_sr=restrict_sr,
syntax=syntax,
converted_data=q.converted_data
).render()
return res

View File

@@ -4,17 +4,18 @@ import httplib
import json
from lxml import etree
from pylons import g, c
import random
import re
import time
import urllib
import l2cs
from r2.lib import amqp
from r2.lib.db.operators import desc
import r2.lib.utils as r2utils
from r2.models import Account, Link, Subreddit, Thing, \
All, DefaultSR, MultiReddit, DomainSR, Friends, ModContribSR, \
FakeSubreddit, NotFound
from r2.models import (Account, Link, Subreddit, Thing, All, DefaultSR,
MultiReddit, DomainSR, Friends, ModContribSR,
FakeSubreddit, NotFound)
_CHUNK_SIZE = 4000000 # Approx. 4 MB, to stay under the 5MB limit
@@ -182,7 +183,8 @@ def xml_from_things(things):
def delete_ids(ids):
'''Delete documents from the index. 'ids' should be a list of fullnames'''
version = _version()
deletes = [etree.Element("delete", id=id_, version=str(version)) for id_ in ids]
deletes = [etree.Element("delete", id=id_, version=str(version))
for id_ in ids]
batch = etree.Element("batch")
batch.extend(deletes)
return send_documents(batch)
@@ -373,7 +375,7 @@ def _to_fn(cls, id_):
require an instance of the class)
'''
return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
r2utils.to36(id_))
@@ -389,7 +391,8 @@ def basic_query(query=None, bq=None, facets=("reddit",), facet_count=10,
timer = None
if record_stats:
timer = g.stats.get_timer("cloudsearch_timer")
timer.start()
if timer:
timer.start()
connection = httplib.HTTPConnection(g.CLOUDSEARCH_SEARCH_API, 80)
try:
connection.request('GET', path)
@@ -454,8 +457,19 @@ class CloudSearchQuery(object):
'top': 3,
}
def __init__(self, query, sr, sort):
lucene_parser = l2cs.make_parser(int_fields=['timestamp'],
yesno_fields=['over18', 'is_self'])
known_syntaxes = ("cloudsearch", "lucene")
default_syntax = "lucene"
def __init__(self, query, sr, sort, syntax=None):
if syntax is None:
syntax = self.default_syntax
elif syntax not in self.known_syntaxes:
raise ValueError("Unknown search syntax: %s" % syntax)
self.query = query.encode("utf-8") if query else ''
self.converted_data = None
self.syntax = syntax
self.sr = sr
self._sort = sort
self.sort = self.sorts[sort]
@@ -475,8 +489,8 @@ class CloudSearchQuery(object):
self.results = Results(after_docs, hits, facets)
return self.results
@staticmethod
def create_boolean_query(base_query, subreddit_query):
@classmethod
def create_boolean_query(cls, query, subreddit_query):
'''Join a (user-entered) text query with the generated subreddit query
Input:
@@ -489,19 +503,10 @@ class CloudSearchQuery(object):
without parens "author:'foo'"
'''
is_boolean_query = any([x in base_query for x in ":()"])
query = base_query.strip()
if not is_boolean_query:
query = query.replace("\\", "")
query = query.replace("'", "\\'")
query = "(field text '%s')" % query
if subreddit_query:
bq = "(and %s %s)" % (query, subreddit_query)
else:
bq = query
return bq
@staticmethod
@@ -527,7 +532,8 @@ class CloudSearchQuery(object):
# The query limit is roughly 8k bytes. Limit to 200 friends to
# avoid getting too close to that limit
friend_ids = c.user.friends[:200]
friends = ["author_fullname:'%s'" % _to_fn(Account, id_) for id_ in friend_ids]
friends = ["author_fullname:'%s'" % _to_fn(Account, id_)
for id_ in friend_ids]
bq.extend(friends)
bq.append(")")
elif isinstance(sr, ModContribSR):
@@ -543,7 +549,13 @@ class CloudSearchQuery(object):
def _run(self, start=0, num=1000, _update=False):
'''Run the search against self.query'''
subreddit_query = self._get_sr_restriction(self.sr)
self.bq = self.create_boolean_query(self.query, subreddit_query)
if self.syntax == "cloudsearch":
base_query = self.query
elif self.syntax == "lucene":
base_query = l2cs.convert(self.query, self.lucene_parser)
self.converted_data = {"syntax": "cloudsearch",
"converted": base_query}
self.bq = self.create_boolean_query(base_query, subreddit_query)
if g.sqlprinting:
g.log.info("%s", self)
return self._run_cached(self.bq, self.sort, start=start, num=num,
@@ -551,7 +563,8 @@ class CloudSearchQuery(object):
def __repr__(self):
'''Return a string representation of this query'''
result = ["<", self.__class__.__name__, "> query:", repr(self.query), " "]
result = ["<", self.__class__.__name__, "> query:",
repr(self.query), " "]
if self.bq:
result.append(" bq:")
result.append(repr(self.bq))
@@ -612,23 +625,3 @@ class CloudSearchQuery(object):
results = Results(docs, hits, facets)
return results
def test_create_boolean_query():
tests = [('steve holt', None),
('steve holt', '(or sr_id:1 sr_id:2 sr_id:3)'),
('steve holt', "author:'qgyh2'"),
("can't help myself", None),
("can't help myself", '(or sr_id:1 sr_id:2 sr_id:3)'),
("can't help myself", "author:'qgyh2'"),
("text:'steve holt'", None),
("text:'steve holt'", '(or sr_id:1 sr_id:2 sr_id:3)'),
("text:'steve holt'", "author:'qgyh2'"),
("(or text:'steve holt' text:'nintendo')", None),
("(or text:'steve holt' text:'nintendo')", '(or sr_id:1 sr_id:2 sr_id:3)'),
("(or text:'steve holt' text:'nintendo')", "author:'qgyh2'")]
for test in tests:
print "Trying: %r" % (test,)
bq = CloudSearchQuery.create_boolean_query(*test)
print "Query: %r" % bq
basic_query(bq=bq, size=1)

View File

@@ -822,13 +822,15 @@ class SearchPage(BoringPage):
def __init__(self, pagename, prev_search, elapsed_time,
num_results, search_params = {},
simple=False, restrict_sr = False, site=None,
syntax=None, converted_data=None,
*a, **kw):
self.searchbar = SearchBar(prev_search = prev_search,
elapsed_time = elapsed_time,
num_results = num_results,
search_params = search_params,
show_feedback = True, site=site,
simple=simple, restrict_sr=restrict_sr)
self.searchbar = SearchBar(prev_search=prev_search,
elapsed_time=elapsed_time,
num_results=num_results,
search_params=search_params,
show_feedback=True, site=site,
simple=simple, restrict_sr=restrict_sr,
syntax=syntax, converted_data=converted_data)
BoringPage.__init__(self, pagename, robots='noindex', *a, **kw)
def content(self):
@@ -1728,26 +1730,26 @@ class PaneStack(Templated):
class SearchForm(Templated):
"""The simple search form in the header of the page. prev_search
is the previous search."""
def __init__(self, prev_search = '', search_params = {},
site=None, simple=True, restrict_sr=False,
subreddit_search=False):
Templated.__init__(self, prev_search = prev_search,
search_params = search_params, site=site,
simple=simple, restrict_sr=restrict_sr,
subreddit_search=subreddit_search)
def __init__(self, prev_search='', search_params={}, site=None,
simple=True, restrict_sr=False, subreddit_search=False,
syntax=None):
Templated.__init__(self, prev_search=prev_search,
search_params=search_params, site=site,
simple=simple, restrict_sr=restrict_sr,
subreddit_search=subreddit_search, syntax=syntax)
class SearchBar(Templated):
"""More detailed search box for /search and /reddits pages.
Displays the previous search as well as info of the elapsed_time
and num_results if any."""
def __init__(self, num_results = 0, prev_search = '', elapsed_time = 0,
search_params = {}, show_feedback=False,
simple=False, restrict_sr=False, site=None,
subreddit_search=False, **kw):
# not listed explicitly in args to ensure it translates properly
self.header = kw.get('header', _("previous search"))
def __init__(self, header=None, num_results=0, prev_search='',
elapsed_time=0, search_params={}, show_feedback=False,
simple=False, restrict_sr=False, site=None, syntax=None,
subreddit_search=False, converted_data=None, **kw):
if header is None:
header = _("previous search")
self.header = header
self.prev_search = prev_search
self.elapsed_time = elapsed_time
@@ -1759,9 +1761,11 @@ class SearchBar(Templated):
else:
self.num_results = num_results
Templated.__init__(self, search_params = search_params,
Templated.__init__(self, search_params=search_params,
simple=simple, restrict_sr=restrict_sr,
site=site, subreddit_search=subreddit_search)
site=site, syntax=syntax,
converted_data=converted_data,
subreddit_search=subreddit_search)
class Frame(Wrapped):
"""Frameset for the FrameToolbar used when a user hits /tb/. The

View File

@@ -2254,10 +2254,17 @@ label + #moresearchinfo {
.bottommenu { color: gray; font-size: smaller; clear: both}
.bottommenu a { color: gray; text-decoration: underline; }
.bottommenu.serverinfo { text-align:right; padding:5px; }
.bottommenu.serverinfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
.bottommenu.serverinfo .content { display:none; }
.bottommenu.serverinfo:hover .content { display:inline; }
.debuginfo {
text-align: right;
padding: 5px;
color: gray;
font-size: smaller;
clear: both;
}
.debuginfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
.debuginfo .content { display:none; }
.debuginfo:hover .content { display:inline; }
/* Buttons specific */

View File

@@ -54,5 +54,5 @@
dict(year=datetime.datetime.now().timetuple()[0])}
</p>
<p class="bottommenu">REDDIT and the ALIEN Logo are registered trademarks of reddit inc.</p>
<p class="bottommenu serverinfo"><span class="icon">&pi;</span>&nbsp;<span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
<p class="bottommenu debuginfo"><span class="icon">&pi;</span>&nbsp;<span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
</div>

View File

@@ -51,6 +51,14 @@
</div>
</div>
%endif
<div>
%if thing.converted_data:
<p class="debuginfo">
<span class="icon">&delta;</span>&nbsp;
<span class="content">${_('converted query to %(syntax)s syntax: %(converted)s') % thing.converted_data}</span>
</p>
%endif
</div>
</div>
%endif
@@ -58,9 +66,10 @@
<h4 style="color:gray">${thing.header}</h4>
<div id="previoussearch">
${SearchForm(prev_search = thing.prev_search,
search_params = thing.search_params,
${SearchForm(prev_search=thing.prev_search,
search_params=thing.search_params,
site=thing.site, subreddit_search=thing.subreddit_search,
simple=thing.simple, restrict_sr=thing.restrict_sr)}
simple=thing.simple, restrict_sr=thing.restrict_sr,
syntax=thing.syntax)}
</div>
</div>

View File

@@ -68,7 +68,6 @@
% endif
${search_faq()}
</div>
%else:
%if not thing.site or isinstance(thing.site, DefaultSR):