From 27c31666f3448264da9cd42a455ffb52437376c5 Mon Sep 17 00:00:00 2001
From: Keith Mitchell
Date: Fri, 11 May 2012 14:41:28 -0700
Subject: [PATCH] Use lucene syntax for searches
This uses the l2cs python library to convert
search queries written in "lucene" syntax to
Amazon's CloudSearch syntax
A mouseover blurb on the search results page
shows what the query was parsed as
---
r2/r2/controllers/front.py | 25 ++++++----
r2/r2/lib/cloudsearch.py | 77 ++++++++++++++----------------
r2/r2/lib/pages/pages.py | 48 ++++++++++---------
r2/r2/public/static/css/reddit.css | 15 ++++--
r2/r2/templates/redditfooter.html | 2 +-
r2/r2/templates/searchbar.html | 15 ++++--
r2/r2/templates/searchform.html | 1 -
7 files changed, 101 insertions(+), 82 deletions(-)
diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py
index c8f7e372a..5da9df441 100755
--- a/r2/r2/controllers/front.py
+++ b/r2/r2/controllers/front.py
@@ -705,11 +705,13 @@ class FrontController(RedditController):
search_help_page = "/help/search"
verify_langs_regex = re.compile(r"\A[a-z][a-z](,[a-z][a-z])*\Z")
@base_listing
- @validate(query = VLength('q', max_length=512),
- sort = VMenu('sort', SearchSortMenu, remember=False),
- restrict_sr = VBoolean('restrict_sr', default=False))
+ @validate(query=VLength('q', max_length=512),
+ sort=VMenu('sort', SearchSortMenu, remember=False),
+ restrict_sr=VBoolean('restrict_sr', default=False),
+ syntax=VOneOf('syntax', options=SearchQuery.known_syntaxes))
@api_doc(api_section.search, extensions=['json', 'xml'])
- def GET_search(self, query, num, reverse, after, count, sort, restrict_sr):
+ def GET_search(self, query, num, reverse, after, count, sort, restrict_sr,
+ syntax):
"""Search links page."""
if query and '.' in query:
url = sanitize_url(query, require_scheme = True)
@@ -720,16 +722,19 @@ class FrontController(RedditController):
site = DefaultSR()
else:
site = c.site
+
+ if not syntax:
+ syntax = SearchQuery.default_syntax
try:
cleanup_message = None
try:
- q = SearchQuery(query, site, sort)
+ q = SearchQuery(query, site, sort, syntax=syntax)
num, t, spane = self._search(q, num=num, after=after,
reverse = reverse, count = count)
except InvalidQuery:
# strip the query down to a whitelist
- cleaned = re.sub("[^\w\s]+", "", query)
+ cleaned = re.sub("[^\w\s]+", " ", query)
cleaned = cleaned.lower()
# if it was nothing but mess, we have to stop
@@ -738,7 +743,7 @@ class FrontController(RedditController):
cleanup_message = strings.completely_invalid_search_query
else:
q = SearchQuery(cleaned, site, sort)
- num, t, spane = self._search(q, num=num, after=after,
+ num, t, spane = self._search(q, num=num, after=after,
reverse=reverse, count=count)
cleanup_message = strings.invalid_search_query % {
"clean_query": cleaned
@@ -749,11 +754,13 @@ class FrontController(RedditController):
}
res = SearchPage(_('search results'), query, t, num, content=spane,
- nav_menus = [SearchSortMenu(default=sort)],
- search_params = dict(sort = sort),
+ nav_menus=[SearchSortMenu(default=sort)],
+ search_params=dict(sort=sort),
infotext=cleanup_message,
simple=False, site=c.site,
restrict_sr=restrict_sr,
+ syntax=syntax,
+ converted_data=q.converted_data
).render()
return res
diff --git a/r2/r2/lib/cloudsearch.py b/r2/r2/lib/cloudsearch.py
index cfecd5951..94046314c 100644
--- a/r2/r2/lib/cloudsearch.py
+++ b/r2/r2/lib/cloudsearch.py
@@ -4,17 +4,18 @@ import httplib
import json
from lxml import etree
from pylons import g, c
-import random
import re
import time
import urllib
+import l2cs
+
from r2.lib import amqp
from r2.lib.db.operators import desc
import r2.lib.utils as r2utils
-from r2.models import Account, Link, Subreddit, Thing, \
- All, DefaultSR, MultiReddit, DomainSR, Friends, ModContribSR, \
- FakeSubreddit, NotFound
+from r2.models import (Account, Link, Subreddit, Thing, All, DefaultSR,
+ MultiReddit, DomainSR, Friends, ModContribSR,
+ FakeSubreddit, NotFound)
_CHUNK_SIZE = 4000000 # Approx. 4 MB, to stay under the 5MB limit
@@ -182,7 +183,8 @@ def xml_from_things(things):
def delete_ids(ids):
'''Delete documents from the index. 'ids' should be a list of fullnames'''
version = _version()
- deletes = [etree.Element("delete", id=id_, version=str(version)) for id_ in ids]
+ deletes = [etree.Element("delete", id=id_, version=str(version))
+ for id_ in ids]
batch = etree.Element("batch")
batch.extend(deletes)
return send_documents(batch)
@@ -373,7 +375,7 @@ def _to_fn(cls, id_):
require an instance of the class)
'''
- return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
+ return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
r2utils.to36(id_))
@@ -389,7 +391,8 @@ def basic_query(query=None, bq=None, facets=("reddit",), facet_count=10,
timer = None
if record_stats:
timer = g.stats.get_timer("cloudsearch_timer")
- timer.start()
+ if timer:
+ timer.start()
connection = httplib.HTTPConnection(g.CLOUDSEARCH_SEARCH_API, 80)
try:
connection.request('GET', path)
@@ -454,8 +457,19 @@ class CloudSearchQuery(object):
'top': 3,
}
- def __init__(self, query, sr, sort):
+ lucene_parser = l2cs.make_parser(int_fields=['timestamp'],
+ yesno_fields=['over18', 'is_self'])
+ known_syntaxes = ("cloudsearch", "lucene")
+ default_syntax = "lucene"
+
+ def __init__(self, query, sr, sort, syntax=None):
+ if syntax is None:
+ syntax = self.default_syntax
+ elif syntax not in self.known_syntaxes:
+ raise ValueError("Unknown search syntax: %s" % syntax)
self.query = query.encode("utf-8") if query else ''
+ self.converted_data = None
+ self.syntax = syntax
self.sr = sr
self._sort = sort
self.sort = self.sorts[sort]
@@ -475,8 +489,8 @@ class CloudSearchQuery(object):
self.results = Results(after_docs, hits, facets)
return self.results
- @staticmethod
- def create_boolean_query(base_query, subreddit_query):
+ @classmethod
+ def create_boolean_query(cls, query, subreddit_query):
'''Join a (user-entered) text query with the generated subreddit query
Input:
@@ -489,19 +503,10 @@ class CloudSearchQuery(object):
without parens "author:'foo'"
'''
- is_boolean_query = any([x in base_query for x in ":()"])
-
- query = base_query.strip()
- if not is_boolean_query:
- query = query.replace("\\", "")
- query = query.replace("'", "\\'")
- query = "(field text '%s')" % query
-
if subreddit_query:
bq = "(and %s %s)" % (query, subreddit_query)
else:
bq = query
-
return bq
@staticmethod
@@ -527,7 +532,8 @@ class CloudSearchQuery(object):
# The query limit is roughly 8k bytes. Limit to 200 friends to
# avoid getting too close to that limit
friend_ids = c.user.friends[:200]
- friends = ["author_fullname:'%s'" % _to_fn(Account, id_) for id_ in friend_ids]
+ friends = ["author_fullname:'%s'" % _to_fn(Account, id_)
+ for id_ in friend_ids]
bq.extend(friends)
bq.append(")")
elif isinstance(sr, ModContribSR):
@@ -543,7 +549,13 @@ class CloudSearchQuery(object):
def _run(self, start=0, num=1000, _update=False):
'''Run the search against self.query'''
subreddit_query = self._get_sr_restriction(self.sr)
- self.bq = self.create_boolean_query(self.query, subreddit_query)
+ if self.syntax == "cloudsearch":
+ base_query = self.query
+ elif self.syntax == "lucene":
+ base_query = l2cs.convert(self.query, self.lucene_parser)
+ self.converted_data = {"syntax": "cloudsearch",
+ "converted": base_query}
+ self.bq = self.create_boolean_query(base_query, subreddit_query)
if g.sqlprinting:
g.log.info("%s", self)
return self._run_cached(self.bq, self.sort, start=start, num=num,
@@ -551,7 +563,8 @@ class CloudSearchQuery(object):
def __repr__(self):
'''Return a string representation of this query'''
- result = ["<", self.__class__.__name__, "> query:", repr(self.query), " "]
+ result = ["<", self.__class__.__name__, "> query:",
+ repr(self.query), " "]
if self.bq:
result.append(" bq:")
result.append(repr(self.bq))
@@ -612,23 +625,3 @@ class CloudSearchQuery(object):
results = Results(docs, hits, facets)
return results
-
-
-def test_create_boolean_query():
- tests = [('steve holt', None),
- ('steve holt', '(or sr_id:1 sr_id:2 sr_id:3)'),
- ('steve holt', "author:'qgyh2'"),
- ("can't help myself", None),
- ("can't help myself", '(or sr_id:1 sr_id:2 sr_id:3)'),
- ("can't help myself", "author:'qgyh2'"),
- ("text:'steve holt'", None),
- ("text:'steve holt'", '(or sr_id:1 sr_id:2 sr_id:3)'),
- ("text:'steve holt'", "author:'qgyh2'"),
- ("(or text:'steve holt' text:'nintendo')", None),
- ("(or text:'steve holt' text:'nintendo')", '(or sr_id:1 sr_id:2 sr_id:3)'),
- ("(or text:'steve holt' text:'nintendo')", "author:'qgyh2'")]
- for test in tests:
- print "Trying: %r" % (test,)
- bq = CloudSearchQuery.create_boolean_query(*test)
- print "Query: %r" % bq
- basic_query(bq=bq, size=1)
diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py
index 85261048b..4ec8f51dd 100755
--- a/r2/r2/lib/pages/pages.py
+++ b/r2/r2/lib/pages/pages.py
@@ -822,13 +822,15 @@ class SearchPage(BoringPage):
def __init__(self, pagename, prev_search, elapsed_time,
num_results, search_params = {},
simple=False, restrict_sr = False, site=None,
+ syntax=None, converted_data=None,
*a, **kw):
- self.searchbar = SearchBar(prev_search = prev_search,
- elapsed_time = elapsed_time,
- num_results = num_results,
- search_params = search_params,
- show_feedback = True, site=site,
- simple=simple, restrict_sr=restrict_sr)
+ self.searchbar = SearchBar(prev_search=prev_search,
+ elapsed_time=elapsed_time,
+ num_results=num_results,
+ search_params=search_params,
+ show_feedback=True, site=site,
+ simple=simple, restrict_sr=restrict_sr,
+ syntax=syntax, converted_data=converted_data)
BoringPage.__init__(self, pagename, robots='noindex', *a, **kw)
def content(self):
@@ -1728,26 +1730,26 @@ class PaneStack(Templated):
class SearchForm(Templated):
"""The simple search form in the header of the page. prev_search
is the previous search."""
- def __init__(self, prev_search = '', search_params = {},
- site=None, simple=True, restrict_sr=False,
- subreddit_search=False):
- Templated.__init__(self, prev_search = prev_search,
- search_params = search_params, site=site,
- simple=simple, restrict_sr=restrict_sr,
- subreddit_search=subreddit_search)
+ def __init__(self, prev_search='', search_params={}, site=None,
+ simple=True, restrict_sr=False, subreddit_search=False,
+ syntax=None):
+ Templated.__init__(self, prev_search=prev_search,
+ search_params=search_params, site=site,
+ simple=simple, restrict_sr=restrict_sr,
+ subreddit_search=subreddit_search, syntax=syntax)
class SearchBar(Templated):
"""More detailed search box for /search and /reddits pages.
Displays the previous search as well as info of the elapsed_time
and num_results if any."""
- def __init__(self, num_results = 0, prev_search = '', elapsed_time = 0,
- search_params = {}, show_feedback=False,
- simple=False, restrict_sr=False, site=None,
- subreddit_search=False, **kw):
-
- # not listed explicitly in args to ensure it translates properly
- self.header = kw.get('header', _("previous search"))
+ def __init__(self, header=None, num_results=0, prev_search='',
+ elapsed_time=0, search_params={}, show_feedback=False,
+ simple=False, restrict_sr=False, site=None, syntax=None,
+ subreddit_search=False, converted_data=None, **kw):
+ if header is None:
+ header = _("previous search")
+ self.header = header
self.prev_search = prev_search
self.elapsed_time = elapsed_time
@@ -1759,9 +1761,11 @@ class SearchBar(Templated):
else:
self.num_results = num_results
- Templated.__init__(self, search_params = search_params,
+ Templated.__init__(self, search_params=search_params,
simple=simple, restrict_sr=restrict_sr,
- site=site, subreddit_search=subreddit_search)
+ site=site, syntax=syntax,
+ converted_data=converted_data,
+ subreddit_search=subreddit_search)
class Frame(Wrapped):
"""Frameset for the FrameToolbar used when a user hits /tb/. The
diff --git a/r2/r2/public/static/css/reddit.css b/r2/r2/public/static/css/reddit.css
index 1a0fa1ed5..db137d573 100755
--- a/r2/r2/public/static/css/reddit.css
+++ b/r2/r2/public/static/css/reddit.css
@@ -2254,10 +2254,17 @@ label + #moresearchinfo {
.bottommenu { color: gray; font-size: smaller; clear: both}
.bottommenu a { color: gray; text-decoration: underline; }
-.bottommenu.serverinfo { text-align:right; padding:5px; }
-.bottommenu.serverinfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
-.bottommenu.serverinfo .content { display:none; }
-.bottommenu.serverinfo:hover .content { display:inline; }
+
+.debuginfo {
+ text-align: right;
+ padding: 5px;
+ color: gray;
+ font-size: smaller;
+ clear: both;
+}
+.debuginfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
+.debuginfo .content { display:none; }
+.debuginfo:hover .content { display:inline; }
/* Buttons specific */
diff --git a/r2/r2/templates/redditfooter.html b/r2/r2/templates/redditfooter.html
index edf6772cc..10c1f8a17 100644
--- a/r2/r2/templates/redditfooter.html
+++ b/r2/r2/templates/redditfooter.html
@@ -54,5 +54,5 @@
dict(year=datetime.datetime.now().timetuple()[0])}
-
+
diff --git a/r2/r2/templates/searchbar.html b/r2/r2/templates/searchbar.html
index 42fe9cbe0..dc84f77bb 100644
--- a/r2/r2/templates/searchbar.html
+++ b/r2/r2/templates/searchbar.html
@@ -51,6 +51,14 @@
%endif
+
+ %if thing.converted_data:
+
+ δ
+ ${_('converted query to %(syntax)s syntax: %(converted)s') % thing.converted_data}
+
+ %endif
+
%endif
@@ -58,9 +66,10 @@
${thing.header}
- ${SearchForm(prev_search = thing.prev_search,
- search_params = thing.search_params,
+ ${SearchForm(prev_search=thing.prev_search,
+ search_params=thing.search_params,
site=thing.site, subreddit_search=thing.subreddit_search,
- simple=thing.simple, restrict_sr=thing.restrict_sr)}
+ simple=thing.simple, restrict_sr=thing.restrict_sr,
+ syntax=thing.syntax)}
diff --git a/r2/r2/templates/searchform.html b/r2/r2/templates/searchform.html
index e7c3b8fac..e75a4e0c9 100644
--- a/r2/r2/templates/searchform.html
+++ b/r2/r2/templates/searchform.html
@@ -68,7 +68,6 @@
% endif
${search_faq()}
-
%else:
%if not thing.site or isinstance(thing.site, DefaultSR):