From 27c31666f3448264da9cd42a455ffb52437376c5 Mon Sep 17 00:00:00 2001 From: Keith Mitchell Date: Fri, 11 May 2012 14:41:28 -0700 Subject: [PATCH] Use lucene syntax for searches This uses the l2cs python library to convert search queries written in "lucene" syntax to Amazon's CloudSearch syntax A mouseover blurb on the search results page shows what the query was parsed as --- r2/r2/controllers/front.py | 25 ++++++---- r2/r2/lib/cloudsearch.py | 77 ++++++++++++++---------------- r2/r2/lib/pages/pages.py | 48 ++++++++++--------- r2/r2/public/static/css/reddit.css | 15 ++++-- r2/r2/templates/redditfooter.html | 2 +- r2/r2/templates/searchbar.html | 15 ++++-- r2/r2/templates/searchform.html | 1 - 7 files changed, 101 insertions(+), 82 deletions(-) diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py index c8f7e372a..5da9df441 100755 --- a/r2/r2/controllers/front.py +++ b/r2/r2/controllers/front.py @@ -705,11 +705,13 @@ class FrontController(RedditController): search_help_page = "/help/search" verify_langs_regex = re.compile(r"\A[a-z][a-z](,[a-z][a-z])*\Z") @base_listing - @validate(query = VLength('q', max_length=512), - sort = VMenu('sort', SearchSortMenu, remember=False), - restrict_sr = VBoolean('restrict_sr', default=False)) + @validate(query=VLength('q', max_length=512), + sort=VMenu('sort', SearchSortMenu, remember=False), + restrict_sr=VBoolean('restrict_sr', default=False), + syntax=VOneOf('syntax', options=SearchQuery.known_syntaxes)) @api_doc(api_section.search, extensions=['json', 'xml']) - def GET_search(self, query, num, reverse, after, count, sort, restrict_sr): + def GET_search(self, query, num, reverse, after, count, sort, restrict_sr, + syntax): """Search links page.""" if query and '.' in query: url = sanitize_url(query, require_scheme = True) @@ -720,16 +722,19 @@ class FrontController(RedditController): site = DefaultSR() else: site = c.site + + if not syntax: + syntax = SearchQuery.default_syntax try: cleanup_message = None try: - q = SearchQuery(query, site, sort) + q = SearchQuery(query, site, sort, syntax=syntax) num, t, spane = self._search(q, num=num, after=after, reverse = reverse, count = count) except InvalidQuery: # strip the query down to a whitelist - cleaned = re.sub("[^\w\s]+", "", query) + cleaned = re.sub("[^\w\s]+", " ", query) cleaned = cleaned.lower() # if it was nothing but mess, we have to stop @@ -738,7 +743,7 @@ class FrontController(RedditController): cleanup_message = strings.completely_invalid_search_query else: q = SearchQuery(cleaned, site, sort) - num, t, spane = self._search(q, num=num, after=after, + num, t, spane = self._search(q, num=num, after=after, reverse=reverse, count=count) cleanup_message = strings.invalid_search_query % { "clean_query": cleaned @@ -749,11 +754,13 @@ class FrontController(RedditController): } res = SearchPage(_('search results'), query, t, num, content=spane, - nav_menus = [SearchSortMenu(default=sort)], - search_params = dict(sort = sort), + nav_menus=[SearchSortMenu(default=sort)], + search_params=dict(sort=sort), infotext=cleanup_message, simple=False, site=c.site, restrict_sr=restrict_sr, + syntax=syntax, + converted_data=q.converted_data ).render() return res diff --git a/r2/r2/lib/cloudsearch.py b/r2/r2/lib/cloudsearch.py index cfecd5951..94046314c 100644 --- a/r2/r2/lib/cloudsearch.py +++ b/r2/r2/lib/cloudsearch.py @@ -4,17 +4,18 @@ import httplib import json from lxml import etree from pylons import g, c -import random import re import time import urllib +import l2cs + from r2.lib import amqp from r2.lib.db.operators import desc import r2.lib.utils as r2utils -from r2.models import Account, Link, Subreddit, Thing, \ - All, DefaultSR, MultiReddit, DomainSR, Friends, ModContribSR, \ - FakeSubreddit, NotFound +from r2.models import (Account, Link, Subreddit, Thing, All, DefaultSR, + MultiReddit, DomainSR, Friends, ModContribSR, + FakeSubreddit, NotFound) _CHUNK_SIZE = 4000000 # Approx. 4 MB, to stay under the 5MB limit @@ -182,7 +183,8 @@ def xml_from_things(things): def delete_ids(ids): '''Delete documents from the index. 'ids' should be a list of fullnames''' version = _version() - deletes = [etree.Element("delete", id=id_, version=str(version)) for id_ in ids] + deletes = [etree.Element("delete", id=id_, version=str(version)) + for id_ in ids] batch = etree.Element("batch") batch.extend(deletes) return send_documents(batch) @@ -373,7 +375,7 @@ def _to_fn(cls, id_): require an instance of the class) ''' - return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' + + return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' + r2utils.to36(id_)) @@ -389,7 +391,8 @@ def basic_query(query=None, bq=None, facets=("reddit",), facet_count=10, timer = None if record_stats: timer = g.stats.get_timer("cloudsearch_timer") - timer.start() + if timer: + timer.start() connection = httplib.HTTPConnection(g.CLOUDSEARCH_SEARCH_API, 80) try: connection.request('GET', path) @@ -454,8 +457,19 @@ class CloudSearchQuery(object): 'top': 3, } - def __init__(self, query, sr, sort): + lucene_parser = l2cs.make_parser(int_fields=['timestamp'], + yesno_fields=['over18', 'is_self']) + known_syntaxes = ("cloudsearch", "lucene") + default_syntax = "lucene" + + def __init__(self, query, sr, sort, syntax=None): + if syntax is None: + syntax = self.default_syntax + elif syntax not in self.known_syntaxes: + raise ValueError("Unknown search syntax: %s" % syntax) self.query = query.encode("utf-8") if query else '' + self.converted_data = None + self.syntax = syntax self.sr = sr self._sort = sort self.sort = self.sorts[sort] @@ -475,8 +489,8 @@ class CloudSearchQuery(object): self.results = Results(after_docs, hits, facets) return self.results - @staticmethod - def create_boolean_query(base_query, subreddit_query): + @classmethod + def create_boolean_query(cls, query, subreddit_query): '''Join a (user-entered) text query with the generated subreddit query Input: @@ -489,19 +503,10 @@ class CloudSearchQuery(object): without parens "author:'foo'" ''' - is_boolean_query = any([x in base_query for x in ":()"]) - - query = base_query.strip() - if not is_boolean_query: - query = query.replace("\\", "") - query = query.replace("'", "\\'") - query = "(field text '%s')" % query - if subreddit_query: bq = "(and %s %s)" % (query, subreddit_query) else: bq = query - return bq @staticmethod @@ -527,7 +532,8 @@ class CloudSearchQuery(object): # The query limit is roughly 8k bytes. Limit to 200 friends to # avoid getting too close to that limit friend_ids = c.user.friends[:200] - friends = ["author_fullname:'%s'" % _to_fn(Account, id_) for id_ in friend_ids] + friends = ["author_fullname:'%s'" % _to_fn(Account, id_) + for id_ in friend_ids] bq.extend(friends) bq.append(")") elif isinstance(sr, ModContribSR): @@ -543,7 +549,13 @@ class CloudSearchQuery(object): def _run(self, start=0, num=1000, _update=False): '''Run the search against self.query''' subreddit_query = self._get_sr_restriction(self.sr) - self.bq = self.create_boolean_query(self.query, subreddit_query) + if self.syntax == "cloudsearch": + base_query = self.query + elif self.syntax == "lucene": + base_query = l2cs.convert(self.query, self.lucene_parser) + self.converted_data = {"syntax": "cloudsearch", + "converted": base_query} + self.bq = self.create_boolean_query(base_query, subreddit_query) if g.sqlprinting: g.log.info("%s", self) return self._run_cached(self.bq, self.sort, start=start, num=num, @@ -551,7 +563,8 @@ class CloudSearchQuery(object): def __repr__(self): '''Return a string representation of this query''' - result = ["<", self.__class__.__name__, "> query:", repr(self.query), " "] + result = ["<", self.__class__.__name__, "> query:", + repr(self.query), " "] if self.bq: result.append(" bq:") result.append(repr(self.bq)) @@ -612,23 +625,3 @@ class CloudSearchQuery(object): results = Results(docs, hits, facets) return results - - -def test_create_boolean_query(): - tests = [('steve holt', None), - ('steve holt', '(or sr_id:1 sr_id:2 sr_id:3)'), - ('steve holt', "author:'qgyh2'"), - ("can't help myself", None), - ("can't help myself", '(or sr_id:1 sr_id:2 sr_id:3)'), - ("can't help myself", "author:'qgyh2'"), - ("text:'steve holt'", None), - ("text:'steve holt'", '(or sr_id:1 sr_id:2 sr_id:3)'), - ("text:'steve holt'", "author:'qgyh2'"), - ("(or text:'steve holt' text:'nintendo')", None), - ("(or text:'steve holt' text:'nintendo')", '(or sr_id:1 sr_id:2 sr_id:3)'), - ("(or text:'steve holt' text:'nintendo')", "author:'qgyh2'")] - for test in tests: - print "Trying: %r" % (test,) - bq = CloudSearchQuery.create_boolean_query(*test) - print "Query: %r" % bq - basic_query(bq=bq, size=1) diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py index 85261048b..4ec8f51dd 100755 --- a/r2/r2/lib/pages/pages.py +++ b/r2/r2/lib/pages/pages.py @@ -822,13 +822,15 @@ class SearchPage(BoringPage): def __init__(self, pagename, prev_search, elapsed_time, num_results, search_params = {}, simple=False, restrict_sr = False, site=None, + syntax=None, converted_data=None, *a, **kw): - self.searchbar = SearchBar(prev_search = prev_search, - elapsed_time = elapsed_time, - num_results = num_results, - search_params = search_params, - show_feedback = True, site=site, - simple=simple, restrict_sr=restrict_sr) + self.searchbar = SearchBar(prev_search=prev_search, + elapsed_time=elapsed_time, + num_results=num_results, + search_params=search_params, + show_feedback=True, site=site, + simple=simple, restrict_sr=restrict_sr, + syntax=syntax, converted_data=converted_data) BoringPage.__init__(self, pagename, robots='noindex', *a, **kw) def content(self): @@ -1728,26 +1730,26 @@ class PaneStack(Templated): class SearchForm(Templated): """The simple search form in the header of the page. prev_search is the previous search.""" - def __init__(self, prev_search = '', search_params = {}, - site=None, simple=True, restrict_sr=False, - subreddit_search=False): - Templated.__init__(self, prev_search = prev_search, - search_params = search_params, site=site, - simple=simple, restrict_sr=restrict_sr, - subreddit_search=subreddit_search) + def __init__(self, prev_search='', search_params={}, site=None, + simple=True, restrict_sr=False, subreddit_search=False, + syntax=None): + Templated.__init__(self, prev_search=prev_search, + search_params=search_params, site=site, + simple=simple, restrict_sr=restrict_sr, + subreddit_search=subreddit_search, syntax=syntax) class SearchBar(Templated): """More detailed search box for /search and /reddits pages. Displays the previous search as well as info of the elapsed_time and num_results if any.""" - def __init__(self, num_results = 0, prev_search = '', elapsed_time = 0, - search_params = {}, show_feedback=False, - simple=False, restrict_sr=False, site=None, - subreddit_search=False, **kw): - - # not listed explicitly in args to ensure it translates properly - self.header = kw.get('header', _("previous search")) + def __init__(self, header=None, num_results=0, prev_search='', + elapsed_time=0, search_params={}, show_feedback=False, + simple=False, restrict_sr=False, site=None, syntax=None, + subreddit_search=False, converted_data=None, **kw): + if header is None: + header = _("previous search") + self.header = header self.prev_search = prev_search self.elapsed_time = elapsed_time @@ -1759,9 +1761,11 @@ class SearchBar(Templated): else: self.num_results = num_results - Templated.__init__(self, search_params = search_params, + Templated.__init__(self, search_params=search_params, simple=simple, restrict_sr=restrict_sr, - site=site, subreddit_search=subreddit_search) + site=site, syntax=syntax, + converted_data=converted_data, + subreddit_search=subreddit_search) class Frame(Wrapped): """Frameset for the FrameToolbar used when a user hits /tb/. The diff --git a/r2/r2/public/static/css/reddit.css b/r2/r2/public/static/css/reddit.css index 1a0fa1ed5..db137d573 100755 --- a/r2/r2/public/static/css/reddit.css +++ b/r2/r2/public/static/css/reddit.css @@ -2254,10 +2254,17 @@ label + #moresearchinfo { .bottommenu { color: gray; font-size: smaller; clear: both} .bottommenu a { color: gray; text-decoration: underline; } -.bottommenu.serverinfo { text-align:right; padding:5px; } -.bottommenu.serverinfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; } -.bottommenu.serverinfo .content { display:none; } -.bottommenu.serverinfo:hover .content { display:inline; } + +.debuginfo { + text-align: right; + padding: 5px; + color: gray; + font-size: smaller; + clear: both; +} +.debuginfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; } +.debuginfo .content { display:none; } +.debuginfo:hover .content { display:inline; } /* Buttons specific */ diff --git a/r2/r2/templates/redditfooter.html b/r2/r2/templates/redditfooter.html index edf6772cc..10c1f8a17 100644 --- a/r2/r2/templates/redditfooter.html +++ b/r2/r2/templates/redditfooter.html @@ -54,5 +54,5 @@ dict(year=datetime.datetime.now().timetuple()[0])}

REDDIT and the ALIEN Logo are registered trademarks of reddit inc.

-

π Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.

+

π Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.

diff --git a/r2/r2/templates/searchbar.html b/r2/r2/templates/searchbar.html index 42fe9cbe0..dc84f77bb 100644 --- a/r2/r2/templates/searchbar.html +++ b/r2/r2/templates/searchbar.html @@ -51,6 +51,14 @@ %endif +
+ %if thing.converted_data: +

+ δ  + ${_('converted query to %(syntax)s syntax: %(converted)s') % thing.converted_data} +

+ %endif +
%endif @@ -58,9 +66,10 @@

${thing.header}

- ${SearchForm(prev_search = thing.prev_search, - search_params = thing.search_params, + ${SearchForm(prev_search=thing.prev_search, + search_params=thing.search_params, site=thing.site, subreddit_search=thing.subreddit_search, - simple=thing.simple, restrict_sr=thing.restrict_sr)} + simple=thing.simple, restrict_sr=thing.restrict_sr, + syntax=thing.syntax)}
diff --git a/r2/r2/templates/searchform.html b/r2/r2/templates/searchform.html index e7c3b8fac..e75a4e0c9 100644 --- a/r2/r2/templates/searchform.html +++ b/r2/r2/templates/searchform.html @@ -68,7 +68,6 @@ % endif ${search_faq()} - %else: %if not thing.site or isinstance(thing.site, DefaultSR):