From 27c31666f3448264da9cd42a455ffb52437376c5 Mon Sep 17 00:00:00 2001
From: Keith Mitchell <kemitche@reddit.com>
Date: Fri, 11 May 2012 14:41:28 -0700
Subject: [PATCH] Use lucene syntax for searches

This uses the l2cs python library to convert
search queries written in "lucene" syntax to
Amazon's CloudSearch syntax

A mouseover blurb on the search results page
shows what the query was parsed as
---
 r2/r2/controllers/front.py         | 25 ++++++----
 r2/r2/lib/cloudsearch.py           | 77 ++++++++++++++----------------
 r2/r2/lib/pages/pages.py           | 48 ++++++++++---------
 r2/r2/public/static/css/reddit.css | 15 ++++--
 r2/r2/templates/redditfooter.html  |  2 +-
 r2/r2/templates/searchbar.html     | 15 ++++--
 r2/r2/templates/searchform.html    |  1 -
 7 files changed, 101 insertions(+), 82 deletions(-)

diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py
index c8f7e372a..5da9df441 100755
--- a/r2/r2/controllers/front.py
+++ b/r2/r2/controllers/front.py
@@ -705,11 +705,13 @@ class FrontController(RedditController):
     search_help_page = "/help/search"
     verify_langs_regex = re.compile(r"\A[a-z][a-z](,[a-z][a-z])*\Z")
     @base_listing
-    @validate(query = VLength('q', max_length=512),
-              sort = VMenu('sort', SearchSortMenu, remember=False),
-              restrict_sr = VBoolean('restrict_sr', default=False))
+    @validate(query=VLength('q', max_length=512),
+              sort=VMenu('sort', SearchSortMenu, remember=False),
+              restrict_sr=VBoolean('restrict_sr', default=False),
+              syntax=VOneOf('syntax', options=SearchQuery.known_syntaxes))
     @api_doc(api_section.search, extensions=['json', 'xml'])
-    def GET_search(self, query, num, reverse, after, count, sort, restrict_sr):
+    def GET_search(self, query, num, reverse, after, count, sort, restrict_sr,
+                   syntax):
         """Search links page."""
         if query and '.' in query:
             url = sanitize_url(query, require_scheme = True)
@@ -720,16 +722,19 @@ class FrontController(RedditController):
             site = DefaultSR()
         else:
             site = c.site
+        
+        if not syntax:
+            syntax = SearchQuery.default_syntax
 
         try:
             cleanup_message = None
             try:
-                q = SearchQuery(query, site, sort)
+                q = SearchQuery(query, site, sort, syntax=syntax)
                 num, t, spane = self._search(q, num=num, after=after, 
                                              reverse = reverse, count = count)
             except InvalidQuery:
                 # strip the query down to a whitelist
-                cleaned = re.sub("[^\w\s]+", "", query)
+                cleaned = re.sub("[^\w\s]+", " ", query)
                 cleaned = cleaned.lower()
 
                 # if it was nothing but mess, we have to stop
@@ -738,7 +743,7 @@ class FrontController(RedditController):
                     cleanup_message = strings.completely_invalid_search_query
                 else:
                     q = SearchQuery(cleaned, site, sort)
-                    num, t, spane = self._search(q, num=num, after=after, 
+                    num, t, spane = self._search(q, num=num, after=after,
                                                  reverse=reverse, count=count)
                     cleanup_message = strings.invalid_search_query % {
                                           "clean_query": cleaned
@@ -749,11 +754,13 @@ class FrontController(RedditController):
                                                           }
             
             res = SearchPage(_('search results'), query, t, num, content=spane,
-                             nav_menus = [SearchSortMenu(default=sort)],
-                             search_params = dict(sort = sort), 
+                             nav_menus=[SearchSortMenu(default=sort)],
+                             search_params=dict(sort=sort),
                              infotext=cleanup_message,
                              simple=False, site=c.site,
                              restrict_sr=restrict_sr,
+                             syntax=syntax,
+                             converted_data=q.converted_data
                              ).render()
 
             return res
diff --git a/r2/r2/lib/cloudsearch.py b/r2/r2/lib/cloudsearch.py
index cfecd5951..94046314c 100644
--- a/r2/r2/lib/cloudsearch.py
+++ b/r2/r2/lib/cloudsearch.py
@@ -4,17 +4,18 @@ import httplib
 import json
 from lxml import etree
 from pylons import g, c
-import random
 import re
 import time
 import urllib
 
+import l2cs
+
 from r2.lib import amqp
 from r2.lib.db.operators import desc
 import r2.lib.utils as r2utils
-from r2.models import Account, Link, Subreddit, Thing, \
-    All, DefaultSR, MultiReddit, DomainSR, Friends, ModContribSR, \
-    FakeSubreddit, NotFound
+from r2.models import (Account, Link, Subreddit, Thing, All, DefaultSR,
+                       MultiReddit, DomainSR, Friends, ModContribSR,
+                       FakeSubreddit, NotFound)
 
 
 _CHUNK_SIZE = 4000000 # Approx. 4 MB, to stay under the 5MB limit
@@ -182,7 +183,8 @@ def xml_from_things(things):
 def delete_ids(ids):
     '''Delete documents from the index. 'ids' should be a list of fullnames'''
     version = _version()
-    deletes = [etree.Element("delete", id=id_, version=str(version)) for id_ in ids]
+    deletes = [etree.Element("delete", id=id_, version=str(version))
+               for id_ in ids]
     batch = etree.Element("batch")
     batch.extend(deletes)
     return send_documents(batch)
@@ -373,7 +375,7 @@ def _to_fn(cls, id_):
     require an instance of the class)
     
     '''
-    return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' + 
+    return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
             r2utils.to36(id_))
 
 
@@ -389,7 +391,8 @@ def basic_query(query=None, bq=None, facets=("reddit",), facet_count=10,
     timer = None
     if record_stats:
         timer = g.stats.get_timer("cloudsearch_timer")
-        timer.start()
+        if timer:
+            timer.start()
     connection = httplib.HTTPConnection(g.CLOUDSEARCH_SEARCH_API, 80)
     try:
         connection.request('GET', path)
@@ -454,8 +457,19 @@ class CloudSearchQuery(object):
                           'top': 3,
                           }
     
-    def __init__(self, query, sr, sort):
+    lucene_parser = l2cs.make_parser(int_fields=['timestamp'],
+                                     yesno_fields=['over18', 'is_self'])
+    known_syntaxes = ("cloudsearch", "lucene")
+    default_syntax = "lucene"
+    
+    def __init__(self, query, sr, sort, syntax=None):
+        if syntax is None:
+            syntax = self.default_syntax
+        elif syntax not in self.known_syntaxes:
+            raise ValueError("Unknown search syntax: %s" % syntax)
         self.query = query.encode("utf-8") if query else ''
+        self.converted_data = None
+        self.syntax = syntax
         self.sr = sr
         self._sort = sort
         self.sort = self.sorts[sort]
@@ -475,8 +489,8 @@ class CloudSearchQuery(object):
         self.results = Results(after_docs, hits, facets)
         return self.results
     
-    @staticmethod
-    def create_boolean_query(base_query, subreddit_query):
+    @classmethod
+    def create_boolean_query(cls, query, subreddit_query):
         '''Join a (user-entered) text query with the generated subreddit query
         
         Input:
@@ -489,19 +503,10 @@ class CloudSearchQuery(object):
                              without parens "author:'foo'"
         
         '''
-        is_boolean_query = any([x in base_query for x in ":()"])
-        
-        query = base_query.strip()
-        if not is_boolean_query:
-            query = query.replace("\\", "")
-            query = query.replace("'", "\\'")
-            query = "(field text '%s')" % query
-        
         if subreddit_query:
             bq = "(and %s %s)" % (query, subreddit_query)
         else:
             bq = query
-        
         return bq
     
     @staticmethod
@@ -527,7 +532,8 @@ class CloudSearchQuery(object):
             # The query limit is roughly 8k bytes. Limit to 200 friends to
             # avoid getting too close to that limit
             friend_ids = c.user.friends[:200]
-            friends = ["author_fullname:'%s'" % _to_fn(Account, id_) for id_ in friend_ids]
+            friends = ["author_fullname:'%s'" % _to_fn(Account, id_)
+                       for id_ in friend_ids]
             bq.extend(friends)
             bq.append(")")
         elif isinstance(sr, ModContribSR):
@@ -543,7 +549,13 @@ class CloudSearchQuery(object):
     def _run(self, start=0, num=1000, _update=False):
         '''Run the search against self.query'''
         subreddit_query = self._get_sr_restriction(self.sr)
-        self.bq = self.create_boolean_query(self.query, subreddit_query)
+        if self.syntax == "cloudsearch":
+            base_query = self.query
+        elif self.syntax == "lucene":
+            base_query = l2cs.convert(self.query, self.lucene_parser)
+            self.converted_data = {"syntax": "cloudsearch",
+                                   "converted": base_query}
+        self.bq = self.create_boolean_query(base_query, subreddit_query)
         if g.sqlprinting:
             g.log.info("%s", self)
         return self._run_cached(self.bq, self.sort, start=start, num=num,
@@ -551,7 +563,8 @@ class CloudSearchQuery(object):
     
     def __repr__(self):
         '''Return a string representation of this query'''
-        result = ["<", self.__class__.__name__, "> query:", repr(self.query), " "]
+        result = ["<", self.__class__.__name__, "> query:",
+                  repr(self.query), " "]
         if self.bq:
             result.append(" bq:")
             result.append(repr(self.bq))
@@ -612,23 +625,3 @@ class CloudSearchQuery(object):
         
         results = Results(docs, hits, facets)
         return results
-
-
-def test_create_boolean_query():
-    tests = [('steve holt', None),
-             ('steve holt', '(or sr_id:1 sr_id:2 sr_id:3)'),
-             ('steve holt', "author:'qgyh2'"),
-             ("can't help myself", None),
-             ("can't help myself", '(or sr_id:1 sr_id:2 sr_id:3)'),
-             ("can't help myself", "author:'qgyh2'"),
-             ("text:'steve holt'", None),
-             ("text:'steve holt'", '(or sr_id:1 sr_id:2 sr_id:3)'),
-             ("text:'steve holt'", "author:'qgyh2'"),
-             ("(or text:'steve holt' text:'nintendo')", None),
-             ("(or text:'steve holt' text:'nintendo')", '(or sr_id:1 sr_id:2 sr_id:3)'),
-             ("(or text:'steve holt' text:'nintendo')", "author:'qgyh2'")]
-    for test in tests:
-        print "Trying: %r" % (test,)
-        bq = CloudSearchQuery.create_boolean_query(*test)
-        print "Query: %r" % bq
-        basic_query(bq=bq, size=1)
diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py
index 85261048b..4ec8f51dd 100755
--- a/r2/r2/lib/pages/pages.py
+++ b/r2/r2/lib/pages/pages.py
@@ -822,13 +822,15 @@ class SearchPage(BoringPage):
     def __init__(self, pagename, prev_search, elapsed_time,
                  num_results, search_params = {},
                  simple=False, restrict_sr = False, site=None,
+                 syntax=None, converted_data=None,
                  *a, **kw):
-        self.searchbar = SearchBar(prev_search = prev_search,
-                                   elapsed_time = elapsed_time,
-                                   num_results = num_results,
-                                   search_params = search_params,
-                                   show_feedback = True, site=site,
-                                   simple=simple, restrict_sr=restrict_sr)
+        self.searchbar = SearchBar(prev_search=prev_search,
+                                   elapsed_time=elapsed_time,
+                                   num_results=num_results,
+                                   search_params=search_params,
+                                   show_feedback=True, site=site,
+                                   simple=simple, restrict_sr=restrict_sr,
+                                   syntax=syntax, converted_data=converted_data)
         BoringPage.__init__(self, pagename, robots='noindex', *a, **kw)
 
     def content(self):
@@ -1728,26 +1730,26 @@ class PaneStack(Templated):
 class SearchForm(Templated):
     """The simple search form in the header of the page.  prev_search
     is the previous search."""
-    def __init__(self, prev_search = '', search_params = {},
-                 site=None, simple=True, restrict_sr=False, 
-                 subreddit_search=False):
-        Templated.__init__(self, prev_search = prev_search,
-                           search_params = search_params, site=site,
-                           simple=simple, restrict_sr=restrict_sr, 
-                           subreddit_search=subreddit_search)
+    def __init__(self, prev_search='', search_params={}, site=None,
+                 simple=True, restrict_sr=False, subreddit_search=False,
+                 syntax=None):
+        Templated.__init__(self, prev_search=prev_search,
+                           search_params=search_params, site=site,
+                           simple=simple, restrict_sr=restrict_sr,
+                           subreddit_search=subreddit_search, syntax=syntax)
 
 
 class SearchBar(Templated):
     """More detailed search box for /search and /reddits pages.
     Displays the previous search as well as info of the elapsed_time
     and num_results if any."""
-    def __init__(self, num_results = 0, prev_search = '', elapsed_time = 0,
-                 search_params = {}, show_feedback=False,
-                 simple=False, restrict_sr=False, site=None,
-                 subreddit_search=False, **kw):
-
-        # not listed explicitly in args to ensure it translates properly
-        self.header = kw.get('header', _("previous search"))
+    def __init__(self, header=None, num_results=0, prev_search='',
+                 elapsed_time=0, search_params={}, show_feedback=False,
+                 simple=False, restrict_sr=False, site=None, syntax=None,
+                 subreddit_search=False, converted_data=None, **kw):
+        if header is None:
+            header = _("previous search")
+        self.header = header
 
         self.prev_search  = prev_search
         self.elapsed_time = elapsed_time
@@ -1759,9 +1761,11 @@ class SearchBar(Templated):
         else:
             self.num_results = num_results
 
-        Templated.__init__(self, search_params = search_params,
+        Templated.__init__(self, search_params=search_params,
                            simple=simple, restrict_sr=restrict_sr,
-                           site=site, subreddit_search=subreddit_search)
+                           site=site, syntax=syntax,
+                           converted_data=converted_data,
+                           subreddit_search=subreddit_search)
 
 class Frame(Wrapped):
     """Frameset for the FrameToolbar used when a user hits /tb/. The
diff --git a/r2/r2/public/static/css/reddit.css b/r2/r2/public/static/css/reddit.css
index 1a0fa1ed5..db137d573 100755
--- a/r2/r2/public/static/css/reddit.css
+++ b/r2/r2/public/static/css/reddit.css
@@ -2254,10 +2254,17 @@ label + #moresearchinfo {
 
 .bottommenu { color: gray; font-size: smaller; clear: both}
 .bottommenu a { color: gray; text-decoration: underline;   }
-.bottommenu.serverinfo { text-align:right; padding:5px; }
-.bottommenu.serverinfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
-.bottommenu.serverinfo .content { display:none; }
-.bottommenu.serverinfo:hover .content { display:inline; }
+
+.debuginfo {
+    text-align: right;
+    padding: 5px;
+    color: gray;
+    font-size: smaller;
+    clear: both;
+}
+.debuginfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
+.debuginfo .content { display:none; }
+.debuginfo:hover .content { display:inline; }
 
 
 /* Buttons specific */
diff --git a/r2/r2/templates/redditfooter.html b/r2/r2/templates/redditfooter.html
index edf6772cc..10c1f8a17 100644
--- a/r2/r2/templates/redditfooter.html
+++ b/r2/r2/templates/redditfooter.html
@@ -54,5 +54,5 @@
     dict(year=datetime.datetime.now().timetuple()[0])}
   </p>
   <p class="bottommenu">REDDIT and the ALIEN Logo are registered trademarks of reddit inc.</p>
-  <p class="bottommenu serverinfo"><span class="icon">&pi;</span>&nbsp;<span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
+  <p class="bottommenu debuginfo"><span class="icon">&pi;</span>&nbsp;<span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
 </div>
diff --git a/r2/r2/templates/searchbar.html b/r2/r2/templates/searchbar.html
index 42fe9cbe0..dc84f77bb 100644
--- a/r2/r2/templates/searchbar.html
+++ b/r2/r2/templates/searchbar.html
@@ -51,6 +51,14 @@
       </div>
     </div>
     %endif
+    <div>
+    %if thing.converted_data:
+    <p class="debuginfo">
+      <span class="icon">&delta;</span>&nbsp;
+      <span class="content">${_('converted query to %(syntax)s syntax: %(converted)s') % thing.converted_data}</span>
+    </p>
+    %endif
+    </div>
   </div>
 %endif
 
@@ -58,9 +66,10 @@
   <h4 style="color:gray">${thing.header}</h4>
 
   <div id="previoussearch">
-    ${SearchForm(prev_search = thing.prev_search,
-                 search_params = thing.search_params,
+    ${SearchForm(prev_search=thing.prev_search,
+                 search_params=thing.search_params,
                  site=thing.site, subreddit_search=thing.subreddit_search,
-                 simple=thing.simple, restrict_sr=thing.restrict_sr)}
+                 simple=thing.simple, restrict_sr=thing.restrict_sr,
+                 syntax=thing.syntax)}
   </div>
 </div>
diff --git a/r2/r2/templates/searchform.html b/r2/r2/templates/searchform.html
index e7c3b8fac..e75a4e0c9 100644
--- a/r2/r2/templates/searchform.html
+++ b/r2/r2/templates/searchform.html
@@ -68,7 +68,6 @@
       % endif
 
       ${search_faq()}
-
   </div>
   %else:
     %if not thing.site or isinstance(thing.site, DefaultSR):