mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-02-01 02:05:03 -05:00
Use lucene syntax for searches
This uses the l2cs python library to convert search queries written in "lucene" syntax to Amazon's CloudSearch syntax A mouseover blurb on the search results page shows what the query was parsed as
This commit is contained in:
@@ -705,11 +705,13 @@ class FrontController(RedditController):
|
||||
search_help_page = "/help/search"
|
||||
verify_langs_regex = re.compile(r"\A[a-z][a-z](,[a-z][a-z])*\Z")
|
||||
@base_listing
|
||||
@validate(query = VLength('q', max_length=512),
|
||||
sort = VMenu('sort', SearchSortMenu, remember=False),
|
||||
restrict_sr = VBoolean('restrict_sr', default=False))
|
||||
@validate(query=VLength('q', max_length=512),
|
||||
sort=VMenu('sort', SearchSortMenu, remember=False),
|
||||
restrict_sr=VBoolean('restrict_sr', default=False),
|
||||
syntax=VOneOf('syntax', options=SearchQuery.known_syntaxes))
|
||||
@api_doc(api_section.search, extensions=['json', 'xml'])
|
||||
def GET_search(self, query, num, reverse, after, count, sort, restrict_sr):
|
||||
def GET_search(self, query, num, reverse, after, count, sort, restrict_sr,
|
||||
syntax):
|
||||
"""Search links page."""
|
||||
if query and '.' in query:
|
||||
url = sanitize_url(query, require_scheme = True)
|
||||
@@ -720,16 +722,19 @@ class FrontController(RedditController):
|
||||
site = DefaultSR()
|
||||
else:
|
||||
site = c.site
|
||||
|
||||
if not syntax:
|
||||
syntax = SearchQuery.default_syntax
|
||||
|
||||
try:
|
||||
cleanup_message = None
|
||||
try:
|
||||
q = SearchQuery(query, site, sort)
|
||||
q = SearchQuery(query, site, sort, syntax=syntax)
|
||||
num, t, spane = self._search(q, num=num, after=after,
|
||||
reverse = reverse, count = count)
|
||||
except InvalidQuery:
|
||||
# strip the query down to a whitelist
|
||||
cleaned = re.sub("[^\w\s]+", "", query)
|
||||
cleaned = re.sub("[^\w\s]+", " ", query)
|
||||
cleaned = cleaned.lower()
|
||||
|
||||
# if it was nothing but mess, we have to stop
|
||||
@@ -738,7 +743,7 @@ class FrontController(RedditController):
|
||||
cleanup_message = strings.completely_invalid_search_query
|
||||
else:
|
||||
q = SearchQuery(cleaned, site, sort)
|
||||
num, t, spane = self._search(q, num=num, after=after,
|
||||
num, t, spane = self._search(q, num=num, after=after,
|
||||
reverse=reverse, count=count)
|
||||
cleanup_message = strings.invalid_search_query % {
|
||||
"clean_query": cleaned
|
||||
@@ -749,11 +754,13 @@ class FrontController(RedditController):
|
||||
}
|
||||
|
||||
res = SearchPage(_('search results'), query, t, num, content=spane,
|
||||
nav_menus = [SearchSortMenu(default=sort)],
|
||||
search_params = dict(sort = sort),
|
||||
nav_menus=[SearchSortMenu(default=sort)],
|
||||
search_params=dict(sort=sort),
|
||||
infotext=cleanup_message,
|
||||
simple=False, site=c.site,
|
||||
restrict_sr=restrict_sr,
|
||||
syntax=syntax,
|
||||
converted_data=q.converted_data
|
||||
).render()
|
||||
|
||||
return res
|
||||
|
||||
@@ -4,17 +4,18 @@ import httplib
|
||||
import json
|
||||
from lxml import etree
|
||||
from pylons import g, c
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import urllib
|
||||
|
||||
import l2cs
|
||||
|
||||
from r2.lib import amqp
|
||||
from r2.lib.db.operators import desc
|
||||
import r2.lib.utils as r2utils
|
||||
from r2.models import Account, Link, Subreddit, Thing, \
|
||||
All, DefaultSR, MultiReddit, DomainSR, Friends, ModContribSR, \
|
||||
FakeSubreddit, NotFound
|
||||
from r2.models import (Account, Link, Subreddit, Thing, All, DefaultSR,
|
||||
MultiReddit, DomainSR, Friends, ModContribSR,
|
||||
FakeSubreddit, NotFound)
|
||||
|
||||
|
||||
_CHUNK_SIZE = 4000000 # Approx. 4 MB, to stay under the 5MB limit
|
||||
@@ -182,7 +183,8 @@ def xml_from_things(things):
|
||||
def delete_ids(ids):
|
||||
'''Delete documents from the index. 'ids' should be a list of fullnames'''
|
||||
version = _version()
|
||||
deletes = [etree.Element("delete", id=id_, version=str(version)) for id_ in ids]
|
||||
deletes = [etree.Element("delete", id=id_, version=str(version))
|
||||
for id_ in ids]
|
||||
batch = etree.Element("batch")
|
||||
batch.extend(deletes)
|
||||
return send_documents(batch)
|
||||
@@ -373,7 +375,7 @@ def _to_fn(cls, id_):
|
||||
require an instance of the class)
|
||||
|
||||
'''
|
||||
return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
|
||||
return (cls._type_prefix + r2utils.to36(cls._type_id) + '_' +
|
||||
r2utils.to36(id_))
|
||||
|
||||
|
||||
@@ -389,7 +391,8 @@ def basic_query(query=None, bq=None, facets=("reddit",), facet_count=10,
|
||||
timer = None
|
||||
if record_stats:
|
||||
timer = g.stats.get_timer("cloudsearch_timer")
|
||||
timer.start()
|
||||
if timer:
|
||||
timer.start()
|
||||
connection = httplib.HTTPConnection(g.CLOUDSEARCH_SEARCH_API, 80)
|
||||
try:
|
||||
connection.request('GET', path)
|
||||
@@ -454,8 +457,19 @@ class CloudSearchQuery(object):
|
||||
'top': 3,
|
||||
}
|
||||
|
||||
def __init__(self, query, sr, sort):
|
||||
lucene_parser = l2cs.make_parser(int_fields=['timestamp'],
|
||||
yesno_fields=['over18', 'is_self'])
|
||||
known_syntaxes = ("cloudsearch", "lucene")
|
||||
default_syntax = "lucene"
|
||||
|
||||
def __init__(self, query, sr, sort, syntax=None):
|
||||
if syntax is None:
|
||||
syntax = self.default_syntax
|
||||
elif syntax not in self.known_syntaxes:
|
||||
raise ValueError("Unknown search syntax: %s" % syntax)
|
||||
self.query = query.encode("utf-8") if query else ''
|
||||
self.converted_data = None
|
||||
self.syntax = syntax
|
||||
self.sr = sr
|
||||
self._sort = sort
|
||||
self.sort = self.sorts[sort]
|
||||
@@ -475,8 +489,8 @@ class CloudSearchQuery(object):
|
||||
self.results = Results(after_docs, hits, facets)
|
||||
return self.results
|
||||
|
||||
@staticmethod
|
||||
def create_boolean_query(base_query, subreddit_query):
|
||||
@classmethod
|
||||
def create_boolean_query(cls, query, subreddit_query):
|
||||
'''Join a (user-entered) text query with the generated subreddit query
|
||||
|
||||
Input:
|
||||
@@ -489,19 +503,10 @@ class CloudSearchQuery(object):
|
||||
without parens "author:'foo'"
|
||||
|
||||
'''
|
||||
is_boolean_query = any([x in base_query for x in ":()"])
|
||||
|
||||
query = base_query.strip()
|
||||
if not is_boolean_query:
|
||||
query = query.replace("\\", "")
|
||||
query = query.replace("'", "\\'")
|
||||
query = "(field text '%s')" % query
|
||||
|
||||
if subreddit_query:
|
||||
bq = "(and %s %s)" % (query, subreddit_query)
|
||||
else:
|
||||
bq = query
|
||||
|
||||
return bq
|
||||
|
||||
@staticmethod
|
||||
@@ -527,7 +532,8 @@ class CloudSearchQuery(object):
|
||||
# The query limit is roughly 8k bytes. Limit to 200 friends to
|
||||
# avoid getting too close to that limit
|
||||
friend_ids = c.user.friends[:200]
|
||||
friends = ["author_fullname:'%s'" % _to_fn(Account, id_) for id_ in friend_ids]
|
||||
friends = ["author_fullname:'%s'" % _to_fn(Account, id_)
|
||||
for id_ in friend_ids]
|
||||
bq.extend(friends)
|
||||
bq.append(")")
|
||||
elif isinstance(sr, ModContribSR):
|
||||
@@ -543,7 +549,13 @@ class CloudSearchQuery(object):
|
||||
def _run(self, start=0, num=1000, _update=False):
|
||||
'''Run the search against self.query'''
|
||||
subreddit_query = self._get_sr_restriction(self.sr)
|
||||
self.bq = self.create_boolean_query(self.query, subreddit_query)
|
||||
if self.syntax == "cloudsearch":
|
||||
base_query = self.query
|
||||
elif self.syntax == "lucene":
|
||||
base_query = l2cs.convert(self.query, self.lucene_parser)
|
||||
self.converted_data = {"syntax": "cloudsearch",
|
||||
"converted": base_query}
|
||||
self.bq = self.create_boolean_query(base_query, subreddit_query)
|
||||
if g.sqlprinting:
|
||||
g.log.info("%s", self)
|
||||
return self._run_cached(self.bq, self.sort, start=start, num=num,
|
||||
@@ -551,7 +563,8 @@ class CloudSearchQuery(object):
|
||||
|
||||
def __repr__(self):
|
||||
'''Return a string representation of this query'''
|
||||
result = ["<", self.__class__.__name__, "> query:", repr(self.query), " "]
|
||||
result = ["<", self.__class__.__name__, "> query:",
|
||||
repr(self.query), " "]
|
||||
if self.bq:
|
||||
result.append(" bq:")
|
||||
result.append(repr(self.bq))
|
||||
@@ -612,23 +625,3 @@ class CloudSearchQuery(object):
|
||||
|
||||
results = Results(docs, hits, facets)
|
||||
return results
|
||||
|
||||
|
||||
def test_create_boolean_query():
|
||||
tests = [('steve holt', None),
|
||||
('steve holt', '(or sr_id:1 sr_id:2 sr_id:3)'),
|
||||
('steve holt', "author:'qgyh2'"),
|
||||
("can't help myself", None),
|
||||
("can't help myself", '(or sr_id:1 sr_id:2 sr_id:3)'),
|
||||
("can't help myself", "author:'qgyh2'"),
|
||||
("text:'steve holt'", None),
|
||||
("text:'steve holt'", '(or sr_id:1 sr_id:2 sr_id:3)'),
|
||||
("text:'steve holt'", "author:'qgyh2'"),
|
||||
("(or text:'steve holt' text:'nintendo')", None),
|
||||
("(or text:'steve holt' text:'nintendo')", '(or sr_id:1 sr_id:2 sr_id:3)'),
|
||||
("(or text:'steve holt' text:'nintendo')", "author:'qgyh2'")]
|
||||
for test in tests:
|
||||
print "Trying: %r" % (test,)
|
||||
bq = CloudSearchQuery.create_boolean_query(*test)
|
||||
print "Query: %r" % bq
|
||||
basic_query(bq=bq, size=1)
|
||||
|
||||
@@ -822,13 +822,15 @@ class SearchPage(BoringPage):
|
||||
def __init__(self, pagename, prev_search, elapsed_time,
|
||||
num_results, search_params = {},
|
||||
simple=False, restrict_sr = False, site=None,
|
||||
syntax=None, converted_data=None,
|
||||
*a, **kw):
|
||||
self.searchbar = SearchBar(prev_search = prev_search,
|
||||
elapsed_time = elapsed_time,
|
||||
num_results = num_results,
|
||||
search_params = search_params,
|
||||
show_feedback = True, site=site,
|
||||
simple=simple, restrict_sr=restrict_sr)
|
||||
self.searchbar = SearchBar(prev_search=prev_search,
|
||||
elapsed_time=elapsed_time,
|
||||
num_results=num_results,
|
||||
search_params=search_params,
|
||||
show_feedback=True, site=site,
|
||||
simple=simple, restrict_sr=restrict_sr,
|
||||
syntax=syntax, converted_data=converted_data)
|
||||
BoringPage.__init__(self, pagename, robots='noindex', *a, **kw)
|
||||
|
||||
def content(self):
|
||||
@@ -1728,26 +1730,26 @@ class PaneStack(Templated):
|
||||
class SearchForm(Templated):
|
||||
"""The simple search form in the header of the page. prev_search
|
||||
is the previous search."""
|
||||
def __init__(self, prev_search = '', search_params = {},
|
||||
site=None, simple=True, restrict_sr=False,
|
||||
subreddit_search=False):
|
||||
Templated.__init__(self, prev_search = prev_search,
|
||||
search_params = search_params, site=site,
|
||||
simple=simple, restrict_sr=restrict_sr,
|
||||
subreddit_search=subreddit_search)
|
||||
def __init__(self, prev_search='', search_params={}, site=None,
|
||||
simple=True, restrict_sr=False, subreddit_search=False,
|
||||
syntax=None):
|
||||
Templated.__init__(self, prev_search=prev_search,
|
||||
search_params=search_params, site=site,
|
||||
simple=simple, restrict_sr=restrict_sr,
|
||||
subreddit_search=subreddit_search, syntax=syntax)
|
||||
|
||||
|
||||
class SearchBar(Templated):
|
||||
"""More detailed search box for /search and /reddits pages.
|
||||
Displays the previous search as well as info of the elapsed_time
|
||||
and num_results if any."""
|
||||
def __init__(self, num_results = 0, prev_search = '', elapsed_time = 0,
|
||||
search_params = {}, show_feedback=False,
|
||||
simple=False, restrict_sr=False, site=None,
|
||||
subreddit_search=False, **kw):
|
||||
|
||||
# not listed explicitly in args to ensure it translates properly
|
||||
self.header = kw.get('header', _("previous search"))
|
||||
def __init__(self, header=None, num_results=0, prev_search='',
|
||||
elapsed_time=0, search_params={}, show_feedback=False,
|
||||
simple=False, restrict_sr=False, site=None, syntax=None,
|
||||
subreddit_search=False, converted_data=None, **kw):
|
||||
if header is None:
|
||||
header = _("previous search")
|
||||
self.header = header
|
||||
|
||||
self.prev_search = prev_search
|
||||
self.elapsed_time = elapsed_time
|
||||
@@ -1759,9 +1761,11 @@ class SearchBar(Templated):
|
||||
else:
|
||||
self.num_results = num_results
|
||||
|
||||
Templated.__init__(self, search_params = search_params,
|
||||
Templated.__init__(self, search_params=search_params,
|
||||
simple=simple, restrict_sr=restrict_sr,
|
||||
site=site, subreddit_search=subreddit_search)
|
||||
site=site, syntax=syntax,
|
||||
converted_data=converted_data,
|
||||
subreddit_search=subreddit_search)
|
||||
|
||||
class Frame(Wrapped):
|
||||
"""Frameset for the FrameToolbar used when a user hits /tb/. The
|
||||
|
||||
@@ -2254,10 +2254,17 @@ label + #moresearchinfo {
|
||||
|
||||
.bottommenu { color: gray; font-size: smaller; clear: both}
|
||||
.bottommenu a { color: gray; text-decoration: underline; }
|
||||
.bottommenu.serverinfo { text-align:right; padding:5px; }
|
||||
.bottommenu.serverinfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
|
||||
.bottommenu.serverinfo .content { display:none; }
|
||||
.bottommenu.serverinfo:hover .content { display:inline; }
|
||||
|
||||
.debuginfo {
|
||||
text-align: right;
|
||||
padding: 5px;
|
||||
color: gray;
|
||||
font-size: smaller;
|
||||
clear: both;
|
||||
}
|
||||
.debuginfo .icon { color:#a0a0a0; font:1.5em serif; padding:0 2px; }
|
||||
.debuginfo .content { display:none; }
|
||||
.debuginfo:hover .content { display:inline; }
|
||||
|
||||
|
||||
/* Buttons specific */
|
||||
|
||||
@@ -54,5 +54,5 @@
|
||||
dict(year=datetime.datetime.now().timetuple()[0])}
|
||||
</p>
|
||||
<p class="bottommenu">REDDIT and the ALIEN Logo are registered trademarks of reddit inc.</p>
|
||||
<p class="bottommenu serverinfo"><span class="icon">π</span> <span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
|
||||
<p class="bottommenu debuginfo"><span class="icon">π</span> <span class="content">Rendered by PID ${g.reddit_pid} on ${g.reddit_host} running ${g.short_version}.</span></p>
|
||||
</div>
|
||||
|
||||
@@ -51,6 +51,14 @@
|
||||
</div>
|
||||
</div>
|
||||
%endif
|
||||
<div>
|
||||
%if thing.converted_data:
|
||||
<p class="debuginfo">
|
||||
<span class="icon">δ</span>
|
||||
<span class="content">${_('converted query to %(syntax)s syntax: %(converted)s') % thing.converted_data}</span>
|
||||
</p>
|
||||
%endif
|
||||
</div>
|
||||
</div>
|
||||
%endif
|
||||
|
||||
@@ -58,9 +66,10 @@
|
||||
<h4 style="color:gray">${thing.header}</h4>
|
||||
|
||||
<div id="previoussearch">
|
||||
${SearchForm(prev_search = thing.prev_search,
|
||||
search_params = thing.search_params,
|
||||
${SearchForm(prev_search=thing.prev_search,
|
||||
search_params=thing.search_params,
|
||||
site=thing.site, subreddit_search=thing.subreddit_search,
|
||||
simple=thing.simple, restrict_sr=thing.restrict_sr)}
|
||||
simple=thing.simple, restrict_sr=thing.restrict_sr,
|
||||
syntax=thing.syntax)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -68,7 +68,6 @@
|
||||
% endif
|
||||
|
||||
${search_faq()}
|
||||
|
||||
</div>
|
||||
%else:
|
||||
%if not thing.site or isinstance(thing.site, DefaultSR):
|
||||
|
||||
Reference in New Issue
Block a user