mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-24 06:18:08 -05:00
Tools for webmasters to monitor their content on reddit (uses Solr
for pulling domain information). Also includes a re-factor of solrsearch.py and its usage, which should fix bug #179 as a side-effect
This commit is contained in:
@@ -385,6 +385,8 @@ CondeNet, Inc. All Rights Reserved.
|
||||
<field name="hot" type="hotness" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="controversy" type="sfloat" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="points" type="sint" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="spam" type="boolean" indexed="true" stored="true" required="false" />
|
||||
<field name="deleted" type="boolean" indexed="true" stored="true" required="false" />
|
||||
<!-- subreddit,link,comment -->
|
||||
<field name="author_id" type="integer" indexed="true" stored="false" required="false" />
|
||||
<field name="author" type="string" indexed="true" stored="false" required="false" />
|
||||
|
||||
@@ -34,6 +34,7 @@ from pylons.wsgiapp import PylonsApp, PylonsBaseWSGIApp
|
||||
from r2.config.environment import load_environment
|
||||
from r2.config.rewrites import rewrites
|
||||
from r2.lib.utils import rstrips
|
||||
from r2.lib.jsontemplates import api_type
|
||||
|
||||
#middleware stuff
|
||||
from r2.lib.html_source import HTMLValidationParser
|
||||
@@ -240,7 +241,7 @@ class DomainMiddleware(object):
|
||||
|
||||
|
||||
class SubredditMiddleware(object):
|
||||
sr_pattern = re.compile(r'^/r/([^/]+)')
|
||||
sr_pattern = re.compile(r'^/r/([^/]{3,20})')
|
||||
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
@@ -255,18 +256,50 @@ class SubredditMiddleware(object):
|
||||
environ['subreddit'] = 'r'
|
||||
return self.app(environ, start_response)
|
||||
|
||||
class DomainListingMiddleware(object):
|
||||
domain_pattern = re.compile(r'^/domain/(([\w]+\.)+[\w]+)')
|
||||
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
if not environ.has_key('subreddit'):
|
||||
path = environ['PATH_INFO']
|
||||
domain = self.domain_pattern.match(path)
|
||||
if domain:
|
||||
environ['domain'] = domain.groups()[0]
|
||||
environ['PATH_INFO'] = self.domain_pattern.sub('', path) or '/'
|
||||
return self.app(environ, start_response)
|
||||
|
||||
class ExtensionMiddleware(object):
|
||||
ext_pattern = re.compile(r'\.([^/]+)$')
|
||||
|
||||
extensions = {'rss' : ('xml', 'text/xml; charset=UTF-8'),
|
||||
'xml' : ('xml', 'text/xml; charset=UTF-8'),
|
||||
'js' : ('js', 'text/javascript; charset=UTF-8'),
|
||||
'wired' : ('wired', 'text/javascript; charset=UTF-8'),
|
||||
'embed' : ('htmllite', 'text/javascript; charset=UTF-8'),
|
||||
'mobile' : ('mobile', 'text/html'),
|
||||
'png' : ('png', 'image/png'),
|
||||
'css' : ('css', 'text/css'),
|
||||
'api' : (api_type(), 'application/json; charset=UTF-8'),
|
||||
'json' : (api_type(), 'application/json; charset=UTF-8'),
|
||||
'json-html' : (api_type('html'), 'application/json; charset=UTF-8')}
|
||||
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
path = environ['PATH_INFO']
|
||||
ext = self.ext_pattern.findall(path)
|
||||
if ext:
|
||||
environ['extension'] = ext[0]
|
||||
environ['PATH_INFO'] = self.ext_pattern.sub('', path) or '/'
|
||||
domain_ext = environ.get('reddit-domain-extension')
|
||||
for ext, val in self.extensions.iteritems():
|
||||
if ext == domain_ext or path.endswith(ext):
|
||||
environ['extension'] = ext
|
||||
environ['render_style'] = val[0]
|
||||
environ['content_type'] = val[1]
|
||||
#strip off the extension
|
||||
environ['PATH_INFO'] = path[:-(len(ext) + 1)]
|
||||
break
|
||||
return self.app(environ, start_response)
|
||||
|
||||
class RewriteMiddleware(object):
|
||||
@@ -382,11 +415,11 @@ def make_app(global_conf, full_stack=True, **app_conf):
|
||||
app = ProfilingMiddleware(app)
|
||||
app = SourceViewMiddleware(app)
|
||||
|
||||
app = SubredditMiddleware(app)
|
||||
app = DomainMiddleware(app)
|
||||
app = DomainListingMiddleware(app)
|
||||
app = SubredditMiddleware(app)
|
||||
app = ExtensionMiddleware(app)
|
||||
|
||||
|
||||
log_path = global_conf.get('log_path')
|
||||
if log_path:
|
||||
process_iden = global_conf.get('scgi_port', 'default')
|
||||
|
||||
@@ -32,12 +32,14 @@ from r2.lib.template_helpers import get_domain
|
||||
from r2.lib.emailer import has_opted_out, Email
|
||||
from r2.lib.db.operators import desc
|
||||
from r2.lib.strings import strings
|
||||
from r2.lib.solrsearch import RelatedSearchQuery, SubredditSearchQuery, LinkSearchQuery
|
||||
import r2.lib.db.thing as thing
|
||||
from listingcontroller import ListingController
|
||||
from pylons import c, request
|
||||
|
||||
import random as rand
|
||||
import re
|
||||
import time as time_module
|
||||
from urllib import quote_plus
|
||||
|
||||
from admin import admin_profile_query
|
||||
@@ -292,6 +294,7 @@ class FrontController(RedditController):
|
||||
def GET_related(self, num, article, after, reverse, count):
|
||||
"""Related page: performs a search using title of article as
|
||||
the search query."""
|
||||
|
||||
title = c.site.name + ((': ' + article.title) if hasattr(article, 'title') else '')
|
||||
|
||||
query = self.related_replace_regex.sub(self.related_replace_with,
|
||||
@@ -301,24 +304,25 @@ class FrontController(RedditController):
|
||||
# longer than this are typically ascii art anyway
|
||||
query = query[0:1023]
|
||||
|
||||
num, t, pane = self._search(query, time = 'all',
|
||||
count = count,
|
||||
after = after, reverse = reverse, num = num,
|
||||
ignore = [article._fullname],
|
||||
types = [Link])
|
||||
res = LinkInfoPage(link = article, content = pane).render()
|
||||
return res
|
||||
q = RelatedSearchQuery(query, ignore = [article._fullname])
|
||||
num, t, pane = self._search(q,
|
||||
num = num, after = after, reverse = reverse,
|
||||
count = count)
|
||||
|
||||
return LinkInfoPage(link = article, content = pane).render()
|
||||
|
||||
@base_listing
|
||||
@validate(query = nop('q'))
|
||||
def GET_search_reddits(self, query, reverse, after, count, num):
|
||||
"""Search reddits by title and description."""
|
||||
num, t, spane = self._search(query, num = num, types = [Subreddit],
|
||||
sort='points desc', time='all',
|
||||
after = after, reverse = reverse,
|
||||
# note that 'downs' is a measure of activity on subreddits
|
||||
q = SubredditSearchQuery(query, sort = 'downs desc',
|
||||
timerange = 'all')
|
||||
|
||||
num, t, spane = self._search(q, num = num, reverse = reverse, after = after,
|
||||
count = count)
|
||||
|
||||
res = SubredditsPage(content=spane,
|
||||
res = SubredditsPage(content=spane,
|
||||
prev_search = query,
|
||||
elapsed_time = t,
|
||||
num_results = num,
|
||||
@@ -327,7 +331,7 @@ class FrontController(RedditController):
|
||||
|
||||
verify_langs_regex = re.compile(r"^[a-z][a-z](,[a-z][a-z])*$")
|
||||
@base_listing
|
||||
@validate(query=nop('q'),
|
||||
@validate(query = nop('q'),
|
||||
time = VMenu('action', TimeMenu, remember = False),
|
||||
langs = nop('langs'))
|
||||
def GET_search(self, query, num, time, reverse, after, count, langs):
|
||||
@@ -340,12 +344,12 @@ class FrontController(RedditController):
|
||||
if langs and self.verify_langs_regex.match(langs):
|
||||
langs = langs.split(',')
|
||||
else:
|
||||
langs = None
|
||||
langs = c.content_langs
|
||||
|
||||
num, t, spane = self._search(query, time=time,
|
||||
num = num, after = after,
|
||||
reverse = reverse,
|
||||
count = count, types = [Link])
|
||||
q = LinkSearchQuery(q = query, timerange = time, langs = langs)
|
||||
|
||||
num, t, spane = self._search(q, num = num, after = after, reverse = reverse,
|
||||
count = count)
|
||||
|
||||
if not isinstance(c.site,FakeSubreddit):
|
||||
my_reddits_link = "/search%s" % query_string({'q': query})
|
||||
@@ -365,26 +369,22 @@ class FrontController(RedditController):
|
||||
|
||||
return res
|
||||
|
||||
def _search(self, query = '', time=None,
|
||||
sort = 'hot desc',
|
||||
after = None, reverse = False, num = 25,
|
||||
ignore = None, count=0, types = None,
|
||||
langs = None):
|
||||
def _search(self, query_obj, num, after, reverse, count=0):
|
||||
"""Helper function for interfacing with search. Basically a
|
||||
thin wrapper for SearchBuilder."""
|
||||
builder = SearchBuilder(query, num = num,
|
||||
sort = sort,
|
||||
after = after, reverse = reverse,
|
||||
count = count, types = types,
|
||||
time = time, ignore = ignore,
|
||||
langs = langs,
|
||||
builder = SearchBuilder(query_obj,
|
||||
after = after, num = num, reverse = reverse,
|
||||
count = count,
|
||||
wrap = ListingController.builder_wrapper)
|
||||
|
||||
listing = LinkListing(builder, show_nums=True)
|
||||
|
||||
# have to do it in two steps since total_num and timing are only
|
||||
# computed after fetch_more
|
||||
res = listing.listing()
|
||||
return builder.total_num, builder.timing, res
|
||||
timing = time_module.time() - builder.start_time
|
||||
|
||||
return builder.total_num, timing, res
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ from r2.lib.db.thing import Query, Merge, Relations
|
||||
from r2.lib.db import queries
|
||||
from r2.lib.strings import Score
|
||||
from r2.lib import organic
|
||||
from r2.lib.solrsearch import SearchQuery
|
||||
from r2.lib.utils import iters, check_cheating
|
||||
|
||||
from admin import admin_profile_query
|
||||
@@ -112,6 +113,8 @@ class ListingController(RedditController):
|
||||
builder_cls = self.builder_cls
|
||||
elif isinstance(self.query_obj, Query):
|
||||
builder_cls = QueryBuilder
|
||||
elif isinstance(self.query_obj, SearchQuery):
|
||||
builder_cls = SearchBuilder
|
||||
elif isinstance(self.query_obj, iters):
|
||||
builder_cls = IDBuilder
|
||||
elif isinstance(self.query_obj, queries.CachedResults):
|
||||
|
||||
@@ -212,13 +212,18 @@ def over18():
|
||||
return True
|
||||
|
||||
def set_subreddit():
|
||||
sr_name=request.environ.get("subreddit", request.params.get('r'))
|
||||
#the r parameter gets added by javascript for POST requests so we
|
||||
#can reference c.site in api.py
|
||||
sr_name = request.environ.get("subreddit", request.POST.get('r'))
|
||||
domain = request.environ.get("domain")
|
||||
|
||||
if not sr_name or sr_name == Default.name:
|
||||
if not sr_name:
|
||||
#check for cnames
|
||||
sub_domain = request.environ.get('sub_domain')
|
||||
sr = Subreddit._by_domain(sub_domain) if sub_domain else None
|
||||
c.site = sr or Default
|
||||
elif sr_name == 'r':
|
||||
#reddits
|
||||
c.site = Sub
|
||||
else:
|
||||
try:
|
||||
@@ -227,6 +232,10 @@ def set_subreddit():
|
||||
c.site = Default
|
||||
redirect_to("/reddits/create?name=%s" % sr_name)
|
||||
|
||||
#if we didn't find a subreddit, check for a domain listing
|
||||
if not sr_name and c.site == Default and domain:
|
||||
c.site = DomainSR(domain)
|
||||
|
||||
if isinstance(c.site, FakeSubreddit):
|
||||
c.default_sr = True
|
||||
|
||||
@@ -235,42 +244,16 @@ def set_subreddit():
|
||||
abort(404, "not found")
|
||||
|
||||
def set_content_type():
|
||||
c.extension = request.environ.get('extension') or \
|
||||
request.environ.get('reddit-domain-extension') or ''
|
||||
c.render_style = 'html'
|
||||
if c.extension in ('rss', 'xml'):
|
||||
c.render_style = 'xml'
|
||||
c.response_content_type = 'text/xml; charset=UTF-8'
|
||||
elif c.extension == 'js':
|
||||
c.render_style = 'js'
|
||||
c.response_content_type = 'text/javascript; charset=UTF-8'
|
||||
elif c.extension.startswith('json') or c.extension == "api":
|
||||
c.response_content_type = 'application/json; charset=UTF-8'
|
||||
c.response_access_control = 'allow <*>'
|
||||
if c.extension == 'json-html':
|
||||
c.render_style = api_type('html')
|
||||
else:
|
||||
c.render_style = api_type()
|
||||
elif c.extension == 'wired':
|
||||
c.render_style = 'wired'
|
||||
c.response_content_type = 'text/javascript; charset=UTF-8'
|
||||
c.response_wrappers.append(utils.to_js)
|
||||
elif c.extension == 'embed':
|
||||
c.render_style = 'htmllite'
|
||||
c.response_content_type = 'text/javascript; charset=UTF-8'
|
||||
c.response_wrappers.append(utils.to_js)
|
||||
elif c.extension == 'mobile':
|
||||
c.render_style = 'mobile'
|
||||
elif c.extension == 'png':
|
||||
c.response_content_type = 'image/png'
|
||||
c.render_style = 'png'
|
||||
elif c.extension == 'css':
|
||||
c.response_content_type = 'text/css'
|
||||
c.render_style = 'css'
|
||||
#Insert new extentions above this line
|
||||
elif c.extension not in ('', 'html'):
|
||||
# request.path already has the extension stripped off of it
|
||||
redirect_to(request.path + utils.query_string(request.get))
|
||||
e = request.environ
|
||||
if e.has_key('extension'):
|
||||
c.render_style = e['render_style']
|
||||
c.response_content_type = e['content_type']
|
||||
|
||||
ext = e['extension']
|
||||
if ext == 'api' or ext.startswith('json'):
|
||||
c.response_access_control = 'allow <*>'
|
||||
if ext in ('embed', 'wired'):
|
||||
c.response_wrappers.append(utils.to_js)
|
||||
|
||||
def get_browser_langs():
|
||||
browser_langs = []
|
||||
|
||||
@@ -118,7 +118,8 @@ class BaseController(WSGIController):
|
||||
u.mk_cname(**kw)
|
||||
|
||||
# make sure the extensions agree with the current page
|
||||
u.set_extension(c.extension)
|
||||
if c.extension:
|
||||
u.set_extension(c.extension)
|
||||
|
||||
# unparse and encode it un utf8
|
||||
return _force_unicode(u.unparse()).encode('utf8')
|
||||
|
||||
@@ -225,7 +225,7 @@ def test_cache(cache):
|
||||
# a cache that occasionally dumps itself to be used for long-running
|
||||
# processes
|
||||
class SelfEmptyingCache(LocalCache):
|
||||
def __init__(self,max_size=50*1000):
|
||||
def __init__(self,max_size=100*1000):
|
||||
self.max_size = max_size
|
||||
|
||||
def maybe_reset(self):
|
||||
|
||||
@@ -5,6 +5,7 @@ from r2.lib.db.operators import asc, desc, timeago
|
||||
from r2.lib.db import query_queue
|
||||
from r2.lib.db.sorts import epoch_seconds
|
||||
from r2.lib.utils import fetch_things2, worker
|
||||
from r2.lib.solrsearch import DomainSearchQuery
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
@@ -23,6 +24,12 @@ def db_sort(sort):
|
||||
cls, col = db_sorts[sort]
|
||||
return cls(col)
|
||||
|
||||
search_sort = dict(hot = 'hot desc',
|
||||
new = 'date desc',
|
||||
top = 'points desc',
|
||||
controversial = 'controversy desc',
|
||||
old = 'date asc')
|
||||
|
||||
db_times = dict(all = None,
|
||||
hour = Thing.c._date >= timeago('1 hour'),
|
||||
day = Thing.c._date >= timeago('1 day'),
|
||||
@@ -176,6 +183,9 @@ def get_links(sr, sort, time):
|
||||
q._filter(db_times[time])
|
||||
return make_results(q)
|
||||
|
||||
def get_domain_links(domain, sort, time):
|
||||
return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time)
|
||||
|
||||
def user_query(kind, user, sort, time):
|
||||
"""General profile-page query."""
|
||||
q = kind._query(kind.c.author_id == user._id,
|
||||
|
||||
@@ -32,7 +32,7 @@ from r2.models import *
|
||||
from r2.lib.contrib import pysolr
|
||||
from r2.lib.contrib.pysolr import SolrError
|
||||
from r2.lib.utils import timeago, set_emptying_cache, IteratorChunker
|
||||
from r2.lib.utils import psave, pload, unicode_safe
|
||||
from r2.lib.utils import psave, pload, unicode_safe, tup
|
||||
from r2.lib.cache import SelfEmptyingCache
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
@@ -125,6 +125,8 @@ search_fields={Thing: (Field('fullname', '_fullname'),
|
||||
Field('lang'),
|
||||
Field('ups', '_ups', is_number=True, reverse=True),
|
||||
Field('downs', '_downs', is_number=True, reverse=True),
|
||||
Field('spam','_spam'),
|
||||
Field('deleted','_deleted'),
|
||||
Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True),
|
||||
Field('controversy', '_controversy', is_number=True, reverse=True),
|
||||
Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)),
|
||||
@@ -162,8 +164,8 @@ search_fields={Thing: (Field('fullname', '_fullname'),
|
||||
# yes, it's a copy of 'hot'
|
||||
is_number=True, reverse=True),
|
||||
ThingField('author',Account,'author_id','name'),
|
||||
#ThingField('subreddit',Subreddit,'sr_id','name'),
|
||||
ThingField('reddit',Subreddit,'sr_id','name'))}
|
||||
ThingField('subreddit',Subreddit,'sr_id','name'))}
|
||||
#ThingField('reddit',Subreddit,'sr_id','name'))}
|
||||
|
||||
def tokenize_things(things,return_dict=False):
|
||||
"""
|
||||
@@ -276,6 +278,8 @@ def fetch_batches(t_class,size,since,until):
|
||||
of `fetch_things`
|
||||
"""
|
||||
q=t_class._query(t_class.c._date >= since,
|
||||
t_class.c._spam == (True,False),
|
||||
t_class.c._deleted == (True,False),
|
||||
t_class.c._date < until,
|
||||
sort = desc('_date'),
|
||||
limit = size,
|
||||
@@ -375,8 +379,8 @@ def reindex_all(types = None, delete_all_first=False):
|
||||
for batch in fetch_batches(cls,1000,
|
||||
timeago("50 years"),
|
||||
start_t):
|
||||
r = tokenize_things([x for x in batch
|
||||
if not x._spam and not x._deleted ])
|
||||
r = tokenize_things([ x for x in batch
|
||||
if not x._spam and not x._deleted ])
|
||||
|
||||
count += len(r)
|
||||
print ("Processing %s #%d(%s): %s"
|
||||
@@ -465,173 +469,241 @@ def combine_searchterms(terms):
|
||||
def swap_strings(s,this,that):
|
||||
"""
|
||||
Just swaps substrings, like:
|
||||
s = "sort(asc)"
|
||||
swap_strings(s,'asc','desc')
|
||||
s -> "sort desc"
|
||||
s = "hot asc"
|
||||
s = swap_strings(s,'asc','desc')
|
||||
s == "hot desc"
|
||||
|
||||
uses 'tmp' as a replacment string, so don't use for anything
|
||||
very complicated
|
||||
"""
|
||||
return s.replace(this,'tmp').replace(that,this).replace('tmp',that)
|
||||
|
||||
def search_things(q, sort = 'hot desc',
|
||||
after = None,
|
||||
subreddits = None,
|
||||
authors = None,
|
||||
num = 100, reverse = False,
|
||||
timerange = None, langs = None,
|
||||
types = None,
|
||||
boost = []):
|
||||
"""
|
||||
Takes a given query and returns a list of Things that match
|
||||
that query. See Builder for the use of `after`, `reverse`, and
|
||||
`num`. Queries on params are OR queries, except `timerange`
|
||||
and `types`
|
||||
"""
|
||||
if not q or not g.solr_url:
|
||||
return pysolr.Results([],0)
|
||||
class SearchQuery(object):
|
||||
def __init__(self, q, sort, fields = [], subreddits = [], authors = [],
|
||||
types = [], timerange = None, spam = False, deleted = False):
|
||||
|
||||
# there are two parts to our query: what the user typed (parsed
|
||||
# with Solr's DisMax parser), and what we are adding to it. The
|
||||
# latter is called the "boost" (and is parsed using full Lucene
|
||||
# syntax), and it can be added to via the `boost` parameter (which
|
||||
# we have to copy since we append to it)
|
||||
boost = list(boost)
|
||||
self.q = q
|
||||
self.fields = fields
|
||||
self.sort = sort
|
||||
self.subreddits = subreddits
|
||||
self.authors = authors
|
||||
self.types = types
|
||||
self.spam = spam
|
||||
self.deleted = deleted
|
||||
|
||||
# `score` refers to Solr's score (relevency to the search given),
|
||||
# not our score (sums of ups and downs).
|
||||
sort = "score desc, %s, date desc, fullname asc" % (sort,)
|
||||
if reverse:
|
||||
sort = swap_strings(sort,'asc','desc')
|
||||
|
||||
if timerange:
|
||||
def time_to_searchstr(t):
|
||||
if isinstance(t, datetime):
|
||||
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
|
||||
elif isinstance(t, date):
|
||||
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
|
||||
elif isinstance(t,str):
|
||||
t = t
|
||||
return t
|
||||
|
||||
(fromtime, totime) = timerange
|
||||
fromtime = time_to_searchstr(fromtime)
|
||||
totime = time_to_searchstr(totime)
|
||||
boost.append("+date:[%s TO %s]"
|
||||
% (fromtime,totime))
|
||||
|
||||
if subreddits:
|
||||
def subreddit_to_searchstr(sr):
|
||||
if isinstance(sr,Subreddit):
|
||||
return ('sr_id','%d' % sr.id)
|
||||
elif isinstance(sr,str) or isinstance(sr,unicode):
|
||||
return ('reddit',sr)
|
||||
else:
|
||||
return ('sr_id','%d' % sr)
|
||||
|
||||
if isinstance(subreddits,list) or isinstance(subreddits,tuple):
|
||||
s_subreddits = map(subreddit_to_searchstr, subreddits)
|
||||
if timerange in ['hour','week','day','month','year']:
|
||||
self.timerange = (timeago("1 %s" % timerange),"NOW")
|
||||
elif timerange == 'all' or timerange is None:
|
||||
self.timerange = None
|
||||
else:
|
||||
s_subreddits = (subreddit_to_searchstr(subreddits),)
|
||||
self.timerange = timerange
|
||||
|
||||
boost.append("+(%s)^2" % combine_searchterms(s_subreddits))
|
||||
def run(self, after = None, num = 100, reverse = False):
|
||||
if not self.q or not g.solr_url:
|
||||
return pysolr.Results([],0)
|
||||
|
||||
if authors:
|
||||
def author_to_searchstr(a):
|
||||
if isinstance(a,Account):
|
||||
return ('author_id','%d' % a.id)
|
||||
elif isinstance(a,str) or isinstance(a,unicode):
|
||||
return ('author',a)
|
||||
# there are two parts to our query: what the user typed
|
||||
# (parsed with Solr's DisMax parser), and what we are adding
|
||||
# to it. The latter is called the "boost" (and is parsed using
|
||||
# full Lucene syntax), and it can be added to via the `boost`
|
||||
# parameter
|
||||
boost = []
|
||||
|
||||
if not self.spam:
|
||||
boost.append("-spam:true")
|
||||
if not self.deleted:
|
||||
boost.append("-deleted:true")
|
||||
|
||||
if self.timerange:
|
||||
def time_to_searchstr(t):
|
||||
if isinstance(t, datetime):
|
||||
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
|
||||
elif isinstance(t, date):
|
||||
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
|
||||
elif isinstance(t,str):
|
||||
t = t
|
||||
return t
|
||||
|
||||
(fromtime, totime) = self.timerange
|
||||
fromtime = time_to_searchstr(fromtime)
|
||||
totime = time_to_searchstr(totime)
|
||||
boost.append("+date:[%s TO %s]"
|
||||
% (fromtime,totime))
|
||||
|
||||
if self.subreddits:
|
||||
def subreddit_to_searchstr(sr):
|
||||
if isinstance(sr,Subreddit):
|
||||
return ('sr_id','%d' % sr.id)
|
||||
elif isinstance(sr,str) or isinstance(sr,unicode):
|
||||
return ('subreddit',sr)
|
||||
else:
|
||||
return ('sr_id','%d' % sr)
|
||||
|
||||
s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits))
|
||||
|
||||
boost.append("+(%s)" % combine_searchterms(s_subreddits))
|
||||
|
||||
if self.authors:
|
||||
def author_to_searchstr(a):
|
||||
if isinstance(a,Account):
|
||||
return ('author_id','%d' % a.id)
|
||||
elif isinstance(a,str) or isinstance(a,unicode):
|
||||
return ('author',a)
|
||||
else:
|
||||
return ('author_id','%d' % a)
|
||||
|
||||
s_authors = map(author_to_searchstr,tup(self.authors))
|
||||
|
||||
boost.append('+(%s)^2' % combine_searchterms(s_authors))
|
||||
|
||||
|
||||
def type_to_searchstr(t):
|
||||
if isinstance(t,str):
|
||||
return ('type',t)
|
||||
else:
|
||||
return ('author_id','%d' % a)
|
||||
|
||||
if isinstance(authors,list) or isinstance(authors,tuple):
|
||||
s_authors = map(author_to_searchstr,authors)
|
||||
else:
|
||||
s_authors = map(author_to_searchstr,(authors,))
|
||||
|
||||
boost.append('+(%s)^2' % combine_searchterms(s_authors))
|
||||
|
||||
# the set of languages is used to determine the fields to search,
|
||||
# named ('contents_%s' % lang), but 'contents' (which is split
|
||||
# only on whitespace) is always also searched. This means that
|
||||
# all_langs and schema.xml must be kept in synch
|
||||
default_fields = ['contents^1.5','contents_ws^3',
|
||||
'site^1','author^1', 'reddit^1', 'url^1']
|
||||
if langs == None:
|
||||
# only search 'contents'
|
||||
fields = default_fields
|
||||
else:
|
||||
if langs == 'all':
|
||||
langs = searchable_langs
|
||||
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
|
||||
+ default_fields)
|
||||
|
||||
if not types:
|
||||
types = indexed_types
|
||||
|
||||
def type_to_searchstr(t):
|
||||
if isinstance(t,str):
|
||||
return ('type',t)
|
||||
else:
|
||||
return ('type',t.__name__.lower())
|
||||
return ('type',t.__name__.lower())
|
||||
|
||||
s_types = map(type_to_searchstr,types)
|
||||
boost.append("+%s" % combine_searchterms(s_types))
|
||||
s_types = map(type_to_searchstr,self.types)
|
||||
boost.append("+%s" % combine_searchterms(s_types))
|
||||
|
||||
# everything else that solr needs to know
|
||||
solr_params = dict(fl = 'fullname', # the field(s) to return
|
||||
qt = 'dismax', # the query-handler (dismax supports 'bq' and 'qf')
|
||||
# qb = '3',
|
||||
bq = ' '.join(boost),
|
||||
qf = ' '.join(fields),
|
||||
mm = '75%') # minimum number of clauses that should match
|
||||
q,solr_params = self.solr_params(self.q,boost)
|
||||
|
||||
with SolrConnection() as s:
|
||||
if after:
|
||||
# size of the pre-search to run in the case that we need
|
||||
# to search more than once. A bigger one can reduce the
|
||||
# number of searches that need to be run twice, but if
|
||||
# it's bigger than the default display size, it could
|
||||
# waste some
|
||||
PRESEARCH_SIZE = num
|
||||
try:
|
||||
search = self.run_search(q, self.sort, solr_params,
|
||||
reverse, after, num)
|
||||
return search
|
||||
|
||||
# run a search and get back the number of hits, so that we
|
||||
# can re-run the search with that max_count.
|
||||
pre_search = s.search(q,sort,rows=PRESEARCH_SIZE,
|
||||
except SolrError,e:
|
||||
g.log.error(str(e))
|
||||
return pysolr.Results([],0)
|
||||
|
||||
@classmethod
|
||||
def run_search(cls, q, sort, solr_params, reverse, after, num):
|
||||
"returns pysolr.Results(docs=[fullname()],hits=int())"
|
||||
|
||||
if reverse:
|
||||
sort = swap_strings(sort,'asc','desc')
|
||||
|
||||
g.log.debug("Searching q=%s" % q)
|
||||
|
||||
with SolrConnection() as s:
|
||||
if after:
|
||||
# size of the pre-search to run in the case that we
|
||||
# need to search more than once. A bigger one can
|
||||
# reduce the number of searches that need to be run
|
||||
# twice, but if it's bigger than the default display
|
||||
# size, it could waste some
|
||||
PRESEARCH_SIZE = num
|
||||
|
||||
# run a search and get back the number of hits, so
|
||||
# that we can re-run the search with that max_count.
|
||||
pre_search = s.search(q,sort,rows=PRESEARCH_SIZE,
|
||||
other_params = solr_params)
|
||||
|
||||
if (PRESEARCH_SIZE >= pre_search.hits
|
||||
or pre_search.hits == len(pre_search.docs)):
|
||||
# don't run a second search if our pre-search
|
||||
# found all of the elements anyway
|
||||
search = pre_search
|
||||
else:
|
||||
# we have to run a second search, but we can limit
|
||||
# the duplicated transfer of the first few records
|
||||
# since we already have those from the pre_search
|
||||
second_search = s.search(q,sort,
|
||||
start=len(pre_search.docs),
|
||||
rows=pre_search.hits - len(pre_search.docs),
|
||||
other_params = solr_params)
|
||||
search = pysolr.Results(pre_search.docs + second_search.docs,
|
||||
pre_search.hits)
|
||||
|
||||
search.docs = [ i['fullname'] for i in search.docs ]
|
||||
search.docs = get_after(search.docs, after._fullname, num)
|
||||
else:
|
||||
search = s.search(q,sort,rows=num,
|
||||
other_params = solr_params)
|
||||
search.docs = [ i['fullname'] for i in search.docs ]
|
||||
|
||||
if (PRESEARCH_SIZE >= pre_search.hits
|
||||
or pre_search.hits == len(pre_search.docs)):
|
||||
# don't run a second search if our pre-search found
|
||||
# all of the elements anyway
|
||||
search = pre_search
|
||||
else:
|
||||
# we have to run a second search, but we can limit the
|
||||
# duplicated transfer of the first few records since
|
||||
# we already have those from the pre_search
|
||||
second_search = s.search(q,sort,
|
||||
start=len(pre_search.docs),
|
||||
rows=pre_search.hits - len(pre_search.docs),
|
||||
other_params = solr_params)
|
||||
search = pysolr.Results(pre_search.docs + second_search.docs,
|
||||
pre_search.hits)
|
||||
return search
|
||||
|
||||
fullname = after._fullname
|
||||
for i, item in enumerate(search.docs):
|
||||
if item['fullname'] == fullname:
|
||||
search.docs = search.docs[i+1:i+1+num]
|
||||
break
|
||||
else:
|
||||
g.log.debug("I got an after query, but the fullname was not present in the results")
|
||||
search.docs = search.docs[0:num]
|
||||
def solr_params(self,*k,**kw):
|
||||
raise NotImplementedError
|
||||
|
||||
class UserSearchQuery(SearchQuery):
|
||||
"Base class for queries that use the dismax parser; requires self.mm"
|
||||
def __init__(self, q, sort=None, fields=[], langs=None, **kw):
|
||||
default_fields = ['contents^1.5','contents_ws^3'] + fields
|
||||
|
||||
if sort is None:
|
||||
sort = 'score desc, hot desc, date desc'
|
||||
|
||||
if langs is None:
|
||||
fields = default_fields
|
||||
else:
|
||||
search = s.search(q,sort,rows=num,
|
||||
other_params = solr_params)
|
||||
if langs == 'all':
|
||||
langs = searchable_langs
|
||||
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
|
||||
+ default_fields)
|
||||
|
||||
hits = search.hits
|
||||
things = Thing._by_fullname([i['fullname'] for i in search.docs],
|
||||
data = True, return_dict = False)
|
||||
# default minimum match
|
||||
self.mm = '75%'
|
||||
|
||||
return pysolr.Results(things,hits)
|
||||
SearchQuery.__init__(self, q, sort, fields = fields, **kw)
|
||||
|
||||
def solr_params(self, q, boost):
|
||||
return q, dict(fl = 'fullname',
|
||||
qt = 'dismax',
|
||||
bq = ' '.join(boost),
|
||||
qf = ' '.join(self.fields),
|
||||
mm = self.mm)
|
||||
|
||||
class LinkSearchQuery(UserSearchQuery):
|
||||
def __init__(self, q, **kw):
|
||||
additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1']
|
||||
|
||||
subreddits = None
|
||||
authors = None
|
||||
if c.site == subreddit.Default:
|
||||
subreddits = Subreddit.user_subreddits(c.user)
|
||||
elif c.site == subreddit.Friends and c.user.friends:
|
||||
authors = c.user.friends
|
||||
elif not isinstance(c.site,subreddit.FakeSubreddit):
|
||||
subreddits = [c.site._id]
|
||||
|
||||
UserSearchQuery.__init__(self, q, fields = additional_fields,
|
||||
subreddits = subreddits, authors = authors,
|
||||
types=[Link], **kw)
|
||||
|
||||
class RelatedSearchQuery(LinkSearchQuery):
|
||||
def __init__(self, q, ignore = [], **kw):
|
||||
self.ignore = set(ignore) if ignore else set()
|
||||
|
||||
LinkSearchQuery.__init__(self, q, sort = 'score desc', **kw)
|
||||
|
||||
self.mm = '25%'
|
||||
|
||||
def run(self, *k, **kw):
|
||||
search = LinkSearchQuery.run(self, *k, **kw)
|
||||
search.docs = [ x for x in search.docs if x not in self.ignore ]
|
||||
return search
|
||||
|
||||
class SubredditSearchQuery(UserSearchQuery):
|
||||
def __init__(self, q, **kw):
|
||||
UserSearchQuery.__init__(self, q, types=[Subreddit], **kw)
|
||||
|
||||
class DomainSearchQuery(SearchQuery):
|
||||
def __init__(self, domain, **kw):
|
||||
q = '+site:%s' % domain
|
||||
|
||||
SearchQuery.__init__(self, q=q, fields=['site'],types=[Link], **kw)
|
||||
|
||||
def solr_params(self, q, boost):
|
||||
q = q + ' ' + ' '.join(boost)
|
||||
return q, dict(fl='fullname',
|
||||
qt='standard')
|
||||
|
||||
def get_after(fullnames, fullname, num):
|
||||
for i, item in enumerate(fullnames):
|
||||
if item == fullname:
|
||||
return fullnames[i+1:i+num+1]
|
||||
else:
|
||||
return fullnames[:num]
|
||||
|
||||
@@ -999,6 +999,7 @@ def title_to_url(title, max_length = 50):
|
||||
return title
|
||||
|
||||
def debug_print(fn):
|
||||
from pylons import g
|
||||
def new_fn(*k,**kw):
|
||||
ret = fn(*k,**kw)
|
||||
g.log.debug("Fn: %s; k=%s; kw=%s\nRet: %s"
|
||||
|
||||
@@ -354,42 +354,26 @@ class IDBuilder(QueryBuilder):
|
||||
return done, new_items
|
||||
|
||||
class SearchBuilder(QueryBuilder):
|
||||
def __init__(self, query, wrap = Wrapped, sort = None, ignore = [],
|
||||
time = time, types = None, langs = None, **kw):
|
||||
QueryBuilder.__init__(self, query, wrap=wrap, **kw)
|
||||
self.sort = sort
|
||||
self.time = time
|
||||
self.types = types
|
||||
self.timing = 0
|
||||
self.total_num = 0
|
||||
self.langs = langs
|
||||
|
||||
self.ignore = set(x for x in (ignore if ignore else []))
|
||||
|
||||
def init_query(self):
|
||||
subreddits = None
|
||||
authors = None
|
||||
if c.site == subreddit.Default:
|
||||
subreddits = Subreddit.user_subreddits(c.user)
|
||||
elif c.site == subreddit.Friends and c.user.friends:
|
||||
authors = c.user.friends
|
||||
elif not isinstance(c.site,subreddit.FakeSubreddit):
|
||||
subreddits = c.site._id
|
||||
|
||||
self.subreddits = subreddits
|
||||
self.authors = authors
|
||||
|
||||
self.skip = True
|
||||
self.total_num = 0
|
||||
self.start_time = time.time()
|
||||
|
||||
self.start_time = time.time()
|
||||
|
||||
def keep_item(self,item):
|
||||
skip_if = item._spam or item._deleted or item._fullname in self.ignore
|
||||
return not skip_if
|
||||
# doesn't use the default keep_item because we want to keep
|
||||
# things that were voted on, even if they've chosen to hide
|
||||
# them in normal listings
|
||||
if item._spam or item._deleted:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def fetch_more(self, last_item, num_have):
|
||||
from r2.lib import solrsearch
|
||||
|
||||
start_t = time.time()
|
||||
|
||||
done = False
|
||||
limit = None
|
||||
if self.num:
|
||||
@@ -401,25 +385,13 @@ class SearchBuilder(QueryBuilder):
|
||||
else:
|
||||
done = True
|
||||
|
||||
langs = c.content_langs
|
||||
if self.langs:
|
||||
langs += self.langs
|
||||
search = self.query.run(after = last_item or self.after,
|
||||
reverse = self.reverse,
|
||||
num = limit)
|
||||
|
||||
if self.time in ['hour','week','day','month']:
|
||||
timerange = (timeago("1 %s" % self.time),"NOW")
|
||||
else:
|
||||
timerange = None
|
||||
new_items = Thing._by_fullname(search.docs, data = True, return_dict=False)
|
||||
|
||||
new_items = solrsearch.search_things(q = self.query or '', sort = self.sort,
|
||||
after = last_item,
|
||||
subreddits = self.subreddits,
|
||||
authors = self.authors,
|
||||
num = limit, reverse = self.reverse,
|
||||
timerange = timerange, langs = langs,
|
||||
types = self.types)
|
||||
|
||||
self.total_num = new_items.hits
|
||||
self.timing = time.time() - start_t
|
||||
self.total_num = search.hits
|
||||
|
||||
return done, new_items
|
||||
|
||||
|
||||
@@ -414,9 +414,12 @@ class Subreddit(Thing, Printable):
|
||||
|
||||
class FakeSubreddit(Subreddit):
|
||||
over_18 = False
|
||||
title = ''
|
||||
_nodb = True
|
||||
|
||||
def __init__(self):
|
||||
Subreddit.__init__(self)
|
||||
self.title = ''
|
||||
|
||||
def is_moderator(self, user):
|
||||
return c.user_is_loggedin and c.user_is_admin
|
||||
|
||||
@@ -568,6 +571,21 @@ class SubSR(FakeSubreddit):
|
||||
@property
|
||||
def path(self):
|
||||
return "/reddits/"
|
||||
|
||||
class DomainSR(FakeSubreddit):
|
||||
@property
|
||||
def path(self):
|
||||
return '/domain/' + self.domain
|
||||
|
||||
def __init__(self, domain):
|
||||
FakeSubreddit.__init__(self)
|
||||
self.domain = domain
|
||||
self.name = domain
|
||||
self.title = domain + ' ' + _('on reddit.com')
|
||||
|
||||
def get_links(self, sort, time):
|
||||
from r2.lib.db import queries
|
||||
return queries.get_domain_links(self.domain, sort, time)
|
||||
|
||||
Sub = SubSR()
|
||||
Friends = FriendsSR()
|
||||
|
||||
@@ -222,7 +222,7 @@ input[type=checkbox], input[type=radio] { margin-top: .4em; }
|
||||
padding: 2px 6px 1px 6px;
|
||||
background-color: white;
|
||||
border: 1px solid #5f99cf;
|
||||
border-bottom: none;
|
||||
border-bottom: 1px solid white;
|
||||
}
|
||||
|
||||
#search {
|
||||
@@ -588,6 +588,7 @@ before enabling */
|
||||
padding: 5px 10px;
|
||||
margin: 5px 310px 5px 0px;
|
||||
border: 1px solid orange;
|
||||
font-size: small;
|
||||
}
|
||||
|
||||
.menuarea {
|
||||
@@ -958,8 +959,7 @@ a.star { text-decoration: none; color: #ff8b60 }
|
||||
.searchpane a { color: #369 }*/
|
||||
|
||||
.searchpane {
|
||||
margin: 5px;
|
||||
margin-right: 310px;
|
||||
margin: 5px 310px 5px 0px;
|
||||
}
|
||||
|
||||
.searchpane #search input[type=text] { }
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
var sr = {};
|
||||
|
||||
var logged = ${c.user_is_loggedin and ("'%s'" % c.user.name) or "false"};
|
||||
var post_site = "${c.site.name}";
|
||||
var post_site = "${c.site.name if not c.default_sr else ''}";
|
||||
var cnameframe = ${'true' if c.cname else 'false'};
|
||||
var modhash = ${"'%s'" % c.modhash or "false"};
|
||||
var cur_domain = "${get_domain(cname = True, subreddit = False) if c.frameless_cname else g.domain}";
|
||||
|
||||
Reference in New Issue
Block a user