Tools for webmasters to monitor their content on reddit (uses Solr

for pulling domain information).

Also includes a re-factor of solrsearch.py and its usage, which
should fix bug #179 as a side-effect
This commit is contained in:
ketralnis
2008-09-30 10:10:05 -07:00
parent 870364b152
commit 90278abea3
14 changed files with 373 additions and 278 deletions

View File

@@ -385,6 +385,8 @@ CondeNet, Inc. All Rights Reserved.
<field name="hot" type="hotness" indexed="true" stored="true" required="true" reversed="true" />
<field name="controversy" type="sfloat" indexed="true" stored="true" required="true" reversed="true" />
<field name="points" type="sint" indexed="true" stored="true" required="true" reversed="true" />
<field name="spam" type="boolean" indexed="true" stored="true" required="false" />
<field name="deleted" type="boolean" indexed="true" stored="true" required="false" />
<!-- subreddit,link,comment -->
<field name="author_id" type="integer" indexed="true" stored="false" required="false" />
<field name="author" type="string" indexed="true" stored="false" required="false" />

View File

@@ -34,6 +34,7 @@ from pylons.wsgiapp import PylonsApp, PylonsBaseWSGIApp
from r2.config.environment import load_environment
from r2.config.rewrites import rewrites
from r2.lib.utils import rstrips
from r2.lib.jsontemplates import api_type
#middleware stuff
from r2.lib.html_source import HTMLValidationParser
@@ -240,7 +241,7 @@ class DomainMiddleware(object):
class SubredditMiddleware(object):
sr_pattern = re.compile(r'^/r/([^/]+)')
sr_pattern = re.compile(r'^/r/([^/]{3,20})')
def __init__(self, app):
self.app = app
@@ -255,18 +256,50 @@ class SubredditMiddleware(object):
environ['subreddit'] = 'r'
return self.app(environ, start_response)
class DomainListingMiddleware(object):
domain_pattern = re.compile(r'^/domain/(([\w]+\.)+[\w]+)')
def __init__(self, app):
self.app = app
def __call__(self, environ, start_response):
if not environ.has_key('subreddit'):
path = environ['PATH_INFO']
domain = self.domain_pattern.match(path)
if domain:
environ['domain'] = domain.groups()[0]
environ['PATH_INFO'] = self.domain_pattern.sub('', path) or '/'
return self.app(environ, start_response)
class ExtensionMiddleware(object):
ext_pattern = re.compile(r'\.([^/]+)$')
extensions = {'rss' : ('xml', 'text/xml; charset=UTF-8'),
'xml' : ('xml', 'text/xml; charset=UTF-8'),
'js' : ('js', 'text/javascript; charset=UTF-8'),
'wired' : ('wired', 'text/javascript; charset=UTF-8'),
'embed' : ('htmllite', 'text/javascript; charset=UTF-8'),
'mobile' : ('mobile', 'text/html'),
'png' : ('png', 'image/png'),
'css' : ('css', 'text/css'),
'api' : (api_type(), 'application/json; charset=UTF-8'),
'json' : (api_type(), 'application/json; charset=UTF-8'),
'json-html' : (api_type('html'), 'application/json; charset=UTF-8')}
def __init__(self, app):
self.app = app
def __call__(self, environ, start_response):
path = environ['PATH_INFO']
ext = self.ext_pattern.findall(path)
if ext:
environ['extension'] = ext[0]
environ['PATH_INFO'] = self.ext_pattern.sub('', path) or '/'
domain_ext = environ.get('reddit-domain-extension')
for ext, val in self.extensions.iteritems():
if ext == domain_ext or path.endswith(ext):
environ['extension'] = ext
environ['render_style'] = val[0]
environ['content_type'] = val[1]
#strip off the extension
environ['PATH_INFO'] = path[:-(len(ext) + 1)]
break
return self.app(environ, start_response)
class RewriteMiddleware(object):
@@ -382,11 +415,11 @@ def make_app(global_conf, full_stack=True, **app_conf):
app = ProfilingMiddleware(app)
app = SourceViewMiddleware(app)
app = SubredditMiddleware(app)
app = DomainMiddleware(app)
app = DomainListingMiddleware(app)
app = SubredditMiddleware(app)
app = ExtensionMiddleware(app)
log_path = global_conf.get('log_path')
if log_path:
process_iden = global_conf.get('scgi_port', 'default')

View File

@@ -32,12 +32,14 @@ from r2.lib.template_helpers import get_domain
from r2.lib.emailer import has_opted_out, Email
from r2.lib.db.operators import desc
from r2.lib.strings import strings
from r2.lib.solrsearch import RelatedSearchQuery, SubredditSearchQuery, LinkSearchQuery
import r2.lib.db.thing as thing
from listingcontroller import ListingController
from pylons import c, request
import random as rand
import re
import time as time_module
from urllib import quote_plus
from admin import admin_profile_query
@@ -292,6 +294,7 @@ class FrontController(RedditController):
def GET_related(self, num, article, after, reverse, count):
"""Related page: performs a search using title of article as
the search query."""
title = c.site.name + ((': ' + article.title) if hasattr(article, 'title') else '')
query = self.related_replace_regex.sub(self.related_replace_with,
@@ -301,24 +304,25 @@ class FrontController(RedditController):
# longer than this are typically ascii art anyway
query = query[0:1023]
num, t, pane = self._search(query, time = 'all',
count = count,
after = after, reverse = reverse, num = num,
ignore = [article._fullname],
types = [Link])
res = LinkInfoPage(link = article, content = pane).render()
return res
q = RelatedSearchQuery(query, ignore = [article._fullname])
num, t, pane = self._search(q,
num = num, after = after, reverse = reverse,
count = count)
return LinkInfoPage(link = article, content = pane).render()
@base_listing
@validate(query = nop('q'))
def GET_search_reddits(self, query, reverse, after, count, num):
"""Search reddits by title and description."""
num, t, spane = self._search(query, num = num, types = [Subreddit],
sort='points desc', time='all',
after = after, reverse = reverse,
# note that 'downs' is a measure of activity on subreddits
q = SubredditSearchQuery(query, sort = 'downs desc',
timerange = 'all')
num, t, spane = self._search(q, num = num, reverse = reverse, after = after,
count = count)
res = SubredditsPage(content=spane,
res = SubredditsPage(content=spane,
prev_search = query,
elapsed_time = t,
num_results = num,
@@ -327,7 +331,7 @@ class FrontController(RedditController):
verify_langs_regex = re.compile(r"^[a-z][a-z](,[a-z][a-z])*$")
@base_listing
@validate(query=nop('q'),
@validate(query = nop('q'),
time = VMenu('action', TimeMenu, remember = False),
langs = nop('langs'))
def GET_search(self, query, num, time, reverse, after, count, langs):
@@ -340,12 +344,12 @@ class FrontController(RedditController):
if langs and self.verify_langs_regex.match(langs):
langs = langs.split(',')
else:
langs = None
langs = c.content_langs
num, t, spane = self._search(query, time=time,
num = num, after = after,
reverse = reverse,
count = count, types = [Link])
q = LinkSearchQuery(q = query, timerange = time, langs = langs)
num, t, spane = self._search(q, num = num, after = after, reverse = reverse,
count = count)
if not isinstance(c.site,FakeSubreddit):
my_reddits_link = "/search%s" % query_string({'q': query})
@@ -365,26 +369,22 @@ class FrontController(RedditController):
return res
def _search(self, query = '', time=None,
sort = 'hot desc',
after = None, reverse = False, num = 25,
ignore = None, count=0, types = None,
langs = None):
def _search(self, query_obj, num, after, reverse, count=0):
"""Helper function for interfacing with search. Basically a
thin wrapper for SearchBuilder."""
builder = SearchBuilder(query, num = num,
sort = sort,
after = after, reverse = reverse,
count = count, types = types,
time = time, ignore = ignore,
langs = langs,
builder = SearchBuilder(query_obj,
after = after, num = num, reverse = reverse,
count = count,
wrap = ListingController.builder_wrapper)
listing = LinkListing(builder, show_nums=True)
# have to do it in two steps since total_num and timing are only
# computed after fetch_more
res = listing.listing()
return builder.total_num, builder.timing, res
timing = time_module.time() - builder.start_time
return builder.total_num, timing, res

View File

@@ -33,6 +33,7 @@ from r2.lib.db.thing import Query, Merge, Relations
from r2.lib.db import queries
from r2.lib.strings import Score
from r2.lib import organic
from r2.lib.solrsearch import SearchQuery
from r2.lib.utils import iters, check_cheating
from admin import admin_profile_query
@@ -112,6 +113,8 @@ class ListingController(RedditController):
builder_cls = self.builder_cls
elif isinstance(self.query_obj, Query):
builder_cls = QueryBuilder
elif isinstance(self.query_obj, SearchQuery):
builder_cls = SearchBuilder
elif isinstance(self.query_obj, iters):
builder_cls = IDBuilder
elif isinstance(self.query_obj, queries.CachedResults):

View File

@@ -212,13 +212,18 @@ def over18():
return True
def set_subreddit():
sr_name=request.environ.get("subreddit", request.params.get('r'))
#the r parameter gets added by javascript for POST requests so we
#can reference c.site in api.py
sr_name = request.environ.get("subreddit", request.POST.get('r'))
domain = request.environ.get("domain")
if not sr_name or sr_name == Default.name:
if not sr_name:
#check for cnames
sub_domain = request.environ.get('sub_domain')
sr = Subreddit._by_domain(sub_domain) if sub_domain else None
c.site = sr or Default
elif sr_name == 'r':
#reddits
c.site = Sub
else:
try:
@@ -227,6 +232,10 @@ def set_subreddit():
c.site = Default
redirect_to("/reddits/create?name=%s" % sr_name)
#if we didn't find a subreddit, check for a domain listing
if not sr_name and c.site == Default and domain:
c.site = DomainSR(domain)
if isinstance(c.site, FakeSubreddit):
c.default_sr = True
@@ -235,42 +244,16 @@ def set_subreddit():
abort(404, "not found")
def set_content_type():
c.extension = request.environ.get('extension') or \
request.environ.get('reddit-domain-extension') or ''
c.render_style = 'html'
if c.extension in ('rss', 'xml'):
c.render_style = 'xml'
c.response_content_type = 'text/xml; charset=UTF-8'
elif c.extension == 'js':
c.render_style = 'js'
c.response_content_type = 'text/javascript; charset=UTF-8'
elif c.extension.startswith('json') or c.extension == "api":
c.response_content_type = 'application/json; charset=UTF-8'
c.response_access_control = 'allow <*>'
if c.extension == 'json-html':
c.render_style = api_type('html')
else:
c.render_style = api_type()
elif c.extension == 'wired':
c.render_style = 'wired'
c.response_content_type = 'text/javascript; charset=UTF-8'
c.response_wrappers.append(utils.to_js)
elif c.extension == 'embed':
c.render_style = 'htmllite'
c.response_content_type = 'text/javascript; charset=UTF-8'
c.response_wrappers.append(utils.to_js)
elif c.extension == 'mobile':
c.render_style = 'mobile'
elif c.extension == 'png':
c.response_content_type = 'image/png'
c.render_style = 'png'
elif c.extension == 'css':
c.response_content_type = 'text/css'
c.render_style = 'css'
#Insert new extentions above this line
elif c.extension not in ('', 'html'):
# request.path already has the extension stripped off of it
redirect_to(request.path + utils.query_string(request.get))
e = request.environ
if e.has_key('extension'):
c.render_style = e['render_style']
c.response_content_type = e['content_type']
ext = e['extension']
if ext == 'api' or ext.startswith('json'):
c.response_access_control = 'allow <*>'
if ext in ('embed', 'wired'):
c.response_wrappers.append(utils.to_js)
def get_browser_langs():
browser_langs = []

View File

@@ -118,7 +118,8 @@ class BaseController(WSGIController):
u.mk_cname(**kw)
# make sure the extensions agree with the current page
u.set_extension(c.extension)
if c.extension:
u.set_extension(c.extension)
# unparse and encode it un utf8
return _force_unicode(u.unparse()).encode('utf8')

View File

@@ -225,7 +225,7 @@ def test_cache(cache):
# a cache that occasionally dumps itself to be used for long-running
# processes
class SelfEmptyingCache(LocalCache):
def __init__(self,max_size=50*1000):
def __init__(self,max_size=100*1000):
self.max_size = max_size
def maybe_reset(self):

View File

@@ -5,6 +5,7 @@ from r2.lib.db.operators import asc, desc, timeago
from r2.lib.db import query_queue
from r2.lib.db.sorts import epoch_seconds
from r2.lib.utils import fetch_things2, worker
from r2.lib.solrsearch import DomainSearchQuery
from datetime import datetime
@@ -23,6 +24,12 @@ def db_sort(sort):
cls, col = db_sorts[sort]
return cls(col)
search_sort = dict(hot = 'hot desc',
new = 'date desc',
top = 'points desc',
controversial = 'controversy desc',
old = 'date asc')
db_times = dict(all = None,
hour = Thing.c._date >= timeago('1 hour'),
day = Thing.c._date >= timeago('1 day'),
@@ -176,6 +183,9 @@ def get_links(sr, sort, time):
q._filter(db_times[time])
return make_results(q)
def get_domain_links(domain, sort, time):
return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time)
def user_query(kind, user, sort, time):
"""General profile-page query."""
q = kind._query(kind.c.author_id == user._id,

View File

@@ -32,7 +32,7 @@ from r2.models import *
from r2.lib.contrib import pysolr
from r2.lib.contrib.pysolr import SolrError
from r2.lib.utils import timeago, set_emptying_cache, IteratorChunker
from r2.lib.utils import psave, pload, unicode_safe
from r2.lib.utils import psave, pload, unicode_safe, tup
from r2.lib.cache import SelfEmptyingCache
from Queue import Queue
from threading import Thread
@@ -125,6 +125,8 @@ search_fields={Thing: (Field('fullname', '_fullname'),
Field('lang'),
Field('ups', '_ups', is_number=True, reverse=True),
Field('downs', '_downs', is_number=True, reverse=True),
Field('spam','_spam'),
Field('deleted','_deleted'),
Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True),
Field('controversy', '_controversy', is_number=True, reverse=True),
Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)),
@@ -162,8 +164,8 @@ search_fields={Thing: (Field('fullname', '_fullname'),
# yes, it's a copy of 'hot'
is_number=True, reverse=True),
ThingField('author',Account,'author_id','name'),
#ThingField('subreddit',Subreddit,'sr_id','name'),
ThingField('reddit',Subreddit,'sr_id','name'))}
ThingField('subreddit',Subreddit,'sr_id','name'))}
#ThingField('reddit',Subreddit,'sr_id','name'))}
def tokenize_things(things,return_dict=False):
"""
@@ -276,6 +278,8 @@ def fetch_batches(t_class,size,since,until):
of `fetch_things`
"""
q=t_class._query(t_class.c._date >= since,
t_class.c._spam == (True,False),
t_class.c._deleted == (True,False),
t_class.c._date < until,
sort = desc('_date'),
limit = size,
@@ -375,8 +379,8 @@ def reindex_all(types = None, delete_all_first=False):
for batch in fetch_batches(cls,1000,
timeago("50 years"),
start_t):
r = tokenize_things([x for x in batch
if not x._spam and not x._deleted ])
r = tokenize_things([ x for x in batch
if not x._spam and not x._deleted ])
count += len(r)
print ("Processing %s #%d(%s): %s"
@@ -465,173 +469,241 @@ def combine_searchterms(terms):
def swap_strings(s,this,that):
"""
Just swaps substrings, like:
s = "sort(asc)"
swap_strings(s,'asc','desc')
s -> "sort desc"
s = "hot asc"
s = swap_strings(s,'asc','desc')
s == "hot desc"
uses 'tmp' as a replacment string, so don't use for anything
very complicated
"""
return s.replace(this,'tmp').replace(that,this).replace('tmp',that)
def search_things(q, sort = 'hot desc',
after = None,
subreddits = None,
authors = None,
num = 100, reverse = False,
timerange = None, langs = None,
types = None,
boost = []):
"""
Takes a given query and returns a list of Things that match
that query. See Builder for the use of `after`, `reverse`, and
`num`. Queries on params are OR queries, except `timerange`
and `types`
"""
if not q or not g.solr_url:
return pysolr.Results([],0)
class SearchQuery(object):
def __init__(self, q, sort, fields = [], subreddits = [], authors = [],
types = [], timerange = None, spam = False, deleted = False):
# there are two parts to our query: what the user typed (parsed
# with Solr's DisMax parser), and what we are adding to it. The
# latter is called the "boost" (and is parsed using full Lucene
# syntax), and it can be added to via the `boost` parameter (which
# we have to copy since we append to it)
boost = list(boost)
self.q = q
self.fields = fields
self.sort = sort
self.subreddits = subreddits
self.authors = authors
self.types = types
self.spam = spam
self.deleted = deleted
# `score` refers to Solr's score (relevency to the search given),
# not our score (sums of ups and downs).
sort = "score desc, %s, date desc, fullname asc" % (sort,)
if reverse:
sort = swap_strings(sort,'asc','desc')
if timerange:
def time_to_searchstr(t):
if isinstance(t, datetime):
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif isinstance(t, date):
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
elif isinstance(t,str):
t = t
return t
(fromtime, totime) = timerange
fromtime = time_to_searchstr(fromtime)
totime = time_to_searchstr(totime)
boost.append("+date:[%s TO %s]"
% (fromtime,totime))
if subreddits:
def subreddit_to_searchstr(sr):
if isinstance(sr,Subreddit):
return ('sr_id','%d' % sr.id)
elif isinstance(sr,str) or isinstance(sr,unicode):
return ('reddit',sr)
else:
return ('sr_id','%d' % sr)
if isinstance(subreddits,list) or isinstance(subreddits,tuple):
s_subreddits = map(subreddit_to_searchstr, subreddits)
if timerange in ['hour','week','day','month','year']:
self.timerange = (timeago("1 %s" % timerange),"NOW")
elif timerange == 'all' or timerange is None:
self.timerange = None
else:
s_subreddits = (subreddit_to_searchstr(subreddits),)
self.timerange = timerange
boost.append("+(%s)^2" % combine_searchterms(s_subreddits))
def run(self, after = None, num = 100, reverse = False):
if not self.q or not g.solr_url:
return pysolr.Results([],0)
if authors:
def author_to_searchstr(a):
if isinstance(a,Account):
return ('author_id','%d' % a.id)
elif isinstance(a,str) or isinstance(a,unicode):
return ('author',a)
# there are two parts to our query: what the user typed
# (parsed with Solr's DisMax parser), and what we are adding
# to it. The latter is called the "boost" (and is parsed using
# full Lucene syntax), and it can be added to via the `boost`
# parameter
boost = []
if not self.spam:
boost.append("-spam:true")
if not self.deleted:
boost.append("-deleted:true")
if self.timerange:
def time_to_searchstr(t):
if isinstance(t, datetime):
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif isinstance(t, date):
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
elif isinstance(t,str):
t = t
return t
(fromtime, totime) = self.timerange
fromtime = time_to_searchstr(fromtime)
totime = time_to_searchstr(totime)
boost.append("+date:[%s TO %s]"
% (fromtime,totime))
if self.subreddits:
def subreddit_to_searchstr(sr):
if isinstance(sr,Subreddit):
return ('sr_id','%d' % sr.id)
elif isinstance(sr,str) or isinstance(sr,unicode):
return ('subreddit',sr)
else:
return ('sr_id','%d' % sr)
s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits))
boost.append("+(%s)" % combine_searchterms(s_subreddits))
if self.authors:
def author_to_searchstr(a):
if isinstance(a,Account):
return ('author_id','%d' % a.id)
elif isinstance(a,str) or isinstance(a,unicode):
return ('author',a)
else:
return ('author_id','%d' % a)
s_authors = map(author_to_searchstr,tup(self.authors))
boost.append('+(%s)^2' % combine_searchterms(s_authors))
def type_to_searchstr(t):
if isinstance(t,str):
return ('type',t)
else:
return ('author_id','%d' % a)
if isinstance(authors,list) or isinstance(authors,tuple):
s_authors = map(author_to_searchstr,authors)
else:
s_authors = map(author_to_searchstr,(authors,))
boost.append('+(%s)^2' % combine_searchterms(s_authors))
# the set of languages is used to determine the fields to search,
# named ('contents_%s' % lang), but 'contents' (which is split
# only on whitespace) is always also searched. This means that
# all_langs and schema.xml must be kept in synch
default_fields = ['contents^1.5','contents_ws^3',
'site^1','author^1', 'reddit^1', 'url^1']
if langs == None:
# only search 'contents'
fields = default_fields
else:
if langs == 'all':
langs = searchable_langs
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
+ default_fields)
if not types:
types = indexed_types
def type_to_searchstr(t):
if isinstance(t,str):
return ('type',t)
else:
return ('type',t.__name__.lower())
return ('type',t.__name__.lower())
s_types = map(type_to_searchstr,types)
boost.append("+%s" % combine_searchterms(s_types))
s_types = map(type_to_searchstr,self.types)
boost.append("+%s" % combine_searchterms(s_types))
# everything else that solr needs to know
solr_params = dict(fl = 'fullname', # the field(s) to return
qt = 'dismax', # the query-handler (dismax supports 'bq' and 'qf')
# qb = '3',
bq = ' '.join(boost),
qf = ' '.join(fields),
mm = '75%') # minimum number of clauses that should match
q,solr_params = self.solr_params(self.q,boost)
with SolrConnection() as s:
if after:
# size of the pre-search to run in the case that we need
# to search more than once. A bigger one can reduce the
# number of searches that need to be run twice, but if
# it's bigger than the default display size, it could
# waste some
PRESEARCH_SIZE = num
try:
search = self.run_search(q, self.sort, solr_params,
reverse, after, num)
return search
# run a search and get back the number of hits, so that we
# can re-run the search with that max_count.
pre_search = s.search(q,sort,rows=PRESEARCH_SIZE,
except SolrError,e:
g.log.error(str(e))
return pysolr.Results([],0)
@classmethod
def run_search(cls, q, sort, solr_params, reverse, after, num):
"returns pysolr.Results(docs=[fullname()],hits=int())"
if reverse:
sort = swap_strings(sort,'asc','desc')
g.log.debug("Searching q=%s" % q)
with SolrConnection() as s:
if after:
# size of the pre-search to run in the case that we
# need to search more than once. A bigger one can
# reduce the number of searches that need to be run
# twice, but if it's bigger than the default display
# size, it could waste some
PRESEARCH_SIZE = num
# run a search and get back the number of hits, so
# that we can re-run the search with that max_count.
pre_search = s.search(q,sort,rows=PRESEARCH_SIZE,
other_params = solr_params)
if (PRESEARCH_SIZE >= pre_search.hits
or pre_search.hits == len(pre_search.docs)):
# don't run a second search if our pre-search
# found all of the elements anyway
search = pre_search
else:
# we have to run a second search, but we can limit
# the duplicated transfer of the first few records
# since we already have those from the pre_search
second_search = s.search(q,sort,
start=len(pre_search.docs),
rows=pre_search.hits - len(pre_search.docs),
other_params = solr_params)
search = pysolr.Results(pre_search.docs + second_search.docs,
pre_search.hits)
search.docs = [ i['fullname'] for i in search.docs ]
search.docs = get_after(search.docs, after._fullname, num)
else:
search = s.search(q,sort,rows=num,
other_params = solr_params)
search.docs = [ i['fullname'] for i in search.docs ]
if (PRESEARCH_SIZE >= pre_search.hits
or pre_search.hits == len(pre_search.docs)):
# don't run a second search if our pre-search found
# all of the elements anyway
search = pre_search
else:
# we have to run a second search, but we can limit the
# duplicated transfer of the first few records since
# we already have those from the pre_search
second_search = s.search(q,sort,
start=len(pre_search.docs),
rows=pre_search.hits - len(pre_search.docs),
other_params = solr_params)
search = pysolr.Results(pre_search.docs + second_search.docs,
pre_search.hits)
return search
fullname = after._fullname
for i, item in enumerate(search.docs):
if item['fullname'] == fullname:
search.docs = search.docs[i+1:i+1+num]
break
else:
g.log.debug("I got an after query, but the fullname was not present in the results")
search.docs = search.docs[0:num]
def solr_params(self,*k,**kw):
raise NotImplementedError
class UserSearchQuery(SearchQuery):
"Base class for queries that use the dismax parser; requires self.mm"
def __init__(self, q, sort=None, fields=[], langs=None, **kw):
default_fields = ['contents^1.5','contents_ws^3'] + fields
if sort is None:
sort = 'score desc, hot desc, date desc'
if langs is None:
fields = default_fields
else:
search = s.search(q,sort,rows=num,
other_params = solr_params)
if langs == 'all':
langs = searchable_langs
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
+ default_fields)
hits = search.hits
things = Thing._by_fullname([i['fullname'] for i in search.docs],
data = True, return_dict = False)
# default minimum match
self.mm = '75%'
return pysolr.Results(things,hits)
SearchQuery.__init__(self, q, sort, fields = fields, **kw)
def solr_params(self, q, boost):
return q, dict(fl = 'fullname',
qt = 'dismax',
bq = ' '.join(boost),
qf = ' '.join(self.fields),
mm = self.mm)
class LinkSearchQuery(UserSearchQuery):
def __init__(self, q, **kw):
additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1']
subreddits = None
authors = None
if c.site == subreddit.Default:
subreddits = Subreddit.user_subreddits(c.user)
elif c.site == subreddit.Friends and c.user.friends:
authors = c.user.friends
elif not isinstance(c.site,subreddit.FakeSubreddit):
subreddits = [c.site._id]
UserSearchQuery.__init__(self, q, fields = additional_fields,
subreddits = subreddits, authors = authors,
types=[Link], **kw)
class RelatedSearchQuery(LinkSearchQuery):
def __init__(self, q, ignore = [], **kw):
self.ignore = set(ignore) if ignore else set()
LinkSearchQuery.__init__(self, q, sort = 'score desc', **kw)
self.mm = '25%'
def run(self, *k, **kw):
search = LinkSearchQuery.run(self, *k, **kw)
search.docs = [ x for x in search.docs if x not in self.ignore ]
return search
class SubredditSearchQuery(UserSearchQuery):
def __init__(self, q, **kw):
UserSearchQuery.__init__(self, q, types=[Subreddit], **kw)
class DomainSearchQuery(SearchQuery):
def __init__(self, domain, **kw):
q = '+site:%s' % domain
SearchQuery.__init__(self, q=q, fields=['site'],types=[Link], **kw)
def solr_params(self, q, boost):
q = q + ' ' + ' '.join(boost)
return q, dict(fl='fullname',
qt='standard')
def get_after(fullnames, fullname, num):
for i, item in enumerate(fullnames):
if item == fullname:
return fullnames[i+1:i+num+1]
else:
return fullnames[:num]

View File

@@ -999,6 +999,7 @@ def title_to_url(title, max_length = 50):
return title
def debug_print(fn):
from pylons import g
def new_fn(*k,**kw):
ret = fn(*k,**kw)
g.log.debug("Fn: %s; k=%s; kw=%s\nRet: %s"

View File

@@ -354,42 +354,26 @@ class IDBuilder(QueryBuilder):
return done, new_items
class SearchBuilder(QueryBuilder):
def __init__(self, query, wrap = Wrapped, sort = None, ignore = [],
time = time, types = None, langs = None, **kw):
QueryBuilder.__init__(self, query, wrap=wrap, **kw)
self.sort = sort
self.time = time
self.types = types
self.timing = 0
self.total_num = 0
self.langs = langs
self.ignore = set(x for x in (ignore if ignore else []))
def init_query(self):
subreddits = None
authors = None
if c.site == subreddit.Default:
subreddits = Subreddit.user_subreddits(c.user)
elif c.site == subreddit.Friends and c.user.friends:
authors = c.user.friends
elif not isinstance(c.site,subreddit.FakeSubreddit):
subreddits = c.site._id
self.subreddits = subreddits
self.authors = authors
self.skip = True
self.total_num = 0
self.start_time = time.time()
self.start_time = time.time()
def keep_item(self,item):
skip_if = item._spam or item._deleted or item._fullname in self.ignore
return not skip_if
# doesn't use the default keep_item because we want to keep
# things that were voted on, even if they've chosen to hide
# them in normal listings
if item._spam or item._deleted:
return False
else:
return True
def fetch_more(self, last_item, num_have):
from r2.lib import solrsearch
start_t = time.time()
done = False
limit = None
if self.num:
@@ -401,25 +385,13 @@ class SearchBuilder(QueryBuilder):
else:
done = True
langs = c.content_langs
if self.langs:
langs += self.langs
search = self.query.run(after = last_item or self.after,
reverse = self.reverse,
num = limit)
if self.time in ['hour','week','day','month']:
timerange = (timeago("1 %s" % self.time),"NOW")
else:
timerange = None
new_items = Thing._by_fullname(search.docs, data = True, return_dict=False)
new_items = solrsearch.search_things(q = self.query or '', sort = self.sort,
after = last_item,
subreddits = self.subreddits,
authors = self.authors,
num = limit, reverse = self.reverse,
timerange = timerange, langs = langs,
types = self.types)
self.total_num = new_items.hits
self.timing = time.time() - start_t
self.total_num = search.hits
return done, new_items

View File

@@ -414,9 +414,12 @@ class Subreddit(Thing, Printable):
class FakeSubreddit(Subreddit):
over_18 = False
title = ''
_nodb = True
def __init__(self):
Subreddit.__init__(self)
self.title = ''
def is_moderator(self, user):
return c.user_is_loggedin and c.user_is_admin
@@ -568,6 +571,21 @@ class SubSR(FakeSubreddit):
@property
def path(self):
return "/reddits/"
class DomainSR(FakeSubreddit):
@property
def path(self):
return '/domain/' + self.domain
def __init__(self, domain):
FakeSubreddit.__init__(self)
self.domain = domain
self.name = domain
self.title = domain + ' ' + _('on reddit.com')
def get_links(self, sort, time):
from r2.lib.db import queries
return queries.get_domain_links(self.domain, sort, time)
Sub = SubSR()
Friends = FriendsSR()

View File

@@ -222,7 +222,7 @@ input[type=checkbox], input[type=radio] { margin-top: .4em; }
padding: 2px 6px 1px 6px;
background-color: white;
border: 1px solid #5f99cf;
border-bottom: none;
border-bottom: 1px solid white;
}
#search {
@@ -588,6 +588,7 @@ before enabling */
padding: 5px 10px;
margin: 5px 310px 5px 0px;
border: 1px solid orange;
font-size: small;
}
.menuarea {
@@ -958,8 +959,7 @@ a.star { text-decoration: none; color: #ff8b60 }
.searchpane a { color: #369 }*/
.searchpane {
margin: 5px;
margin-right: 310px;
margin: 5px 310px 5px 0px;
}
.searchpane #search input[type=text] { }

View File

@@ -45,7 +45,7 @@
var sr = {};
var logged = ${c.user_is_loggedin and ("'%s'" % c.user.name) or "false"};
var post_site = "${c.site.name}";
var post_site = "${c.site.name if not c.default_sr else ''}";
var cnameframe = ${'true' if c.cname else 'false'};
var modhash = ${"'%s'" % c.modhash or "false"};
var cur_domain = "${get_domain(cname = True, subreddit = False) if c.frameless_cname else g.domain}";