diff --git a/config/solr/schema.xml b/config/solr/schema.xml index ecd6c0a38..7568c088a 100644 --- a/config/solr/schema.xml +++ b/config/solr/schema.xml @@ -385,6 +385,8 @@ CondeNet, Inc. All Rights Reserved. + + diff --git a/r2/r2/config/middleware.py b/r2/r2/config/middleware.py index a7b88647d..ef9953648 100644 --- a/r2/r2/config/middleware.py +++ b/r2/r2/config/middleware.py @@ -34,6 +34,7 @@ from pylons.wsgiapp import PylonsApp, PylonsBaseWSGIApp from r2.config.environment import load_environment from r2.config.rewrites import rewrites from r2.lib.utils import rstrips +from r2.lib.jsontemplates import api_type #middleware stuff from r2.lib.html_source import HTMLValidationParser @@ -240,7 +241,7 @@ class DomainMiddleware(object): class SubredditMiddleware(object): - sr_pattern = re.compile(r'^/r/([^/]+)') + sr_pattern = re.compile(r'^/r/([^/]{3,20})') def __init__(self, app): self.app = app @@ -255,18 +256,50 @@ class SubredditMiddleware(object): environ['subreddit'] = 'r' return self.app(environ, start_response) +class DomainListingMiddleware(object): + domain_pattern = re.compile(r'^/domain/(([\w]+\.)+[\w]+)') + + def __init__(self, app): + self.app = app + + def __call__(self, environ, start_response): + if not environ.has_key('subreddit'): + path = environ['PATH_INFO'] + domain = self.domain_pattern.match(path) + if domain: + environ['domain'] = domain.groups()[0] + environ['PATH_INFO'] = self.domain_pattern.sub('', path) or '/' + return self.app(environ, start_response) + class ExtensionMiddleware(object): ext_pattern = re.compile(r'\.([^/]+)$') + extensions = {'rss' : ('xml', 'text/xml; charset=UTF-8'), + 'xml' : ('xml', 'text/xml; charset=UTF-8'), + 'js' : ('js', 'text/javascript; charset=UTF-8'), + 'wired' : ('wired', 'text/javascript; charset=UTF-8'), + 'embed' : ('htmllite', 'text/javascript; charset=UTF-8'), + 'mobile' : ('mobile', 'text/html'), + 'png' : ('png', 'image/png'), + 'css' : ('css', 'text/css'), + 'api' : (api_type(), 'application/json; charset=UTF-8'), + 'json' : (api_type(), 'application/json; charset=UTF-8'), + 'json-html' : (api_type('html'), 'application/json; charset=UTF-8')} + def __init__(self, app): self.app = app def __call__(self, environ, start_response): path = environ['PATH_INFO'] - ext = self.ext_pattern.findall(path) - if ext: - environ['extension'] = ext[0] - environ['PATH_INFO'] = self.ext_pattern.sub('', path) or '/' + domain_ext = environ.get('reddit-domain-extension') + for ext, val in self.extensions.iteritems(): + if ext == domain_ext or path.endswith(ext): + environ['extension'] = ext + environ['render_style'] = val[0] + environ['content_type'] = val[1] + #strip off the extension + environ['PATH_INFO'] = path[:-(len(ext) + 1)] + break return self.app(environ, start_response) class RewriteMiddleware(object): @@ -382,11 +415,11 @@ def make_app(global_conf, full_stack=True, **app_conf): app = ProfilingMiddleware(app) app = SourceViewMiddleware(app) - app = SubredditMiddleware(app) app = DomainMiddleware(app) + app = DomainListingMiddleware(app) + app = SubredditMiddleware(app) app = ExtensionMiddleware(app) - log_path = global_conf.get('log_path') if log_path: process_iden = global_conf.get('scgi_port', 'default') diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py index ac8182f77..a2c680c7e 100644 --- a/r2/r2/controllers/front.py +++ b/r2/r2/controllers/front.py @@ -32,12 +32,14 @@ from r2.lib.template_helpers import get_domain from r2.lib.emailer import has_opted_out, Email from r2.lib.db.operators import desc from r2.lib.strings import strings +from r2.lib.solrsearch import RelatedSearchQuery, SubredditSearchQuery, LinkSearchQuery import r2.lib.db.thing as thing from listingcontroller import ListingController from pylons import c, request import random as rand import re +import time as time_module from urllib import quote_plus from admin import admin_profile_query @@ -292,6 +294,7 @@ class FrontController(RedditController): def GET_related(self, num, article, after, reverse, count): """Related page: performs a search using title of article as the search query.""" + title = c.site.name + ((': ' + article.title) if hasattr(article, 'title') else '') query = self.related_replace_regex.sub(self.related_replace_with, @@ -301,24 +304,25 @@ class FrontController(RedditController): # longer than this are typically ascii art anyway query = query[0:1023] - num, t, pane = self._search(query, time = 'all', - count = count, - after = after, reverse = reverse, num = num, - ignore = [article._fullname], - types = [Link]) - res = LinkInfoPage(link = article, content = pane).render() - return res + q = RelatedSearchQuery(query, ignore = [article._fullname]) + num, t, pane = self._search(q, + num = num, after = after, reverse = reverse, + count = count) + + return LinkInfoPage(link = article, content = pane).render() @base_listing @validate(query = nop('q')) def GET_search_reddits(self, query, reverse, after, count, num): """Search reddits by title and description.""" - num, t, spane = self._search(query, num = num, types = [Subreddit], - sort='points desc', time='all', - after = after, reverse = reverse, + # note that 'downs' is a measure of activity on subreddits + q = SubredditSearchQuery(query, sort = 'downs desc', + timerange = 'all') + + num, t, spane = self._search(q, num = num, reverse = reverse, after = after, count = count) - res = SubredditsPage(content=spane, + res = SubredditsPage(content=spane, prev_search = query, elapsed_time = t, num_results = num, @@ -327,7 +331,7 @@ class FrontController(RedditController): verify_langs_regex = re.compile(r"^[a-z][a-z](,[a-z][a-z])*$") @base_listing - @validate(query=nop('q'), + @validate(query = nop('q'), time = VMenu('action', TimeMenu, remember = False), langs = nop('langs')) def GET_search(self, query, num, time, reverse, after, count, langs): @@ -340,12 +344,12 @@ class FrontController(RedditController): if langs and self.verify_langs_regex.match(langs): langs = langs.split(',') else: - langs = None + langs = c.content_langs - num, t, spane = self._search(query, time=time, - num = num, after = after, - reverse = reverse, - count = count, types = [Link]) + q = LinkSearchQuery(q = query, timerange = time, langs = langs) + + num, t, spane = self._search(q, num = num, after = after, reverse = reverse, + count = count) if not isinstance(c.site,FakeSubreddit): my_reddits_link = "/search%s" % query_string({'q': query}) @@ -365,26 +369,22 @@ class FrontController(RedditController): return res - def _search(self, query = '', time=None, - sort = 'hot desc', - after = None, reverse = False, num = 25, - ignore = None, count=0, types = None, - langs = None): + def _search(self, query_obj, num, after, reverse, count=0): """Helper function for interfacing with search. Basically a thin wrapper for SearchBuilder.""" - builder = SearchBuilder(query, num = num, - sort = sort, - after = after, reverse = reverse, - count = count, types = types, - time = time, ignore = ignore, - langs = langs, + builder = SearchBuilder(query_obj, + after = after, num = num, reverse = reverse, + count = count, wrap = ListingController.builder_wrapper) + listing = LinkListing(builder, show_nums=True) # have to do it in two steps since total_num and timing are only # computed after fetch_more res = listing.listing() - return builder.total_num, builder.timing, res + timing = time_module.time() - builder.start_time + + return builder.total_num, timing, res diff --git a/r2/r2/controllers/listingcontroller.py b/r2/r2/controllers/listingcontroller.py index 6d3299ebc..ecb996624 100644 --- a/r2/r2/controllers/listingcontroller.py +++ b/r2/r2/controllers/listingcontroller.py @@ -33,6 +33,7 @@ from r2.lib.db.thing import Query, Merge, Relations from r2.lib.db import queries from r2.lib.strings import Score from r2.lib import organic +from r2.lib.solrsearch import SearchQuery from r2.lib.utils import iters, check_cheating from admin import admin_profile_query @@ -112,6 +113,8 @@ class ListingController(RedditController): builder_cls = self.builder_cls elif isinstance(self.query_obj, Query): builder_cls = QueryBuilder + elif isinstance(self.query_obj, SearchQuery): + builder_cls = SearchBuilder elif isinstance(self.query_obj, iters): builder_cls = IDBuilder elif isinstance(self.query_obj, queries.CachedResults): diff --git a/r2/r2/controllers/reddit_base.py b/r2/r2/controllers/reddit_base.py index 126ff4f49..8415ccc5d 100644 --- a/r2/r2/controllers/reddit_base.py +++ b/r2/r2/controllers/reddit_base.py @@ -212,13 +212,18 @@ def over18(): return True def set_subreddit(): - sr_name=request.environ.get("subreddit", request.params.get('r')) + #the r parameter gets added by javascript for POST requests so we + #can reference c.site in api.py + sr_name = request.environ.get("subreddit", request.POST.get('r')) + domain = request.environ.get("domain") - if not sr_name or sr_name == Default.name: + if not sr_name: + #check for cnames sub_domain = request.environ.get('sub_domain') sr = Subreddit._by_domain(sub_domain) if sub_domain else None c.site = sr or Default elif sr_name == 'r': + #reddits c.site = Sub else: try: @@ -227,6 +232,10 @@ def set_subreddit(): c.site = Default redirect_to("/reddits/create?name=%s" % sr_name) + #if we didn't find a subreddit, check for a domain listing + if not sr_name and c.site == Default and domain: + c.site = DomainSR(domain) + if isinstance(c.site, FakeSubreddit): c.default_sr = True @@ -235,42 +244,16 @@ def set_subreddit(): abort(404, "not found") def set_content_type(): - c.extension = request.environ.get('extension') or \ - request.environ.get('reddit-domain-extension') or '' - c.render_style = 'html' - if c.extension in ('rss', 'xml'): - c.render_style = 'xml' - c.response_content_type = 'text/xml; charset=UTF-8' - elif c.extension == 'js': - c.render_style = 'js' - c.response_content_type = 'text/javascript; charset=UTF-8' - elif c.extension.startswith('json') or c.extension == "api": - c.response_content_type = 'application/json; charset=UTF-8' - c.response_access_control = 'allow <*>' - if c.extension == 'json-html': - c.render_style = api_type('html') - else: - c.render_style = api_type() - elif c.extension == 'wired': - c.render_style = 'wired' - c.response_content_type = 'text/javascript; charset=UTF-8' - c.response_wrappers.append(utils.to_js) - elif c.extension == 'embed': - c.render_style = 'htmllite' - c.response_content_type = 'text/javascript; charset=UTF-8' - c.response_wrappers.append(utils.to_js) - elif c.extension == 'mobile': - c.render_style = 'mobile' - elif c.extension == 'png': - c.response_content_type = 'image/png' - c.render_style = 'png' - elif c.extension == 'css': - c.response_content_type = 'text/css' - c.render_style = 'css' - #Insert new extentions above this line - elif c.extension not in ('', 'html'): - # request.path already has the extension stripped off of it - redirect_to(request.path + utils.query_string(request.get)) + e = request.environ + if e.has_key('extension'): + c.render_style = e['render_style'] + c.response_content_type = e['content_type'] + + ext = e['extension'] + if ext == 'api' or ext.startswith('json'): + c.response_access_control = 'allow <*>' + if ext in ('embed', 'wired'): + c.response_wrappers.append(utils.to_js) def get_browser_langs(): browser_langs = [] diff --git a/r2/r2/lib/base.py b/r2/r2/lib/base.py index 48a1eba84..10bf73bc0 100644 --- a/r2/r2/lib/base.py +++ b/r2/r2/lib/base.py @@ -118,7 +118,8 @@ class BaseController(WSGIController): u.mk_cname(**kw) # make sure the extensions agree with the current page - u.set_extension(c.extension) + if c.extension: + u.set_extension(c.extension) # unparse and encode it un utf8 return _force_unicode(u.unparse()).encode('utf8') diff --git a/r2/r2/lib/cache.py b/r2/r2/lib/cache.py index ba9324eaa..75b10a0cb 100644 --- a/r2/r2/lib/cache.py +++ b/r2/r2/lib/cache.py @@ -225,7 +225,7 @@ def test_cache(cache): # a cache that occasionally dumps itself to be used for long-running # processes class SelfEmptyingCache(LocalCache): - def __init__(self,max_size=50*1000): + def __init__(self,max_size=100*1000): self.max_size = max_size def maybe_reset(self): diff --git a/r2/r2/lib/db/queries.py b/r2/r2/lib/db/queries.py index 824f60f92..95c1c34ed 100644 --- a/r2/r2/lib/db/queries.py +++ b/r2/r2/lib/db/queries.py @@ -5,6 +5,7 @@ from r2.lib.db.operators import asc, desc, timeago from r2.lib.db import query_queue from r2.lib.db.sorts import epoch_seconds from r2.lib.utils import fetch_things2, worker +from r2.lib.solrsearch import DomainSearchQuery from datetime import datetime @@ -23,6 +24,12 @@ def db_sort(sort): cls, col = db_sorts[sort] return cls(col) +search_sort = dict(hot = 'hot desc', + new = 'date desc', + top = 'points desc', + controversial = 'controversy desc', + old = 'date asc') + db_times = dict(all = None, hour = Thing.c._date >= timeago('1 hour'), day = Thing.c._date >= timeago('1 day'), @@ -176,6 +183,9 @@ def get_links(sr, sort, time): q._filter(db_times[time]) return make_results(q) +def get_domain_links(domain, sort, time): + return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time) + def user_query(kind, user, sort, time): """General profile-page query.""" q = kind._query(kind.c.author_id == user._id, diff --git a/r2/r2/lib/solrsearch.py b/r2/r2/lib/solrsearch.py index 2409c50b1..d7090fd97 100644 --- a/r2/r2/lib/solrsearch.py +++ b/r2/r2/lib/solrsearch.py @@ -32,7 +32,7 @@ from r2.models import * from r2.lib.contrib import pysolr from r2.lib.contrib.pysolr import SolrError from r2.lib.utils import timeago, set_emptying_cache, IteratorChunker -from r2.lib.utils import psave, pload, unicode_safe +from r2.lib.utils import psave, pload, unicode_safe, tup from r2.lib.cache import SelfEmptyingCache from Queue import Queue from threading import Thread @@ -125,6 +125,8 @@ search_fields={Thing: (Field('fullname', '_fullname'), Field('lang'), Field('ups', '_ups', is_number=True, reverse=True), Field('downs', '_downs', is_number=True, reverse=True), + Field('spam','_spam'), + Field('deleted','_deleted'), Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True), Field('controversy', '_controversy', is_number=True, reverse=True), Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)), @@ -162,8 +164,8 @@ search_fields={Thing: (Field('fullname', '_fullname'), # yes, it's a copy of 'hot' is_number=True, reverse=True), ThingField('author',Account,'author_id','name'), - #ThingField('subreddit',Subreddit,'sr_id','name'), - ThingField('reddit',Subreddit,'sr_id','name'))} + ThingField('subreddit',Subreddit,'sr_id','name'))} + #ThingField('reddit',Subreddit,'sr_id','name'))} def tokenize_things(things,return_dict=False): """ @@ -276,6 +278,8 @@ def fetch_batches(t_class,size,since,until): of `fetch_things` """ q=t_class._query(t_class.c._date >= since, + t_class.c._spam == (True,False), + t_class.c._deleted == (True,False), t_class.c._date < until, sort = desc('_date'), limit = size, @@ -375,8 +379,8 @@ def reindex_all(types = None, delete_all_first=False): for batch in fetch_batches(cls,1000, timeago("50 years"), start_t): - r = tokenize_things([x for x in batch - if not x._spam and not x._deleted ]) + r = tokenize_things([ x for x in batch + if not x._spam and not x._deleted ]) count += len(r) print ("Processing %s #%d(%s): %s" @@ -465,173 +469,241 @@ def combine_searchterms(terms): def swap_strings(s,this,that): """ Just swaps substrings, like: - s = "sort(asc)" - swap_strings(s,'asc','desc') - s -> "sort desc" + s = "hot asc" + s = swap_strings(s,'asc','desc') + s == "hot desc" uses 'tmp' as a replacment string, so don't use for anything very complicated """ return s.replace(this,'tmp').replace(that,this).replace('tmp',that) -def search_things(q, sort = 'hot desc', - after = None, - subreddits = None, - authors = None, - num = 100, reverse = False, - timerange = None, langs = None, - types = None, - boost = []): - """ - Takes a given query and returns a list of Things that match - that query. See Builder for the use of `after`, `reverse`, and - `num`. Queries on params are OR queries, except `timerange` - and `types` - """ - if not q or not g.solr_url: - return pysolr.Results([],0) +class SearchQuery(object): + def __init__(self, q, sort, fields = [], subreddits = [], authors = [], + types = [], timerange = None, spam = False, deleted = False): - # there are two parts to our query: what the user typed (parsed - # with Solr's DisMax parser), and what we are adding to it. The - # latter is called the "boost" (and is parsed using full Lucene - # syntax), and it can be added to via the `boost` parameter (which - # we have to copy since we append to it) - boost = list(boost) + self.q = q + self.fields = fields + self.sort = sort + self.subreddits = subreddits + self.authors = authors + self.types = types + self.spam = spam + self.deleted = deleted - # `score` refers to Solr's score (relevency to the search given), - # not our score (sums of ups and downs). - sort = "score desc, %s, date desc, fullname asc" % (sort,) - if reverse: - sort = swap_strings(sort,'asc','desc') - - if timerange: - def time_to_searchstr(t): - if isinstance(t, datetime): - t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z') - elif isinstance(t, date): - t = t.strftime('%Y-%m-%dT00:00:00.000Z') - elif isinstance(t,str): - t = t - return t - - (fromtime, totime) = timerange - fromtime = time_to_searchstr(fromtime) - totime = time_to_searchstr(totime) - boost.append("+date:[%s TO %s]" - % (fromtime,totime)) - - if subreddits: - def subreddit_to_searchstr(sr): - if isinstance(sr,Subreddit): - return ('sr_id','%d' % sr.id) - elif isinstance(sr,str) or isinstance(sr,unicode): - return ('reddit',sr) - else: - return ('sr_id','%d' % sr) - - if isinstance(subreddits,list) or isinstance(subreddits,tuple): - s_subreddits = map(subreddit_to_searchstr, subreddits) + if timerange in ['hour','week','day','month','year']: + self.timerange = (timeago("1 %s" % timerange),"NOW") + elif timerange == 'all' or timerange is None: + self.timerange = None else: - s_subreddits = (subreddit_to_searchstr(subreddits),) + self.timerange = timerange - boost.append("+(%s)^2" % combine_searchterms(s_subreddits)) + def run(self, after = None, num = 100, reverse = False): + if not self.q or not g.solr_url: + return pysolr.Results([],0) - if authors: - def author_to_searchstr(a): - if isinstance(a,Account): - return ('author_id','%d' % a.id) - elif isinstance(a,str) or isinstance(a,unicode): - return ('author',a) + # there are two parts to our query: what the user typed + # (parsed with Solr's DisMax parser), and what we are adding + # to it. The latter is called the "boost" (and is parsed using + # full Lucene syntax), and it can be added to via the `boost` + # parameter + boost = [] + + if not self.spam: + boost.append("-spam:true") + if not self.deleted: + boost.append("-deleted:true") + + if self.timerange: + def time_to_searchstr(t): + if isinstance(t, datetime): + t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z') + elif isinstance(t, date): + t = t.strftime('%Y-%m-%dT00:00:00.000Z') + elif isinstance(t,str): + t = t + return t + + (fromtime, totime) = self.timerange + fromtime = time_to_searchstr(fromtime) + totime = time_to_searchstr(totime) + boost.append("+date:[%s TO %s]" + % (fromtime,totime)) + + if self.subreddits: + def subreddit_to_searchstr(sr): + if isinstance(sr,Subreddit): + return ('sr_id','%d' % sr.id) + elif isinstance(sr,str) or isinstance(sr,unicode): + return ('subreddit',sr) + else: + return ('sr_id','%d' % sr) + + s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits)) + + boost.append("+(%s)" % combine_searchterms(s_subreddits)) + + if self.authors: + def author_to_searchstr(a): + if isinstance(a,Account): + return ('author_id','%d' % a.id) + elif isinstance(a,str) or isinstance(a,unicode): + return ('author',a) + else: + return ('author_id','%d' % a) + + s_authors = map(author_to_searchstr,tup(self.authors)) + + boost.append('+(%s)^2' % combine_searchterms(s_authors)) + + + def type_to_searchstr(t): + if isinstance(t,str): + return ('type',t) else: - return ('author_id','%d' % a) - - if isinstance(authors,list) or isinstance(authors,tuple): - s_authors = map(author_to_searchstr,authors) - else: - s_authors = map(author_to_searchstr,(authors,)) - - boost.append('+(%s)^2' % combine_searchterms(s_authors)) - - # the set of languages is used to determine the fields to search, - # named ('contents_%s' % lang), but 'contents' (which is split - # only on whitespace) is always also searched. This means that - # all_langs and schema.xml must be kept in synch - default_fields = ['contents^1.5','contents_ws^3', - 'site^1','author^1', 'reddit^1', 'url^1'] - if langs == None: - # only search 'contents' - fields = default_fields - else: - if langs == 'all': - langs = searchable_langs - fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs] - + default_fields) - - if not types: - types = indexed_types - - def type_to_searchstr(t): - if isinstance(t,str): - return ('type',t) - else: - return ('type',t.__name__.lower()) + return ('type',t.__name__.lower()) - s_types = map(type_to_searchstr,types) - boost.append("+%s" % combine_searchterms(s_types)) + s_types = map(type_to_searchstr,self.types) + boost.append("+%s" % combine_searchterms(s_types)) - # everything else that solr needs to know - solr_params = dict(fl = 'fullname', # the field(s) to return - qt = 'dismax', # the query-handler (dismax supports 'bq' and 'qf') - # qb = '3', - bq = ' '.join(boost), - qf = ' '.join(fields), - mm = '75%') # minimum number of clauses that should match + q,solr_params = self.solr_params(self.q,boost) - with SolrConnection() as s: - if after: - # size of the pre-search to run in the case that we need - # to search more than once. A bigger one can reduce the - # number of searches that need to be run twice, but if - # it's bigger than the default display size, it could - # waste some - PRESEARCH_SIZE = num + try: + search = self.run_search(q, self.sort, solr_params, + reverse, after, num) + return search - # run a search and get back the number of hits, so that we - # can re-run the search with that max_count. - pre_search = s.search(q,sort,rows=PRESEARCH_SIZE, + except SolrError,e: + g.log.error(str(e)) + return pysolr.Results([],0) + + @classmethod + def run_search(cls, q, sort, solr_params, reverse, after, num): + "returns pysolr.Results(docs=[fullname()],hits=int())" + + if reverse: + sort = swap_strings(sort,'asc','desc') + + g.log.debug("Searching q=%s" % q) + + with SolrConnection() as s: + if after: + # size of the pre-search to run in the case that we + # need to search more than once. A bigger one can + # reduce the number of searches that need to be run + # twice, but if it's bigger than the default display + # size, it could waste some + PRESEARCH_SIZE = num + + # run a search and get back the number of hits, so + # that we can re-run the search with that max_count. + pre_search = s.search(q,sort,rows=PRESEARCH_SIZE, + other_params = solr_params) + + if (PRESEARCH_SIZE >= pre_search.hits + or pre_search.hits == len(pre_search.docs)): + # don't run a second search if our pre-search + # found all of the elements anyway + search = pre_search + else: + # we have to run a second search, but we can limit + # the duplicated transfer of the first few records + # since we already have those from the pre_search + second_search = s.search(q,sort, + start=len(pre_search.docs), + rows=pre_search.hits - len(pre_search.docs), + other_params = solr_params) + search = pysolr.Results(pre_search.docs + second_search.docs, + pre_search.hits) + + search.docs = [ i['fullname'] for i in search.docs ] + search.docs = get_after(search.docs, after._fullname, num) + else: + search = s.search(q,sort,rows=num, other_params = solr_params) + search.docs = [ i['fullname'] for i in search.docs ] - if (PRESEARCH_SIZE >= pre_search.hits - or pre_search.hits == len(pre_search.docs)): - # don't run a second search if our pre-search found - # all of the elements anyway - search = pre_search - else: - # we have to run a second search, but we can limit the - # duplicated transfer of the first few records since - # we already have those from the pre_search - second_search = s.search(q,sort, - start=len(pre_search.docs), - rows=pre_search.hits - len(pre_search.docs), - other_params = solr_params) - search = pysolr.Results(pre_search.docs + second_search.docs, - pre_search.hits) + return search - fullname = after._fullname - for i, item in enumerate(search.docs): - if item['fullname'] == fullname: - search.docs = search.docs[i+1:i+1+num] - break - else: - g.log.debug("I got an after query, but the fullname was not present in the results") - search.docs = search.docs[0:num] + def solr_params(self,*k,**kw): + raise NotImplementedError + +class UserSearchQuery(SearchQuery): + "Base class for queries that use the dismax parser; requires self.mm" + def __init__(self, q, sort=None, fields=[], langs=None, **kw): + default_fields = ['contents^1.5','contents_ws^3'] + fields + + if sort is None: + sort = 'score desc, hot desc, date desc' + + if langs is None: + fields = default_fields else: - search = s.search(q,sort,rows=num, - other_params = solr_params) + if langs == 'all': + langs = searchable_langs + fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs] + + default_fields) - hits = search.hits - things = Thing._by_fullname([i['fullname'] for i in search.docs], - data = True, return_dict = False) + # default minimum match + self.mm = '75%' - return pysolr.Results(things,hits) + SearchQuery.__init__(self, q, sort, fields = fields, **kw) + def solr_params(self, q, boost): + return q, dict(fl = 'fullname', + qt = 'dismax', + bq = ' '.join(boost), + qf = ' '.join(self.fields), + mm = self.mm) + +class LinkSearchQuery(UserSearchQuery): + def __init__(self, q, **kw): + additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1'] + + subreddits = None + authors = None + if c.site == subreddit.Default: + subreddits = Subreddit.user_subreddits(c.user) + elif c.site == subreddit.Friends and c.user.friends: + authors = c.user.friends + elif not isinstance(c.site,subreddit.FakeSubreddit): + subreddits = [c.site._id] + + UserSearchQuery.__init__(self, q, fields = additional_fields, + subreddits = subreddits, authors = authors, + types=[Link], **kw) + +class RelatedSearchQuery(LinkSearchQuery): + def __init__(self, q, ignore = [], **kw): + self.ignore = set(ignore) if ignore else set() + + LinkSearchQuery.__init__(self, q, sort = 'score desc', **kw) + + self.mm = '25%' + + def run(self, *k, **kw): + search = LinkSearchQuery.run(self, *k, **kw) + search.docs = [ x for x in search.docs if x not in self.ignore ] + return search + +class SubredditSearchQuery(UserSearchQuery): + def __init__(self, q, **kw): + UserSearchQuery.__init__(self, q, types=[Subreddit], **kw) + +class DomainSearchQuery(SearchQuery): + def __init__(self, domain, **kw): + q = '+site:%s' % domain + + SearchQuery.__init__(self, q=q, fields=['site'],types=[Link], **kw) + + def solr_params(self, q, boost): + q = q + ' ' + ' '.join(boost) + return q, dict(fl='fullname', + qt='standard') + +def get_after(fullnames, fullname, num): + for i, item in enumerate(fullnames): + if item == fullname: + return fullnames[i+1:i+num+1] + else: + return fullnames[:num] diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py index 7920ee06c..25e294b99 100644 --- a/r2/r2/lib/utils/utils.py +++ b/r2/r2/lib/utils/utils.py @@ -999,6 +999,7 @@ def title_to_url(title, max_length = 50): return title def debug_print(fn): + from pylons import g def new_fn(*k,**kw): ret = fn(*k,**kw) g.log.debug("Fn: %s; k=%s; kw=%s\nRet: %s" diff --git a/r2/r2/models/builder.py b/r2/r2/models/builder.py index 2368331c8..5bbed1038 100644 --- a/r2/r2/models/builder.py +++ b/r2/r2/models/builder.py @@ -354,42 +354,26 @@ class IDBuilder(QueryBuilder): return done, new_items class SearchBuilder(QueryBuilder): - def __init__(self, query, wrap = Wrapped, sort = None, ignore = [], - time = time, types = None, langs = None, **kw): - QueryBuilder.__init__(self, query, wrap=wrap, **kw) - self.sort = sort - self.time = time - self.types = types - self.timing = 0 - self.total_num = 0 - self.langs = langs - - self.ignore = set(x for x in (ignore if ignore else [])) - def init_query(self): - subreddits = None - authors = None - if c.site == subreddit.Default: - subreddits = Subreddit.user_subreddits(c.user) - elif c.site == subreddit.Friends and c.user.friends: - authors = c.user.friends - elif not isinstance(c.site,subreddit.FakeSubreddit): - subreddits = c.site._id - - self.subreddits = subreddits - self.authors = authors - self.skip = True + self.total_num = 0 + self.start_time = time.time() + + self.start_time = time.time() def keep_item(self,item): - skip_if = item._spam or item._deleted or item._fullname in self.ignore - return not skip_if + # doesn't use the default keep_item because we want to keep + # things that were voted on, even if they've chosen to hide + # them in normal listings + if item._spam or item._deleted: + return False + else: + return True + def fetch_more(self, last_item, num_have): from r2.lib import solrsearch - start_t = time.time() - done = False limit = None if self.num: @@ -401,25 +385,13 @@ class SearchBuilder(QueryBuilder): else: done = True - langs = c.content_langs - if self.langs: - langs += self.langs + search = self.query.run(after = last_item or self.after, + reverse = self.reverse, + num = limit) - if self.time in ['hour','week','day','month']: - timerange = (timeago("1 %s" % self.time),"NOW") - else: - timerange = None + new_items = Thing._by_fullname(search.docs, data = True, return_dict=False) - new_items = solrsearch.search_things(q = self.query or '', sort = self.sort, - after = last_item, - subreddits = self.subreddits, - authors = self.authors, - num = limit, reverse = self.reverse, - timerange = timerange, langs = langs, - types = self.types) - - self.total_num = new_items.hits - self.timing = time.time() - start_t + self.total_num = search.hits return done, new_items diff --git a/r2/r2/models/subreddit.py b/r2/r2/models/subreddit.py index 4c9e5cf52..6694f65b4 100644 --- a/r2/r2/models/subreddit.py +++ b/r2/r2/models/subreddit.py @@ -414,9 +414,12 @@ class Subreddit(Thing, Printable): class FakeSubreddit(Subreddit): over_18 = False - title = '' _nodb = True + def __init__(self): + Subreddit.__init__(self) + self.title = '' + def is_moderator(self, user): return c.user_is_loggedin and c.user_is_admin @@ -568,6 +571,21 @@ class SubSR(FakeSubreddit): @property def path(self): return "/reddits/" + +class DomainSR(FakeSubreddit): + @property + def path(self): + return '/domain/' + self.domain + + def __init__(self, domain): + FakeSubreddit.__init__(self) + self.domain = domain + self.name = domain + self.title = domain + ' ' + _('on reddit.com') + + def get_links(self, sort, time): + from r2.lib.db import queries + return queries.get_domain_links(self.domain, sort, time) Sub = SubSR() Friends = FriendsSR() diff --git a/r2/r2/public/static/reddit.css b/r2/r2/public/static/reddit.css index cc4b033f5..b04442c58 100644 --- a/r2/r2/public/static/reddit.css +++ b/r2/r2/public/static/reddit.css @@ -222,7 +222,7 @@ input[type=checkbox], input[type=radio] { margin-top: .4em; } padding: 2px 6px 1px 6px; background-color: white; border: 1px solid #5f99cf; - border-bottom: none; + border-bottom: 1px solid white; } #search { @@ -588,6 +588,7 @@ before enabling */ padding: 5px 10px; margin: 5px 310px 5px 0px; border: 1px solid orange; + font-size: small; } .menuarea { @@ -958,8 +959,7 @@ a.star { text-decoration: none; color: #ff8b60 } .searchpane a { color: #369 }*/ .searchpane { - margin: 5px; - margin-right: 310px; + margin: 5px 310px 5px 0px; } .searchpane #search input[type=text] { } diff --git a/r2/r2/templates/base.html b/r2/r2/templates/base.html index e9ecba136..e979dfc05 100644 --- a/r2/r2/templates/base.html +++ b/r2/r2/templates/base.html @@ -45,7 +45,7 @@ var sr = {}; var logged = ${c.user_is_loggedin and ("'%s'" % c.user.name) or "false"}; - var post_site = "${c.site.name}"; + var post_site = "${c.site.name if not c.default_sr else ''}"; var cnameframe = ${'true' if c.cname else 'false'}; var modhash = ${"'%s'" % c.modhash or "false"}; var cur_domain = "${get_domain(cname = True, subreddit = False) if c.frameless_cname else g.domain}";