From 973005d2df08720ca0dac4804870feb236d769ab Mon Sep 17 00:00:00 2001 From: Keith Mitchell Date: Fri, 15 Jun 2012 16:37:16 -0700 Subject: [PATCH] Remove references to deprecated Solr index --- config/solr/schema.xml | 456 ---------------- config/solr/server.xml | 387 -------------- config/solr/solrconfig.xml | 464 ----------------- install-reddit.sh | 2 - r2/example.ini | 6 - r2/r2/controllers/front.py | 4 +- r2/r2/controllers/listingcontroller.py | 3 +- r2/r2/controllers/reddit_base.py | 11 +- r2/r2/lib/app_globals.py | 1 - r2/r2/lib/contrib/pysolr.py | 347 ------------- r2/r2/lib/db/queries.py | 10 - r2/r2/lib/solrsearch.py | 692 ------------------------- upstart/reddit-job-solrsearch.conf | 13 - 13 files changed, 3 insertions(+), 2393 deletions(-) delete mode 100644 config/solr/schema.xml delete mode 100644 config/solr/server.xml delete mode 100644 config/solr/solrconfig.xml delete mode 100644 r2/r2/lib/contrib/pysolr.py delete mode 100644 r2/r2/lib/solrsearch.py delete mode 100644 upstart/reddit-job-solrsearch.conf diff --git a/config/solr/schema.xml b/config/solr/schema.xml deleted file mode 100644 index ab4376050..000000000 --- a/config/solr/schema.xml +++ /dev/null @@ -1,456 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - fullname - - - contents - - - - - - - - diff --git a/config/solr/server.xml b/config/solr/server.xml deleted file mode 100644 index 8ea4525f6..000000000 --- a/config/solr/server.xml +++ /dev/null @@ -1,387 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/config/solr/solrconfig.xml b/config/solr/solrconfig.xml deleted file mode 100644 index 4e41d249e..000000000 --- a/config/solr/solrconfig.xml +++ /dev/null @@ -1,464 +0,0 @@ - - - - - ${solr.abortOnConfigurationError:true} - - - - - - - false - 10 - 1000 - 2147483647 - 10000 - 1000 - 10000 - - - - - false - 10 - 1000 - 2147483647 - 10000 - - - false - - - - - - - - - - - - - - - - - - - 1024 - - - - - - - - - - - - - true - - - - - - - - 10 - - - - - - - - - - - - - - - - - - false - - - 4 - - - - - - - - - - - - - - - explicit - - - - - - - - contents - - - - - - - - explicit - text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 - 2<-1 5<-2 6<90% - - incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2 - - - - inStock:true - - - - cat - manu_exact - price:[* TO 500] - price:[500 TO *] - - - - - - - inStock:true - - - text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 - - - 2<-1 5<-2 6<90% - - - - - - - - - 1 - 0.5 - - - - - - - - spell - - - - - word - - - - - - - - - - - - - - - - - - - - - - - - - - solr - solrconfig.xml schema.xml admin-extra.html - - - qt=standard&q=solrpingquery - - - - - diff --git a/install-reddit.sh b/install-reddit.sh index 377286baf..0f5211610 100755 --- a/install-reddit.sh +++ b/install-reddit.sh @@ -338,8 +338,6 @@ if [ ! -f /etc/cron.d/reddit ]; then # disabled by default, uncomment if you need these jobs #*/2 * * * * root /sbin/start --quiet reddit-job-google_checkout -#*/10 * * * * root /sbin/start --quiet reddit-job-solrsearch optimize=False -#0 0 * * * root /sbin/start --quiet reddit-job-solrsearch optimize=True #0 0 * * * root /sbin/start --quiet reddit-job-update_gold_users CRON fi diff --git a/r2/example.ini b/r2/example.ini index 6934226c1..5f5cc9114 100644 --- a/r2/example.ini +++ b/r2/example.ini @@ -354,12 +354,6 @@ png_optimizer = /usr/bin/env optipng # jpeg compressor jpeg_optimizer = -# -- search -- -# where is solor? -solr_url = -# how long do we cache search results (in seconds) -solr_cache_time = 300 - # Just a list of words. Used by errlog.py to make up names for new errors. words_file = /usr/dict/words diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py index c61571e7a..2bd94186a 100755 --- a/r2/r2/controllers/front.py +++ b/r2/r2/controllers/front.py @@ -40,8 +40,6 @@ from r2.lib.db.tdb_cassandra import MultiColumnQuery from r2.lib.strings import strings from r2.lib.search import (SearchQuery, SubredditSearchQuery, SearchException, InvalidQuery) -from r2.lib.solrsearch import RelatedSearchQuery -from r2.lib.contrib.pysolr import SolrError from r2.lib import jsontemplates from r2.lib import sup import r2.lib.db.thing as thing @@ -788,7 +786,7 @@ class FrontController(RedditController): # computed after fetch_more try: res = listing.listing() - except SearchException + (SolrError, socket.error) as e: + except SearchException + (socket.error,) as e: return self.search_fail(e) timing = time_module.time() - builder.start_time diff --git a/r2/r2/controllers/listingcontroller.py b/r2/r2/controllers/listingcontroller.py index eca1ff3b0..7fd5fcf40 100755 --- a/r2/r2/controllers/listingcontroller.py +++ b/r2/r2/controllers/listingcontroller.py @@ -36,7 +36,6 @@ from r2.lib.db.thing import Query, Merge, Relations from r2.lib.db import queries from r2.lib.strings import Score from r2.lib import organic -import r2.lib.solrsearch as solrsearch import r2.lib.search as search from r2.lib.utils import iters, check_cheating, timeago from r2.lib.utils.trial_utils import populate_spotlight @@ -130,7 +129,7 @@ class ListingController(RedditController): builder_cls = self.builder_cls elif isinstance(self.query_obj, Query): builder_cls = QueryBuilder - elif isinstance(self.query_obj, (solrsearch.SearchQuery, search.SearchQuery)): + elif isinstance(self.query_obj, search.SearchQuery): builder_cls = SearchBuilder elif isinstance(self.query_obj, iters): builder_cls = IDBuilder diff --git a/r2/r2/controllers/reddit_base.py b/r2/r2/controllers/reddit_base.py index 13515c70d..50dca5e38 100644 --- a/r2/r2/controllers/reddit_base.py +++ b/r2/r2/controllers/reddit_base.py @@ -931,17 +931,8 @@ class RedditController(MinimalController): abort(304, 'not modified') def search_fail(self, exception): - from r2.lib.contrib.pysolr import SolrError from r2.lib.search import SearchException - if isinstance(exception, SolrError): - errmsg = "SolrError: %r" % exception - - if (str(exception) == 'None'): - # Production error logs only get non-None errors - g.log.debug(errmsg) - else: - g.log.error(errmsg) - elif isinstance(exception, SearchException + (socket.error,)): + if isinstance(exception, SearchException + (socket.error,)): g.log.error("Search Error: %s" % repr(exception)) errpage = pages.RedditError(_("search failed"), diff --git a/r2/r2/lib/app_globals.py b/r2/r2/lib/app_globals.py index 04b21045c..6f441bc04 100755 --- a/r2/r2/lib/app_globals.py +++ b/r2/r2/lib/app_globals.py @@ -47,7 +47,6 @@ class Globals(object): 'db_pool_size', 'db_pool_overflow_size', 'page_cache_time', - 'solr_cache_time', 'num_mc_clients', 'MIN_DOWN_LINK', 'MIN_UP_KARMA', diff --git a/r2/r2/lib/contrib/pysolr.py b/r2/r2/lib/contrib/pysolr.py deleted file mode 100644 index 46fd04ae4..000000000 --- a/r2/r2/lib/contrib/pysolr.py +++ /dev/null @@ -1,347 +0,0 @@ -# -*- coding: utf-8 -*- -""" -All we need to create a Solr connection is a url. - ->>> conn = Solr('http://127.0.0.1:8983/solr/') - -First, completely clear the index. - ->>> conn.delete(q='*:*') - -For now, we can only index python dictionaries. Each key in the dictionary -will correspond to a field in Solr. - ->>> docs = [ -... {'id': 'testdoc.1', 'order_i': 1, 'name': 'document 1', 'text': u'Paul Verlaine'}, -... {'id': 'testdoc.2', 'order_i': 2, 'name': 'document 2', 'text': u'Владимир Маякoвский'}, -... {'id': 'testdoc.3', 'order_i': 3, 'name': 'document 3', 'text': u'test'}, -... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'test'} -... ] - - -We can add documents to the index by passing a list of docs to the connection's -add method. - ->>> conn.add(docs) - ->>> results = conn.search('Verlaine') ->>> len(results) -1 - ->>> results = conn.search(u'Владимир') ->>> len(results) -1 - - -Simple tests for searching. We can optionally sort the results using Solr's -sort syntax, that is, the field name and either asc or desc. - ->>> results = conn.search('test', sort='order_i asc') ->>> for result in results: -... print result['name'] -document 3 -document 4 - ->>> results = conn.search('test', sort='order_i desc') ->>> for result in results: -... print result['name'] -document 4 -document 3 - - -To update documents, we just use the add method. - ->>> docs = [ -... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'blah'} -... ] ->>> conn.add(docs) - ->>> len(conn.search('blah')) -1 ->>> len(conn.search('test')) -1 - - -We can delete documents from the index by id, or by supplying a query. - ->>> conn.delete(id='testdoc.1') ->>> conn.delete(q='name:"document 2"') - ->>> results = conn.search('Verlaine') ->>> len(results) -0 - - -Docs can also have multiple values for any particular key. This lets us use -Solr's multiValue fields. - ->>> docs = [ -... {'id': 'testdoc.5', 'cat': ['poetry', 'science'], 'name': 'document 5', 'text': u''}, -... {'id': 'testdoc.6', 'cat': ['science-fiction',], 'name': 'document 6', 'text': u''}, -... ] - ->>> conn.add(docs) ->>> results = conn.search('cat:"poetry"') ->>> for result in results: -... print result['name'] -document 5 - ->>> results = conn.search('cat:"science-fiction"') ->>> for result in results: -... print result['name'] -document 6 - ->>> results = conn.search('cat:"science"') ->>> for result in results: -... print result['name'] -document 5 - -NOTE: PySolr is an open-source Python module - that falls under the New BSD -Licence , NOT the -licence covering the rest of Reddit. Reddit's modifications to this -module also fall under the New BSD Licence. The New BSD Licence -requires that re-distributions of the source, modified or not, display -the original copyright notice, but PySolr does not, as of import-time, -display a copyright notice or licence, except on its Google Code -information page. Therefore for licencing information, I point you to -PySolr's Google Code information page, URL above. - -""" - -# TODO: unicode support is pretty sloppy. define it better. - -from httplib import HTTPConnection -from urllib import urlencode -from urlparse import urlsplit -from datetime import datetime, date -from time import strptime, strftime -from r2.lib.utils import unicode_safe -try: - # for python 2.5 - from xml.etree import ElementTree - from xml.parsers.expat import ExpatError -except ImportError: - from elementtree import ElementTree,ExpatError - -__all__ = ['Solr'] - -class SolrError(Exception): - pass - -class Results(object): - def __init__(self, docs, hits): - self.docs = docs - self.hits = hits - - def __len__(self): - return len(self.docs) - - def __iter__(self): - return iter(self.docs) - - def __getitem__(self,x): - return self.docs[x] - -class Solr(object): - def __init__(self, url): - self.url = url - scheme, netloc, path, query, fragment = urlsplit(url) - netloc = netloc.split(':') - self.host = netloc[0] - if len(netloc) == 1: - self.host, self.port = netloc[0], None - else: - self.host, self.port = netloc - self.path = path.rstrip('/') - - def _select(self, params): - # encode the query as utf-8 so urlencode can handle it - params['q'] = unicode_safe(params['q']) - path = '%s/select/?%s' % (self.path, urlencode(params)) - conn = HTTPConnection(self.host, self.port) - conn.request('GET', path) - return conn.getresponse() - - def _update(self, message): - """ - Posts the given xml message to http://:/solr/update and - returns the result. - """ - path = '%s/update/' % self.path - conn = HTTPConnection(self.host, self.port) - conn.request('POST', path, message, {'Content-type': 'text/xml'}) - return conn.getresponse() - - def _extract_error(self, response): - """ - Extract the actual error message from a solr response. Unfortunately, - this means scraping the html. - """ - try: - et = ElementTree.parse(response) - error = et.findtext('body/pre') - return error - except ExpatError,e: - return "%s: %s (%d/%s)" % (e,response.read(),response.status,response.reason) - - # Converters ############################################################# - - @staticmethod - def _from_python(value): - """ - Converts python values to a form suitable for insertion into the xml - we send to solr. - """ - if isinstance(value, datetime): - value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z') - elif isinstance(value, date): - value = value.strftime('%Y-%m-%dT00:00:00.000Z') - elif isinstance(value, bool): - if value: - value = 'true' - else: - value = 'false' - else: - value = unicode_safe(value) - return value - - def bool_to_python(self, value): - """ - Convert a 'bool' field from solr's xml format to python and return it. - """ - if value == 'true': - return True - elif value == 'false': - return False - - def str_to_python(self, value): - """ - Convert an 'str' field from solr's xml format to python and return it. - """ - return unicode_safe(value) - - def int_to_python(self, value): - """ - Convert an 'int' field from solr's xml format to python and return it. - """ - return int(value) - - def date_to_python(self, value): - """ - Convert a 'date' field from solr's xml format to python and return it. - """ - # this throws away fractions of a second - return datetime(*strptime(value[:-5], "%Y-%m-%dT%H:%M:%S")[0:6]) - - # API Methods ############################################################ - - def search(self, q, sort=None, start=0, rows=20, other_params = {}): - """Performs a search and returns the results.""" - params = {'q': q, 'start': start, 'rows': rows} - - for x,y in other_params.iteritems(): - params[x] = y - if sort: - params['sort'] = sort - - response = self._select(params) - if response.status != 200: - raise SolrError(self._extract_error(response)) - - # TODO: make result retrieval lazy and allow custom result objects - # also, this has become rather ugly and definitely needs some cleanup. - et = ElementTree.parse(response) - result = et.find('result') - hits = int(result.get('numFound')) - docs = result.findall('doc') - results = [] - for doc in docs: - result = {} - for element in doc.getchildren(): - if element.tag == 'arr': - result_val = [] - for array_element in element.getchildren(): - converter_name = '%s_to_python' % array_element.tag - converter = getattr(self, converter_name) - result_val.append(converter(array_element.text)) - else: - converter_name = '%s_to_python' % element.tag - converter = getattr(self, converter_name) - result_val = converter(element.text) - result[element.get('name')] = result_val - results.append(result) - return Results(results, hits) - - def add(self, docs, commit=False): - """Adds or updates documents. For now, docs is a list of dictionaies - where each key is the field name and each value is the value to index. - """ - message = ElementTree.Element('add') - for doc in docs: - message.append(doc_to_elemtree(doc)) - m = ElementTree.tostring(message) - response = self._update(m) - if response.status != 200: - raise SolrError(self._extract_error(response)) - # TODO: Supposedly, we can put a element in the same post body - # as the add element. That isn't working for some reason, and it would save us - # an extra trip to the server. This works for now. - if commit: - self.commit() - - def delete(self, id=None, q=None, commit=False): - """Deletes documents.""" - if id is None and q is None: - raise ValueError('You must specify "id" or "q".') - elif id is not None and q is not None: - raise ValueError('You many only specify "id" OR "q", not both.') - elif id is not None: - m = '%s' % id - elif q is not None: - m = '%s' % q - response = self._update(m) - if response.status != 200: - raise SolrError(self._extract_error(response)) - # TODO: Supposedly, we can put a element in the same post body - # as the delete element. That isn't working for some reason, and it would save us - # an extra trip to the server. This works for now. - if commit: - self.commit() - - def commit(self): - response = self._update('') - if response.status != 200: - raise SolrError(self._extract_error(response)) - - def optimize(self): - response = self._update('') - if response.status != 200: - raise SolrError(self._extract_error(response)) - -solr_magic_fields = ('boost',) -def doc_to_elemtree(doc): - d = ElementTree.Element('doc') - for key, value in doc.iteritems(): - - if key in solr_magic_fields: - # handle special fields that are attributes, not fields - d.set(key,Solr._from_python(value)) - elif (not isinstance(value,str)) and hasattr(value, '__iter__'): - # handle lists, tuples, and other iterabes - for v in value: - f = ElementTree.Element('field', name=key) - f.text = Solr._from_python(v) - d.append(f) - # handle strings and unicode - else: - f = ElementTree.Element('field', name=key) - f.text = Solr._from_python(value) - d.append(f) - - return d - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/r2/r2/lib/db/queries.py b/r2/r2/lib/db/queries.py index b3a4d107e..66cd0e39d 100755 --- a/r2/r2/lib/db/queries.py +++ b/r2/r2/lib/db/queries.py @@ -5,7 +5,6 @@ from r2.lib.db.operators import asc, desc, timeago from r2.lib.db.sorts import epoch_seconds from r2.lib.utils import fetch_things2, tup, UniqueIterator, set_last_modified from r2.lib import utils -from r2.lib.solrsearch import DomainSearchQuery from r2.lib import amqp, sup, filters from r2.lib.comment_tree import add_comments, update_comment_votes from r2.models.query_cache import (cached_query, merged_cached_query, @@ -39,12 +38,6 @@ def db_sort(sort): cls, col = db_sorts[sort] return cls(col) -search_sort = dict(hot = 'hot desc', - new = 'date desc', - top = 'points desc', - controversial = 'controversy desc', - old = 'date asc') - db_times = dict(all = None, hour = Thing.c._date >= timeago('1 hour'), day = Thing.c._date >= timeago('1 day'), @@ -458,9 +451,6 @@ def get_modqueue(sr): q.append(get_spam_filtered_comments(sr)) return q -def get_domain_links_old(domain, sort, time): - return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time) - def get_domain_links(domain, sort, time): from r2.lib.db import operators q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain), diff --git a/r2/r2/lib/solrsearch.py b/r2/r2/lib/solrsearch.py deleted file mode 100644 index 51d176d0d..000000000 --- a/r2/r2/lib/solrsearch.py +++ /dev/null @@ -1,692 +0,0 @@ -# The contents of this file are subject to the Common Public Attribution -# License Version 1.0. (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public -# License Version 1.1, but Sections 14 and 15 have been added to cover use of -# software over a computer network and provide for limited attribution for the -# Original Developer. In addition, Exhibit A has been modified to be consistent -# with Exhibit B. -# -# Software distributed under the License is distributed on an "AS IS" basis, -# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for -# the specific language governing rights and limitations under the License. -# -# The Original Code is Reddit. -# -# The Original Developer is the Initial Developer. The Initial Developer of the -# Original Code is CondeNet, Inc. -# -# All portions of the code written by CondeNet are Copyright (c) 2006-2010 -# CondeNet, Inc. All Rights Reserved. -################################################################################ -""" - Module for communication reddit-level communication with - Solr. Contains functions for indexing (`reindex_all`, `run_changed`) - and searching (`search_things`). Uses pysolr (placed in r2.lib) - for lower-level communication with Solr -""" - -from __future__ import with_statement - -from Queue import Queue -from threading import Thread -import time -from datetime import datetime, date -from time import strftime - -from pylons import g, config - -from r2.models import * -from r2.lib.contrib import pysolr -from r2.lib.contrib.pysolr import SolrError -from r2.lib.utils import timeago, UrlParser -from r2.lib.utils import unicode_safe, tup, get_after, strordict_fullname -from r2.lib.cache import SelfEmptyingCache -from r2.lib import amqp - -solr_cache_time = g.solr_cache_time - -## Changes to the list of searchable languages will require changes to -## Solr's configuration (specifically, the fields that are searched) -searchable_langs = set(['dk','nl','en','fi','fr','de','it','no','nn','pt', - 'ru','es','sv','zh','ja','ko','cs','el','th']) - -## Adding types is a matter of adding the class to indexed_types here, -## adding the fields from that type to search_fields below, and adding -## those fields to Solr's configuration -indexed_types = (Subreddit, Link) - - -class Field(object): - """ - Describes a field of a Thing that is searchable by Solr. Used - by `search_fields` below - """ - def __init__(self, name, thing_attr_func = None, store = True, - tokenize=False, is_number=False, reverse=False, - is_date = False): - self.name = name - self.thing_attr_func = self.make_extractor(thing_attr_func) - - def make_extractor(self,thing_attr_func): - if not thing_attr_func: - return self.make_extractor(self.name) - elif isinstance(thing_attr_func,str): - return (lambda x: getattr(x,thing_attr_func)) - else: - return thing_attr_func - - def extract_from(self,thing): - return self.thing_attr_func(thing) - -class ThingField(Field): - """ - ThingField('field_name',Author,'author_id','name') - is like: - Field(name, lambda x: Author._byID(x.author_id,data=True).name) - but faster because lookups are done in batch - """ - def __init__(self,name,cls,id_attr,lu_attr_name): - self.name = name - - self.cls = cls # the class of the looked-up object - self.id_attr = id_attr # the attr of the source obj used to find the dest obj - self.lu_attr_name = lu_attr_name # the attr of the dest class that we want to return - - def __str__(self): - return ("" - % (self.name,self.cls,self.id_attr,self.lu_attr_name)) - -# Describes the fields of Thing objects and subclasses that are passed -# to Solr for indexing. All must have a 'contents' field, since that -# will be used for language-agnostic searching, and will be copied -# into contents_en, contents_eo, et (see `tokenize_things` for a -# discussion of multi-language search. The 'boost' field is a -# solr-magic field that ends up being an attribute on the -# message (rather than a field), and is used to do an index-time boost -# (this magic is done in pysolr.dor_to_elemtree) -search_fields={Thing: (Field('fullname', '_fullname'), - Field('date', '_date', is_date = True, reverse=True), - Field('lang'), - Field('ups', '_ups', is_number=True, reverse=True), - Field('downs', '_downs', is_number=True, reverse=True), - Field('spam','_spam'), - Field('deleted','_deleted'), - Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True), - Field('controversy', '_controversy', is_number=True, reverse=True), - Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)), - Subreddit: (Field('contents', - lambda s: ' '.join([unicode_safe(s.name), - unicode_safe(s.title), - unicode_safe(s.description), - unicode_safe(s.firsttext)]), - tokenize = True), - Field('boost', '_downs'), - #Field('title'), - #Field('firsttext'), - #Field('description'), - #Field('over_18'), - #Field('sr_type','type'), - ), - Link: (Field('contents','title', tokenize = True), - Field('boost', lambda t: int(t._hot*1000), - # yes, it's a copy of 'hot' - is_number=True, reverse=True), - Field('author_id'), - ThingField('author',Account,'author_id','name'), - ThingField('subreddit',Subreddit,'sr_id','name'), - #ThingField('reddit',Subreddit,'sr_id','name'), - Field('sr_id'), - Field('url', tokenize = True), - #Field('domain', - # lambda l: UrlParser(l.url).domain_permutations()), - Field('site', - lambda l: UrlParser(l.url).domain_permutations()), - #Field('is_self','is_self'), - ), - Comment: (Field('contents', 'body', tokenize = True), - Field('boost', lambda t: int(t._hot*1000), - # yes, it's a copy of 'hot' - is_number=True, reverse=True), - ThingField('author',Account,'author_id','name'), - ThingField('subreddit',Subreddit,'sr_id','name'))} - #ThingField('reddit',Subreddit,'sr_id','name'))} - -def strip_control_characters(text): - if not isinstance(text, basestring): - return text - return ''.join((c for c in text if ord(c) >= 0x20)) - -def tokenize_things(things,return_dict=False): - """ - Here, we take a list of things, and return a list of - dictionaries of fields, which will be sent to Solr. We take - the `search_fields` dictionary above, and look for all classes - for which each Thing is an instance (that is, a Comment will - pick up fields for Thing as well as Comment), and extract the - given fields. All tokenised Things are expected to have a - 'contents' attribute. That field is then copied to - contents_XX, where XX is your two-letter language code, which - becomes your default search field. Those language-specific - fields are also set up with the proper language-stemming and - tokenisers on Solr's end (in config/schema.xml), which allows - for language-specific searching - """ - global search_fields - - batched_classes = {} - ret = {} - for thing in things: - try: - t = {'type': []} - for cls in ((thing.__class__,) + thing.__class__.__bases__): - t['type'].append(cls.__name__.lower()) - - if cls in search_fields: - for field in search_fields[cls]: - if field.__class__ == Field: - try: - val = field.extract_from(thing) - val = strip_control_characters(val) - if val != None and val != '': - t[field.name] = val - except AttributeError,e: - print e - - elif field.__class__ == ThingField: - if not field.cls in batched_classes: - batched_classes[field.cls] = [] - batched_classes[field.cls].append((thing,field)) - - # copy 'contents' to ('contents_%s' % lang) and contents_ws - t[lang_to_fieldname(thing.lang)] = t['contents'] - t['contents_ws'] = t['contents'] - - ret[thing._fullname] = t - except AttributeError,e: - print e - except KeyError,e: - print e - - # batched_classes should now be a {cls: [(Thing,ThingField)]}. - # This ugliness is to make it possible to batch Thing lookups, as - # they were accounting for most of the indexing time - for cls in batched_classes: - ids = set() - for (thing,field) in batched_classes[cls]: - # extract the IDs - try: - id = getattr(thing,field.id_attr) - ids.add(id) - except AttributeError,e: - print e - found_batch = cls._byID(ids,data=True,return_dict=True) - - for (thing,field) in batched_classes[cls]: - try: - id = getattr(thing,field.id_attr) - ret[thing._fullname][field.name] = strip_control_characters( - getattr(found_batch[id],field.lu_attr_name)) - except AttributeError,e: - print e - except KeyError,e: - print e - - return ret if return_dict else ret.values() - -def lang_to_fieldname(l): - """ - Returns the field-name for the given language, or `contents` - if it isn't found - """ - global searchable_langs - - code = l[:2] - - if code in searchable_langs: - return ("contents_%s" % code) - else: - return "contents" - -def tokenize(thing): - return tokenize_things([thing]) - -def index_things(s=None,things=[]): - "Sends the given Things to Solr to be indexed" - tokenized = tokenize_things(things) - - if s: - s.add(tokenized) - else: - with SolrConnection(commit=True) as s: - s.add(tokenize_things(things)) - -def fetch_batches(t_class,size,since,until): - """ - Convenience function to fetch all Things of class t_class with - _date from `since` to `until`, returning them in batches of - `size`. TODO: move to lib/utils, and merge to be the backend - of `fetch_things` - """ - q=t_class._query(t_class.c._date >= since, - t_class.c._spam == (True,False), - t_class.c._deleted == (True,False), - t_class.c._date < until, - sort = desc('_date'), - limit = size, - data = True) - orig_rules = deepcopy(q._rules) - - things = list(q) - while things: - yield things - - q._rules = deepcopy(orig_rules) - q._after(things[len(things)-1]) - things = list(q) - -solr_queue=Queue() -for i in range(20): - solr_queue.put(pysolr.Solr(g.solr_url)) -class SolrConnection(object): - """ - Represents a connection to Solr, properly limited to N - concurrent connections. Used like - - with SolrConnection() as s: - s.add(things) - """ - def __init__(self,commit=False,optimize=False): - self.commit = commit - self.optimize = optimize - def __enter__(self): - self.conn = solr_queue.get() - return self.conn - def __exit__(self, _type, _value, _tb): - if self.commit: - self.conn.commit() - if self.optimize: - self.conn.optimize() - solr_queue.task_done() - solr_queue.put(self.conn) - -def indexer_worker(q,delete_all_first=False): - """ - The thread for mass-indexing that connects to Solr and submits - tokenised objects - """ - with SolrConnection(commit=True,optimize=True) as s: - count = 0 - - if delete_all_first: - s.delete(q='*:*') - - t = q.get() - while t != "done": - # if it's not a list or a dictionary, I don't know how to - # handle it, so die. It's probably an exception pushed in - # by the handler in my parent - if not (isinstance(t,list) and isinstance(t[0],dict)): - raise t - count += len(t) - s.add(t) - if count > 25000: - print "Committing... (q:%d)" % (q.qsize(),) - s.commit() - count = 0 - q.task_done() - - t=q.get() - q.task_done() - -def reindex_all(types = None, delete_all_first=False): - """ - Called from `paster run` to totally re-index everything in the - database. Spawns a thread to connect to Solr, and sends it - tokenised Things - """ - global indexed_types - - start_t = datetime.now() - - if not types: - types = indexed_types - - # We don't want the default thread-local cache (which is just a - # dict) to grow un-bounded (normally, we'd use - # utils.set_emptying_cache, except that that preserves memcached, - # and we don't even want to get memcached for total indexing, - # because it would dump out more recent stuff) - g.cache.caches = (SelfEmptyingCache(),) # + g.cache.caches[1:] - - count = 0 - q=Queue(100) - indexer=Thread(target=indexer_worker, - args=(q,delete_all_first)) - indexer.start() - - try: - for cls in types: - for batch in fetch_batches(cls,1000, - timeago("50 years"), - start_t): - r = tokenize_things([ x for x in batch - if not x._spam and not x._deleted ]) - - count += len(r) - print ("Processing %s #%d(%s): %s" - % (cls.__name__, count, q.qsize(), r[0]['contents'])) - - if indexer.isAlive(): - q.put(r) - else: - raise Exception("'tis a shame that I have but one thread to give") - q.put("done") - indexer.join() - - except object,e: - if indexer.isAlive(): - q.put(e,timeout=30) - raise e - except KeyboardInterrupt,e: # turns out KeyboardInterrupts aren't objects. Who knew? - if indexer.isAlive(): - q.put(e,timeout=30) - raise e - - -def combine_searchterms(terms): - """ - Convenience function to take a list like - [ sr_id:1, sr_id:2 sr_id:3 subreddit:reddit.com ] - and turn it into - sr_id:(1 2 3) OR subreddit:reddit.com - """ - combined = {} - - for (name,val) in terms: - combined[name] = combined.get(name,[]) + [val] - - ret = [] - - for (name,vals) in combined.iteritems(): - if len(vals) == 1: - ret.append("%s:%s" % (name,vals[0])) - else: - ret.append("%s:(%s)" % (name," ".join(vals))) - - if len(ret) > 1: - ret = "(%s)" % " OR ".join(ret) - else: - ret = " ".join(ret) - - return ret - -def swap_strings(s,this,that): - """ - Just swaps substrings, like: - s = "hot asc" - s = swap_strings(s,'asc','desc') - s == "hot desc" - - uses 'tmp' as a replacment string, so don't use for anything - very complicated - """ - return s.replace(this,'tmp').replace(that,this).replace('tmp',that) - -class SearchQuery(object): - def __init__(self, q, sort, fields = [], subreddits = [], authors = [], - types = [], timerange = None, spam = False, deleted = False): - - self.q = q - self.fields = fields - self.sort = sort - self.subreddits = subreddits - self.authors = authors - self.types = types - self.spam = spam - self.deleted = deleted - - if timerange in ['day','month','year']: - self.timerange = ('NOW-1%s/HOUR' % timerange.upper(),"NOW") - elif timerange == 'week': - self.timerange = ('NOW-7DAY/HOUR',"NOW") - elif timerange == 'hour': - self.timerange = ('NOW-1HOUR/MINUTE',"NOW") - elif timerange == 'all' or timerange is None: - self.timerange = None - else: - self.timerange = timerange - - def __repr__(self): - attrs = [ "***q=%s***" % self.q ] - - if self.subreddits is not None: - attrs.append("srs=" + '+'.join([ "%d" % s - for s in self.subreddits ])) - - if self.authors is not None: - attrs.append("authors=" + '+'.join([ "%d" % s - for s in self.authors ])) - - if self.timerange is not None: - attrs.append("timerange=%s" % str(self.timerange)) - - if self.sort is not None: - attrs.append("sort=%r" % self.sort) - - return "<%s(%s)>" % (self.__class__.__name__, ", ".join(attrs)) - - def run(self, after = None, num = 1000, reverse = False, - _update = False): - if not self.q: - return pysolr.Results([],0) - - if not g.solr_url: - raise SolrError("g.solr_url is not set") - - # there are two parts to our query: what the user typed - # (parsed with Solr's DisMax parser), and what we are adding - # to it. The latter is called the "boost" (and is parsed using - # full Lucene syntax), and it can be added to via the `boost` - # parameter - boost = [] - - if not self.spam: - boost.append("-spam:true") - if not self.deleted: - boost.append("-deleted:true") - - if self.timerange: - def time_to_searchstr(t): - if isinstance(t, datetime): - t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z') - elif isinstance(t, date): - t = t.strftime('%Y-%m-%dT00:00:00.000Z') - elif isinstance(t,str): - t = t - return t - - (fromtime, totime) = self.timerange - fromtime = time_to_searchstr(fromtime) - totime = time_to_searchstr(totime) - boost.append("+date:[%s TO %s]" - % (fromtime,totime)) - - if self.subreddits: - def subreddit_to_searchstr(sr): - if isinstance(sr,Subreddit): - return ('sr_id','%d' % sr.id) - elif isinstance(sr,str) or isinstance(sr,unicode): - return ('subreddit',sr) - else: - return ('sr_id','%d' % sr) - - s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits)) - - boost.append("+(%s)" % combine_searchterms(s_subreddits)) - - if self.authors: - def author_to_searchstr(a): - if isinstance(a,Account): - return ('author_id','%d' % a.id) - elif isinstance(a,str) or isinstance(a,unicode): - return ('author',a) - else: - return ('author_id','%d' % a) - - s_authors = map(author_to_searchstr,tup(self.authors)) - - boost.append('+(%s)^2' % combine_searchterms(s_authors)) - - - def type_to_searchstr(t): - if isinstance(t,str): - return ('type',t) - else: - return ('type',t.__name__.lower()) - - s_types = map(type_to_searchstr,self.types) - boost.append("+%s" % combine_searchterms(s_types)) - - q,solr_params = self.solr_params(self.q,boost) - - search = self.run_search(q, self.sort, solr_params, - reverse, after, num, - _update = _update) - return search - - @classmethod - def run_search(cls, q, sort, solr_params, reverse, after, num, - _update = False): - "returns pysolr.Results(docs=[fullname()],hits=int())" - - if reverse: - sort = swap_strings(sort,'asc','desc') - after = after._fullname if after else None - - search = cls.run_search_cached(q, sort, 0, num, solr_params, - _update = _update) - search.docs = get_after(search.docs, after, num) - - return search - - @staticmethod - @memoize('solr_search', solr_cache_time) - def run_search_cached(q, sort, start, rows, other_params): - with SolrConnection() as s: - g.log.debug(("Searching q = %r; sort = %r," - + " start = %r, rows = %r," - + " params = %r") - % (q,sort,start,rows,other_params)) - - res = s.search(q, sort, start = start, rows = rows, - other_params = other_params) - - # extract out the fullname in the 'docs' field, since that's - # all we care about - res = pysolr.Results(docs = [ i['fullname'] for i in res.docs ], - hits = res.hits) - - return res - - def solr_params(self,*k,**kw): - raise NotImplementedError - -class UserSearchQuery(SearchQuery): - "Base class for queries that use the dismax parser" - def __init__(self, q, mm, sort=None, fields=[], langs=None, **kw): - default_fields = ['contents^1.5','contents_ws^3'] + fields - - if langs is None: - fields = default_fields - else: - if langs == 'all': - langs = searchable_langs - fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs] - + default_fields) - - # minimum match. See http://lucene.apache.org/solr/api/org/apache/solr/util/doc-files/min-should-match.html - self.mm = mm - - SearchQuery.__init__(self, q, sort, fields = fields, **kw) - - def solr_params(self, q, boost): - return q, dict(fl = 'fullname', - qt = 'dismax', - bq = ' '.join(boost), - qf = ' '.join(self.fields), - mm = self.mm) - -class LinkSearchQuery(UserSearchQuery): - def __init__(self, q, mm = None, **kw): - additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1'] - - if mm is None: - mm = '4<75%' - - UserSearchQuery.__init__(self, q, mm = mm, fields = additional_fields, - types=[Link], **kw) - -class RelatedSearchQuery(LinkSearchQuery): - def __init__(self, q, ignore = [], **kw): - self.ignore = set(ignore) if ignore else set() - - LinkSearchQuery.__init__(self, q, mm = '3<100% 5<60% 8<50%', **kw) - - def run(self, *k, **kw): - search = LinkSearchQuery.run(self, *k, **kw) - search.docs = [ x for x in search.docs if x not in self.ignore ] - return search - -class SubredditSearchQuery(UserSearchQuery): - def __init__(self, q, **kw): - # note that 'downs' is a measure of activity on subreddits - UserSearchQuery.__init__(self, q, mm = '75%', sort = 'downs desc', - types=[Subreddit], **kw) - -class DomainSearchQuery(SearchQuery): - def __init__(self, domain, **kw): - q = '+site:%s' % domain - - SearchQuery.__init__(self, q = q, fields=['site'],types=[Link], **kw) - - def solr_params(self, q, boost): - q = q + ' ' + ' '.join(boost) - return q, dict(fl='fullname', - qt='standard') - -def run_commit(optimize=False): - with SolrConnection(commit=True, optimize=optimize) as s: - pass - - -def run_changed(drain=False): - """ - Run by `cron` (through `paster run`) on a schedule to update - all Things that have been created or have changed since the - last run. Note: unlike many queue-using functions, this one is - run from cron and totally drains the queue before terminating - """ - @g.stats.amqp_processor('solrsearch_changes') - def _run_changed(msgs, chan): - print "changed: Processing %d items" % len(msgs) - msgs = [strordict_fullname(msg.body) - for msg in msgs] - fullnames = set(msg['fullname'] for msg in msgs if not msg.get('boost_only')) - - things = Thing._by_fullname(fullnames, data=True, return_dict=False) - things = [x for x in things if isinstance(x, indexed_types)] - - update_things = [x for x in things if not x._spam and not x._deleted] - delete_things = [x for x in things if x._spam or x._deleted] - - with SolrConnection() as s: - if update_things: - tokenized = tokenize_things(update_things) - s.add(tokenized) - if delete_things: - for i in delete_things: - s.delete(id=i._fullname) - - amqp.handle_items('solrsearch_changes', _run_changed, limit=1000, - drain=drain) diff --git a/upstart/reddit-job-solrsearch.conf b/upstart/reddit-job-solrsearch.conf deleted file mode 100644 index 14f9876eb..000000000 --- a/upstart/reddit-job-solrsearch.conf +++ /dev/null @@ -1,13 +0,0 @@ -description "commit/optimize solr index" - -instance $optimize - -manual -task - -nice 10 - -script - . /etc/default/reddit - wrap-job paster run $REDDIT_INI -c "from r2.lib import solrsearch; solrsearch.run_commit(optimize=$optimize)" -end script