diff --git a/config/solr/schema.xml b/config/solr/schema.xml
deleted file mode 100644
index ab4376050..000000000
--- a/config/solr/schema.xml
+++ /dev/null
@@ -1,456 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- fullname
-
-
- contents
-
-
-
-
-
-
-
-
diff --git a/config/solr/server.xml b/config/solr/server.xml
deleted file mode 100644
index 8ea4525f6..000000000
--- a/config/solr/server.xml
+++ /dev/null
@@ -1,387 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/config/solr/solrconfig.xml b/config/solr/solrconfig.xml
deleted file mode 100644
index 4e41d249e..000000000
--- a/config/solr/solrconfig.xml
+++ /dev/null
@@ -1,464 +0,0 @@
-
-
-
-
- ${solr.abortOnConfigurationError:true}
-
-
-
-
-
-
- false
- 10
- 1000
- 2147483647
- 10000
- 1000
- 10000
-
-
-
-
- false
- 10
- 1000
- 2147483647
- 10000
-
-
- false
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1024
-
-
-
-
-
-
-
-
-
-
-
-
- true
-
-
-
-
-
-
-
- 10
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- false
-
-
- 4
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- explicit
-
-
-
-
-
-
-
- contents
-
-
-
-
-
-
-
- explicit
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0
- 2<-1 5<-2 6<90%
-
- incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2
-
-
-
- inStock:true
-
-
-
- cat
- manu_exact
- price:[* TO 500]
- price:[500 TO *]
-
-
-
-
-
-
- inStock:true
-
-
- text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
-
-
- 2<-1 5<-2 6<90%
-
-
-
-
-
-
-
-
- 1
- 0.5
-
-
-
-
-
-
-
- spell
-
-
-
-
- word
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- solr
- solrconfig.xml schema.xml admin-extra.html
-
-
- qt=standard&q=solrpingquery
-
-
-
-
-
diff --git a/install-reddit.sh b/install-reddit.sh
index 377286baf..0f5211610 100755
--- a/install-reddit.sh
+++ b/install-reddit.sh
@@ -338,8 +338,6 @@ if [ ! -f /etc/cron.d/reddit ]; then
# disabled by default, uncomment if you need these jobs
#*/2 * * * * root /sbin/start --quiet reddit-job-google_checkout
-#*/10 * * * * root /sbin/start --quiet reddit-job-solrsearch optimize=False
-#0 0 * * * root /sbin/start --quiet reddit-job-solrsearch optimize=True
#0 0 * * * root /sbin/start --quiet reddit-job-update_gold_users
CRON
fi
diff --git a/r2/example.ini b/r2/example.ini
index 6934226c1..5f5cc9114 100644
--- a/r2/example.ini
+++ b/r2/example.ini
@@ -354,12 +354,6 @@ png_optimizer = /usr/bin/env optipng
# jpeg compressor
jpeg_optimizer =
-# -- search --
-# where is solor?
-solr_url =
-# how long do we cache search results (in seconds)
-solr_cache_time = 300
-
# Just a list of words. Used by errlog.py to make up names for new errors.
words_file = /usr/dict/words
diff --git a/r2/r2/controllers/front.py b/r2/r2/controllers/front.py
index c61571e7a..2bd94186a 100755
--- a/r2/r2/controllers/front.py
+++ b/r2/r2/controllers/front.py
@@ -40,8 +40,6 @@ from r2.lib.db.tdb_cassandra import MultiColumnQuery
from r2.lib.strings import strings
from r2.lib.search import (SearchQuery, SubredditSearchQuery, SearchException,
InvalidQuery)
-from r2.lib.solrsearch import RelatedSearchQuery
-from r2.lib.contrib.pysolr import SolrError
from r2.lib import jsontemplates
from r2.lib import sup
import r2.lib.db.thing as thing
@@ -788,7 +786,7 @@ class FrontController(RedditController):
# computed after fetch_more
try:
res = listing.listing()
- except SearchException + (SolrError, socket.error) as e:
+ except SearchException + (socket.error,) as e:
return self.search_fail(e)
timing = time_module.time() - builder.start_time
diff --git a/r2/r2/controllers/listingcontroller.py b/r2/r2/controllers/listingcontroller.py
index eca1ff3b0..7fd5fcf40 100755
--- a/r2/r2/controllers/listingcontroller.py
+++ b/r2/r2/controllers/listingcontroller.py
@@ -36,7 +36,6 @@ from r2.lib.db.thing import Query, Merge, Relations
from r2.lib.db import queries
from r2.lib.strings import Score
from r2.lib import organic
-import r2.lib.solrsearch as solrsearch
import r2.lib.search as search
from r2.lib.utils import iters, check_cheating, timeago
from r2.lib.utils.trial_utils import populate_spotlight
@@ -130,7 +129,7 @@ class ListingController(RedditController):
builder_cls = self.builder_cls
elif isinstance(self.query_obj, Query):
builder_cls = QueryBuilder
- elif isinstance(self.query_obj, (solrsearch.SearchQuery, search.SearchQuery)):
+ elif isinstance(self.query_obj, search.SearchQuery):
builder_cls = SearchBuilder
elif isinstance(self.query_obj, iters):
builder_cls = IDBuilder
diff --git a/r2/r2/controllers/reddit_base.py b/r2/r2/controllers/reddit_base.py
index 13515c70d..50dca5e38 100644
--- a/r2/r2/controllers/reddit_base.py
+++ b/r2/r2/controllers/reddit_base.py
@@ -931,17 +931,8 @@ class RedditController(MinimalController):
abort(304, 'not modified')
def search_fail(self, exception):
- from r2.lib.contrib.pysolr import SolrError
from r2.lib.search import SearchException
- if isinstance(exception, SolrError):
- errmsg = "SolrError: %r" % exception
-
- if (str(exception) == 'None'):
- # Production error logs only get non-None errors
- g.log.debug(errmsg)
- else:
- g.log.error(errmsg)
- elif isinstance(exception, SearchException + (socket.error,)):
+ if isinstance(exception, SearchException + (socket.error,)):
g.log.error("Search Error: %s" % repr(exception))
errpage = pages.RedditError(_("search failed"),
diff --git a/r2/r2/lib/app_globals.py b/r2/r2/lib/app_globals.py
index 04b21045c..6f441bc04 100755
--- a/r2/r2/lib/app_globals.py
+++ b/r2/r2/lib/app_globals.py
@@ -47,7 +47,6 @@ class Globals(object):
'db_pool_size',
'db_pool_overflow_size',
'page_cache_time',
- 'solr_cache_time',
'num_mc_clients',
'MIN_DOWN_LINK',
'MIN_UP_KARMA',
diff --git a/r2/r2/lib/contrib/pysolr.py b/r2/r2/lib/contrib/pysolr.py
deleted file mode 100644
index 46fd04ae4..000000000
--- a/r2/r2/lib/contrib/pysolr.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-All we need to create a Solr connection is a url.
-
->>> conn = Solr('http://127.0.0.1:8983/solr/')
-
-First, completely clear the index.
-
->>> conn.delete(q='*:*')
-
-For now, we can only index python dictionaries. Each key in the dictionary
-will correspond to a field in Solr.
-
->>> docs = [
-... {'id': 'testdoc.1', 'order_i': 1, 'name': 'document 1', 'text': u'Paul Verlaine'},
-... {'id': 'testdoc.2', 'order_i': 2, 'name': 'document 2', 'text': u'Владимир Маякoвский'},
-... {'id': 'testdoc.3', 'order_i': 3, 'name': 'document 3', 'text': u'test'},
-... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'test'}
-... ]
-
-
-We can add documents to the index by passing a list of docs to the connection's
-add method.
-
->>> conn.add(docs)
-
->>> results = conn.search('Verlaine')
->>> len(results)
-1
-
->>> results = conn.search(u'Владимир')
->>> len(results)
-1
-
-
-Simple tests for searching. We can optionally sort the results using Solr's
-sort syntax, that is, the field name and either asc or desc.
-
->>> results = conn.search('test', sort='order_i asc')
->>> for result in results:
-... print result['name']
-document 3
-document 4
-
->>> results = conn.search('test', sort='order_i desc')
->>> for result in results:
-... print result['name']
-document 4
-document 3
-
-
-To update documents, we just use the add method.
-
->>> docs = [
-... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'blah'}
-... ]
->>> conn.add(docs)
-
->>> len(conn.search('blah'))
-1
->>> len(conn.search('test'))
-1
-
-
-We can delete documents from the index by id, or by supplying a query.
-
->>> conn.delete(id='testdoc.1')
->>> conn.delete(q='name:"document 2"')
-
->>> results = conn.search('Verlaine')
->>> len(results)
-0
-
-
-Docs can also have multiple values for any particular key. This lets us use
-Solr's multiValue fields.
-
->>> docs = [
-... {'id': 'testdoc.5', 'cat': ['poetry', 'science'], 'name': 'document 5', 'text': u''},
-... {'id': 'testdoc.6', 'cat': ['science-fiction',], 'name': 'document 6', 'text': u''},
-... ]
-
->>> conn.add(docs)
->>> results = conn.search('cat:"poetry"')
->>> for result in results:
-... print result['name']
-document 5
-
->>> results = conn.search('cat:"science-fiction"')
->>> for result in results:
-... print result['name']
-document 6
-
->>> results = conn.search('cat:"science"')
->>> for result in results:
-... print result['name']
-document 5
-
-NOTE: PySolr is an open-source Python module
- that falls under the New BSD
-Licence , NOT the
-licence covering the rest of Reddit. Reddit's modifications to this
-module also fall under the New BSD Licence. The New BSD Licence
-requires that re-distributions of the source, modified or not, display
-the original copyright notice, but PySolr does not, as of import-time,
-display a copyright notice or licence, except on its Google Code
-information page. Therefore for licencing information, I point you to
-PySolr's Google Code information page, URL above.
-
-"""
-
-# TODO: unicode support is pretty sloppy. define it better.
-
-from httplib import HTTPConnection
-from urllib import urlencode
-from urlparse import urlsplit
-from datetime import datetime, date
-from time import strptime, strftime
-from r2.lib.utils import unicode_safe
-try:
- # for python 2.5
- from xml.etree import ElementTree
- from xml.parsers.expat import ExpatError
-except ImportError:
- from elementtree import ElementTree,ExpatError
-
-__all__ = ['Solr']
-
-class SolrError(Exception):
- pass
-
-class Results(object):
- def __init__(self, docs, hits):
- self.docs = docs
- self.hits = hits
-
- def __len__(self):
- return len(self.docs)
-
- def __iter__(self):
- return iter(self.docs)
-
- def __getitem__(self,x):
- return self.docs[x]
-
-class Solr(object):
- def __init__(self, url):
- self.url = url
- scheme, netloc, path, query, fragment = urlsplit(url)
- netloc = netloc.split(':')
- self.host = netloc[0]
- if len(netloc) == 1:
- self.host, self.port = netloc[0], None
- else:
- self.host, self.port = netloc
- self.path = path.rstrip('/')
-
- def _select(self, params):
- # encode the query as utf-8 so urlencode can handle it
- params['q'] = unicode_safe(params['q'])
- path = '%s/select/?%s' % (self.path, urlencode(params))
- conn = HTTPConnection(self.host, self.port)
- conn.request('GET', path)
- return conn.getresponse()
-
- def _update(self, message):
- """
- Posts the given xml message to http://:/solr/update and
- returns the result.
- """
- path = '%s/update/' % self.path
- conn = HTTPConnection(self.host, self.port)
- conn.request('POST', path, message, {'Content-type': 'text/xml'})
- return conn.getresponse()
-
- def _extract_error(self, response):
- """
- Extract the actual error message from a solr response. Unfortunately,
- this means scraping the html.
- """
- try:
- et = ElementTree.parse(response)
- error = et.findtext('body/pre')
- return error
- except ExpatError,e:
- return "%s: %s (%d/%s)" % (e,response.read(),response.status,response.reason)
-
- # Converters #############################################################
-
- @staticmethod
- def _from_python(value):
- """
- Converts python values to a form suitable for insertion into the xml
- we send to solr.
- """
- if isinstance(value, datetime):
- value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
- elif isinstance(value, date):
- value = value.strftime('%Y-%m-%dT00:00:00.000Z')
- elif isinstance(value, bool):
- if value:
- value = 'true'
- else:
- value = 'false'
- else:
- value = unicode_safe(value)
- return value
-
- def bool_to_python(self, value):
- """
- Convert a 'bool' field from solr's xml format to python and return it.
- """
- if value == 'true':
- return True
- elif value == 'false':
- return False
-
- def str_to_python(self, value):
- """
- Convert an 'str' field from solr's xml format to python and return it.
- """
- return unicode_safe(value)
-
- def int_to_python(self, value):
- """
- Convert an 'int' field from solr's xml format to python and return it.
- """
- return int(value)
-
- def date_to_python(self, value):
- """
- Convert a 'date' field from solr's xml format to python and return it.
- """
- # this throws away fractions of a second
- return datetime(*strptime(value[:-5], "%Y-%m-%dT%H:%M:%S")[0:6])
-
- # API Methods ############################################################
-
- def search(self, q, sort=None, start=0, rows=20, other_params = {}):
- """Performs a search and returns the results."""
- params = {'q': q, 'start': start, 'rows': rows}
-
- for x,y in other_params.iteritems():
- params[x] = y
- if sort:
- params['sort'] = sort
-
- response = self._select(params)
- if response.status != 200:
- raise SolrError(self._extract_error(response))
-
- # TODO: make result retrieval lazy and allow custom result objects
- # also, this has become rather ugly and definitely needs some cleanup.
- et = ElementTree.parse(response)
- result = et.find('result')
- hits = int(result.get('numFound'))
- docs = result.findall('doc')
- results = []
- for doc in docs:
- result = {}
- for element in doc.getchildren():
- if element.tag == 'arr':
- result_val = []
- for array_element in element.getchildren():
- converter_name = '%s_to_python' % array_element.tag
- converter = getattr(self, converter_name)
- result_val.append(converter(array_element.text))
- else:
- converter_name = '%s_to_python' % element.tag
- converter = getattr(self, converter_name)
- result_val = converter(element.text)
- result[element.get('name')] = result_val
- results.append(result)
- return Results(results, hits)
-
- def add(self, docs, commit=False):
- """Adds or updates documents. For now, docs is a list of dictionaies
- where each key is the field name and each value is the value to index.
- """
- message = ElementTree.Element('add')
- for doc in docs:
- message.append(doc_to_elemtree(doc))
- m = ElementTree.tostring(message)
- response = self._update(m)
- if response.status != 200:
- raise SolrError(self._extract_error(response))
- # TODO: Supposedly, we can put a element in the same post body
- # as the add element. That isn't working for some reason, and it would save us
- # an extra trip to the server. This works for now.
- if commit:
- self.commit()
-
- def delete(self, id=None, q=None, commit=False):
- """Deletes documents."""
- if id is None and q is None:
- raise ValueError('You must specify "id" or "q".')
- elif id is not None and q is not None:
- raise ValueError('You many only specify "id" OR "q", not both.')
- elif id is not None:
- m = '%s' % id
- elif q is not None:
- m = '%s' % q
- response = self._update(m)
- if response.status != 200:
- raise SolrError(self._extract_error(response))
- # TODO: Supposedly, we can put a element in the same post body
- # as the delete element. That isn't working for some reason, and it would save us
- # an extra trip to the server. This works for now.
- if commit:
- self.commit()
-
- def commit(self):
- response = self._update('')
- if response.status != 200:
- raise SolrError(self._extract_error(response))
-
- def optimize(self):
- response = self._update('')
- if response.status != 200:
- raise SolrError(self._extract_error(response))
-
-solr_magic_fields = ('boost',)
-def doc_to_elemtree(doc):
- d = ElementTree.Element('doc')
- for key, value in doc.iteritems():
-
- if key in solr_magic_fields:
- # handle special fields that are attributes, not fields
- d.set(key,Solr._from_python(value))
- elif (not isinstance(value,str)) and hasattr(value, '__iter__'):
- # handle lists, tuples, and other iterabes
- for v in value:
- f = ElementTree.Element('field', name=key)
- f.text = Solr._from_python(v)
- d.append(f)
- # handle strings and unicode
- else:
- f = ElementTree.Element('field', name=key)
- f.text = Solr._from_python(value)
- d.append(f)
-
- return d
-
-
-if __name__ == "__main__":
- import doctest
- doctest.testmod()
diff --git a/r2/r2/lib/db/queries.py b/r2/r2/lib/db/queries.py
index b3a4d107e..66cd0e39d 100755
--- a/r2/r2/lib/db/queries.py
+++ b/r2/r2/lib/db/queries.py
@@ -5,7 +5,6 @@ from r2.lib.db.operators import asc, desc, timeago
from r2.lib.db.sorts import epoch_seconds
from r2.lib.utils import fetch_things2, tup, UniqueIterator, set_last_modified
from r2.lib import utils
-from r2.lib.solrsearch import DomainSearchQuery
from r2.lib import amqp, sup, filters
from r2.lib.comment_tree import add_comments, update_comment_votes
from r2.models.query_cache import (cached_query, merged_cached_query,
@@ -39,12 +38,6 @@ def db_sort(sort):
cls, col = db_sorts[sort]
return cls(col)
-search_sort = dict(hot = 'hot desc',
- new = 'date desc',
- top = 'points desc',
- controversial = 'controversy desc',
- old = 'date asc')
-
db_times = dict(all = None,
hour = Thing.c._date >= timeago('1 hour'),
day = Thing.c._date >= timeago('1 day'),
@@ -458,9 +451,6 @@ def get_modqueue(sr):
q.append(get_spam_filtered_comments(sr))
return q
-def get_domain_links_old(domain, sort, time):
- return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time)
-
def get_domain_links(domain, sort, time):
from r2.lib.db import operators
q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain),
diff --git a/r2/r2/lib/solrsearch.py b/r2/r2/lib/solrsearch.py
deleted file mode 100644
index 51d176d0d..000000000
--- a/r2/r2/lib/solrsearch.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# The contents of this file are subject to the Common Public Attribution
-# License Version 1.0. (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
-# License Version 1.1, but Sections 14 and 15 have been added to cover use of
-# software over a computer network and provide for limited attribution for the
-# Original Developer. In addition, Exhibit A has been modified to be consistent
-# with Exhibit B.
-#
-# Software distributed under the License is distributed on an "AS IS" basis,
-# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
-# the specific language governing rights and limitations under the License.
-#
-# The Original Code is Reddit.
-#
-# The Original Developer is the Initial Developer. The Initial Developer of the
-# Original Code is CondeNet, Inc.
-#
-# All portions of the code written by CondeNet are Copyright (c) 2006-2010
-# CondeNet, Inc. All Rights Reserved.
-################################################################################
-"""
- Module for communication reddit-level communication with
- Solr. Contains functions for indexing (`reindex_all`, `run_changed`)
- and searching (`search_things`). Uses pysolr (placed in r2.lib)
- for lower-level communication with Solr
-"""
-
-from __future__ import with_statement
-
-from Queue import Queue
-from threading import Thread
-import time
-from datetime import datetime, date
-from time import strftime
-
-from pylons import g, config
-
-from r2.models import *
-from r2.lib.contrib import pysolr
-from r2.lib.contrib.pysolr import SolrError
-from r2.lib.utils import timeago, UrlParser
-from r2.lib.utils import unicode_safe, tup, get_after, strordict_fullname
-from r2.lib.cache import SelfEmptyingCache
-from r2.lib import amqp
-
-solr_cache_time = g.solr_cache_time
-
-## Changes to the list of searchable languages will require changes to
-## Solr's configuration (specifically, the fields that are searched)
-searchable_langs = set(['dk','nl','en','fi','fr','de','it','no','nn','pt',
- 'ru','es','sv','zh','ja','ko','cs','el','th'])
-
-## Adding types is a matter of adding the class to indexed_types here,
-## adding the fields from that type to search_fields below, and adding
-## those fields to Solr's configuration
-indexed_types = (Subreddit, Link)
-
-
-class Field(object):
- """
- Describes a field of a Thing that is searchable by Solr. Used
- by `search_fields` below
- """
- def __init__(self, name, thing_attr_func = None, store = True,
- tokenize=False, is_number=False, reverse=False,
- is_date = False):
- self.name = name
- self.thing_attr_func = self.make_extractor(thing_attr_func)
-
- def make_extractor(self,thing_attr_func):
- if not thing_attr_func:
- return self.make_extractor(self.name)
- elif isinstance(thing_attr_func,str):
- return (lambda x: getattr(x,thing_attr_func))
- else:
- return thing_attr_func
-
- def extract_from(self,thing):
- return self.thing_attr_func(thing)
-
-class ThingField(Field):
- """
- ThingField('field_name',Author,'author_id','name')
- is like:
- Field(name, lambda x: Author._byID(x.author_id,data=True).name)
- but faster because lookups are done in batch
- """
- def __init__(self,name,cls,id_attr,lu_attr_name):
- self.name = name
-
- self.cls = cls # the class of the looked-up object
- self.id_attr = id_attr # the attr of the source obj used to find the dest obj
- self.lu_attr_name = lu_attr_name # the attr of the dest class that we want to return
-
- def __str__(self):
- return (""
- % (self.name,self.cls,self.id_attr,self.lu_attr_name))
-
-# Describes the fields of Thing objects and subclasses that are passed
-# to Solr for indexing. All must have a 'contents' field, since that
-# will be used for language-agnostic searching, and will be copied
-# into contents_en, contents_eo, et (see `tokenize_things` for a
-# discussion of multi-language search. The 'boost' field is a
-# solr-magic field that ends up being an attribute on the
-# message (rather than a field), and is used to do an index-time boost
-# (this magic is done in pysolr.dor_to_elemtree)
-search_fields={Thing: (Field('fullname', '_fullname'),
- Field('date', '_date', is_date = True, reverse=True),
- Field('lang'),
- Field('ups', '_ups', is_number=True, reverse=True),
- Field('downs', '_downs', is_number=True, reverse=True),
- Field('spam','_spam'),
- Field('deleted','_deleted'),
- Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True),
- Field('controversy', '_controversy', is_number=True, reverse=True),
- Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)),
- Subreddit: (Field('contents',
- lambda s: ' '.join([unicode_safe(s.name),
- unicode_safe(s.title),
- unicode_safe(s.description),
- unicode_safe(s.firsttext)]),
- tokenize = True),
- Field('boost', '_downs'),
- #Field('title'),
- #Field('firsttext'),
- #Field('description'),
- #Field('over_18'),
- #Field('sr_type','type'),
- ),
- Link: (Field('contents','title', tokenize = True),
- Field('boost', lambda t: int(t._hot*1000),
- # yes, it's a copy of 'hot'
- is_number=True, reverse=True),
- Field('author_id'),
- ThingField('author',Account,'author_id','name'),
- ThingField('subreddit',Subreddit,'sr_id','name'),
- #ThingField('reddit',Subreddit,'sr_id','name'),
- Field('sr_id'),
- Field('url', tokenize = True),
- #Field('domain',
- # lambda l: UrlParser(l.url).domain_permutations()),
- Field('site',
- lambda l: UrlParser(l.url).domain_permutations()),
- #Field('is_self','is_self'),
- ),
- Comment: (Field('contents', 'body', tokenize = True),
- Field('boost', lambda t: int(t._hot*1000),
- # yes, it's a copy of 'hot'
- is_number=True, reverse=True),
- ThingField('author',Account,'author_id','name'),
- ThingField('subreddit',Subreddit,'sr_id','name'))}
- #ThingField('reddit',Subreddit,'sr_id','name'))}
-
-def strip_control_characters(text):
- if not isinstance(text, basestring):
- return text
- return ''.join((c for c in text if ord(c) >= 0x20))
-
-def tokenize_things(things,return_dict=False):
- """
- Here, we take a list of things, and return a list of
- dictionaries of fields, which will be sent to Solr. We take
- the `search_fields` dictionary above, and look for all classes
- for which each Thing is an instance (that is, a Comment will
- pick up fields for Thing as well as Comment), and extract the
- given fields. All tokenised Things are expected to have a
- 'contents' attribute. That field is then copied to
- contents_XX, where XX is your two-letter language code, which
- becomes your default search field. Those language-specific
- fields are also set up with the proper language-stemming and
- tokenisers on Solr's end (in config/schema.xml), which allows
- for language-specific searching
- """
- global search_fields
-
- batched_classes = {}
- ret = {}
- for thing in things:
- try:
- t = {'type': []}
- for cls in ((thing.__class__,) + thing.__class__.__bases__):
- t['type'].append(cls.__name__.lower())
-
- if cls in search_fields:
- for field in search_fields[cls]:
- if field.__class__ == Field:
- try:
- val = field.extract_from(thing)
- val = strip_control_characters(val)
- if val != None and val != '':
- t[field.name] = val
- except AttributeError,e:
- print e
-
- elif field.__class__ == ThingField:
- if not field.cls in batched_classes:
- batched_classes[field.cls] = []
- batched_classes[field.cls].append((thing,field))
-
- # copy 'contents' to ('contents_%s' % lang) and contents_ws
- t[lang_to_fieldname(thing.lang)] = t['contents']
- t['contents_ws'] = t['contents']
-
- ret[thing._fullname] = t
- except AttributeError,e:
- print e
- except KeyError,e:
- print e
-
- # batched_classes should now be a {cls: [(Thing,ThingField)]}.
- # This ugliness is to make it possible to batch Thing lookups, as
- # they were accounting for most of the indexing time
- for cls in batched_classes:
- ids = set()
- for (thing,field) in batched_classes[cls]:
- # extract the IDs
- try:
- id = getattr(thing,field.id_attr)
- ids.add(id)
- except AttributeError,e:
- print e
- found_batch = cls._byID(ids,data=True,return_dict=True)
-
- for (thing,field) in batched_classes[cls]:
- try:
- id = getattr(thing,field.id_attr)
- ret[thing._fullname][field.name] = strip_control_characters(
- getattr(found_batch[id],field.lu_attr_name))
- except AttributeError,e:
- print e
- except KeyError,e:
- print e
-
- return ret if return_dict else ret.values()
-
-def lang_to_fieldname(l):
- """
- Returns the field-name for the given language, or `contents`
- if it isn't found
- """
- global searchable_langs
-
- code = l[:2]
-
- if code in searchable_langs:
- return ("contents_%s" % code)
- else:
- return "contents"
-
-def tokenize(thing):
- return tokenize_things([thing])
-
-def index_things(s=None,things=[]):
- "Sends the given Things to Solr to be indexed"
- tokenized = tokenize_things(things)
-
- if s:
- s.add(tokenized)
- else:
- with SolrConnection(commit=True) as s:
- s.add(tokenize_things(things))
-
-def fetch_batches(t_class,size,since,until):
- """
- Convenience function to fetch all Things of class t_class with
- _date from `since` to `until`, returning them in batches of
- `size`. TODO: move to lib/utils, and merge to be the backend
- of `fetch_things`
- """
- q=t_class._query(t_class.c._date >= since,
- t_class.c._spam == (True,False),
- t_class.c._deleted == (True,False),
- t_class.c._date < until,
- sort = desc('_date'),
- limit = size,
- data = True)
- orig_rules = deepcopy(q._rules)
-
- things = list(q)
- while things:
- yield things
-
- q._rules = deepcopy(orig_rules)
- q._after(things[len(things)-1])
- things = list(q)
-
-solr_queue=Queue()
-for i in range(20):
- solr_queue.put(pysolr.Solr(g.solr_url))
-class SolrConnection(object):
- """
- Represents a connection to Solr, properly limited to N
- concurrent connections. Used like
-
- with SolrConnection() as s:
- s.add(things)
- """
- def __init__(self,commit=False,optimize=False):
- self.commit = commit
- self.optimize = optimize
- def __enter__(self):
- self.conn = solr_queue.get()
- return self.conn
- def __exit__(self, _type, _value, _tb):
- if self.commit:
- self.conn.commit()
- if self.optimize:
- self.conn.optimize()
- solr_queue.task_done()
- solr_queue.put(self.conn)
-
-def indexer_worker(q,delete_all_first=False):
- """
- The thread for mass-indexing that connects to Solr and submits
- tokenised objects
- """
- with SolrConnection(commit=True,optimize=True) as s:
- count = 0
-
- if delete_all_first:
- s.delete(q='*:*')
-
- t = q.get()
- while t != "done":
- # if it's not a list or a dictionary, I don't know how to
- # handle it, so die. It's probably an exception pushed in
- # by the handler in my parent
- if not (isinstance(t,list) and isinstance(t[0],dict)):
- raise t
- count += len(t)
- s.add(t)
- if count > 25000:
- print "Committing... (q:%d)" % (q.qsize(),)
- s.commit()
- count = 0
- q.task_done()
-
- t=q.get()
- q.task_done()
-
-def reindex_all(types = None, delete_all_first=False):
- """
- Called from `paster run` to totally re-index everything in the
- database. Spawns a thread to connect to Solr, and sends it
- tokenised Things
- """
- global indexed_types
-
- start_t = datetime.now()
-
- if not types:
- types = indexed_types
-
- # We don't want the default thread-local cache (which is just a
- # dict) to grow un-bounded (normally, we'd use
- # utils.set_emptying_cache, except that that preserves memcached,
- # and we don't even want to get memcached for total indexing,
- # because it would dump out more recent stuff)
- g.cache.caches = (SelfEmptyingCache(),) # + g.cache.caches[1:]
-
- count = 0
- q=Queue(100)
- indexer=Thread(target=indexer_worker,
- args=(q,delete_all_first))
- indexer.start()
-
- try:
- for cls in types:
- for batch in fetch_batches(cls,1000,
- timeago("50 years"),
- start_t):
- r = tokenize_things([ x for x in batch
- if not x._spam and not x._deleted ])
-
- count += len(r)
- print ("Processing %s #%d(%s): %s"
- % (cls.__name__, count, q.qsize(), r[0]['contents']))
-
- if indexer.isAlive():
- q.put(r)
- else:
- raise Exception("'tis a shame that I have but one thread to give")
- q.put("done")
- indexer.join()
-
- except object,e:
- if indexer.isAlive():
- q.put(e,timeout=30)
- raise e
- except KeyboardInterrupt,e: # turns out KeyboardInterrupts aren't objects. Who knew?
- if indexer.isAlive():
- q.put(e,timeout=30)
- raise e
-
-
-def combine_searchterms(terms):
- """
- Convenience function to take a list like
- [ sr_id:1, sr_id:2 sr_id:3 subreddit:reddit.com ]
- and turn it into
- sr_id:(1 2 3) OR subreddit:reddit.com
- """
- combined = {}
-
- for (name,val) in terms:
- combined[name] = combined.get(name,[]) + [val]
-
- ret = []
-
- for (name,vals) in combined.iteritems():
- if len(vals) == 1:
- ret.append("%s:%s" % (name,vals[0]))
- else:
- ret.append("%s:(%s)" % (name," ".join(vals)))
-
- if len(ret) > 1:
- ret = "(%s)" % " OR ".join(ret)
- else:
- ret = " ".join(ret)
-
- return ret
-
-def swap_strings(s,this,that):
- """
- Just swaps substrings, like:
- s = "hot asc"
- s = swap_strings(s,'asc','desc')
- s == "hot desc"
-
- uses 'tmp' as a replacment string, so don't use for anything
- very complicated
- """
- return s.replace(this,'tmp').replace(that,this).replace('tmp',that)
-
-class SearchQuery(object):
- def __init__(self, q, sort, fields = [], subreddits = [], authors = [],
- types = [], timerange = None, spam = False, deleted = False):
-
- self.q = q
- self.fields = fields
- self.sort = sort
- self.subreddits = subreddits
- self.authors = authors
- self.types = types
- self.spam = spam
- self.deleted = deleted
-
- if timerange in ['day','month','year']:
- self.timerange = ('NOW-1%s/HOUR' % timerange.upper(),"NOW")
- elif timerange == 'week':
- self.timerange = ('NOW-7DAY/HOUR',"NOW")
- elif timerange == 'hour':
- self.timerange = ('NOW-1HOUR/MINUTE',"NOW")
- elif timerange == 'all' or timerange is None:
- self.timerange = None
- else:
- self.timerange = timerange
-
- def __repr__(self):
- attrs = [ "***q=%s***" % self.q ]
-
- if self.subreddits is not None:
- attrs.append("srs=" + '+'.join([ "%d" % s
- for s in self.subreddits ]))
-
- if self.authors is not None:
- attrs.append("authors=" + '+'.join([ "%d" % s
- for s in self.authors ]))
-
- if self.timerange is not None:
- attrs.append("timerange=%s" % str(self.timerange))
-
- if self.sort is not None:
- attrs.append("sort=%r" % self.sort)
-
- return "<%s(%s)>" % (self.__class__.__name__, ", ".join(attrs))
-
- def run(self, after = None, num = 1000, reverse = False,
- _update = False):
- if not self.q:
- return pysolr.Results([],0)
-
- if not g.solr_url:
- raise SolrError("g.solr_url is not set")
-
- # there are two parts to our query: what the user typed
- # (parsed with Solr's DisMax parser), and what we are adding
- # to it. The latter is called the "boost" (and is parsed using
- # full Lucene syntax), and it can be added to via the `boost`
- # parameter
- boost = []
-
- if not self.spam:
- boost.append("-spam:true")
- if not self.deleted:
- boost.append("-deleted:true")
-
- if self.timerange:
- def time_to_searchstr(t):
- if isinstance(t, datetime):
- t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
- elif isinstance(t, date):
- t = t.strftime('%Y-%m-%dT00:00:00.000Z')
- elif isinstance(t,str):
- t = t
- return t
-
- (fromtime, totime) = self.timerange
- fromtime = time_to_searchstr(fromtime)
- totime = time_to_searchstr(totime)
- boost.append("+date:[%s TO %s]"
- % (fromtime,totime))
-
- if self.subreddits:
- def subreddit_to_searchstr(sr):
- if isinstance(sr,Subreddit):
- return ('sr_id','%d' % sr.id)
- elif isinstance(sr,str) or isinstance(sr,unicode):
- return ('subreddit',sr)
- else:
- return ('sr_id','%d' % sr)
-
- s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits))
-
- boost.append("+(%s)" % combine_searchterms(s_subreddits))
-
- if self.authors:
- def author_to_searchstr(a):
- if isinstance(a,Account):
- return ('author_id','%d' % a.id)
- elif isinstance(a,str) or isinstance(a,unicode):
- return ('author',a)
- else:
- return ('author_id','%d' % a)
-
- s_authors = map(author_to_searchstr,tup(self.authors))
-
- boost.append('+(%s)^2' % combine_searchterms(s_authors))
-
-
- def type_to_searchstr(t):
- if isinstance(t,str):
- return ('type',t)
- else:
- return ('type',t.__name__.lower())
-
- s_types = map(type_to_searchstr,self.types)
- boost.append("+%s" % combine_searchterms(s_types))
-
- q,solr_params = self.solr_params(self.q,boost)
-
- search = self.run_search(q, self.sort, solr_params,
- reverse, after, num,
- _update = _update)
- return search
-
- @classmethod
- def run_search(cls, q, sort, solr_params, reverse, after, num,
- _update = False):
- "returns pysolr.Results(docs=[fullname()],hits=int())"
-
- if reverse:
- sort = swap_strings(sort,'asc','desc')
- after = after._fullname if after else None
-
- search = cls.run_search_cached(q, sort, 0, num, solr_params,
- _update = _update)
- search.docs = get_after(search.docs, after, num)
-
- return search
-
- @staticmethod
- @memoize('solr_search', solr_cache_time)
- def run_search_cached(q, sort, start, rows, other_params):
- with SolrConnection() as s:
- g.log.debug(("Searching q = %r; sort = %r,"
- + " start = %r, rows = %r,"
- + " params = %r")
- % (q,sort,start,rows,other_params))
-
- res = s.search(q, sort, start = start, rows = rows,
- other_params = other_params)
-
- # extract out the fullname in the 'docs' field, since that's
- # all we care about
- res = pysolr.Results(docs = [ i['fullname'] for i in res.docs ],
- hits = res.hits)
-
- return res
-
- def solr_params(self,*k,**kw):
- raise NotImplementedError
-
-class UserSearchQuery(SearchQuery):
- "Base class for queries that use the dismax parser"
- def __init__(self, q, mm, sort=None, fields=[], langs=None, **kw):
- default_fields = ['contents^1.5','contents_ws^3'] + fields
-
- if langs is None:
- fields = default_fields
- else:
- if langs == 'all':
- langs = searchable_langs
- fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
- + default_fields)
-
- # minimum match. See http://lucene.apache.org/solr/api/org/apache/solr/util/doc-files/min-should-match.html
- self.mm = mm
-
- SearchQuery.__init__(self, q, sort, fields = fields, **kw)
-
- def solr_params(self, q, boost):
- return q, dict(fl = 'fullname',
- qt = 'dismax',
- bq = ' '.join(boost),
- qf = ' '.join(self.fields),
- mm = self.mm)
-
-class LinkSearchQuery(UserSearchQuery):
- def __init__(self, q, mm = None, **kw):
- additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1']
-
- if mm is None:
- mm = '4<75%'
-
- UserSearchQuery.__init__(self, q, mm = mm, fields = additional_fields,
- types=[Link], **kw)
-
-class RelatedSearchQuery(LinkSearchQuery):
- def __init__(self, q, ignore = [], **kw):
- self.ignore = set(ignore) if ignore else set()
-
- LinkSearchQuery.__init__(self, q, mm = '3<100% 5<60% 8<50%', **kw)
-
- def run(self, *k, **kw):
- search = LinkSearchQuery.run(self, *k, **kw)
- search.docs = [ x for x in search.docs if x not in self.ignore ]
- return search
-
-class SubredditSearchQuery(UserSearchQuery):
- def __init__(self, q, **kw):
- # note that 'downs' is a measure of activity on subreddits
- UserSearchQuery.__init__(self, q, mm = '75%', sort = 'downs desc',
- types=[Subreddit], **kw)
-
-class DomainSearchQuery(SearchQuery):
- def __init__(self, domain, **kw):
- q = '+site:%s' % domain
-
- SearchQuery.__init__(self, q = q, fields=['site'],types=[Link], **kw)
-
- def solr_params(self, q, boost):
- q = q + ' ' + ' '.join(boost)
- return q, dict(fl='fullname',
- qt='standard')
-
-def run_commit(optimize=False):
- with SolrConnection(commit=True, optimize=optimize) as s:
- pass
-
-
-def run_changed(drain=False):
- """
- Run by `cron` (through `paster run`) on a schedule to update
- all Things that have been created or have changed since the
- last run. Note: unlike many queue-using functions, this one is
- run from cron and totally drains the queue before terminating
- """
- @g.stats.amqp_processor('solrsearch_changes')
- def _run_changed(msgs, chan):
- print "changed: Processing %d items" % len(msgs)
- msgs = [strordict_fullname(msg.body)
- for msg in msgs]
- fullnames = set(msg['fullname'] for msg in msgs if not msg.get('boost_only'))
-
- things = Thing._by_fullname(fullnames, data=True, return_dict=False)
- things = [x for x in things if isinstance(x, indexed_types)]
-
- update_things = [x for x in things if not x._spam and not x._deleted]
- delete_things = [x for x in things if x._spam or x._deleted]
-
- with SolrConnection() as s:
- if update_things:
- tokenized = tokenize_things(update_things)
- s.add(tokenized)
- if delete_things:
- for i in delete_things:
- s.delete(id=i._fullname)
-
- amqp.handle_items('solrsearch_changes', _run_changed, limit=1000,
- drain=drain)
diff --git a/upstart/reddit-job-solrsearch.conf b/upstart/reddit-job-solrsearch.conf
deleted file mode 100644
index 14f9876eb..000000000
--- a/upstart/reddit-job-solrsearch.conf
+++ /dev/null
@@ -1,13 +0,0 @@
-description "commit/optimize solr index"
-
-instance $optimize
-
-manual
-task
-
-nice 10
-
-script
- . /etc/default/reddit
- wrap-job paster run $REDDIT_INI -c "from r2.lib import solrsearch; solrsearch.run_commit(optimize=$optimize)"
-end script