mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-14 17:38:04 -05:00
Remove references to deprecated Solr index
This commit is contained in:
@@ -1,456 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
"The contents of this file are subject to the Common Public Attribution
|
||||
License Version 1.0. (the "License"); you may not use this file except in
|
||||
compliance with the License. You may obtain a copy of the License at
|
||||
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
software over a computer network and provide for limited attribution for the
|
||||
Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
with Exhibit B.
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
the specific language governing rights and limitations under the License.
|
||||
|
||||
The Original Code is Reddit.
|
||||
|
||||
The Original Developer is the Initial Developer. The Initial Developer of
|
||||
the Original Code is CondeNet, Inc.
|
||||
|
||||
All portions of the code written by CondeNet are Copyright (c) 2006-2009
|
||||
CondeNet, Inc. All Rights Reserved.
|
||||
-->
|
||||
|
||||
<schema name="reddit" version="1.1">
|
||||
<types>
|
||||
<!-- field type definitions. The "name" attribute is
|
||||
just a label to be used by field definitions. The "class"
|
||||
attribute and any other attributes determine the real
|
||||
behavior of the fieldType.
|
||||
Class names starting with "solr" refer to java classes in the
|
||||
org.apache.solr.analysis package.
|
||||
-->
|
||||
|
||||
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
|
||||
- StrField and TextField support an optional compressThreshold which
|
||||
limits compression (if enabled in the derived fields) to values which
|
||||
exceed a certain size (in characters).
|
||||
-->
|
||||
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<!-- The optional sortMissingLast and sortMissingFirst attributes are
|
||||
currently supported on types that are sorted internally as strings.
|
||||
- If sortMissingLast="true", then a sort on this field will cause documents
|
||||
without the field to come after documents with the field,
|
||||
regardless of the requested sort order (asc or desc).
|
||||
- If sortMissingFirst="true", then a sort on this field will cause documents
|
||||
without the field to come before documents with the field,
|
||||
regardless of the requested sort order.
|
||||
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
|
||||
then default lucene sorting will be used which places docs without the
|
||||
field first in an ascending sort and last in a descending sort.
|
||||
-->
|
||||
|
||||
|
||||
<!-- numeric field types that store and index the text
|
||||
value verbatim (and hence don't support range queries, since the
|
||||
lexicographic ordering isn't equal to the numeric ordering) -->
|
||||
<fieldType name="integer" class="solr.IntField" omitNorms="true"/>
|
||||
<fieldType name="long" class="solr.LongField" omitNorms="true"/>
|
||||
<fieldType name="float" class="solr.FloatField" omitNorms="true"/>
|
||||
<fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
|
||||
|
||||
|
||||
<!-- Numeric field types that manipulate the value into
|
||||
a string value that isn't human-readable in its internal form,
|
||||
but with a lexicographic ordering the same as the numeric ordering,
|
||||
so that range queries work correctly. -->
|
||||
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
|
||||
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
|
||||
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
|
||||
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<fieldType name="hotness" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="false"/>
|
||||
|
||||
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
||||
is a more restricted form of the canonical representation of dateTime
|
||||
http://www.w3.org/TR/xmlschema-2/#dateTime
|
||||
The trailing "Z" designates UTC time and is mandatory.
|
||||
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
|
||||
All other components are mandatory.
|
||||
|
||||
Expressions can also be used to denote calculations that should be
|
||||
performed relative to "NOW" to determine the value, ie...
|
||||
|
||||
NOW/HOUR
|
||||
... Round to the start of the current hour
|
||||
NOW-1DAY
|
||||
... Exactly 1 day prior to now
|
||||
NOW/DAY+6MONTHS+3DAYS
|
||||
... 6 months and 3 days in the future from the start of
|
||||
the current day
|
||||
|
||||
Consult the DateField javadocs for more information.
|
||||
-->
|
||||
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<!-- solr.TextField allows the specification of custom text analyzers
|
||||
specified as a tokenizer and a list of token filters. Different
|
||||
analyzers may be specified for indexing and querying.
|
||||
|
||||
The optional positionIncrementGap puts space between multiple fields of
|
||||
this type on the same document, with the purpose of preventing false phrase
|
||||
matching across fields.
|
||||
|
||||
For more info on customizing your analyzer chain, please see
|
||||
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
||||
-->
|
||||
|
||||
<!-- One can also specify an existing Analyzer class that has a
|
||||
default constructor via the class attribute on the analyzer element -->
|
||||
|
||||
<!-- languages -->
|
||||
<fieldtype name="text_dk" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Danish" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_nl" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Dutch" />
|
||||
</analyzer>
|
||||
|
||||
</fieldtype>
|
||||
<fieldtype name="text_en" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="English" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_fi" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Finnish" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_fr" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="French" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_de" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="German" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_it" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Italian" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_no" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_nn" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_pt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Portuguese" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldType name="text_ru" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.ru.RussianAnalyzer"/>
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Russian" />
|
||||
</fieldType>
|
||||
<fieldtype name="text_es" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Spanish" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="text_sv" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StandardFilterFactory"/>
|
||||
<filter class="solr.ISOLatin1AccentFilterFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<filter class="solr.SnowballPorterFilterFactory" language="Swedish" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldType name="text_zh" class="solr.TextField">
|
||||
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
|
||||
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
|
||||
</fieldType>
|
||||
<fieldType name="text_ja" class="solr.TextField">
|
||||
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
|
||||
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
|
||||
</fieldType>
|
||||
<fieldType name="text_ko" class="solr.TextField">
|
||||
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
|
||||
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
|
||||
</fieldType>
|
||||
<fieldType name="text_cs" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.cz.CzechAnalyzer"/>
|
||||
</fieldType>
|
||||
<fieldType name="text_el" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
|
||||
</fieldType>
|
||||
<fieldType name="text_th" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.th.ThaiAnalyzer"/>
|
||||
</fieldType>
|
||||
|
||||
<!-- A text field that only splits on whitespace for exact matching of words -->
|
||||
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
|
||||
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
|
||||
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
|
||||
Synonyms and stopwords are customized by external files, and stemming is enabled.
|
||||
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
|
||||
WordDelim parts) are removed.
|
||||
-->
|
||||
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
|
||||
<!-- in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
|
||||
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
|
||||
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
-->
|
||||
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
|
||||
<analyzer>
|
||||
<!-- KeywordTokenizer does no actual tokenizing, so the entire
|
||||
input string is preserved as a single token
|
||||
-->
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<!-- The LowerCase TokenFilter does what you expect, which can be
|
||||
when you want your sorting to be case insensitive
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<!-- The TrimFilter removes any leading or trailing whitespace -->
|
||||
<filter class="solr.TrimFilterFactory" />
|
||||
<!-- The PatternReplaceFilter gives you the flexibility to use
|
||||
Java Regular expression to replace any sequence of characters
|
||||
matching a pattern with an arbitrary replacement string,
|
||||
which may include back refrences to portions of the orriginal
|
||||
string matched by the pattern.
|
||||
|
||||
See the Java Regular Expression documentation for more
|
||||
infomation on pattern and replacement string syntax.
|
||||
|
||||
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
|
||||
-->
|
||||
<filter class="solr.PatternReplaceFilterFactory"
|
||||
pattern="([^a-z])" replacement="" replace="all"
|
||||
/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed, any data added to
|
||||
them will be ignored outright
|
||||
-->
|
||||
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
|
||||
|
||||
</types>
|
||||
|
||||
|
||||
<fields>
|
||||
<!-- Valid attributes for fields:
|
||||
name: mandatory - the name for the field
|
||||
type: mandatory - the name of a previously defined type from the <types> section
|
||||
indexed: true if this field should be indexed (searchable or sortable)
|
||||
stored: true if this field should be retrievable
|
||||
compressed: [false] if this field should be stored using gzip compression
|
||||
(this will only apply if the field type is compressable; among
|
||||
the standard field types, only TextField and StrField are)
|
||||
multiValued: true if this field may contain multiple values per document
|
||||
omitNorms: (expert) set to true to omit the norms associated with
|
||||
this field (this disables length normalization and index-time
|
||||
boosting for the field, and saves some memory). Only full-text
|
||||
fields or fields that need an index-time boost need norms.
|
||||
-->
|
||||
|
||||
<!-- Thing -->
|
||||
<field name="fullname" type="string" indexed="true" stored="true" required="true" />
|
||||
<field name="type" type="string" indexed="true" stored="false" required="true" multiValued="true" />
|
||||
<field name="date" type="date" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="lang" type="string" indexed="true" stored="false" required="false" />
|
||||
<field name="ups" type="sint" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="downs" type="sint" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="hot" type="hotness" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="controversy" type="sfloat" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="points" type="sint" indexed="true" stored="true" required="true" reversed="true" />
|
||||
<field name="spam" type="boolean" indexed="true" stored="true" required="false" />
|
||||
<field name="deleted" type="boolean" indexed="true" stored="true" required="false" />
|
||||
<!-- subreddit,link,comment -->
|
||||
<field name="author_id" type="integer" indexed="true" stored="false" required="false" />
|
||||
<field name="author" type="string" indexed="true" stored="false" required="false" />
|
||||
<!-- subreddit -->
|
||||
<field name="title" type="text" indexed="true" stored="false" required="false" />
|
||||
<field name="description" type="text" indexed="true" stored="false" required="false" />
|
||||
<field name="firsttext" type="text" indexed="true" stored="false" required="false" />
|
||||
<field name="name" type="string" indexed="true" stored="false" required="false" />
|
||||
<field name="over_18" type="boolean" indexed="true" stored="false" required="false" />
|
||||
<field name="sr_type" type="string" indexed="true" stored="false" required="false" />
|
||||
<!-- link -->
|
||||
<field name="sr_id" type="integer" indexed="true" stored="false" required="false" />
|
||||
<field name="reddit" type="string" indexed="true" stored="false" required="false" />
|
||||
<field name="subreddit" type="string" indexed="true" stored="false" required="false" />
|
||||
<field name="url" type="text" indexed="true" stored="false" required="false" />
|
||||
<field name="domain" type="string" indexed="true" stored="false" required="false" multiValued="true" />
|
||||
<field name="site" type="string" indexed="true" stored="false" required="false" multiValued="true" />
|
||||
<field name="is_self" type="boolean" indexed="true" stored="false" required="false" />
|
||||
<!-- comment (none) -->
|
||||
|
||||
<!-- all objects must have a 'contents' field, and most will also
|
||||
have a field for their particular languages. Searches are then
|
||||
done according to the fields in the languages that the user
|
||||
has specified -->
|
||||
|
||||
<field name="contents" type="text" indexed="true" stored="false" required="true" />
|
||||
<field name="contents_ws" type="text_ws" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_en" type="text_en" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_cs" type="text_cs" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_pt" type="text_pt" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_zh" type="text_zh" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_ja" type="text_ja" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_ko" type="text_ko" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_de" type="text_de" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_fr" type="text_fr" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_el" type="text_el" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_nl" type="text_nl" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_no" type="text_no" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_nn" type="text_nn" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_ru" type="text_ru" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_it" type="text_it" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_es" type="text_es" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_sv" type="text_sv" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_fi" type="text_fi" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_dk" type="text_dk" indexed="true" stored="false" required="false" />
|
||||
<field name="contents_th" type="text_th" indexed="true" stored="false" required="false" />
|
||||
|
||||
|
||||
</fields>
|
||||
|
||||
<!-- Field to use to determine and enforce document uniqueness.
|
||||
Unless this field is marked with required="false", it will be a required field
|
||||
-->
|
||||
<uniqueKey>fullname</uniqueKey>
|
||||
|
||||
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
||||
<defaultSearchField>contents</defaultSearchField>
|
||||
|
||||
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
|
||||
<solrQueryParser defaultOperator="OR"/>
|
||||
|
||||
<!-- Similarity is the scoring routine for each document vs. a query.
|
||||
A custom similarity may be specified here, but the default is fine
|
||||
for most applications. -->
|
||||
<!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
|
||||
|
||||
</schema>
|
||||
@@ -1,387 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
"The contents of this file are subject to the Common Public Attribution
|
||||
License Version 1.0. (the "License"); you may not use this file except in
|
||||
compliance with the License. You may obtain a copy of the License at
|
||||
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
software over a computer network and provide for limited attribution for the
|
||||
Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
with Exhibit B.
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
the specific language governing rights and limitations under the License.
|
||||
|
||||
The Original Code is Reddit.
|
||||
|
||||
The Original Developer is the Initial Developer. The Initial Developer of
|
||||
the Original Code is CondeNet, Inc.
|
||||
|
||||
All portions of the code written by CondeNet are Copyright (c) 2006-2009
|
||||
CondeNet, Inc. All Rights Reserved.
|
||||
-->
|
||||
|
||||
<Server port="8081" shutdown="SHUTDOWN">
|
||||
|
||||
<!-- Comment these entries out to disable JMX MBeans support used for the
|
||||
administration web application -->
|
||||
<Listener className="org.apache.catalina.core.AprLifecycleListener" />
|
||||
<Listener className="org.apache.catalina.mbeans.ServerLifecycleListener" />
|
||||
<Listener className="org.apache.catalina.mbeans.GlobalResourcesLifecycleListener" />
|
||||
<Listener className="org.apache.catalina.storeconfig.StoreConfigLifecycleListener"/>
|
||||
|
||||
<!-- Global JNDI resources -->
|
||||
<GlobalNamingResources>
|
||||
|
||||
<!-- Test entry for demonstration purposes -->
|
||||
<Environment name="simpleValue" type="java.lang.Integer" value="30"/>
|
||||
|
||||
<!-- Editable user database that can also be used by
|
||||
UserDatabaseRealm to authenticate users -->
|
||||
<Resource name="UserDatabase" auth="Container"
|
||||
type="org.apache.catalina.UserDatabase"
|
||||
description="User database that can be updated and saved"
|
||||
factory="org.apache.catalina.users.MemoryUserDatabaseFactory"
|
||||
pathname="conf/tomcat-users.xml" />
|
||||
|
||||
</GlobalNamingResources>
|
||||
|
||||
<!-- A "Service" is a collection of one or more "Connectors" that share
|
||||
a single "Container" (and therefore the web applications visible
|
||||
within that Container). Normally, that Container is an "Engine",
|
||||
but this is not required.
|
||||
|
||||
Note: A "Service" is not itself a "Container", so you may not
|
||||
define subcomponents such as "Valves" or "Loggers" at this level.
|
||||
-->
|
||||
|
||||
<!-- Define the Tomcat Stand-Alone Service -->
|
||||
<Service name="Catalina">
|
||||
|
||||
<!-- A "Connector" represents an endpoint by which requests are received
|
||||
and responses are returned. Each Connector passes requests on to the
|
||||
associated "Container" (normally an Engine) for processing.
|
||||
|
||||
By default, a non-SSL HTTP/1.1 Connector is established on port 8080.
|
||||
You can also enable an SSL HTTP/1.1 Connector on port 8443 by
|
||||
following the instructions below and uncommenting the second Connector
|
||||
entry. SSL support requires the following steps (see the SSL Config
|
||||
HOWTO in the Tomcat 5 documentation bundle for more detailed
|
||||
instructions):
|
||||
* If your JDK version 1.3 or prior, download and install JSSE 1.0.2 or
|
||||
later, and put the JAR files into "$JAVA_HOME/jre/lib/ext".
|
||||
* Execute:
|
||||
%JAVA_HOME%\bin\keytool -genkey -alias tomcat -keyalg RSA (Windows)
|
||||
$JAVA_HOME/bin/keytool -genkey -alias tomcat -keyalg RSA (Unix)
|
||||
with a password value of "changeit" for both the certificate and
|
||||
the keystore itself.
|
||||
|
||||
By default, DNS lookups are enabled when a web application calls
|
||||
request.getRemoteHost(). This can have an adverse impact on
|
||||
performance, so you can disable it by setting the
|
||||
"enableLookups" attribute to "false". When DNS lookups are disabled,
|
||||
request.getRemoteHost() will return the String version of the
|
||||
IP address of the remote client.
|
||||
-->
|
||||
|
||||
<!-- Define a non-SSL HTTP/1.1 Connector on port 8080 -->
|
||||
<Connector port="8080" maxHttpHeaderSize="8192"
|
||||
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
|
||||
enableLookups="false" redirectPort="8443" acceptCount="100"
|
||||
connectionTimeout="20000" disableUploadTimeout="true"
|
||||
URIEncoding="UTF-8" />
|
||||
<!-- Note : To disable connection timeouts, set connectionTimeout value
|
||||
to 0 -->
|
||||
|
||||
<!-- Note : To use gzip compression you could set the following properties :
|
||||
|
||||
compression="on"
|
||||
compressionMinSize="2048"
|
||||
noCompressionUserAgents="gozilla, traviata"
|
||||
compressableMimeType="text/html,text/xml"
|
||||
-->
|
||||
|
||||
<!-- Define a SSL HTTP/1.1 Connector on port 8443 -->
|
||||
<!--
|
||||
<Connector port="8443" maxHttpHeaderSize="8192"
|
||||
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
|
||||
enableLookups="false" disableUploadTimeout="true"
|
||||
acceptCount="100" scheme="https" secure="true"
|
||||
clientAuth="false" sslProtocol="TLS" />
|
||||
-->
|
||||
|
||||
<!-- Define an AJP 1.3 Connector on port 8009 -->
|
||||
<Connector port="8009"
|
||||
enableLookups="false" redirectPort="8443" protocol="AJP/1.3" />
|
||||
|
||||
<!-- Define a Proxied HTTP/1.1 Connector on port 8082 -->
|
||||
<!-- See proxy documentation for more information about using this. -->
|
||||
<!--
|
||||
<Connector port="8082"
|
||||
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
|
||||
enableLookups="false" acceptCount="100" connectionTimeout="20000"
|
||||
proxyPort="80" disableUploadTimeout="true" />
|
||||
-->
|
||||
|
||||
<!-- An Engine represents the entry point (within Catalina) that processes
|
||||
every request. The Engine implementation for Tomcat stand alone
|
||||
analyzes the HTTP headers included with the request, and passes them
|
||||
on to the appropriate Host (virtual host). -->
|
||||
|
||||
<!-- You should set jvmRoute to support load-balancing via AJP ie :
|
||||
<Engine name="Standalone" defaultHost="localhost" jvmRoute="jvm1">
|
||||
-->
|
||||
|
||||
<!-- Define the top level container in our container hierarchy -->
|
||||
<Engine name="Catalina" defaultHost="localhost">
|
||||
|
||||
<!-- The request dumper valve dumps useful debugging information about
|
||||
the request headers and cookies that were received, and the response
|
||||
headers and cookies that were sent, for all requests received by
|
||||
this instance of Tomcat. If you care only about requests to a
|
||||
particular virtual host, or a particular application, nest this
|
||||
element inside the corresponding <Host> or <Context> entry instead.
|
||||
|
||||
For a similar mechanism that is portable to all Servlet 2.4
|
||||
containers, check out the "RequestDumperFilter" Filter in the
|
||||
example application (the source for this filter may be found in
|
||||
"$CATALINA_HOME/webapps/examples/WEB-INF/classes/filters").
|
||||
|
||||
Note that this Valve uses the platform's default character encoding.
|
||||
This may cause problems for developers in another encoding, e.g.
|
||||
UTF-8. Use the RequestDumperFilter instead.
|
||||
|
||||
Also note that enabling this Valve will write a ton of stuff to your
|
||||
logs. They are likely to grow quite large. This extensive log writing
|
||||
will definitely slow down your server.
|
||||
|
||||
Request dumping is disabled by default. Uncomment the following
|
||||
element to enable it. -->
|
||||
<!--
|
||||
<Valve className="org.apache.catalina.valves.RequestDumperValve"/>
|
||||
-->
|
||||
|
||||
<!-- Because this Realm is here, an instance will be shared globally -->
|
||||
|
||||
<!-- This Realm uses the UserDatabase configured in the global JNDI
|
||||
resources under the key "UserDatabase". Any edits
|
||||
that are performed against this UserDatabase are immediately
|
||||
available for use by the Realm. -->
|
||||
<Realm className="org.apache.catalina.realm.UserDatabaseRealm"
|
||||
resourceName="UserDatabase"/>
|
||||
|
||||
<!-- Comment out the old realm but leave here for now in case we
|
||||
need to go back quickly -->
|
||||
<!--
|
||||
<Realm className="org.apache.catalina.realm.MemoryRealm" />
|
||||
-->
|
||||
|
||||
<!-- Replace the above Realm with one of the following to get a Realm
|
||||
stored in a database and accessed via JDBC -->
|
||||
|
||||
<!--
|
||||
<Realm className="org.apache.catalina.realm.JDBCRealm"
|
||||
driverName="org.gjt.mm.mysql.Driver"
|
||||
connectionURL="jdbc:mysql://localhost/authority"
|
||||
connectionName="test" connectionPassword="test"
|
||||
userTable="users" userNameCol="user_name" userCredCol="user_pass"
|
||||
userRoleTable="user_roles" roleNameCol="role_name" />
|
||||
-->
|
||||
|
||||
<!--
|
||||
<Realm className="org.apache.catalina.realm.JDBCRealm"
|
||||
driverName="oracle.jdbc.driver.OracleDriver"
|
||||
connectionURL="jdbc:oracle:thin:@ntserver:1521:ORCL"
|
||||
connectionName="scott" connectionPassword="tiger"
|
||||
userTable="users" userNameCol="user_name" userCredCol="user_pass"
|
||||
userRoleTable="user_roles" roleNameCol="role_name" />
|
||||
-->
|
||||
|
||||
<!--
|
||||
<Realm className="org.apache.catalina.realm.JDBCRealm"
|
||||
driverName="sun.jdbc.odbc.JdbcOdbcDriver"
|
||||
connectionURL="jdbc:odbc:CATALINA"
|
||||
userTable="users" userNameCol="user_name" userCredCol="user_pass"
|
||||
userRoleTable="user_roles" roleNameCol="role_name" />
|
||||
-->
|
||||
|
||||
<!-- Define the default virtual host
|
||||
Note: XML Schema validation will not work with Xerces 2.2.
|
||||
-->
|
||||
<Host name="localhost" appBase="webapps"
|
||||
unpackWARs="true" autoDeploy="true"
|
||||
xmlValidation="false" xmlNamespaceAware="false">
|
||||
|
||||
<!-- Defines a cluster for this node,
|
||||
By defining this element, means that every manager will be changed.
|
||||
So when running a cluster, only make sure that you have webapps in there
|
||||
that need to be clustered and remove the other ones.
|
||||
A cluster has the following parameters:
|
||||
|
||||
className = the fully qualified name of the cluster class
|
||||
|
||||
clusterName = a descriptive name for your cluster, can be anything
|
||||
|
||||
mcastAddr = the multicast address, has to be the same for all the nodes
|
||||
|
||||
mcastPort = the multicast port, has to be the same for all the nodes
|
||||
|
||||
mcastBindAddress = bind the multicast socket to a specific address
|
||||
|
||||
mcastTTL = the multicast TTL if you want to limit your broadcast
|
||||
|
||||
mcastSoTimeout = the multicast readtimeout
|
||||
|
||||
mcastFrequency = the number of milliseconds in between sending a "I'm alive" heartbeat
|
||||
|
||||
mcastDropTime = the number a milliseconds before a node is considered "dead" if no heartbeat is received
|
||||
|
||||
tcpThreadCount = the number of threads to handle incoming replication requests, optimal would be the same amount of threads as nodes
|
||||
|
||||
tcpListenAddress = the listen address (bind address) for TCP cluster request on this host,
|
||||
in case of multiple ethernet cards.
|
||||
auto means that address becomes
|
||||
InetAddress.getLocalHost().getHostAddress()
|
||||
|
||||
tcpListenPort = the tcp listen port
|
||||
|
||||
tcpSelectorTimeout = the timeout (ms) for the Selector.select() method in case the OS
|
||||
has a wakup bug in java.nio. Set to 0 for no timeout
|
||||
|
||||
printToScreen = true means that managers will also print to std.out
|
||||
|
||||
expireSessionsOnShutdown = true means that
|
||||
|
||||
useDirtyFlag = true means that we only replicate a session after setAttribute,removeAttribute has been called.
|
||||
false means to replicate the session after each request.
|
||||
false means that replication would work for the following piece of code: (only for SimpleTcpReplicationManager)
|
||||
<%
|
||||
HashMap map = (HashMap)session.getAttribute("map");
|
||||
map.put("key","value");
|
||||
%>
|
||||
replicationMode = can be either 'pooled', 'synchronous' or 'asynchronous'.
|
||||
* Pooled means that the replication happens using several sockets in a synchronous way. Ie, the data gets replicated, then the request return. This is the same as the 'synchronous' setting except it uses a pool of sockets, hence it is multithreaded. This is the fastest and safest configuration. To use this, also increase the nr of tcp threads that you have dealing with replication.
|
||||
* Synchronous means that the thread that executes the request, is also the
|
||||
thread the replicates the data to the other nodes, and will not return until all
|
||||
nodes have received the information.
|
||||
* Asynchronous means that there is a specific 'sender' thread for each cluster node,
|
||||
so the request thread will queue the replication request into a "smart" queue,
|
||||
and then return to the client.
|
||||
The "smart" queue is a queue where when a session is added to the queue, and the same session
|
||||
already exists in the queue from a previous request, that session will be replaced
|
||||
in the queue instead of replicating two requests. This almost never happens, unless there is a
|
||||
large network delay.
|
||||
-->
|
||||
<!--
|
||||
When configuring for clustering, you also add in a valve to catch all the requests
|
||||
coming in, at the end of the request, the session may or may not be replicated.
|
||||
A session is replicated if and only if all the conditions are met:
|
||||
1. useDirtyFlag is true or setAttribute or removeAttribute has been called AND
|
||||
2. a session exists (has been created)
|
||||
3. the request is not trapped by the "filter" attribute
|
||||
|
||||
The filter attribute is to filter out requests that could not modify the session,
|
||||
hence we don't replicate the session after the end of this request.
|
||||
The filter is negative, ie, anything you put in the filter, you mean to filter out,
|
||||
ie, no replication will be done on requests that match one of the filters.
|
||||
The filter attribute is delimited by ;, so you can't escape out ; even if you wanted to.
|
||||
|
||||
filter=".*\.gif;.*\.js;" means that we will not replicate the session after requests with the URI
|
||||
ending with .gif and .js are intercepted.
|
||||
|
||||
The deployer element can be used to deploy apps cluster wide.
|
||||
Currently the deployment only deploys/undeploys to working members in the cluster
|
||||
so no WARs are copied upons startup of a broken node.
|
||||
The deployer watches a directory (watchDir) for WAR files when watchEnabled="true"
|
||||
When a new war file is added the war gets deployed to the local instance,
|
||||
and then deployed to the other instances in the cluster.
|
||||
When a war file is deleted from the watchDir the war is undeployed locally
|
||||
and cluster wide
|
||||
-->
|
||||
|
||||
<!--
|
||||
<Cluster className="org.apache.catalina.cluster.tcp.SimpleTcpCluster"
|
||||
managerClassName="org.apache.catalina.cluster.session.DeltaManager"
|
||||
expireSessionsOnShutdown="false"
|
||||
useDirtyFlag="true"
|
||||
notifyListenersOnReplication="true">
|
||||
|
||||
<Membership
|
||||
className="org.apache.catalina.cluster.mcast.McastService"
|
||||
mcastAddr="228.0.0.4"
|
||||
mcastPort="45564"
|
||||
mcastFrequency="500"
|
||||
mcastDropTime="3000"/>
|
||||
|
||||
<Receiver
|
||||
className="org.apache.catalina.cluster.tcp.ReplicationListener"
|
||||
tcpListenAddress="auto"
|
||||
tcpListenPort="4001"
|
||||
tcpSelectorTimeout="100"
|
||||
tcpThreadCount="6"/>
|
||||
|
||||
<Sender
|
||||
className="org.apache.catalina.cluster.tcp.ReplicationTransmitter"
|
||||
replicationMode="pooled"
|
||||
ackTimeout="15000"
|
||||
waitForAck="true"/>
|
||||
|
||||
<Valve className="org.apache.catalina.cluster.tcp.ReplicationValve"
|
||||
filter=".*\.gif;.*\.js;.*\.jpg;.*\.png;.*\.htm;.*\.html;.*\.css;.*\.txt;"/>
|
||||
|
||||
<Deployer className="org.apache.catalina.cluster.deploy.FarmWarDeployer"
|
||||
tempDir="/tmp/war-temp/"
|
||||
deployDir="/tmp/war-deploy/"
|
||||
watchDir="/tmp/war-listen/"
|
||||
watchEnabled="false"/>
|
||||
|
||||
<ClusterListener className="org.apache.catalina.cluster.session.ClusterSessionListener"/>
|
||||
</Cluster>
|
||||
-->
|
||||
|
||||
|
||||
|
||||
<!-- Normally, users must authenticate themselves to each web app
|
||||
individually. Uncomment the following entry if you would like
|
||||
a user to be authenticated the first time they encounter a
|
||||
resource protected by a security constraint, and then have that
|
||||
user identity maintained across *all* web applications contained
|
||||
in this virtual host. -->
|
||||
<!--
|
||||
<Valve className="org.apache.catalina.authenticator.SingleSignOn" />
|
||||
-->
|
||||
|
||||
<!-- Access log processes all requests for this virtual host. By
|
||||
default, log files are created in the "logs" directory relative to
|
||||
$CATALINA_HOME. If you wish, you can specify a different
|
||||
directory with the "directory" attribute. Specify either a relative
|
||||
(to $CATALINA_HOME) or absolute path to the desired directory.
|
||||
-->
|
||||
<!--
|
||||
<Valve className="org.apache.catalina.valves.AccessLogValve"
|
||||
directory="logs" prefix="localhost_access_log." suffix=".txt"
|
||||
pattern="common" resolveHosts="false"/>
|
||||
-->
|
||||
|
||||
<!-- Access log processes all requests for this virtual host. By
|
||||
default, log files are created in the "logs" directory relative to
|
||||
$CATALINA_HOME. If you wish, you can specify a different
|
||||
directory with the "directory" attribute. Specify either a relative
|
||||
(to $CATALINA_HOME) or absolute path to the desired directory.
|
||||
This access log implementation is optimized for maximum performance,
|
||||
but is hardcoded to support only the "common" and "combined" patterns.
|
||||
-->
|
||||
<!--
|
||||
<Valve className="org.apache.catalina.valves.FastCommonAccessLogValve"
|
||||
directory="logs" prefix="localhost_access_log." suffix=".txt"
|
||||
pattern="common" resolveHosts="false"/>
|
||||
-->
|
||||
|
||||
</Host>
|
||||
|
||||
</Engine>
|
||||
|
||||
</Service>
|
||||
|
||||
</Server>
|
||||
@@ -1,464 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
"The contents of this file are subject to the Common Public Attribution
|
||||
License Version 1.0. (the "License"); you may not use this file except in
|
||||
compliance with the License. You may obtain a copy of the License at
|
||||
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
software over a computer network and provide for limited attribution for the
|
||||
Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
with Exhibit B.
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
the specific language governing rights and limitations under the License.
|
||||
|
||||
The Original Code is Reddit.
|
||||
|
||||
The Original Developer is the Initial Developer. The Initial Developer of
|
||||
the Original Code is CondeNet, Inc.
|
||||
|
||||
All portions of the code written by CondeNet are Copyright (c) 2006-2009
|
||||
CondeNet, Inc. All Rights Reserved.
|
||||
-->
|
||||
<config>
|
||||
<!-- Set this to 'false' if you want solr to continue working after it has
|
||||
encountered an severe configuration error. In a production environment,
|
||||
you may want solr to keep working even if one handler is mis-configured.
|
||||
|
||||
You may also set this to false using by setting the system property:
|
||||
-Dsolr.abortOnConfigurationError=false
|
||||
-->
|
||||
<abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
|
||||
|
||||
<!-- Used to specify an alternate directory to hold all index data
|
||||
other than the default ./data under the Solr home.
|
||||
If replication is in use, this should match the replication configuration. -->
|
||||
<!--
|
||||
<dataDir>./solr/data</dataDir>
|
||||
-->
|
||||
|
||||
<indexDefaults>
|
||||
<!-- Values here affect all index writers and act as a default unless overridden. -->
|
||||
<useCompoundFile>false</useCompoundFile>
|
||||
<mergeFactor>10</mergeFactor>
|
||||
<maxBufferedDocs>1000</maxBufferedDocs>
|
||||
<maxMergeDocs>2147483647</maxMergeDocs>
|
||||
<maxFieldLength>10000</maxFieldLength>
|
||||
<writeLockTimeout>1000</writeLockTimeout>
|
||||
<commitLockTimeout>10000</commitLockTimeout>
|
||||
</indexDefaults>
|
||||
|
||||
<mainIndex>
|
||||
<!-- options specific to the main on-disk lucene index -->
|
||||
<useCompoundFile>false</useCompoundFile>
|
||||
<mergeFactor>10</mergeFactor>
|
||||
<maxBufferedDocs>1000</maxBufferedDocs>
|
||||
<maxMergeDocs>2147483647</maxMergeDocs>
|
||||
<maxFieldLength>10000</maxFieldLength>
|
||||
|
||||
<!-- If true, unlock any held write or commit locks on startup.
|
||||
This defeats the locking mechanism that allows multiple
|
||||
processes to safely access a lucene index, and should be
|
||||
used with care. -->
|
||||
<unlockOnStartup>false</unlockOnStartup>
|
||||
</mainIndex>
|
||||
|
||||
<!-- the default high-performance update handler -->
|
||||
<updateHandler class="solr.DirectUpdateHandler2">
|
||||
|
||||
<!-- A prefix of "solr." for class names is an alias that
|
||||
causes solr to search appropriate packages, including
|
||||
org.apache.solr.(search|update|request|core|analysis)
|
||||
-->
|
||||
|
||||
<!-- autocommit pending docs if certain criteria are met
|
||||
<autoCommit>
|
||||
<maxDocs>10000</maxDocs>
|
||||
<maxTime>1000</maxTime>
|
||||
</autoCommit>
|
||||
-->
|
||||
|
||||
<!-- The RunExecutableListener executes an external command.
|
||||
exe - the name of the executable to run
|
||||
dir - dir to use as the current working directory. default="."
|
||||
wait - the calling thread waits until the executable returns. default="true"
|
||||
args - the arguments to pass to the program. default=nothing
|
||||
env - environment variables to set. default=nothing
|
||||
-->
|
||||
<!-- A postCommit event is fired after every commit or optimize command
|
||||
<listener event="postCommit" class="solr.RunExecutableListener">
|
||||
<str name="exe">snapshooter</str>
|
||||
<str name="dir">solr/bin</str>
|
||||
<bool name="wait">true</bool>
|
||||
<arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
|
||||
<arr name="env"> <str>MYVAR=val1</str> </arr>
|
||||
</listener>
|
||||
-->
|
||||
<!-- A postOptimize event is fired only after every optimize command, useful
|
||||
in conjunction with index distribution to only distribute optimized indicies
|
||||
<listener event="postOptimize" class="solr.RunExecutableListener">
|
||||
<str name="exe">snapshooter</str>
|
||||
<str name="dir">solr/bin</str>
|
||||
<bool name="wait">true</bool>
|
||||
</listener>
|
||||
-->
|
||||
|
||||
</updateHandler>
|
||||
|
||||
|
||||
<query>
|
||||
<!-- Maximum number of clauses in a boolean query... can affect
|
||||
range or prefix queries that expand to big boolean
|
||||
queries. An exception is thrown if exceeded. -->
|
||||
<maxBooleanClauses>1024</maxBooleanClauses>
|
||||
|
||||
|
||||
<!-- Cache used by SolrIndexSearcher for filters (DocSets),
|
||||
unordered sets of *all* documents that match a query.
|
||||
When a new searcher is opened, its caches may be prepopulated
|
||||
or "autowarmed" using data from caches in the old searcher.
|
||||
autowarmCount is the number of items to prepopulate. For LRUCache,
|
||||
the autowarmed items will be the most recently accessed items.
|
||||
Parameters:
|
||||
class - the SolrCache implementation (currently only LRUCache)
|
||||
size - the maximum number of entries in the cache
|
||||
initialSize - the initial capacity (number of entries) of
|
||||
the cache. (seel java.util.HashMap)
|
||||
autowarmCount - the number of entries to prepopulate from
|
||||
and old cache.
|
||||
-->
|
||||
<filterCache
|
||||
class="solr.LRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="256"/>
|
||||
|
||||
<!-- queryResultCache caches results of searches - ordered lists of
|
||||
document ids (DocList) based on a query, a sort, and the range
|
||||
of documents requested. -->
|
||||
<queryResultCache
|
||||
class="solr.LRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="256"/>
|
||||
|
||||
<!-- documentCache caches Lucene Document objects (the stored fields for each document).
|
||||
Since Lucene internal document ids are transient, this cache will not be autowarmed. -->
|
||||
<documentCache
|
||||
class="solr.LRUCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="0"/>
|
||||
|
||||
<!-- If true, stored fields that are not requested will be loaded lazily.
|
||||
|
||||
This can result in a significant speed improvement if the usual case is to
|
||||
not load all stored fields, especially if the skipped fields are large compressed
|
||||
text fields.
|
||||
-->
|
||||
<enableLazyFieldLoading>true</enableLazyFieldLoading>
|
||||
|
||||
<!-- Example of a generic cache. These caches may be accessed by name
|
||||
through SolrIndexSearcher.getCache(),cacheLookup(), and cacheInsert().
|
||||
The purpose is to enable easy caching of user/application level data.
|
||||
The regenerator argument should be specified as an implementation
|
||||
of solr.search.CacheRegenerator if autowarming is desired. -->
|
||||
<!--
|
||||
<cache name="myUserCache"
|
||||
class="solr.LRUCache"
|
||||
size="4096"
|
||||
initialSize="1024"
|
||||
autowarmCount="1024"
|
||||
regenerator="org.mycompany.mypackage.MyRegenerator"
|
||||
/>
|
||||
-->
|
||||
|
||||
<!-- An optimization that attempts to use a filter to satisfy a search.
|
||||
If the requested sort does not include score, then the filterCache
|
||||
will be checked for a filter matching the query. If found, the filter
|
||||
will be used as the source of document ids, and then the sort will be
|
||||
applied to that.
|
||||
<useFilterForSortedQuery>true</useFilterForSortedQuery>
|
||||
-->
|
||||
|
||||
<!-- An optimization for use with the queryResultCache. When a search
|
||||
is requested, a superset of the requested number of document ids
|
||||
are collected. For example, if a search for a particular query
|
||||
requests matching documents 10 through 19, and queryWindowSize is 50,
|
||||
then documents 0 through 50 will be collected and cached. Any further
|
||||
requests in that range can be satisfied via the cache. -->
|
||||
<queryResultWindowSize>10</queryResultWindowSize>
|
||||
|
||||
<!-- This entry enables an int hash representation for filters (DocSets)
|
||||
when the number of items in the set is less than maxSize. For smaller
|
||||
sets, this representation is more memory efficient, more efficient to
|
||||
iterate over, and faster to take intersections. -->
|
||||
<HashDocSet maxSize="3000" loadFactor="0.75"/>
|
||||
|
||||
|
||||
<!-- boolToFilterOptimizer converts boolean clauses with zero boost
|
||||
into cached filters if the number of docs selected by the clause exceeds
|
||||
the threshold (represented as a fraction of the total index) -->
|
||||
<boolTofilterOptimizer enabled="true" cacheSize="32" threshold=".05"/>
|
||||
|
||||
|
||||
<!-- a newSearcher event is fired whenever a new searcher is being prepared
|
||||
and there is a current searcher handling requests (aka registered). -->
|
||||
<!-- QuerySenderListener takes an array of NamedList and executes a
|
||||
local query request for each NamedList in sequence. -->
|
||||
<!--
|
||||
<listener event="newSearcher" class="solr.QuerySenderListener">
|
||||
<arr name="queries">
|
||||
<lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
<lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
</arr>
|
||||
</listener>
|
||||
-->
|
||||
|
||||
<!-- a firstSearcher event is fired whenever a new searcher is being
|
||||
prepared but there is no current registered searcher to handle
|
||||
requests or to gain autowarming data from. -->
|
||||
<!--
|
||||
<listener event="firstSearcher" class="solr.QuerySenderListener">
|
||||
<arr name="queries">
|
||||
<lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
</arr>
|
||||
</listener>
|
||||
-->
|
||||
|
||||
<!-- If a search request comes in and there is no current registered searcher,
|
||||
then immediately register the still warming searcher and use it. If
|
||||
"false" then all requests will block until the first searcher is done
|
||||
warming. -->
|
||||
<useColdSearcher>false</useColdSearcher>
|
||||
|
||||
<!-- Maximum number of searchers that may be warming in the background
|
||||
concurrently. An error is returned if this limit is exceeded. Recommend
|
||||
1-2 for read-only slaves, higher for masters w/o cache warming. -->
|
||||
<maxWarmingSearchers>4</maxWarmingSearchers>
|
||||
|
||||
</query>
|
||||
|
||||
<!--
|
||||
Let the dispatch filter handler /select?qt=XXX
|
||||
handleSelect=true will use consistent error handling for /select and /update
|
||||
handleSelect=false will use solr1.1 style error formatting
|
||||
-->
|
||||
<requestDispatcher handleSelect="true" >
|
||||
<!--Make sure your system has some authentication before enabling remote streaming! -->
|
||||
<requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048" />
|
||||
</requestDispatcher>
|
||||
|
||||
|
||||
<!-- requestHandler plugins... incoming queries will be dispatched to the
|
||||
correct handler based on the qt (query type) param matching the
|
||||
name of registered handlers.
|
||||
The "standard" request handler is the default and will be used if qt
|
||||
is not specified in the request.
|
||||
-->
|
||||
<requestHandler name="standard" class="solr.StandardRequestHandler">
|
||||
<!-- default values for query parameters -->
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<!--
|
||||
<int name="rows">10</int>
|
||||
<str name="fl">*</str>
|
||||
<str name="version">2.1</str>
|
||||
-->
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<!-- DisMaxRequestHandler allows easy searching across multiple fields
|
||||
for simple user-entered phrases.
|
||||
see http://wiki.apache.org/solr/DisMaxRequestHandler
|
||||
-->
|
||||
<requestHandler name="dismax" class="solr.DisMaxRequestHandler" >
|
||||
<lst name="defaults">
|
||||
<str name="qf">contents</str>
|
||||
<!-- <str name="echoParams">explicit</str>
|
||||
<float name="tie">0.01</float>
|
||||
<str name="qf">
|
||||
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
|
||||
</str>
|
||||
<str name="pf">
|
||||
text^0.2 features^1.1 name^1.5 manu^1.4 manu_exact^1.9
|
||||
</str>
|
||||
<str name="bf">
|
||||
ord(poplarity)^0.5 recip(rord(price),1,1000,1000)^0.3
|
||||
</str>
|
||||
<str name="fl">
|
||||
id,name,price,score
|
||||
</str>
|
||||
<str name="mm">
|
||||
2<-1 5<-2 6<90%
|
||||
</str>
|
||||
<int name="ps">100</int>
|
||||
<str name="q.alt">*:*</str> -->
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<!-- Note how you can register the same handler multiple times with
|
||||
different names (and different init parameters)
|
||||
-->
|
||||
<requestHandler name="partitioned" class="solr.DisMaxRequestHandler" >
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<str name="qf">text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0</str>
|
||||
<str name="mm">2<-1 5<-2 6<90%</str>
|
||||
<!-- This is an example of using Date Math to specify a constantly
|
||||
moving date range in a config...
|
||||
-->
|
||||
<str name="bq">incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2</str>
|
||||
</lst>
|
||||
<!-- In addition to defaults, "appends" params can be specified
|
||||
to identify values which should be appended to the list of
|
||||
multi-val params from the query (or the existing "defaults").
|
||||
|
||||
In this example, the param "fq=instock:true" will be appended to
|
||||
any query time fq params the user may specify, as a mechanism for
|
||||
partitioning the index, independent of any user selected filtering
|
||||
that may also be desired (perhaps as a result of faceted searching).
|
||||
|
||||
NOTE: there is *absolutely* nothing a client can do to prevent these
|
||||
"appends" values from being used, so don't use this mechanism
|
||||
unless you are sure you always want it.
|
||||
-->
|
||||
<lst name="appends">
|
||||
<str name="fq">inStock:true</str>
|
||||
</lst>
|
||||
<!-- "invariants" are a way of letting the Solr maintainer lock down
|
||||
the options available to Solr clients. Any params values
|
||||
specified here are used regardless of what values may be specified
|
||||
in either the query, the "defaults", or the "appends" params.
|
||||
|
||||
In this example, the facet.field and facet.query params are fixed,
|
||||
limiting the facets clients can use. Faceting is not turned on by
|
||||
default - but if the client does specify facet=true in the request,
|
||||
these are the only facets they will be able to see counts for;
|
||||
regardless of what other facet.field or facet.query params they
|
||||
may specify.
|
||||
|
||||
NOTE: there is *absolutely* nothing a client can do to prevent these
|
||||
"invariants" values from being used, so don't use this mechanism
|
||||
unless you are sure you always want it.
|
||||
-->
|
||||
<lst name="invariants">
|
||||
<str name="facet.field">cat</str>
|
||||
<str name="facet.field">manu_exact</str>
|
||||
<str name="facet.query">price:[* TO 500]</str>
|
||||
<str name="facet.query">price:[500 TO *]</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<requestHandler name="instock" class="solr.DisMaxRequestHandler" >
|
||||
<!-- for legacy reasons, DisMaxRequestHandler will assume all init
|
||||
params are "defaults" if you don't explicitly specify any defaults.
|
||||
-->
|
||||
<str name="fq">
|
||||
inStock:true
|
||||
</str>
|
||||
<str name="qf">
|
||||
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
|
||||
</str>
|
||||
<str name="mm">
|
||||
2<-1 5<-2 6<90%
|
||||
</str>
|
||||
</requestHandler>
|
||||
|
||||
|
||||
<!-- SpellCheckerRequestHandler takes in a word (or several words) as the
|
||||
value of the "q" parameter and returns a list of alternative spelling
|
||||
suggestions. If invoked with a ...&cmd=rebuild, it will rebuild the
|
||||
spellchecker index.
|
||||
-->
|
||||
<requestHandler name="spellchecker" class="solr.SpellCheckerRequestHandler" startup="lazy">
|
||||
<!-- default values for query parameters -->
|
||||
<lst name="defaults">
|
||||
<int name="suggestionCount">1</int>
|
||||
<float name="accuracy">0.5</float>
|
||||
</lst>
|
||||
|
||||
<!-- Main init params for handler -->
|
||||
|
||||
<!-- The directory where your SpellChecker Index should live. -->
|
||||
<!-- May be absolute, or relative to the Solr "dataDir" directory. -->
|
||||
<!-- If this option is not specified, a RAM directory will be used -->
|
||||
<str name="spellcheckerIndexDir">spell</str>
|
||||
|
||||
<!-- the field in your schema that you want to be able to build -->
|
||||
<!-- your spell index on. This should be a field that uses a very -->
|
||||
<!-- simple FieldType without a lot of Analysis (ie: string) -->
|
||||
<str name="termSourceField">word</str>
|
||||
|
||||
</requestHandler>
|
||||
|
||||
|
||||
<!-- Update request handler.
|
||||
|
||||
Note: Since solr1.1 requestHandlers requires a valid content type header if posted in
|
||||
the body. For example, curl now requires: -H 'Content-type:text/xml; charset=utf-8'
|
||||
The response format differs from solr1.1 formatting and returns a standard error code.
|
||||
|
||||
To enable solr1.1 behavior, remove the /update handler or change its path
|
||||
-->
|
||||
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
|
||||
|
||||
<!-- CSV update handler, loaded on demand -->
|
||||
<requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
|
||||
|
||||
|
||||
<!-- Admin Handlers. TODO? There could be a single handler that loads them all... -->
|
||||
<!-- <requestHandler name="/admin/luke" class="org.apache.solr.handler.admin.LukeRequestHandler" />
|
||||
<requestHandler name="/admin/system" class="org.apache.solr.handler.admin.SystemInfoHandler" />
|
||||
<requestHandler name="/admin/plugins" class="org.apache.solr.handler.admin.PluginInfoHandler" />
|
||||
<requestHandler name="/admin/threads" class="org.apache.solr.handler.admin.ThreadDumpHandler" />
|
||||
<requestHandler name="/admin/properties" class="org.apache.solr.handler.admin.PropertiesRequestHandler" /> -->
|
||||
|
||||
<!-- Echo the request contents back to the client -->
|
||||
<!-- <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str> --> <!-- for all params (including the default etc) use: 'all' -->
|
||||
<!-- <str name="echoHandler">true</str>
|
||||
</lst>
|
||||
</requestHandler> -->
|
||||
|
||||
<!-- queryResponseWriter plugins... query responses will be written using the
|
||||
writer specified by the 'wt' request parameter matching the name of a registered
|
||||
writer.
|
||||
The "standard" writer is the default and will be used if 'wt' is not specified
|
||||
in the request. XMLResponseWriter will be used if nothing is specified here.
|
||||
The json, python, and ruby writers are also available by default.
|
||||
|
||||
<queryResponseWriter name="standard" class="org.apache.solr.request.XMLResponseWriter"/>
|
||||
<queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/>
|
||||
<queryResponseWriter name="python" class="org.apache.solr.request.PythonResponseWriter"/>
|
||||
<queryResponseWriter name="ruby" class="org.apache.solr.request.RubyResponseWriter"/>
|
||||
|
||||
<queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
|
||||
-->
|
||||
|
||||
<!-- XSLT response writer transforms the XML output by any xslt file found
|
||||
in Solr's conf/xslt directory. Changes to xslt files are checked for
|
||||
every xsltCacheLifetimeSeconds.
|
||||
-->
|
||||
<!-- <queryResponseWriter name="xslt" class="org.apache.solr.request.XSLTResponseWriter">
|
||||
<int name="xsltCacheLifetimeSeconds">5</int>
|
||||
</queryResponseWriter> -->
|
||||
|
||||
<!-- config for the admin interface -->
|
||||
<admin>
|
||||
<defaultQuery>solr</defaultQuery>
|
||||
<gettableFiles>solrconfig.xml schema.xml admin-extra.html</gettableFiles>
|
||||
<!-- pingQuery should be "URLish" ...
|
||||
& separated key=val pairs ... but there shouldn't be any
|
||||
URL escaping of the values -->
|
||||
<pingQuery>
|
||||
qt=standard&q=solrpingquery
|
||||
</pingQuery>
|
||||
<!-- configure a healthcheck file for servers behind a loadbalancer
|
||||
<healthcheck type="file">server-enabled</healthcheck>
|
||||
-->
|
||||
</admin>
|
||||
|
||||
</config>
|
||||
@@ -338,8 +338,6 @@ if [ ! -f /etc/cron.d/reddit ]; then
|
||||
|
||||
# disabled by default, uncomment if you need these jobs
|
||||
#*/2 * * * * root /sbin/start --quiet reddit-job-google_checkout
|
||||
#*/10 * * * * root /sbin/start --quiet reddit-job-solrsearch optimize=False
|
||||
#0 0 * * * root /sbin/start --quiet reddit-job-solrsearch optimize=True
|
||||
#0 0 * * * root /sbin/start --quiet reddit-job-update_gold_users
|
||||
CRON
|
||||
fi
|
||||
|
||||
@@ -354,12 +354,6 @@ png_optimizer = /usr/bin/env optipng
|
||||
# jpeg compressor
|
||||
jpeg_optimizer =
|
||||
|
||||
# -- search --
|
||||
# where is solor?
|
||||
solr_url =
|
||||
# how long do we cache search results (in seconds)
|
||||
solr_cache_time = 300
|
||||
|
||||
# Just a list of words. Used by errlog.py to make up names for new errors.
|
||||
words_file = /usr/dict/words
|
||||
|
||||
|
||||
@@ -40,8 +40,6 @@ from r2.lib.db.tdb_cassandra import MultiColumnQuery
|
||||
from r2.lib.strings import strings
|
||||
from r2.lib.search import (SearchQuery, SubredditSearchQuery, SearchException,
|
||||
InvalidQuery)
|
||||
from r2.lib.solrsearch import RelatedSearchQuery
|
||||
from r2.lib.contrib.pysolr import SolrError
|
||||
from r2.lib import jsontemplates
|
||||
from r2.lib import sup
|
||||
import r2.lib.db.thing as thing
|
||||
@@ -788,7 +786,7 @@ class FrontController(RedditController):
|
||||
# computed after fetch_more
|
||||
try:
|
||||
res = listing.listing()
|
||||
except SearchException + (SolrError, socket.error) as e:
|
||||
except SearchException + (socket.error,) as e:
|
||||
return self.search_fail(e)
|
||||
timing = time_module.time() - builder.start_time
|
||||
|
||||
|
||||
@@ -36,7 +36,6 @@ from r2.lib.db.thing import Query, Merge, Relations
|
||||
from r2.lib.db import queries
|
||||
from r2.lib.strings import Score
|
||||
from r2.lib import organic
|
||||
import r2.lib.solrsearch as solrsearch
|
||||
import r2.lib.search as search
|
||||
from r2.lib.utils import iters, check_cheating, timeago
|
||||
from r2.lib.utils.trial_utils import populate_spotlight
|
||||
@@ -130,7 +129,7 @@ class ListingController(RedditController):
|
||||
builder_cls = self.builder_cls
|
||||
elif isinstance(self.query_obj, Query):
|
||||
builder_cls = QueryBuilder
|
||||
elif isinstance(self.query_obj, (solrsearch.SearchQuery, search.SearchQuery)):
|
||||
elif isinstance(self.query_obj, search.SearchQuery):
|
||||
builder_cls = SearchBuilder
|
||||
elif isinstance(self.query_obj, iters):
|
||||
builder_cls = IDBuilder
|
||||
|
||||
@@ -931,17 +931,8 @@ class RedditController(MinimalController):
|
||||
abort(304, 'not modified')
|
||||
|
||||
def search_fail(self, exception):
|
||||
from r2.lib.contrib.pysolr import SolrError
|
||||
from r2.lib.search import SearchException
|
||||
if isinstance(exception, SolrError):
|
||||
errmsg = "SolrError: %r" % exception
|
||||
|
||||
if (str(exception) == 'None'):
|
||||
# Production error logs only get non-None errors
|
||||
g.log.debug(errmsg)
|
||||
else:
|
||||
g.log.error(errmsg)
|
||||
elif isinstance(exception, SearchException + (socket.error,)):
|
||||
if isinstance(exception, SearchException + (socket.error,)):
|
||||
g.log.error("Search Error: %s" % repr(exception))
|
||||
|
||||
errpage = pages.RedditError(_("search failed"),
|
||||
|
||||
@@ -47,7 +47,6 @@ class Globals(object):
|
||||
'db_pool_size',
|
||||
'db_pool_overflow_size',
|
||||
'page_cache_time',
|
||||
'solr_cache_time',
|
||||
'num_mc_clients',
|
||||
'MIN_DOWN_LINK',
|
||||
'MIN_UP_KARMA',
|
||||
|
||||
@@ -1,347 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
All we need to create a Solr connection is a url.
|
||||
|
||||
>>> conn = Solr('http://127.0.0.1:8983/solr/')
|
||||
|
||||
First, completely clear the index.
|
||||
|
||||
>>> conn.delete(q='*:*')
|
||||
|
||||
For now, we can only index python dictionaries. Each key in the dictionary
|
||||
will correspond to a field in Solr.
|
||||
|
||||
>>> docs = [
|
||||
... {'id': 'testdoc.1', 'order_i': 1, 'name': 'document 1', 'text': u'Paul Verlaine'},
|
||||
... {'id': 'testdoc.2', 'order_i': 2, 'name': 'document 2', 'text': u'Владимир Маякoвский'},
|
||||
... {'id': 'testdoc.3', 'order_i': 3, 'name': 'document 3', 'text': u'test'},
|
||||
... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'test'}
|
||||
... ]
|
||||
|
||||
|
||||
We can add documents to the index by passing a list of docs to the connection's
|
||||
add method.
|
||||
|
||||
>>> conn.add(docs)
|
||||
|
||||
>>> results = conn.search('Verlaine')
|
||||
>>> len(results)
|
||||
1
|
||||
|
||||
>>> results = conn.search(u'Владимир')
|
||||
>>> len(results)
|
||||
1
|
||||
|
||||
|
||||
Simple tests for searching. We can optionally sort the results using Solr's
|
||||
sort syntax, that is, the field name and either asc or desc.
|
||||
|
||||
>>> results = conn.search('test', sort='order_i asc')
|
||||
>>> for result in results:
|
||||
... print result['name']
|
||||
document 3
|
||||
document 4
|
||||
|
||||
>>> results = conn.search('test', sort='order_i desc')
|
||||
>>> for result in results:
|
||||
... print result['name']
|
||||
document 4
|
||||
document 3
|
||||
|
||||
|
||||
To update documents, we just use the add method.
|
||||
|
||||
>>> docs = [
|
||||
... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'blah'}
|
||||
... ]
|
||||
>>> conn.add(docs)
|
||||
|
||||
>>> len(conn.search('blah'))
|
||||
1
|
||||
>>> len(conn.search('test'))
|
||||
1
|
||||
|
||||
|
||||
We can delete documents from the index by id, or by supplying a query.
|
||||
|
||||
>>> conn.delete(id='testdoc.1')
|
||||
>>> conn.delete(q='name:"document 2"')
|
||||
|
||||
>>> results = conn.search('Verlaine')
|
||||
>>> len(results)
|
||||
0
|
||||
|
||||
|
||||
Docs can also have multiple values for any particular key. This lets us use
|
||||
Solr's multiValue fields.
|
||||
|
||||
>>> docs = [
|
||||
... {'id': 'testdoc.5', 'cat': ['poetry', 'science'], 'name': 'document 5', 'text': u''},
|
||||
... {'id': 'testdoc.6', 'cat': ['science-fiction',], 'name': 'document 6', 'text': u''},
|
||||
... ]
|
||||
|
||||
>>> conn.add(docs)
|
||||
>>> results = conn.search('cat:"poetry"')
|
||||
>>> for result in results:
|
||||
... print result['name']
|
||||
document 5
|
||||
|
||||
>>> results = conn.search('cat:"science-fiction"')
|
||||
>>> for result in results:
|
||||
... print result['name']
|
||||
document 6
|
||||
|
||||
>>> results = conn.search('cat:"science"')
|
||||
>>> for result in results:
|
||||
... print result['name']
|
||||
document 5
|
||||
|
||||
NOTE: PySolr is an open-source Python module
|
||||
<http://code.google.com/p/pysolr/> that falls under the New BSD
|
||||
Licence <http://www.opensource.org/licenses/bsd-license.php>, NOT the
|
||||
licence covering the rest of Reddit. Reddit's modifications to this
|
||||
module also fall under the New BSD Licence. The New BSD Licence
|
||||
requires that re-distributions of the source, modified or not, display
|
||||
the original copyright notice, but PySolr does not, as of import-time,
|
||||
display a copyright notice or licence, except on its Google Code
|
||||
information page. Therefore for licencing information, I point you to
|
||||
PySolr's Google Code information page, URL above.
|
||||
|
||||
"""
|
||||
|
||||
# TODO: unicode support is pretty sloppy. define it better.
|
||||
|
||||
from httplib import HTTPConnection
|
||||
from urllib import urlencode
|
||||
from urlparse import urlsplit
|
||||
from datetime import datetime, date
|
||||
from time import strptime, strftime
|
||||
from r2.lib.utils import unicode_safe
|
||||
try:
|
||||
# for python 2.5
|
||||
from xml.etree import ElementTree
|
||||
from xml.parsers.expat import ExpatError
|
||||
except ImportError:
|
||||
from elementtree import ElementTree,ExpatError
|
||||
|
||||
__all__ = ['Solr']
|
||||
|
||||
class SolrError(Exception):
|
||||
pass
|
||||
|
||||
class Results(object):
|
||||
def __init__(self, docs, hits):
|
||||
self.docs = docs
|
||||
self.hits = hits
|
||||
|
||||
def __len__(self):
|
||||
return len(self.docs)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.docs)
|
||||
|
||||
def __getitem__(self,x):
|
||||
return self.docs[x]
|
||||
|
||||
class Solr(object):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
scheme, netloc, path, query, fragment = urlsplit(url)
|
||||
netloc = netloc.split(':')
|
||||
self.host = netloc[0]
|
||||
if len(netloc) == 1:
|
||||
self.host, self.port = netloc[0], None
|
||||
else:
|
||||
self.host, self.port = netloc
|
||||
self.path = path.rstrip('/')
|
||||
|
||||
def _select(self, params):
|
||||
# encode the query as utf-8 so urlencode can handle it
|
||||
params['q'] = unicode_safe(params['q'])
|
||||
path = '%s/select/?%s' % (self.path, urlencode(params))
|
||||
conn = HTTPConnection(self.host, self.port)
|
||||
conn.request('GET', path)
|
||||
return conn.getresponse()
|
||||
|
||||
def _update(self, message):
|
||||
"""
|
||||
Posts the given xml message to http://<host>:<port>/solr/update and
|
||||
returns the result.
|
||||
"""
|
||||
path = '%s/update/' % self.path
|
||||
conn = HTTPConnection(self.host, self.port)
|
||||
conn.request('POST', path, message, {'Content-type': 'text/xml'})
|
||||
return conn.getresponse()
|
||||
|
||||
def _extract_error(self, response):
|
||||
"""
|
||||
Extract the actual error message from a solr response. Unfortunately,
|
||||
this means scraping the html.
|
||||
"""
|
||||
try:
|
||||
et = ElementTree.parse(response)
|
||||
error = et.findtext('body/pre')
|
||||
return error
|
||||
except ExpatError,e:
|
||||
return "%s: %s (%d/%s)" % (e,response.read(),response.status,response.reason)
|
||||
|
||||
# Converters #############################################################
|
||||
|
||||
@staticmethod
|
||||
def _from_python(value):
|
||||
"""
|
||||
Converts python values to a form suitable for insertion into the xml
|
||||
we send to solr.
|
||||
"""
|
||||
if isinstance(value, datetime):
|
||||
value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
|
||||
elif isinstance(value, date):
|
||||
value = value.strftime('%Y-%m-%dT00:00:00.000Z')
|
||||
elif isinstance(value, bool):
|
||||
if value:
|
||||
value = 'true'
|
||||
else:
|
||||
value = 'false'
|
||||
else:
|
||||
value = unicode_safe(value)
|
||||
return value
|
||||
|
||||
def bool_to_python(self, value):
|
||||
"""
|
||||
Convert a 'bool' field from solr's xml format to python and return it.
|
||||
"""
|
||||
if value == 'true':
|
||||
return True
|
||||
elif value == 'false':
|
||||
return False
|
||||
|
||||
def str_to_python(self, value):
|
||||
"""
|
||||
Convert an 'str' field from solr's xml format to python and return it.
|
||||
"""
|
||||
return unicode_safe(value)
|
||||
|
||||
def int_to_python(self, value):
|
||||
"""
|
||||
Convert an 'int' field from solr's xml format to python and return it.
|
||||
"""
|
||||
return int(value)
|
||||
|
||||
def date_to_python(self, value):
|
||||
"""
|
||||
Convert a 'date' field from solr's xml format to python and return it.
|
||||
"""
|
||||
# this throws away fractions of a second
|
||||
return datetime(*strptime(value[:-5], "%Y-%m-%dT%H:%M:%S")[0:6])
|
||||
|
||||
# API Methods ############################################################
|
||||
|
||||
def search(self, q, sort=None, start=0, rows=20, other_params = {}):
|
||||
"""Performs a search and returns the results."""
|
||||
params = {'q': q, 'start': start, 'rows': rows}
|
||||
|
||||
for x,y in other_params.iteritems():
|
||||
params[x] = y
|
||||
if sort:
|
||||
params['sort'] = sort
|
||||
|
||||
response = self._select(params)
|
||||
if response.status != 200:
|
||||
raise SolrError(self._extract_error(response))
|
||||
|
||||
# TODO: make result retrieval lazy and allow custom result objects
|
||||
# also, this has become rather ugly and definitely needs some cleanup.
|
||||
et = ElementTree.parse(response)
|
||||
result = et.find('result')
|
||||
hits = int(result.get('numFound'))
|
||||
docs = result.findall('doc')
|
||||
results = []
|
||||
for doc in docs:
|
||||
result = {}
|
||||
for element in doc.getchildren():
|
||||
if element.tag == 'arr':
|
||||
result_val = []
|
||||
for array_element in element.getchildren():
|
||||
converter_name = '%s_to_python' % array_element.tag
|
||||
converter = getattr(self, converter_name)
|
||||
result_val.append(converter(array_element.text))
|
||||
else:
|
||||
converter_name = '%s_to_python' % element.tag
|
||||
converter = getattr(self, converter_name)
|
||||
result_val = converter(element.text)
|
||||
result[element.get('name')] = result_val
|
||||
results.append(result)
|
||||
return Results(results, hits)
|
||||
|
||||
def add(self, docs, commit=False):
|
||||
"""Adds or updates documents. For now, docs is a list of dictionaies
|
||||
where each key is the field name and each value is the value to index.
|
||||
"""
|
||||
message = ElementTree.Element('add')
|
||||
for doc in docs:
|
||||
message.append(doc_to_elemtree(doc))
|
||||
m = ElementTree.tostring(message)
|
||||
response = self._update(m)
|
||||
if response.status != 200:
|
||||
raise SolrError(self._extract_error(response))
|
||||
# TODO: Supposedly, we can put a <commit /> element in the same post body
|
||||
# as the add element. That isn't working for some reason, and it would save us
|
||||
# an extra trip to the server. This works for now.
|
||||
if commit:
|
||||
self.commit()
|
||||
|
||||
def delete(self, id=None, q=None, commit=False):
|
||||
"""Deletes documents."""
|
||||
if id is None and q is None:
|
||||
raise ValueError('You must specify "id" or "q".')
|
||||
elif id is not None and q is not None:
|
||||
raise ValueError('You many only specify "id" OR "q", not both.')
|
||||
elif id is not None:
|
||||
m = '<delete><id>%s</id></delete>' % id
|
||||
elif q is not None:
|
||||
m = '<delete><query>%s</query></delete>' % q
|
||||
response = self._update(m)
|
||||
if response.status != 200:
|
||||
raise SolrError(self._extract_error(response))
|
||||
# TODO: Supposedly, we can put a <commit /> element in the same post body
|
||||
# as the delete element. That isn't working for some reason, and it would save us
|
||||
# an extra trip to the server. This works for now.
|
||||
if commit:
|
||||
self.commit()
|
||||
|
||||
def commit(self):
|
||||
response = self._update('<commit />')
|
||||
if response.status != 200:
|
||||
raise SolrError(self._extract_error(response))
|
||||
|
||||
def optimize(self):
|
||||
response = self._update('<optimize />')
|
||||
if response.status != 200:
|
||||
raise SolrError(self._extract_error(response))
|
||||
|
||||
solr_magic_fields = ('boost',)
|
||||
def doc_to_elemtree(doc):
|
||||
d = ElementTree.Element('doc')
|
||||
for key, value in doc.iteritems():
|
||||
|
||||
if key in solr_magic_fields:
|
||||
# handle special fields that are attributes, not fields
|
||||
d.set(key,Solr._from_python(value))
|
||||
elif (not isinstance(value,str)) and hasattr(value, '__iter__'):
|
||||
# handle lists, tuples, and other iterabes
|
||||
for v in value:
|
||||
f = ElementTree.Element('field', name=key)
|
||||
f.text = Solr._from_python(v)
|
||||
d.append(f)
|
||||
# handle strings and unicode
|
||||
else:
|
||||
f = ElementTree.Element('field', name=key)
|
||||
f.text = Solr._from_python(value)
|
||||
d.append(f)
|
||||
|
||||
return d
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
doctest.testmod()
|
||||
@@ -5,7 +5,6 @@ from r2.lib.db.operators import asc, desc, timeago
|
||||
from r2.lib.db.sorts import epoch_seconds
|
||||
from r2.lib.utils import fetch_things2, tup, UniqueIterator, set_last_modified
|
||||
from r2.lib import utils
|
||||
from r2.lib.solrsearch import DomainSearchQuery
|
||||
from r2.lib import amqp, sup, filters
|
||||
from r2.lib.comment_tree import add_comments, update_comment_votes
|
||||
from r2.models.query_cache import (cached_query, merged_cached_query,
|
||||
@@ -39,12 +38,6 @@ def db_sort(sort):
|
||||
cls, col = db_sorts[sort]
|
||||
return cls(col)
|
||||
|
||||
search_sort = dict(hot = 'hot desc',
|
||||
new = 'date desc',
|
||||
top = 'points desc',
|
||||
controversial = 'controversy desc',
|
||||
old = 'date asc')
|
||||
|
||||
db_times = dict(all = None,
|
||||
hour = Thing.c._date >= timeago('1 hour'),
|
||||
day = Thing.c._date >= timeago('1 day'),
|
||||
@@ -458,9 +451,6 @@ def get_modqueue(sr):
|
||||
q.append(get_spam_filtered_comments(sr))
|
||||
return q
|
||||
|
||||
def get_domain_links_old(domain, sort, time):
|
||||
return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time)
|
||||
|
||||
def get_domain_links(domain, sort, time):
|
||||
from r2.lib.db import operators
|
||||
q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain),
|
||||
|
||||
@@ -1,692 +0,0 @@
|
||||
# The contents of this file are subject to the Common Public Attribution
|
||||
# License Version 1.0. (the "License"); you may not use this file except in
|
||||
# compliance with the License. You may obtain a copy of the License at
|
||||
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
# software over a computer network and provide for limited attribution for the
|
||||
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
# with Exhibit B.
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
# the specific language governing rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is Reddit.
|
||||
#
|
||||
# The Original Developer is the Initial Developer. The Initial Developer of the
|
||||
# Original Code is CondeNet, Inc.
|
||||
#
|
||||
# All portions of the code written by CondeNet are Copyright (c) 2006-2010
|
||||
# CondeNet, Inc. All Rights Reserved.
|
||||
################################################################################
|
||||
"""
|
||||
Module for communication reddit-level communication with
|
||||
Solr. Contains functions for indexing (`reindex_all`, `run_changed`)
|
||||
and searching (`search_things`). Uses pysolr (placed in r2.lib)
|
||||
for lower-level communication with Solr
|
||||
"""
|
||||
|
||||
from __future__ import with_statement
|
||||
|
||||
from Queue import Queue
|
||||
from threading import Thread
|
||||
import time
|
||||
from datetime import datetime, date
|
||||
from time import strftime
|
||||
|
||||
from pylons import g, config
|
||||
|
||||
from r2.models import *
|
||||
from r2.lib.contrib import pysolr
|
||||
from r2.lib.contrib.pysolr import SolrError
|
||||
from r2.lib.utils import timeago, UrlParser
|
||||
from r2.lib.utils import unicode_safe, tup, get_after, strordict_fullname
|
||||
from r2.lib.cache import SelfEmptyingCache
|
||||
from r2.lib import amqp
|
||||
|
||||
solr_cache_time = g.solr_cache_time
|
||||
|
||||
## Changes to the list of searchable languages will require changes to
|
||||
## Solr's configuration (specifically, the fields that are searched)
|
||||
searchable_langs = set(['dk','nl','en','fi','fr','de','it','no','nn','pt',
|
||||
'ru','es','sv','zh','ja','ko','cs','el','th'])
|
||||
|
||||
## Adding types is a matter of adding the class to indexed_types here,
|
||||
## adding the fields from that type to search_fields below, and adding
|
||||
## those fields to Solr's configuration
|
||||
indexed_types = (Subreddit, Link)
|
||||
|
||||
|
||||
class Field(object):
|
||||
"""
|
||||
Describes a field of a Thing that is searchable by Solr. Used
|
||||
by `search_fields` below
|
||||
"""
|
||||
def __init__(self, name, thing_attr_func = None, store = True,
|
||||
tokenize=False, is_number=False, reverse=False,
|
||||
is_date = False):
|
||||
self.name = name
|
||||
self.thing_attr_func = self.make_extractor(thing_attr_func)
|
||||
|
||||
def make_extractor(self,thing_attr_func):
|
||||
if not thing_attr_func:
|
||||
return self.make_extractor(self.name)
|
||||
elif isinstance(thing_attr_func,str):
|
||||
return (lambda x: getattr(x,thing_attr_func))
|
||||
else:
|
||||
return thing_attr_func
|
||||
|
||||
def extract_from(self,thing):
|
||||
return self.thing_attr_func(thing)
|
||||
|
||||
class ThingField(Field):
|
||||
"""
|
||||
ThingField('field_name',Author,'author_id','name')
|
||||
is like:
|
||||
Field(name, lambda x: Author._byID(x.author_id,data=True).name)
|
||||
but faster because lookups are done in batch
|
||||
"""
|
||||
def __init__(self,name,cls,id_attr,lu_attr_name):
|
||||
self.name = name
|
||||
|
||||
self.cls = cls # the class of the looked-up object
|
||||
self.id_attr = id_attr # the attr of the source obj used to find the dest obj
|
||||
self.lu_attr_name = lu_attr_name # the attr of the dest class that we want to return
|
||||
|
||||
def __str__(self):
|
||||
return ("<ThingField: (%s,%s,%s,%s)>"
|
||||
% (self.name,self.cls,self.id_attr,self.lu_attr_name))
|
||||
|
||||
# Describes the fields of Thing objects and subclasses that are passed
|
||||
# to Solr for indexing. All must have a 'contents' field, since that
|
||||
# will be used for language-agnostic searching, and will be copied
|
||||
# into contents_en, contents_eo, et (see `tokenize_things` for a
|
||||
# discussion of multi-language search. The 'boost' field is a
|
||||
# solr-magic field that ends up being an attribute on the <doc>
|
||||
# message (rather than a field), and is used to do an index-time boost
|
||||
# (this magic is done in pysolr.dor_to_elemtree)
|
||||
search_fields={Thing: (Field('fullname', '_fullname'),
|
||||
Field('date', '_date', is_date = True, reverse=True),
|
||||
Field('lang'),
|
||||
Field('ups', '_ups', is_number=True, reverse=True),
|
||||
Field('downs', '_downs', is_number=True, reverse=True),
|
||||
Field('spam','_spam'),
|
||||
Field('deleted','_deleted'),
|
||||
Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True),
|
||||
Field('controversy', '_controversy', is_number=True, reverse=True),
|
||||
Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)),
|
||||
Subreddit: (Field('contents',
|
||||
lambda s: ' '.join([unicode_safe(s.name),
|
||||
unicode_safe(s.title),
|
||||
unicode_safe(s.description),
|
||||
unicode_safe(s.firsttext)]),
|
||||
tokenize = True),
|
||||
Field('boost', '_downs'),
|
||||
#Field('title'),
|
||||
#Field('firsttext'),
|
||||
#Field('description'),
|
||||
#Field('over_18'),
|
||||
#Field('sr_type','type'),
|
||||
),
|
||||
Link: (Field('contents','title', tokenize = True),
|
||||
Field('boost', lambda t: int(t._hot*1000),
|
||||
# yes, it's a copy of 'hot'
|
||||
is_number=True, reverse=True),
|
||||
Field('author_id'),
|
||||
ThingField('author',Account,'author_id','name'),
|
||||
ThingField('subreddit',Subreddit,'sr_id','name'),
|
||||
#ThingField('reddit',Subreddit,'sr_id','name'),
|
||||
Field('sr_id'),
|
||||
Field('url', tokenize = True),
|
||||
#Field('domain',
|
||||
# lambda l: UrlParser(l.url).domain_permutations()),
|
||||
Field('site',
|
||||
lambda l: UrlParser(l.url).domain_permutations()),
|
||||
#Field('is_self','is_self'),
|
||||
),
|
||||
Comment: (Field('contents', 'body', tokenize = True),
|
||||
Field('boost', lambda t: int(t._hot*1000),
|
||||
# yes, it's a copy of 'hot'
|
||||
is_number=True, reverse=True),
|
||||
ThingField('author',Account,'author_id','name'),
|
||||
ThingField('subreddit',Subreddit,'sr_id','name'))}
|
||||
#ThingField('reddit',Subreddit,'sr_id','name'))}
|
||||
|
||||
def strip_control_characters(text):
|
||||
if not isinstance(text, basestring):
|
||||
return text
|
||||
return ''.join((c for c in text if ord(c) >= 0x20))
|
||||
|
||||
def tokenize_things(things,return_dict=False):
|
||||
"""
|
||||
Here, we take a list of things, and return a list of
|
||||
dictionaries of fields, which will be sent to Solr. We take
|
||||
the `search_fields` dictionary above, and look for all classes
|
||||
for which each Thing is an instance (that is, a Comment will
|
||||
pick up fields for Thing as well as Comment), and extract the
|
||||
given fields. All tokenised Things are expected to have a
|
||||
'contents' attribute. That field is then copied to
|
||||
contents_XX, where XX is your two-letter language code, which
|
||||
becomes your default search field. Those language-specific
|
||||
fields are also set up with the proper language-stemming and
|
||||
tokenisers on Solr's end (in config/schema.xml), which allows
|
||||
for language-specific searching
|
||||
"""
|
||||
global search_fields
|
||||
|
||||
batched_classes = {}
|
||||
ret = {}
|
||||
for thing in things:
|
||||
try:
|
||||
t = {'type': []}
|
||||
for cls in ((thing.__class__,) + thing.__class__.__bases__):
|
||||
t['type'].append(cls.__name__.lower())
|
||||
|
||||
if cls in search_fields:
|
||||
for field in search_fields[cls]:
|
||||
if field.__class__ == Field:
|
||||
try:
|
||||
val = field.extract_from(thing)
|
||||
val = strip_control_characters(val)
|
||||
if val != None and val != '':
|
||||
t[field.name] = val
|
||||
except AttributeError,e:
|
||||
print e
|
||||
|
||||
elif field.__class__ == ThingField:
|
||||
if not field.cls in batched_classes:
|
||||
batched_classes[field.cls] = []
|
||||
batched_classes[field.cls].append((thing,field))
|
||||
|
||||
# copy 'contents' to ('contents_%s' % lang) and contents_ws
|
||||
t[lang_to_fieldname(thing.lang)] = t['contents']
|
||||
t['contents_ws'] = t['contents']
|
||||
|
||||
ret[thing._fullname] = t
|
||||
except AttributeError,e:
|
||||
print e
|
||||
except KeyError,e:
|
||||
print e
|
||||
|
||||
# batched_classes should now be a {cls: [(Thing,ThingField)]}.
|
||||
# This ugliness is to make it possible to batch Thing lookups, as
|
||||
# they were accounting for most of the indexing time
|
||||
for cls in batched_classes:
|
||||
ids = set()
|
||||
for (thing,field) in batched_classes[cls]:
|
||||
# extract the IDs
|
||||
try:
|
||||
id = getattr(thing,field.id_attr)
|
||||
ids.add(id)
|
||||
except AttributeError,e:
|
||||
print e
|
||||
found_batch = cls._byID(ids,data=True,return_dict=True)
|
||||
|
||||
for (thing,field) in batched_classes[cls]:
|
||||
try:
|
||||
id = getattr(thing,field.id_attr)
|
||||
ret[thing._fullname][field.name] = strip_control_characters(
|
||||
getattr(found_batch[id],field.lu_attr_name))
|
||||
except AttributeError,e:
|
||||
print e
|
||||
except KeyError,e:
|
||||
print e
|
||||
|
||||
return ret if return_dict else ret.values()
|
||||
|
||||
def lang_to_fieldname(l):
|
||||
"""
|
||||
Returns the field-name for the given language, or `contents`
|
||||
if it isn't found
|
||||
"""
|
||||
global searchable_langs
|
||||
|
||||
code = l[:2]
|
||||
|
||||
if code in searchable_langs:
|
||||
return ("contents_%s" % code)
|
||||
else:
|
||||
return "contents"
|
||||
|
||||
def tokenize(thing):
|
||||
return tokenize_things([thing])
|
||||
|
||||
def index_things(s=None,things=[]):
|
||||
"Sends the given Things to Solr to be indexed"
|
||||
tokenized = tokenize_things(things)
|
||||
|
||||
if s:
|
||||
s.add(tokenized)
|
||||
else:
|
||||
with SolrConnection(commit=True) as s:
|
||||
s.add(tokenize_things(things))
|
||||
|
||||
def fetch_batches(t_class,size,since,until):
|
||||
"""
|
||||
Convenience function to fetch all Things of class t_class with
|
||||
_date from `since` to `until`, returning them in batches of
|
||||
`size`. TODO: move to lib/utils, and merge to be the backend
|
||||
of `fetch_things`
|
||||
"""
|
||||
q=t_class._query(t_class.c._date >= since,
|
||||
t_class.c._spam == (True,False),
|
||||
t_class.c._deleted == (True,False),
|
||||
t_class.c._date < until,
|
||||
sort = desc('_date'),
|
||||
limit = size,
|
||||
data = True)
|
||||
orig_rules = deepcopy(q._rules)
|
||||
|
||||
things = list(q)
|
||||
while things:
|
||||
yield things
|
||||
|
||||
q._rules = deepcopy(orig_rules)
|
||||
q._after(things[len(things)-1])
|
||||
things = list(q)
|
||||
|
||||
solr_queue=Queue()
|
||||
for i in range(20):
|
||||
solr_queue.put(pysolr.Solr(g.solr_url))
|
||||
class SolrConnection(object):
|
||||
"""
|
||||
Represents a connection to Solr, properly limited to N
|
||||
concurrent connections. Used like
|
||||
|
||||
with SolrConnection() as s:
|
||||
s.add(things)
|
||||
"""
|
||||
def __init__(self,commit=False,optimize=False):
|
||||
self.commit = commit
|
||||
self.optimize = optimize
|
||||
def __enter__(self):
|
||||
self.conn = solr_queue.get()
|
||||
return self.conn
|
||||
def __exit__(self, _type, _value, _tb):
|
||||
if self.commit:
|
||||
self.conn.commit()
|
||||
if self.optimize:
|
||||
self.conn.optimize()
|
||||
solr_queue.task_done()
|
||||
solr_queue.put(self.conn)
|
||||
|
||||
def indexer_worker(q,delete_all_first=False):
|
||||
"""
|
||||
The thread for mass-indexing that connects to Solr and submits
|
||||
tokenised objects
|
||||
"""
|
||||
with SolrConnection(commit=True,optimize=True) as s:
|
||||
count = 0
|
||||
|
||||
if delete_all_first:
|
||||
s.delete(q='*:*')
|
||||
|
||||
t = q.get()
|
||||
while t != "done":
|
||||
# if it's not a list or a dictionary, I don't know how to
|
||||
# handle it, so die. It's probably an exception pushed in
|
||||
# by the handler in my parent
|
||||
if not (isinstance(t,list) and isinstance(t[0],dict)):
|
||||
raise t
|
||||
count += len(t)
|
||||
s.add(t)
|
||||
if count > 25000:
|
||||
print "Committing... (q:%d)" % (q.qsize(),)
|
||||
s.commit()
|
||||
count = 0
|
||||
q.task_done()
|
||||
|
||||
t=q.get()
|
||||
q.task_done()
|
||||
|
||||
def reindex_all(types = None, delete_all_first=False):
|
||||
"""
|
||||
Called from `paster run` to totally re-index everything in the
|
||||
database. Spawns a thread to connect to Solr, and sends it
|
||||
tokenised Things
|
||||
"""
|
||||
global indexed_types
|
||||
|
||||
start_t = datetime.now()
|
||||
|
||||
if not types:
|
||||
types = indexed_types
|
||||
|
||||
# We don't want the default thread-local cache (which is just a
|
||||
# dict) to grow un-bounded (normally, we'd use
|
||||
# utils.set_emptying_cache, except that that preserves memcached,
|
||||
# and we don't even want to get memcached for total indexing,
|
||||
# because it would dump out more recent stuff)
|
||||
g.cache.caches = (SelfEmptyingCache(),) # + g.cache.caches[1:]
|
||||
|
||||
count = 0
|
||||
q=Queue(100)
|
||||
indexer=Thread(target=indexer_worker,
|
||||
args=(q,delete_all_first))
|
||||
indexer.start()
|
||||
|
||||
try:
|
||||
for cls in types:
|
||||
for batch in fetch_batches(cls,1000,
|
||||
timeago("50 years"),
|
||||
start_t):
|
||||
r = tokenize_things([ x for x in batch
|
||||
if not x._spam and not x._deleted ])
|
||||
|
||||
count += len(r)
|
||||
print ("Processing %s #%d(%s): %s"
|
||||
% (cls.__name__, count, q.qsize(), r[0]['contents']))
|
||||
|
||||
if indexer.isAlive():
|
||||
q.put(r)
|
||||
else:
|
||||
raise Exception("'tis a shame that I have but one thread to give")
|
||||
q.put("done")
|
||||
indexer.join()
|
||||
|
||||
except object,e:
|
||||
if indexer.isAlive():
|
||||
q.put(e,timeout=30)
|
||||
raise e
|
||||
except KeyboardInterrupt,e: # turns out KeyboardInterrupts aren't objects. Who knew?
|
||||
if indexer.isAlive():
|
||||
q.put(e,timeout=30)
|
||||
raise e
|
||||
|
||||
|
||||
def combine_searchterms(terms):
|
||||
"""
|
||||
Convenience function to take a list like
|
||||
[ sr_id:1, sr_id:2 sr_id:3 subreddit:reddit.com ]
|
||||
and turn it into
|
||||
sr_id:(1 2 3) OR subreddit:reddit.com
|
||||
"""
|
||||
combined = {}
|
||||
|
||||
for (name,val) in terms:
|
||||
combined[name] = combined.get(name,[]) + [val]
|
||||
|
||||
ret = []
|
||||
|
||||
for (name,vals) in combined.iteritems():
|
||||
if len(vals) == 1:
|
||||
ret.append("%s:%s" % (name,vals[0]))
|
||||
else:
|
||||
ret.append("%s:(%s)" % (name," ".join(vals)))
|
||||
|
||||
if len(ret) > 1:
|
||||
ret = "(%s)" % " OR ".join(ret)
|
||||
else:
|
||||
ret = " ".join(ret)
|
||||
|
||||
return ret
|
||||
|
||||
def swap_strings(s,this,that):
|
||||
"""
|
||||
Just swaps substrings, like:
|
||||
s = "hot asc"
|
||||
s = swap_strings(s,'asc','desc')
|
||||
s == "hot desc"
|
||||
|
||||
uses 'tmp' as a replacment string, so don't use for anything
|
||||
very complicated
|
||||
"""
|
||||
return s.replace(this,'tmp').replace(that,this).replace('tmp',that)
|
||||
|
||||
class SearchQuery(object):
|
||||
def __init__(self, q, sort, fields = [], subreddits = [], authors = [],
|
||||
types = [], timerange = None, spam = False, deleted = False):
|
||||
|
||||
self.q = q
|
||||
self.fields = fields
|
||||
self.sort = sort
|
||||
self.subreddits = subreddits
|
||||
self.authors = authors
|
||||
self.types = types
|
||||
self.spam = spam
|
||||
self.deleted = deleted
|
||||
|
||||
if timerange in ['day','month','year']:
|
||||
self.timerange = ('NOW-1%s/HOUR' % timerange.upper(),"NOW")
|
||||
elif timerange == 'week':
|
||||
self.timerange = ('NOW-7DAY/HOUR',"NOW")
|
||||
elif timerange == 'hour':
|
||||
self.timerange = ('NOW-1HOUR/MINUTE',"NOW")
|
||||
elif timerange == 'all' or timerange is None:
|
||||
self.timerange = None
|
||||
else:
|
||||
self.timerange = timerange
|
||||
|
||||
def __repr__(self):
|
||||
attrs = [ "***q=%s***" % self.q ]
|
||||
|
||||
if self.subreddits is not None:
|
||||
attrs.append("srs=" + '+'.join([ "%d" % s
|
||||
for s in self.subreddits ]))
|
||||
|
||||
if self.authors is not None:
|
||||
attrs.append("authors=" + '+'.join([ "%d" % s
|
||||
for s in self.authors ]))
|
||||
|
||||
if self.timerange is not None:
|
||||
attrs.append("timerange=%s" % str(self.timerange))
|
||||
|
||||
if self.sort is not None:
|
||||
attrs.append("sort=%r" % self.sort)
|
||||
|
||||
return "<%s(%s)>" % (self.__class__.__name__, ", ".join(attrs))
|
||||
|
||||
def run(self, after = None, num = 1000, reverse = False,
|
||||
_update = False):
|
||||
if not self.q:
|
||||
return pysolr.Results([],0)
|
||||
|
||||
if not g.solr_url:
|
||||
raise SolrError("g.solr_url is not set")
|
||||
|
||||
# there are two parts to our query: what the user typed
|
||||
# (parsed with Solr's DisMax parser), and what we are adding
|
||||
# to it. The latter is called the "boost" (and is parsed using
|
||||
# full Lucene syntax), and it can be added to via the `boost`
|
||||
# parameter
|
||||
boost = []
|
||||
|
||||
if not self.spam:
|
||||
boost.append("-spam:true")
|
||||
if not self.deleted:
|
||||
boost.append("-deleted:true")
|
||||
|
||||
if self.timerange:
|
||||
def time_to_searchstr(t):
|
||||
if isinstance(t, datetime):
|
||||
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
|
||||
elif isinstance(t, date):
|
||||
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
|
||||
elif isinstance(t,str):
|
||||
t = t
|
||||
return t
|
||||
|
||||
(fromtime, totime) = self.timerange
|
||||
fromtime = time_to_searchstr(fromtime)
|
||||
totime = time_to_searchstr(totime)
|
||||
boost.append("+date:[%s TO %s]"
|
||||
% (fromtime,totime))
|
||||
|
||||
if self.subreddits:
|
||||
def subreddit_to_searchstr(sr):
|
||||
if isinstance(sr,Subreddit):
|
||||
return ('sr_id','%d' % sr.id)
|
||||
elif isinstance(sr,str) or isinstance(sr,unicode):
|
||||
return ('subreddit',sr)
|
||||
else:
|
||||
return ('sr_id','%d' % sr)
|
||||
|
||||
s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits))
|
||||
|
||||
boost.append("+(%s)" % combine_searchterms(s_subreddits))
|
||||
|
||||
if self.authors:
|
||||
def author_to_searchstr(a):
|
||||
if isinstance(a,Account):
|
||||
return ('author_id','%d' % a.id)
|
||||
elif isinstance(a,str) or isinstance(a,unicode):
|
||||
return ('author',a)
|
||||
else:
|
||||
return ('author_id','%d' % a)
|
||||
|
||||
s_authors = map(author_to_searchstr,tup(self.authors))
|
||||
|
||||
boost.append('+(%s)^2' % combine_searchterms(s_authors))
|
||||
|
||||
|
||||
def type_to_searchstr(t):
|
||||
if isinstance(t,str):
|
||||
return ('type',t)
|
||||
else:
|
||||
return ('type',t.__name__.lower())
|
||||
|
||||
s_types = map(type_to_searchstr,self.types)
|
||||
boost.append("+%s" % combine_searchterms(s_types))
|
||||
|
||||
q,solr_params = self.solr_params(self.q,boost)
|
||||
|
||||
search = self.run_search(q, self.sort, solr_params,
|
||||
reverse, after, num,
|
||||
_update = _update)
|
||||
return search
|
||||
|
||||
@classmethod
|
||||
def run_search(cls, q, sort, solr_params, reverse, after, num,
|
||||
_update = False):
|
||||
"returns pysolr.Results(docs=[fullname()],hits=int())"
|
||||
|
||||
if reverse:
|
||||
sort = swap_strings(sort,'asc','desc')
|
||||
after = after._fullname if after else None
|
||||
|
||||
search = cls.run_search_cached(q, sort, 0, num, solr_params,
|
||||
_update = _update)
|
||||
search.docs = get_after(search.docs, after, num)
|
||||
|
||||
return search
|
||||
|
||||
@staticmethod
|
||||
@memoize('solr_search', solr_cache_time)
|
||||
def run_search_cached(q, sort, start, rows, other_params):
|
||||
with SolrConnection() as s:
|
||||
g.log.debug(("Searching q = %r; sort = %r,"
|
||||
+ " start = %r, rows = %r,"
|
||||
+ " params = %r")
|
||||
% (q,sort,start,rows,other_params))
|
||||
|
||||
res = s.search(q, sort, start = start, rows = rows,
|
||||
other_params = other_params)
|
||||
|
||||
# extract out the fullname in the 'docs' field, since that's
|
||||
# all we care about
|
||||
res = pysolr.Results(docs = [ i['fullname'] for i in res.docs ],
|
||||
hits = res.hits)
|
||||
|
||||
return res
|
||||
|
||||
def solr_params(self,*k,**kw):
|
||||
raise NotImplementedError
|
||||
|
||||
class UserSearchQuery(SearchQuery):
|
||||
"Base class for queries that use the dismax parser"
|
||||
def __init__(self, q, mm, sort=None, fields=[], langs=None, **kw):
|
||||
default_fields = ['contents^1.5','contents_ws^3'] + fields
|
||||
|
||||
if langs is None:
|
||||
fields = default_fields
|
||||
else:
|
||||
if langs == 'all':
|
||||
langs = searchable_langs
|
||||
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
|
||||
+ default_fields)
|
||||
|
||||
# minimum match. See http://lucene.apache.org/solr/api/org/apache/solr/util/doc-files/min-should-match.html
|
||||
self.mm = mm
|
||||
|
||||
SearchQuery.__init__(self, q, sort, fields = fields, **kw)
|
||||
|
||||
def solr_params(self, q, boost):
|
||||
return q, dict(fl = 'fullname',
|
||||
qt = 'dismax',
|
||||
bq = ' '.join(boost),
|
||||
qf = ' '.join(self.fields),
|
||||
mm = self.mm)
|
||||
|
||||
class LinkSearchQuery(UserSearchQuery):
|
||||
def __init__(self, q, mm = None, **kw):
|
||||
additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1']
|
||||
|
||||
if mm is None:
|
||||
mm = '4<75%'
|
||||
|
||||
UserSearchQuery.__init__(self, q, mm = mm, fields = additional_fields,
|
||||
types=[Link], **kw)
|
||||
|
||||
class RelatedSearchQuery(LinkSearchQuery):
|
||||
def __init__(self, q, ignore = [], **kw):
|
||||
self.ignore = set(ignore) if ignore else set()
|
||||
|
||||
LinkSearchQuery.__init__(self, q, mm = '3<100% 5<60% 8<50%', **kw)
|
||||
|
||||
def run(self, *k, **kw):
|
||||
search = LinkSearchQuery.run(self, *k, **kw)
|
||||
search.docs = [ x for x in search.docs if x not in self.ignore ]
|
||||
return search
|
||||
|
||||
class SubredditSearchQuery(UserSearchQuery):
|
||||
def __init__(self, q, **kw):
|
||||
# note that 'downs' is a measure of activity on subreddits
|
||||
UserSearchQuery.__init__(self, q, mm = '75%', sort = 'downs desc',
|
||||
types=[Subreddit], **kw)
|
||||
|
||||
class DomainSearchQuery(SearchQuery):
|
||||
def __init__(self, domain, **kw):
|
||||
q = '+site:%s' % domain
|
||||
|
||||
SearchQuery.__init__(self, q = q, fields=['site'],types=[Link], **kw)
|
||||
|
||||
def solr_params(self, q, boost):
|
||||
q = q + ' ' + ' '.join(boost)
|
||||
return q, dict(fl='fullname',
|
||||
qt='standard')
|
||||
|
||||
def run_commit(optimize=False):
|
||||
with SolrConnection(commit=True, optimize=optimize) as s:
|
||||
pass
|
||||
|
||||
|
||||
def run_changed(drain=False):
|
||||
"""
|
||||
Run by `cron` (through `paster run`) on a schedule to update
|
||||
all Things that have been created or have changed since the
|
||||
last run. Note: unlike many queue-using functions, this one is
|
||||
run from cron and totally drains the queue before terminating
|
||||
"""
|
||||
@g.stats.amqp_processor('solrsearch_changes')
|
||||
def _run_changed(msgs, chan):
|
||||
print "changed: Processing %d items" % len(msgs)
|
||||
msgs = [strordict_fullname(msg.body)
|
||||
for msg in msgs]
|
||||
fullnames = set(msg['fullname'] for msg in msgs if not msg.get('boost_only'))
|
||||
|
||||
things = Thing._by_fullname(fullnames, data=True, return_dict=False)
|
||||
things = [x for x in things if isinstance(x, indexed_types)]
|
||||
|
||||
update_things = [x for x in things if not x._spam and not x._deleted]
|
||||
delete_things = [x for x in things if x._spam or x._deleted]
|
||||
|
||||
with SolrConnection() as s:
|
||||
if update_things:
|
||||
tokenized = tokenize_things(update_things)
|
||||
s.add(tokenized)
|
||||
if delete_things:
|
||||
for i in delete_things:
|
||||
s.delete(id=i._fullname)
|
||||
|
||||
amqp.handle_items('solrsearch_changes', _run_changed, limit=1000,
|
||||
drain=drain)
|
||||
@@ -1,13 +0,0 @@
|
||||
description "commit/optimize solr index"
|
||||
|
||||
instance $optimize
|
||||
|
||||
manual
|
||||
task
|
||||
|
||||
nice 10
|
||||
|
||||
script
|
||||
. /etc/default/reddit
|
||||
wrap-job paster run $REDDIT_INI -c "from r2.lib import solrsearch; solrsearch.run_commit(optimize=$optimize)"
|
||||
end script
|
||||
Reference in New Issue
Block a user