Remove references to deprecated Solr index

This commit is contained in:
Keith Mitchell
2012-06-15 16:37:16 -07:00
parent 3a6f9e6011
commit 973005d2df
13 changed files with 3 additions and 2393 deletions

View File

@@ -1,456 +0,0 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
"The contents of this file are subject to the Common Public Attribution
License Version 1.0. (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
License Version 1.1, but Sections 14 and 15 have been added to cover use of
software over a computer network and provide for limited attribution for the
Original Developer. In addition, Exhibit A has been modified to be consistent
with Exhibit B.
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
the specific language governing rights and limitations under the License.
The Original Code is Reddit.
The Original Developer is the Initial Developer. The Initial Developer of
the Original Code is CondeNet, Inc.
All portions of the code written by CondeNet are Copyright (c) 2006-2009
CondeNet, Inc. All Rights Reserved.
-->
<schema name="reddit" version="1.1">
<types>
<!-- field type definitions. The "name" attribute is
just a label to be used by field definitions. The "class"
attribute and any other attributes determine the real
behavior of the fieldType.
Class names starting with "solr" refer to java classes in the
org.apache.solr.analysis package.
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!-- The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as strings.
- If sortMissingLast="true", then a sort on this field will cause documents
without the field to come after documents with the field,
regardless of the requested sort order (asc or desc).
- If sortMissingFirst="true", then a sort on this field will cause documents
without the field to come before documents with the field,
regardless of the requested sort order.
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
-->
<!-- numeric field types that store and index the text
value verbatim (and hence don't support range queries, since the
lexicographic ordering isn't equal to the numeric ordering) -->
<fieldType name="integer" class="solr.IntField" omitNorms="true"/>
<fieldType name="long" class="solr.LongField" omitNorms="true"/>
<fieldType name="float" class="solr.FloatField" omitNorms="true"/>
<fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
<!-- Numeric field types that manipulate the value into
a string value that isn't human-readable in its internal form,
but with a lexicographic ordering the same as the numeric ordering,
so that range queries work correctly. -->
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="hotness" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="false"/>
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.
Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...
NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day
Consult the DateField javadocs for more information.
-->
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
The optional positionIncrementGap puts space between multiple fields of
this type on the same document, with the purpose of preventing false phrase
matching across fields.
For more info on customizing your analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!-- One can also specify an existing Analyzer class that has a
default constructor via the class attribute on the analyzer element -->
<!-- languages -->
<fieldtype name="text_dk" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Danish" />
</analyzer>
</fieldtype>
<fieldtype name="text_nl" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch" />
</analyzer>
</fieldtype>
<fieldtype name="text_en" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" />
</analyzer>
</fieldtype>
<fieldtype name="text_fi" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Finnish" />
</analyzer>
</fieldtype>
<fieldtype name="text_fr" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="French" />
</analyzer>
</fieldtype>
<fieldtype name="text_de" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="German" />
</analyzer>
</fieldtype>
<fieldtype name="text_it" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Italian" />
</analyzer>
</fieldtype>
<fieldtype name="text_no" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian" />
</analyzer>
</fieldtype>
<fieldtype name="text_nn" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian" />
</analyzer>
</fieldtype>
<fieldtype name="text_pt" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Portuguese" />
</analyzer>
</fieldtype>
<fieldType name="text_ru" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.ru.RussianAnalyzer"/>
<filter class="solr.SnowballPorterFilterFactory" language="Russian" />
</fieldType>
<fieldtype name="text_es" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Spanish" />
</analyzer>
</fieldtype>
<fieldtype name="text_sv" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.ISOLatin1AccentFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="Swedish" />
</analyzer>
</fieldtype>
<fieldType name="text_zh" class="solr.TextField">
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
</fieldType>
<fieldType name="text_ja" class="solr.TextField">
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
</fieldType>
<fieldType name="text_ko" class="solr.TextField">
<tokenizer class="org.apache.lucene.analysis.cjk.CJKTokenizer" />
<analyzer class="org.apache.lucene.analysis.cjk.CJKAnalyzer"/>
</fieldType>
<fieldType name="text_cs" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.cz.CzechAnalyzer"/>
</fieldType>
<fieldType name="text_el" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
<fieldType name="text_th" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.th.ThaiAnalyzer"/>
</fieldType>
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<filter class="solr.LowerCaseFilterFactory"/>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
WordDelim parts) are removed.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<!-- <tokenizer class="solr.WhitespaceTokenizerFactory"/> -->
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory" protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.KeywordTokenizerFactory"/>
<!-- The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!-- The PatternReplaceFilter gives you the flexibility to use
Java Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string,
which may include back refrences to portions of the orriginal
string matched by the pattern.
See the Java Regular Expression documentation for more
infomation on pattern and replacement string syntax.
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^a-z])" replacement="" replace="all"
/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
-->
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
</types>
<fields>
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a previously defined type from the <types> section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
compressed: [false] if this field should be stored using gzip compression
(this will only apply if the field type is compressable; among
the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
-->
<!-- Thing -->
<field name="fullname" type="string" indexed="true" stored="true" required="true" />
<field name="type" type="string" indexed="true" stored="false" required="true" multiValued="true" />
<field name="date" type="date" indexed="true" stored="true" required="true" reversed="true" />
<field name="lang" type="string" indexed="true" stored="false" required="false" />
<field name="ups" type="sint" indexed="true" stored="true" required="true" reversed="true" />
<field name="downs" type="sint" indexed="true" stored="true" required="true" reversed="true" />
<field name="hot" type="hotness" indexed="true" stored="true" required="true" reversed="true" />
<field name="controversy" type="sfloat" indexed="true" stored="true" required="true" reversed="true" />
<field name="points" type="sint" indexed="true" stored="true" required="true" reversed="true" />
<field name="spam" type="boolean" indexed="true" stored="true" required="false" />
<field name="deleted" type="boolean" indexed="true" stored="true" required="false" />
<!-- subreddit,link,comment -->
<field name="author_id" type="integer" indexed="true" stored="false" required="false" />
<field name="author" type="string" indexed="true" stored="false" required="false" />
<!-- subreddit -->
<field name="title" type="text" indexed="true" stored="false" required="false" />
<field name="description" type="text" indexed="true" stored="false" required="false" />
<field name="firsttext" type="text" indexed="true" stored="false" required="false" />
<field name="name" type="string" indexed="true" stored="false" required="false" />
<field name="over_18" type="boolean" indexed="true" stored="false" required="false" />
<field name="sr_type" type="string" indexed="true" stored="false" required="false" />
<!-- link -->
<field name="sr_id" type="integer" indexed="true" stored="false" required="false" />
<field name="reddit" type="string" indexed="true" stored="false" required="false" />
<field name="subreddit" type="string" indexed="true" stored="false" required="false" />
<field name="url" type="text" indexed="true" stored="false" required="false" />
<field name="domain" type="string" indexed="true" stored="false" required="false" multiValued="true" />
<field name="site" type="string" indexed="true" stored="false" required="false" multiValued="true" />
<field name="is_self" type="boolean" indexed="true" stored="false" required="false" />
<!-- comment (none) -->
<!-- all objects must have a 'contents' field, and most will also
have a field for their particular languages. Searches are then
done according to the fields in the languages that the user
has specified -->
<field name="contents" type="text" indexed="true" stored="false" required="true" />
<field name="contents_ws" type="text_ws" indexed="true" stored="false" required="false" />
<field name="contents_en" type="text_en" indexed="true" stored="false" required="false" />
<field name="contents_cs" type="text_cs" indexed="true" stored="false" required="false" />
<field name="contents_pt" type="text_pt" indexed="true" stored="false" required="false" />
<field name="contents_zh" type="text_zh" indexed="true" stored="false" required="false" />
<field name="contents_ja" type="text_ja" indexed="true" stored="false" required="false" />
<field name="contents_ko" type="text_ko" indexed="true" stored="false" required="false" />
<field name="contents_de" type="text_de" indexed="true" stored="false" required="false" />
<field name="contents_fr" type="text_fr" indexed="true" stored="false" required="false" />
<field name="contents_el" type="text_el" indexed="true" stored="false" required="false" />
<field name="contents_nl" type="text_nl" indexed="true" stored="false" required="false" />
<field name="contents_no" type="text_no" indexed="true" stored="false" required="false" />
<field name="contents_nn" type="text_nn" indexed="true" stored="false" required="false" />
<field name="contents_ru" type="text_ru" indexed="true" stored="false" required="false" />
<field name="contents_it" type="text_it" indexed="true" stored="false" required="false" />
<field name="contents_es" type="text_es" indexed="true" stored="false" required="false" />
<field name="contents_sv" type="text_sv" indexed="true" stored="false" required="false" />
<field name="contents_fi" type="text_fi" indexed="true" stored="false" required="false" />
<field name="contents_dk" type="text_dk" indexed="true" stored="false" required="false" />
<field name="contents_th" type="text_th" indexed="true" stored="false" required="false" />
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>fullname</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>contents</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR"/>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity may be specified here, but the default is fine
for most applications. -->
<!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> -->
</schema>

View File

@@ -1,387 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
"The contents of this file are subject to the Common Public Attribution
License Version 1.0. (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
License Version 1.1, but Sections 14 and 15 have been added to cover use of
software over a computer network and provide for limited attribution for the
Original Developer. In addition, Exhibit A has been modified to be consistent
with Exhibit B.
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
the specific language governing rights and limitations under the License.
The Original Code is Reddit.
The Original Developer is the Initial Developer. The Initial Developer of
the Original Code is CondeNet, Inc.
All portions of the code written by CondeNet are Copyright (c) 2006-2009
CondeNet, Inc. All Rights Reserved.
-->
<Server port="8081" shutdown="SHUTDOWN">
<!-- Comment these entries out to disable JMX MBeans support used for the
administration web application -->
<Listener className="org.apache.catalina.core.AprLifecycleListener" />
<Listener className="org.apache.catalina.mbeans.ServerLifecycleListener" />
<Listener className="org.apache.catalina.mbeans.GlobalResourcesLifecycleListener" />
<Listener className="org.apache.catalina.storeconfig.StoreConfigLifecycleListener"/>
<!-- Global JNDI resources -->
<GlobalNamingResources>
<!-- Test entry for demonstration purposes -->
<Environment name="simpleValue" type="java.lang.Integer" value="30"/>
<!-- Editable user database that can also be used by
UserDatabaseRealm to authenticate users -->
<Resource name="UserDatabase" auth="Container"
type="org.apache.catalina.UserDatabase"
description="User database that can be updated and saved"
factory="org.apache.catalina.users.MemoryUserDatabaseFactory"
pathname="conf/tomcat-users.xml" />
</GlobalNamingResources>
<!-- A "Service" is a collection of one or more "Connectors" that share
a single "Container" (and therefore the web applications visible
within that Container). Normally, that Container is an "Engine",
but this is not required.
Note: A "Service" is not itself a "Container", so you may not
define subcomponents such as "Valves" or "Loggers" at this level.
-->
<!-- Define the Tomcat Stand-Alone Service -->
<Service name="Catalina">
<!-- A "Connector" represents an endpoint by which requests are received
and responses are returned. Each Connector passes requests on to the
associated "Container" (normally an Engine) for processing.
By default, a non-SSL HTTP/1.1 Connector is established on port 8080.
You can also enable an SSL HTTP/1.1 Connector on port 8443 by
following the instructions below and uncommenting the second Connector
entry. SSL support requires the following steps (see the SSL Config
HOWTO in the Tomcat 5 documentation bundle for more detailed
instructions):
* If your JDK version 1.3 or prior, download and install JSSE 1.0.2 or
later, and put the JAR files into "$JAVA_HOME/jre/lib/ext".
* Execute:
%JAVA_HOME%\bin\keytool -genkey -alias tomcat -keyalg RSA (Windows)
$JAVA_HOME/bin/keytool -genkey -alias tomcat -keyalg RSA (Unix)
with a password value of "changeit" for both the certificate and
the keystore itself.
By default, DNS lookups are enabled when a web application calls
request.getRemoteHost(). This can have an adverse impact on
performance, so you can disable it by setting the
"enableLookups" attribute to "false". When DNS lookups are disabled,
request.getRemoteHost() will return the String version of the
IP address of the remote client.
-->
<!-- Define a non-SSL HTTP/1.1 Connector on port 8080 -->
<Connector port="8080" maxHttpHeaderSize="8192"
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
enableLookups="false" redirectPort="8443" acceptCount="100"
connectionTimeout="20000" disableUploadTimeout="true"
URIEncoding="UTF-8" />
<!-- Note : To disable connection timeouts, set connectionTimeout value
to 0 -->
<!-- Note : To use gzip compression you could set the following properties :
compression="on"
compressionMinSize="2048"
noCompressionUserAgents="gozilla, traviata"
compressableMimeType="text/html,text/xml"
-->
<!-- Define a SSL HTTP/1.1 Connector on port 8443 -->
<!--
<Connector port="8443" maxHttpHeaderSize="8192"
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
enableLookups="false" disableUploadTimeout="true"
acceptCount="100" scheme="https" secure="true"
clientAuth="false" sslProtocol="TLS" />
-->
<!-- Define an AJP 1.3 Connector on port 8009 -->
<Connector port="8009"
enableLookups="false" redirectPort="8443" protocol="AJP/1.3" />
<!-- Define a Proxied HTTP/1.1 Connector on port 8082 -->
<!-- See proxy documentation for more information about using this. -->
<!--
<Connector port="8082"
maxThreads="150" minSpareThreads="25" maxSpareThreads="75"
enableLookups="false" acceptCount="100" connectionTimeout="20000"
proxyPort="80" disableUploadTimeout="true" />
-->
<!-- An Engine represents the entry point (within Catalina) that processes
every request. The Engine implementation for Tomcat stand alone
analyzes the HTTP headers included with the request, and passes them
on to the appropriate Host (virtual host). -->
<!-- You should set jvmRoute to support load-balancing via AJP ie :
<Engine name="Standalone" defaultHost="localhost" jvmRoute="jvm1">
-->
<!-- Define the top level container in our container hierarchy -->
<Engine name="Catalina" defaultHost="localhost">
<!-- The request dumper valve dumps useful debugging information about
the request headers and cookies that were received, and the response
headers and cookies that were sent, for all requests received by
this instance of Tomcat. If you care only about requests to a
particular virtual host, or a particular application, nest this
element inside the corresponding <Host> or <Context> entry instead.
For a similar mechanism that is portable to all Servlet 2.4
containers, check out the "RequestDumperFilter" Filter in the
example application (the source for this filter may be found in
"$CATALINA_HOME/webapps/examples/WEB-INF/classes/filters").
Note that this Valve uses the platform's default character encoding.
This may cause problems for developers in another encoding, e.g.
UTF-8. Use the RequestDumperFilter instead.
Also note that enabling this Valve will write a ton of stuff to your
logs. They are likely to grow quite large. This extensive log writing
will definitely slow down your server.
Request dumping is disabled by default. Uncomment the following
element to enable it. -->
<!--
<Valve className="org.apache.catalina.valves.RequestDumperValve"/>
-->
<!-- Because this Realm is here, an instance will be shared globally -->
<!-- This Realm uses the UserDatabase configured in the global JNDI
resources under the key "UserDatabase". Any edits
that are performed against this UserDatabase are immediately
available for use by the Realm. -->
<Realm className="org.apache.catalina.realm.UserDatabaseRealm"
resourceName="UserDatabase"/>
<!-- Comment out the old realm but leave here for now in case we
need to go back quickly -->
<!--
<Realm className="org.apache.catalina.realm.MemoryRealm" />
-->
<!-- Replace the above Realm with one of the following to get a Realm
stored in a database and accessed via JDBC -->
<!--
<Realm className="org.apache.catalina.realm.JDBCRealm"
driverName="org.gjt.mm.mysql.Driver"
connectionURL="jdbc:mysql://localhost/authority"
connectionName="test" connectionPassword="test"
userTable="users" userNameCol="user_name" userCredCol="user_pass"
userRoleTable="user_roles" roleNameCol="role_name" />
-->
<!--
<Realm className="org.apache.catalina.realm.JDBCRealm"
driverName="oracle.jdbc.driver.OracleDriver"
connectionURL="jdbc:oracle:thin:@ntserver:1521:ORCL"
connectionName="scott" connectionPassword="tiger"
userTable="users" userNameCol="user_name" userCredCol="user_pass"
userRoleTable="user_roles" roleNameCol="role_name" />
-->
<!--
<Realm className="org.apache.catalina.realm.JDBCRealm"
driverName="sun.jdbc.odbc.JdbcOdbcDriver"
connectionURL="jdbc:odbc:CATALINA"
userTable="users" userNameCol="user_name" userCredCol="user_pass"
userRoleTable="user_roles" roleNameCol="role_name" />
-->
<!-- Define the default virtual host
Note: XML Schema validation will not work with Xerces 2.2.
-->
<Host name="localhost" appBase="webapps"
unpackWARs="true" autoDeploy="true"
xmlValidation="false" xmlNamespaceAware="false">
<!-- Defines a cluster for this node,
By defining this element, means that every manager will be changed.
So when running a cluster, only make sure that you have webapps in there
that need to be clustered and remove the other ones.
A cluster has the following parameters:
className = the fully qualified name of the cluster class
clusterName = a descriptive name for your cluster, can be anything
mcastAddr = the multicast address, has to be the same for all the nodes
mcastPort = the multicast port, has to be the same for all the nodes
mcastBindAddress = bind the multicast socket to a specific address
mcastTTL = the multicast TTL if you want to limit your broadcast
mcastSoTimeout = the multicast readtimeout
mcastFrequency = the number of milliseconds in between sending a "I'm alive" heartbeat
mcastDropTime = the number a milliseconds before a node is considered "dead" if no heartbeat is received
tcpThreadCount = the number of threads to handle incoming replication requests, optimal would be the same amount of threads as nodes
tcpListenAddress = the listen address (bind address) for TCP cluster request on this host,
in case of multiple ethernet cards.
auto means that address becomes
InetAddress.getLocalHost().getHostAddress()
tcpListenPort = the tcp listen port
tcpSelectorTimeout = the timeout (ms) for the Selector.select() method in case the OS
has a wakup bug in java.nio. Set to 0 for no timeout
printToScreen = true means that managers will also print to std.out
expireSessionsOnShutdown = true means that
useDirtyFlag = true means that we only replicate a session after setAttribute,removeAttribute has been called.
false means to replicate the session after each request.
false means that replication would work for the following piece of code: (only for SimpleTcpReplicationManager)
<%
HashMap map = (HashMap)session.getAttribute("map");
map.put("key","value");
%>
replicationMode = can be either 'pooled', 'synchronous' or 'asynchronous'.
* Pooled means that the replication happens using several sockets in a synchronous way. Ie, the data gets replicated, then the request return. This is the same as the 'synchronous' setting except it uses a pool of sockets, hence it is multithreaded. This is the fastest and safest configuration. To use this, also increase the nr of tcp threads that you have dealing with replication.
* Synchronous means that the thread that executes the request, is also the
thread the replicates the data to the other nodes, and will not return until all
nodes have received the information.
* Asynchronous means that there is a specific 'sender' thread for each cluster node,
so the request thread will queue the replication request into a "smart" queue,
and then return to the client.
The "smart" queue is a queue where when a session is added to the queue, and the same session
already exists in the queue from a previous request, that session will be replaced
in the queue instead of replicating two requests. This almost never happens, unless there is a
large network delay.
-->
<!--
When configuring for clustering, you also add in a valve to catch all the requests
coming in, at the end of the request, the session may or may not be replicated.
A session is replicated if and only if all the conditions are met:
1. useDirtyFlag is true or setAttribute or removeAttribute has been called AND
2. a session exists (has been created)
3. the request is not trapped by the "filter" attribute
The filter attribute is to filter out requests that could not modify the session,
hence we don't replicate the session after the end of this request.
The filter is negative, ie, anything you put in the filter, you mean to filter out,
ie, no replication will be done on requests that match one of the filters.
The filter attribute is delimited by ;, so you can't escape out ; even if you wanted to.
filter=".*\.gif;.*\.js;" means that we will not replicate the session after requests with the URI
ending with .gif and .js are intercepted.
The deployer element can be used to deploy apps cluster wide.
Currently the deployment only deploys/undeploys to working members in the cluster
so no WARs are copied upons startup of a broken node.
The deployer watches a directory (watchDir) for WAR files when watchEnabled="true"
When a new war file is added the war gets deployed to the local instance,
and then deployed to the other instances in the cluster.
When a war file is deleted from the watchDir the war is undeployed locally
and cluster wide
-->
<!--
<Cluster className="org.apache.catalina.cluster.tcp.SimpleTcpCluster"
managerClassName="org.apache.catalina.cluster.session.DeltaManager"
expireSessionsOnShutdown="false"
useDirtyFlag="true"
notifyListenersOnReplication="true">
<Membership
className="org.apache.catalina.cluster.mcast.McastService"
mcastAddr="228.0.0.4"
mcastPort="45564"
mcastFrequency="500"
mcastDropTime="3000"/>
<Receiver
className="org.apache.catalina.cluster.tcp.ReplicationListener"
tcpListenAddress="auto"
tcpListenPort="4001"
tcpSelectorTimeout="100"
tcpThreadCount="6"/>
<Sender
className="org.apache.catalina.cluster.tcp.ReplicationTransmitter"
replicationMode="pooled"
ackTimeout="15000"
waitForAck="true"/>
<Valve className="org.apache.catalina.cluster.tcp.ReplicationValve"
filter=".*\.gif;.*\.js;.*\.jpg;.*\.png;.*\.htm;.*\.html;.*\.css;.*\.txt;"/>
<Deployer className="org.apache.catalina.cluster.deploy.FarmWarDeployer"
tempDir="/tmp/war-temp/"
deployDir="/tmp/war-deploy/"
watchDir="/tmp/war-listen/"
watchEnabled="false"/>
<ClusterListener className="org.apache.catalina.cluster.session.ClusterSessionListener"/>
</Cluster>
-->
<!-- Normally, users must authenticate themselves to each web app
individually. Uncomment the following entry if you would like
a user to be authenticated the first time they encounter a
resource protected by a security constraint, and then have that
user identity maintained across *all* web applications contained
in this virtual host. -->
<!--
<Valve className="org.apache.catalina.authenticator.SingleSignOn" />
-->
<!-- Access log processes all requests for this virtual host. By
default, log files are created in the "logs" directory relative to
$CATALINA_HOME. If you wish, you can specify a different
directory with the "directory" attribute. Specify either a relative
(to $CATALINA_HOME) or absolute path to the desired directory.
-->
<!--
<Valve className="org.apache.catalina.valves.AccessLogValve"
directory="logs" prefix="localhost_access_log." suffix=".txt"
pattern="common" resolveHosts="false"/>
-->
<!-- Access log processes all requests for this virtual host. By
default, log files are created in the "logs" directory relative to
$CATALINA_HOME. If you wish, you can specify a different
directory with the "directory" attribute. Specify either a relative
(to $CATALINA_HOME) or absolute path to the desired directory.
This access log implementation is optimized for maximum performance,
but is hardcoded to support only the "common" and "combined" patterns.
-->
<!--
<Valve className="org.apache.catalina.valves.FastCommonAccessLogValve"
directory="logs" prefix="localhost_access_log." suffix=".txt"
pattern="common" resolveHosts="false"/>
-->
</Host>
</Engine>
</Service>
</Server>

View File

@@ -1,464 +0,0 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
"The contents of this file are subject to the Common Public Attribution
License Version 1.0. (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
License Version 1.1, but Sections 14 and 15 have been added to cover use of
software over a computer network and provide for limited attribution for the
Original Developer. In addition, Exhibit A has been modified to be consistent
with Exhibit B.
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
the specific language governing rights and limitations under the License.
The Original Code is Reddit.
The Original Developer is the Initial Developer. The Initial Developer of
the Original Code is CondeNet, Inc.
All portions of the code written by CondeNet are Copyright (c) 2006-2009
CondeNet, Inc. All Rights Reserved.
-->
<config>
<!-- Set this to 'false' if you want solr to continue working after it has
encountered an severe configuration error. In a production environment,
you may want solr to keep working even if one handler is mis-configured.
You may also set this to false using by setting the system property:
-Dsolr.abortOnConfigurationError=false
-->
<abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
<!-- Used to specify an alternate directory to hold all index data
other than the default ./data under the Solr home.
If replication is in use, this should match the replication configuration. -->
<!--
<dataDir>./solr/data</dataDir>
-->
<indexDefaults>
<!-- Values here affect all index writers and act as a default unless overridden. -->
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<maxBufferedDocs>1000</maxBufferedDocs>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<writeLockTimeout>1000</writeLockTimeout>
<commitLockTimeout>10000</commitLockTimeout>
</indexDefaults>
<mainIndex>
<!-- options specific to the main on-disk lucene index -->
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<maxBufferedDocs>1000</maxBufferedDocs>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<!-- If true, unlock any held write or commit locks on startup.
This defeats the locking mechanism that allows multiple
processes to safely access a lucene index, and should be
used with care. -->
<unlockOnStartup>false</unlockOnStartup>
</mainIndex>
<!-- the default high-performance update handler -->
<updateHandler class="solr.DirectUpdateHandler2">
<!-- A prefix of "solr." for class names is an alias that
causes solr to search appropriate packages, including
org.apache.solr.(search|update|request|core|analysis)
-->
<!-- autocommit pending docs if certain criteria are met
<autoCommit>
<maxDocs>10000</maxDocs>
<maxTime>1000</maxTime>
</autoCommit>
-->
<!-- The RunExecutableListener executes an external command.
exe - the name of the executable to run
dir - dir to use as the current working directory. default="."
wait - the calling thread waits until the executable returns. default="true"
args - the arguments to pass to the program. default=nothing
env - environment variables to set. default=nothing
-->
<!-- A postCommit event is fired after every commit or optimize command
<listener event="postCommit" class="solr.RunExecutableListener">
<str name="exe">snapshooter</str>
<str name="dir">solr/bin</str>
<bool name="wait">true</bool>
<arr name="args"> <str>arg1</str> <str>arg2</str> </arr>
<arr name="env"> <str>MYVAR=val1</str> </arr>
</listener>
-->
<!-- A postOptimize event is fired only after every optimize command, useful
in conjunction with index distribution to only distribute optimized indicies
<listener event="postOptimize" class="solr.RunExecutableListener">
<str name="exe">snapshooter</str>
<str name="dir">solr/bin</str>
<bool name="wait">true</bool>
</listener>
-->
</updateHandler>
<query>
<!-- Maximum number of clauses in a boolean query... can affect
range or prefix queries that expand to big boolean
queries. An exception is thrown if exceeded. -->
<maxBooleanClauses>1024</maxBooleanClauses>
<!-- Cache used by SolrIndexSearcher for filters (DocSets),
unordered sets of *all* documents that match a query.
When a new searcher is opened, its caches may be prepopulated
or "autowarmed" using data from caches in the old searcher.
autowarmCount is the number of items to prepopulate. For LRUCache,
the autowarmed items will be the most recently accessed items.
Parameters:
class - the SolrCache implementation (currently only LRUCache)
size - the maximum number of entries in the cache
initialSize - the initial capacity (number of entries) of
the cache. (seel java.util.HashMap)
autowarmCount - the number of entries to prepopulate from
and old cache.
-->
<filterCache
class="solr.LRUCache"
size="512"
initialSize="512"
autowarmCount="256"/>
<!-- queryResultCache caches results of searches - ordered lists of
document ids (DocList) based on a query, a sort, and the range
of documents requested. -->
<queryResultCache
class="solr.LRUCache"
size="512"
initialSize="512"
autowarmCount="256"/>
<!-- documentCache caches Lucene Document objects (the stored fields for each document).
Since Lucene internal document ids are transient, this cache will not be autowarmed. -->
<documentCache
class="solr.LRUCache"
size="512"
initialSize="512"
autowarmCount="0"/>
<!-- If true, stored fields that are not requested will be loaded lazily.
This can result in a significant speed improvement if the usual case is to
not load all stored fields, especially if the skipped fields are large compressed
text fields.
-->
<enableLazyFieldLoading>true</enableLazyFieldLoading>
<!-- Example of a generic cache. These caches may be accessed by name
through SolrIndexSearcher.getCache(),cacheLookup(), and cacheInsert().
The purpose is to enable easy caching of user/application level data.
The regenerator argument should be specified as an implementation
of solr.search.CacheRegenerator if autowarming is desired. -->
<!--
<cache name="myUserCache"
class="solr.LRUCache"
size="4096"
initialSize="1024"
autowarmCount="1024"
regenerator="org.mycompany.mypackage.MyRegenerator"
/>
-->
<!-- An optimization that attempts to use a filter to satisfy a search.
If the requested sort does not include score, then the filterCache
will be checked for a filter matching the query. If found, the filter
will be used as the source of document ids, and then the sort will be
applied to that.
<useFilterForSortedQuery>true</useFilterForSortedQuery>
-->
<!-- An optimization for use with the queryResultCache. When a search
is requested, a superset of the requested number of document ids
are collected. For example, if a search for a particular query
requests matching documents 10 through 19, and queryWindowSize is 50,
then documents 0 through 50 will be collected and cached. Any further
requests in that range can be satisfied via the cache. -->
<queryResultWindowSize>10</queryResultWindowSize>
<!-- This entry enables an int hash representation for filters (DocSets)
when the number of items in the set is less than maxSize. For smaller
sets, this representation is more memory efficient, more efficient to
iterate over, and faster to take intersections. -->
<HashDocSet maxSize="3000" loadFactor="0.75"/>
<!-- boolToFilterOptimizer converts boolean clauses with zero boost
into cached filters if the number of docs selected by the clause exceeds
the threshold (represented as a fraction of the total index) -->
<boolTofilterOptimizer enabled="true" cacheSize="32" threshold=".05"/>
<!-- a newSearcher event is fired whenever a new searcher is being prepared
and there is a current searcher handling requests (aka registered). -->
<!-- QuerySenderListener takes an array of NamedList and executes a
local query request for each NamedList in sequence. -->
<!--
<listener event="newSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
<lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
</arr>
</listener>
-->
<!-- a firstSearcher event is fired whenever a new searcher is being
prepared but there is no current registered searcher to handle
requests or to gain autowarming data from. -->
<!--
<listener event="firstSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
</arr>
</listener>
-->
<!-- If a search request comes in and there is no current registered searcher,
then immediately register the still warming searcher and use it. If
"false" then all requests will block until the first searcher is done
warming. -->
<useColdSearcher>false</useColdSearcher>
<!-- Maximum number of searchers that may be warming in the background
concurrently. An error is returned if this limit is exceeded. Recommend
1-2 for read-only slaves, higher for masters w/o cache warming. -->
<maxWarmingSearchers>4</maxWarmingSearchers>
</query>
<!--
Let the dispatch filter handler /select?qt=XXX
handleSelect=true will use consistent error handling for /select and /update
handleSelect=false will use solr1.1 style error formatting
-->
<requestDispatcher handleSelect="true" >
<!--Make sure your system has some authentication before enabling remote streaming! -->
<requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="2048" />
</requestDispatcher>
<!-- requestHandler plugins... incoming queries will be dispatched to the
correct handler based on the qt (query type) param matching the
name of registered handlers.
The "standard" request handler is the default and will be used if qt
is not specified in the request.
-->
<requestHandler name="standard" class="solr.StandardRequestHandler">
<!-- default values for query parameters -->
<lst name="defaults">
<str name="echoParams">explicit</str>
<!--
<int name="rows">10</int>
<str name="fl">*</str>
<str name="version">2.1</str>
-->
</lst>
</requestHandler>
<!-- DisMaxRequestHandler allows easy searching across multiple fields
for simple user-entered phrases.
see http://wiki.apache.org/solr/DisMaxRequestHandler
-->
<requestHandler name="dismax" class="solr.DisMaxRequestHandler" >
<lst name="defaults">
<str name="qf">contents</str>
<!-- <str name="echoParams">explicit</str>
<float name="tie">0.01</float>
<str name="qf">
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
</str>
<str name="pf">
text^0.2 features^1.1 name^1.5 manu^1.4 manu_exact^1.9
</str>
<str name="bf">
ord(poplarity)^0.5 recip(rord(price),1,1000,1000)^0.3
</str>
<str name="fl">
id,name,price,score
</str>
<str name="mm">
2&lt;-1 5&lt;-2 6&lt;90%
</str>
<int name="ps">100</int>
<str name="q.alt">*:*</str> -->
</lst>
</requestHandler>
<!-- Note how you can register the same handler multiple times with
different names (and different init parameters)
-->
<requestHandler name="partitioned" class="solr.DisMaxRequestHandler" >
<lst name="defaults">
<str name="echoParams">explicit</str>
<str name="qf">text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0</str>
<str name="mm">2&lt;-1 5&lt;-2 6&lt;90%</str>
<!-- This is an example of using Date Math to specify a constantly
moving date range in a config...
-->
<str name="bq">incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2</str>
</lst>
<!-- In addition to defaults, "appends" params can be specified
to identify values which should be appended to the list of
multi-val params from the query (or the existing "defaults").
In this example, the param "fq=instock:true" will be appended to
any query time fq params the user may specify, as a mechanism for
partitioning the index, independent of any user selected filtering
that may also be desired (perhaps as a result of faceted searching).
NOTE: there is *absolutely* nothing a client can do to prevent these
"appends" values from being used, so don't use this mechanism
unless you are sure you always want it.
-->
<lst name="appends">
<str name="fq">inStock:true</str>
</lst>
<!-- "invariants" are a way of letting the Solr maintainer lock down
the options available to Solr clients. Any params values
specified here are used regardless of what values may be specified
in either the query, the "defaults", or the "appends" params.
In this example, the facet.field and facet.query params are fixed,
limiting the facets clients can use. Faceting is not turned on by
default - but if the client does specify facet=true in the request,
these are the only facets they will be able to see counts for;
regardless of what other facet.field or facet.query params they
may specify.
NOTE: there is *absolutely* nothing a client can do to prevent these
"invariants" values from being used, so don't use this mechanism
unless you are sure you always want it.
-->
<lst name="invariants">
<str name="facet.field">cat</str>
<str name="facet.field">manu_exact</str>
<str name="facet.query">price:[* TO 500]</str>
<str name="facet.query">price:[500 TO *]</str>
</lst>
</requestHandler>
<requestHandler name="instock" class="solr.DisMaxRequestHandler" >
<!-- for legacy reasons, DisMaxRequestHandler will assume all init
params are "defaults" if you don't explicitly specify any defaults.
-->
<str name="fq">
inStock:true
</str>
<str name="qf">
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
</str>
<str name="mm">
2&lt;-1 5&lt;-2 6&lt;90%
</str>
</requestHandler>
<!-- SpellCheckerRequestHandler takes in a word (or several words) as the
value of the "q" parameter and returns a list of alternative spelling
suggestions. If invoked with a ...&cmd=rebuild, it will rebuild the
spellchecker index.
-->
<requestHandler name="spellchecker" class="solr.SpellCheckerRequestHandler" startup="lazy">
<!-- default values for query parameters -->
<lst name="defaults">
<int name="suggestionCount">1</int>
<float name="accuracy">0.5</float>
</lst>
<!-- Main init params for handler -->
<!-- The directory where your SpellChecker Index should live. -->
<!-- May be absolute, or relative to the Solr "dataDir" directory. -->
<!-- If this option is not specified, a RAM directory will be used -->
<str name="spellcheckerIndexDir">spell</str>
<!-- the field in your schema that you want to be able to build -->
<!-- your spell index on. This should be a field that uses a very -->
<!-- simple FieldType without a lot of Analysis (ie: string) -->
<str name="termSourceField">word</str>
</requestHandler>
<!-- Update request handler.
Note: Since solr1.1 requestHandlers requires a valid content type header if posted in
the body. For example, curl now requires: -H 'Content-type:text/xml; charset=utf-8'
The response format differs from solr1.1 formatting and returns a standard error code.
To enable solr1.1 behavior, remove the /update handler or change its path
-->
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
<!-- CSV update handler, loaded on demand -->
<requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
<!-- Admin Handlers. TODO? There could be a single handler that loads them all... -->
<!-- <requestHandler name="/admin/luke" class="org.apache.solr.handler.admin.LukeRequestHandler" />
<requestHandler name="/admin/system" class="org.apache.solr.handler.admin.SystemInfoHandler" />
<requestHandler name="/admin/plugins" class="org.apache.solr.handler.admin.PluginInfoHandler" />
<requestHandler name="/admin/threads" class="org.apache.solr.handler.admin.ThreadDumpHandler" />
<requestHandler name="/admin/properties" class="org.apache.solr.handler.admin.PropertiesRequestHandler" /> -->
<!-- Echo the request contents back to the client -->
<!-- <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
<lst name="defaults">
<str name="echoParams">explicit</str> --> <!-- for all params (including the default etc) use: 'all' -->
<!-- <str name="echoHandler">true</str>
</lst>
</requestHandler> -->
<!-- queryResponseWriter plugins... query responses will be written using the
writer specified by the 'wt' request parameter matching the name of a registered
writer.
The "standard" writer is the default and will be used if 'wt' is not specified
in the request. XMLResponseWriter will be used if nothing is specified here.
The json, python, and ruby writers are also available by default.
<queryResponseWriter name="standard" class="org.apache.solr.request.XMLResponseWriter"/>
<queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/>
<queryResponseWriter name="python" class="org.apache.solr.request.PythonResponseWriter"/>
<queryResponseWriter name="ruby" class="org.apache.solr.request.RubyResponseWriter"/>
<queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
-->
<!-- XSLT response writer transforms the XML output by any xslt file found
in Solr's conf/xslt directory. Changes to xslt files are checked for
every xsltCacheLifetimeSeconds.
-->
<!-- <queryResponseWriter name="xslt" class="org.apache.solr.request.XSLTResponseWriter">
<int name="xsltCacheLifetimeSeconds">5</int>
</queryResponseWriter> -->
<!-- config for the admin interface -->
<admin>
<defaultQuery>solr</defaultQuery>
<gettableFiles>solrconfig.xml schema.xml admin-extra.html</gettableFiles>
<!-- pingQuery should be "URLish" ...
&amp; separated key=val pairs ... but there shouldn't be any
URL escaping of the values -->
<pingQuery>
qt=standard&amp;q=solrpingquery
</pingQuery>
<!-- configure a healthcheck file for servers behind a loadbalancer
<healthcheck type="file">server-enabled</healthcheck>
-->
</admin>
</config>

View File

@@ -338,8 +338,6 @@ if [ ! -f /etc/cron.d/reddit ]; then
# disabled by default, uncomment if you need these jobs
#*/2 * * * * root /sbin/start --quiet reddit-job-google_checkout
#*/10 * * * * root /sbin/start --quiet reddit-job-solrsearch optimize=False
#0 0 * * * root /sbin/start --quiet reddit-job-solrsearch optimize=True
#0 0 * * * root /sbin/start --quiet reddit-job-update_gold_users
CRON
fi

View File

@@ -354,12 +354,6 @@ png_optimizer = /usr/bin/env optipng
# jpeg compressor
jpeg_optimizer =
# -- search --
# where is solor?
solr_url =
# how long do we cache search results (in seconds)
solr_cache_time = 300
# Just a list of words. Used by errlog.py to make up names for new errors.
words_file = /usr/dict/words

View File

@@ -40,8 +40,6 @@ from r2.lib.db.tdb_cassandra import MultiColumnQuery
from r2.lib.strings import strings
from r2.lib.search import (SearchQuery, SubredditSearchQuery, SearchException,
InvalidQuery)
from r2.lib.solrsearch import RelatedSearchQuery
from r2.lib.contrib.pysolr import SolrError
from r2.lib import jsontemplates
from r2.lib import sup
import r2.lib.db.thing as thing
@@ -788,7 +786,7 @@ class FrontController(RedditController):
# computed after fetch_more
try:
res = listing.listing()
except SearchException + (SolrError, socket.error) as e:
except SearchException + (socket.error,) as e:
return self.search_fail(e)
timing = time_module.time() - builder.start_time

View File

@@ -36,7 +36,6 @@ from r2.lib.db.thing import Query, Merge, Relations
from r2.lib.db import queries
from r2.lib.strings import Score
from r2.lib import organic
import r2.lib.solrsearch as solrsearch
import r2.lib.search as search
from r2.lib.utils import iters, check_cheating, timeago
from r2.lib.utils.trial_utils import populate_spotlight
@@ -130,7 +129,7 @@ class ListingController(RedditController):
builder_cls = self.builder_cls
elif isinstance(self.query_obj, Query):
builder_cls = QueryBuilder
elif isinstance(self.query_obj, (solrsearch.SearchQuery, search.SearchQuery)):
elif isinstance(self.query_obj, search.SearchQuery):
builder_cls = SearchBuilder
elif isinstance(self.query_obj, iters):
builder_cls = IDBuilder

View File

@@ -931,17 +931,8 @@ class RedditController(MinimalController):
abort(304, 'not modified')
def search_fail(self, exception):
from r2.lib.contrib.pysolr import SolrError
from r2.lib.search import SearchException
if isinstance(exception, SolrError):
errmsg = "SolrError: %r" % exception
if (str(exception) == 'None'):
# Production error logs only get non-None errors
g.log.debug(errmsg)
else:
g.log.error(errmsg)
elif isinstance(exception, SearchException + (socket.error,)):
if isinstance(exception, SearchException + (socket.error,)):
g.log.error("Search Error: %s" % repr(exception))
errpage = pages.RedditError(_("search failed"),

View File

@@ -47,7 +47,6 @@ class Globals(object):
'db_pool_size',
'db_pool_overflow_size',
'page_cache_time',
'solr_cache_time',
'num_mc_clients',
'MIN_DOWN_LINK',
'MIN_UP_KARMA',

View File

@@ -1,347 +0,0 @@
# -*- coding: utf-8 -*-
"""
All we need to create a Solr connection is a url.
>>> conn = Solr('http://127.0.0.1:8983/solr/')
First, completely clear the index.
>>> conn.delete(q='*:*')
For now, we can only index python dictionaries. Each key in the dictionary
will correspond to a field in Solr.
>>> docs = [
... {'id': 'testdoc.1', 'order_i': 1, 'name': 'document 1', 'text': u'Paul Verlaine'},
... {'id': 'testdoc.2', 'order_i': 2, 'name': 'document 2', 'text': u'Владимир Маякoвский'},
... {'id': 'testdoc.3', 'order_i': 3, 'name': 'document 3', 'text': u'test'},
... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'test'}
... ]
We can add documents to the index by passing a list of docs to the connection's
add method.
>>> conn.add(docs)
>>> results = conn.search('Verlaine')
>>> len(results)
1
>>> results = conn.search(u'Владимир')
>>> len(results)
1
Simple tests for searching. We can optionally sort the results using Solr's
sort syntax, that is, the field name and either asc or desc.
>>> results = conn.search('test', sort='order_i asc')
>>> for result in results:
... print result['name']
document 3
document 4
>>> results = conn.search('test', sort='order_i desc')
>>> for result in results:
... print result['name']
document 4
document 3
To update documents, we just use the add method.
>>> docs = [
... {'id': 'testdoc.4', 'order_i': 4, 'name': 'document 4', 'text': u'blah'}
... ]
>>> conn.add(docs)
>>> len(conn.search('blah'))
1
>>> len(conn.search('test'))
1
We can delete documents from the index by id, or by supplying a query.
>>> conn.delete(id='testdoc.1')
>>> conn.delete(q='name:"document 2"')
>>> results = conn.search('Verlaine')
>>> len(results)
0
Docs can also have multiple values for any particular key. This lets us use
Solr's multiValue fields.
>>> docs = [
... {'id': 'testdoc.5', 'cat': ['poetry', 'science'], 'name': 'document 5', 'text': u''},
... {'id': 'testdoc.6', 'cat': ['science-fiction',], 'name': 'document 6', 'text': u''},
... ]
>>> conn.add(docs)
>>> results = conn.search('cat:"poetry"')
>>> for result in results:
... print result['name']
document 5
>>> results = conn.search('cat:"science-fiction"')
>>> for result in results:
... print result['name']
document 6
>>> results = conn.search('cat:"science"')
>>> for result in results:
... print result['name']
document 5
NOTE: PySolr is an open-source Python module
<http://code.google.com/p/pysolr/> that falls under the New BSD
Licence <http://www.opensource.org/licenses/bsd-license.php>, NOT the
licence covering the rest of Reddit. Reddit's modifications to this
module also fall under the New BSD Licence. The New BSD Licence
requires that re-distributions of the source, modified or not, display
the original copyright notice, but PySolr does not, as of import-time,
display a copyright notice or licence, except on its Google Code
information page. Therefore for licencing information, I point you to
PySolr's Google Code information page, URL above.
"""
# TODO: unicode support is pretty sloppy. define it better.
from httplib import HTTPConnection
from urllib import urlencode
from urlparse import urlsplit
from datetime import datetime, date
from time import strptime, strftime
from r2.lib.utils import unicode_safe
try:
# for python 2.5
from xml.etree import ElementTree
from xml.parsers.expat import ExpatError
except ImportError:
from elementtree import ElementTree,ExpatError
__all__ = ['Solr']
class SolrError(Exception):
pass
class Results(object):
def __init__(self, docs, hits):
self.docs = docs
self.hits = hits
def __len__(self):
return len(self.docs)
def __iter__(self):
return iter(self.docs)
def __getitem__(self,x):
return self.docs[x]
class Solr(object):
def __init__(self, url):
self.url = url
scheme, netloc, path, query, fragment = urlsplit(url)
netloc = netloc.split(':')
self.host = netloc[0]
if len(netloc) == 1:
self.host, self.port = netloc[0], None
else:
self.host, self.port = netloc
self.path = path.rstrip('/')
def _select(self, params):
# encode the query as utf-8 so urlencode can handle it
params['q'] = unicode_safe(params['q'])
path = '%s/select/?%s' % (self.path, urlencode(params))
conn = HTTPConnection(self.host, self.port)
conn.request('GET', path)
return conn.getresponse()
def _update(self, message):
"""
Posts the given xml message to http://<host>:<port>/solr/update and
returns the result.
"""
path = '%s/update/' % self.path
conn = HTTPConnection(self.host, self.port)
conn.request('POST', path, message, {'Content-type': 'text/xml'})
return conn.getresponse()
def _extract_error(self, response):
"""
Extract the actual error message from a solr response. Unfortunately,
this means scraping the html.
"""
try:
et = ElementTree.parse(response)
error = et.findtext('body/pre')
return error
except ExpatError,e:
return "%s: %s (%d/%s)" % (e,response.read(),response.status,response.reason)
# Converters #############################################################
@staticmethod
def _from_python(value):
"""
Converts python values to a form suitable for insertion into the xml
we send to solr.
"""
if isinstance(value, datetime):
value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif isinstance(value, date):
value = value.strftime('%Y-%m-%dT00:00:00.000Z')
elif isinstance(value, bool):
if value:
value = 'true'
else:
value = 'false'
else:
value = unicode_safe(value)
return value
def bool_to_python(self, value):
"""
Convert a 'bool' field from solr's xml format to python and return it.
"""
if value == 'true':
return True
elif value == 'false':
return False
def str_to_python(self, value):
"""
Convert an 'str' field from solr's xml format to python and return it.
"""
return unicode_safe(value)
def int_to_python(self, value):
"""
Convert an 'int' field from solr's xml format to python and return it.
"""
return int(value)
def date_to_python(self, value):
"""
Convert a 'date' field from solr's xml format to python and return it.
"""
# this throws away fractions of a second
return datetime(*strptime(value[:-5], "%Y-%m-%dT%H:%M:%S")[0:6])
# API Methods ############################################################
def search(self, q, sort=None, start=0, rows=20, other_params = {}):
"""Performs a search and returns the results."""
params = {'q': q, 'start': start, 'rows': rows}
for x,y in other_params.iteritems():
params[x] = y
if sort:
params['sort'] = sort
response = self._select(params)
if response.status != 200:
raise SolrError(self._extract_error(response))
# TODO: make result retrieval lazy and allow custom result objects
# also, this has become rather ugly and definitely needs some cleanup.
et = ElementTree.parse(response)
result = et.find('result')
hits = int(result.get('numFound'))
docs = result.findall('doc')
results = []
for doc in docs:
result = {}
for element in doc.getchildren():
if element.tag == 'arr':
result_val = []
for array_element in element.getchildren():
converter_name = '%s_to_python' % array_element.tag
converter = getattr(self, converter_name)
result_val.append(converter(array_element.text))
else:
converter_name = '%s_to_python' % element.tag
converter = getattr(self, converter_name)
result_val = converter(element.text)
result[element.get('name')] = result_val
results.append(result)
return Results(results, hits)
def add(self, docs, commit=False):
"""Adds or updates documents. For now, docs is a list of dictionaies
where each key is the field name and each value is the value to index.
"""
message = ElementTree.Element('add')
for doc in docs:
message.append(doc_to_elemtree(doc))
m = ElementTree.tostring(message)
response = self._update(m)
if response.status != 200:
raise SolrError(self._extract_error(response))
# TODO: Supposedly, we can put a <commit /> element in the same post body
# as the add element. That isn't working for some reason, and it would save us
# an extra trip to the server. This works for now.
if commit:
self.commit()
def delete(self, id=None, q=None, commit=False):
"""Deletes documents."""
if id is None and q is None:
raise ValueError('You must specify "id" or "q".')
elif id is not None and q is not None:
raise ValueError('You many only specify "id" OR "q", not both.')
elif id is not None:
m = '<delete><id>%s</id></delete>' % id
elif q is not None:
m = '<delete><query>%s</query></delete>' % q
response = self._update(m)
if response.status != 200:
raise SolrError(self._extract_error(response))
# TODO: Supposedly, we can put a <commit /> element in the same post body
# as the delete element. That isn't working for some reason, and it would save us
# an extra trip to the server. This works for now.
if commit:
self.commit()
def commit(self):
response = self._update('<commit />')
if response.status != 200:
raise SolrError(self._extract_error(response))
def optimize(self):
response = self._update('<optimize />')
if response.status != 200:
raise SolrError(self._extract_error(response))
solr_magic_fields = ('boost',)
def doc_to_elemtree(doc):
d = ElementTree.Element('doc')
for key, value in doc.iteritems():
if key in solr_magic_fields:
# handle special fields that are attributes, not fields
d.set(key,Solr._from_python(value))
elif (not isinstance(value,str)) and hasattr(value, '__iter__'):
# handle lists, tuples, and other iterabes
for v in value:
f = ElementTree.Element('field', name=key)
f.text = Solr._from_python(v)
d.append(f)
# handle strings and unicode
else:
f = ElementTree.Element('field', name=key)
f.text = Solr._from_python(value)
d.append(f)
return d
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@@ -5,7 +5,6 @@ from r2.lib.db.operators import asc, desc, timeago
from r2.lib.db.sorts import epoch_seconds
from r2.lib.utils import fetch_things2, tup, UniqueIterator, set_last_modified
from r2.lib import utils
from r2.lib.solrsearch import DomainSearchQuery
from r2.lib import amqp, sup, filters
from r2.lib.comment_tree import add_comments, update_comment_votes
from r2.models.query_cache import (cached_query, merged_cached_query,
@@ -39,12 +38,6 @@ def db_sort(sort):
cls, col = db_sorts[sort]
return cls(col)
search_sort = dict(hot = 'hot desc',
new = 'date desc',
top = 'points desc',
controversial = 'controversy desc',
old = 'date asc')
db_times = dict(all = None,
hour = Thing.c._date >= timeago('1 hour'),
day = Thing.c._date >= timeago('1 day'),
@@ -458,9 +451,6 @@ def get_modqueue(sr):
q.append(get_spam_filtered_comments(sr))
return q
def get_domain_links_old(domain, sort, time):
return DomainSearchQuery(domain, sort=search_sort[sort], timerange=time)
def get_domain_links(domain, sort, time):
from r2.lib.db import operators
q = Link._query(operators.domain(Link.c.url) == filters._force_utf8(domain),

View File

@@ -1,692 +0,0 @@
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is Reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of the
# Original Code is CondeNet, Inc.
#
# All portions of the code written by CondeNet are Copyright (c) 2006-2010
# CondeNet, Inc. All Rights Reserved.
################################################################################
"""
Module for communication reddit-level communication with
Solr. Contains functions for indexing (`reindex_all`, `run_changed`)
and searching (`search_things`). Uses pysolr (placed in r2.lib)
for lower-level communication with Solr
"""
from __future__ import with_statement
from Queue import Queue
from threading import Thread
import time
from datetime import datetime, date
from time import strftime
from pylons import g, config
from r2.models import *
from r2.lib.contrib import pysolr
from r2.lib.contrib.pysolr import SolrError
from r2.lib.utils import timeago, UrlParser
from r2.lib.utils import unicode_safe, tup, get_after, strordict_fullname
from r2.lib.cache import SelfEmptyingCache
from r2.lib import amqp
solr_cache_time = g.solr_cache_time
## Changes to the list of searchable languages will require changes to
## Solr's configuration (specifically, the fields that are searched)
searchable_langs = set(['dk','nl','en','fi','fr','de','it','no','nn','pt',
'ru','es','sv','zh','ja','ko','cs','el','th'])
## Adding types is a matter of adding the class to indexed_types here,
## adding the fields from that type to search_fields below, and adding
## those fields to Solr's configuration
indexed_types = (Subreddit, Link)
class Field(object):
"""
Describes a field of a Thing that is searchable by Solr. Used
by `search_fields` below
"""
def __init__(self, name, thing_attr_func = None, store = True,
tokenize=False, is_number=False, reverse=False,
is_date = False):
self.name = name
self.thing_attr_func = self.make_extractor(thing_attr_func)
def make_extractor(self,thing_attr_func):
if not thing_attr_func:
return self.make_extractor(self.name)
elif isinstance(thing_attr_func,str):
return (lambda x: getattr(x,thing_attr_func))
else:
return thing_attr_func
def extract_from(self,thing):
return self.thing_attr_func(thing)
class ThingField(Field):
"""
ThingField('field_name',Author,'author_id','name')
is like:
Field(name, lambda x: Author._byID(x.author_id,data=True).name)
but faster because lookups are done in batch
"""
def __init__(self,name,cls,id_attr,lu_attr_name):
self.name = name
self.cls = cls # the class of the looked-up object
self.id_attr = id_attr # the attr of the source obj used to find the dest obj
self.lu_attr_name = lu_attr_name # the attr of the dest class that we want to return
def __str__(self):
return ("<ThingField: (%s,%s,%s,%s)>"
% (self.name,self.cls,self.id_attr,self.lu_attr_name))
# Describes the fields of Thing objects and subclasses that are passed
# to Solr for indexing. All must have a 'contents' field, since that
# will be used for language-agnostic searching, and will be copied
# into contents_en, contents_eo, et (see `tokenize_things` for a
# discussion of multi-language search. The 'boost' field is a
# solr-magic field that ends up being an attribute on the <doc>
# message (rather than a field), and is used to do an index-time boost
# (this magic is done in pysolr.dor_to_elemtree)
search_fields={Thing: (Field('fullname', '_fullname'),
Field('date', '_date', is_date = True, reverse=True),
Field('lang'),
Field('ups', '_ups', is_number=True, reverse=True),
Field('downs', '_downs', is_number=True, reverse=True),
Field('spam','_spam'),
Field('deleted','_deleted'),
Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True),
Field('controversy', '_controversy', is_number=True, reverse=True),
Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)),
Subreddit: (Field('contents',
lambda s: ' '.join([unicode_safe(s.name),
unicode_safe(s.title),
unicode_safe(s.description),
unicode_safe(s.firsttext)]),
tokenize = True),
Field('boost', '_downs'),
#Field('title'),
#Field('firsttext'),
#Field('description'),
#Field('over_18'),
#Field('sr_type','type'),
),
Link: (Field('contents','title', tokenize = True),
Field('boost', lambda t: int(t._hot*1000),
# yes, it's a copy of 'hot'
is_number=True, reverse=True),
Field('author_id'),
ThingField('author',Account,'author_id','name'),
ThingField('subreddit',Subreddit,'sr_id','name'),
#ThingField('reddit',Subreddit,'sr_id','name'),
Field('sr_id'),
Field('url', tokenize = True),
#Field('domain',
# lambda l: UrlParser(l.url).domain_permutations()),
Field('site',
lambda l: UrlParser(l.url).domain_permutations()),
#Field('is_self','is_self'),
),
Comment: (Field('contents', 'body', tokenize = True),
Field('boost', lambda t: int(t._hot*1000),
# yes, it's a copy of 'hot'
is_number=True, reverse=True),
ThingField('author',Account,'author_id','name'),
ThingField('subreddit',Subreddit,'sr_id','name'))}
#ThingField('reddit',Subreddit,'sr_id','name'))}
def strip_control_characters(text):
if not isinstance(text, basestring):
return text
return ''.join((c for c in text if ord(c) >= 0x20))
def tokenize_things(things,return_dict=False):
"""
Here, we take a list of things, and return a list of
dictionaries of fields, which will be sent to Solr. We take
the `search_fields` dictionary above, and look for all classes
for which each Thing is an instance (that is, a Comment will
pick up fields for Thing as well as Comment), and extract the
given fields. All tokenised Things are expected to have a
'contents' attribute. That field is then copied to
contents_XX, where XX is your two-letter language code, which
becomes your default search field. Those language-specific
fields are also set up with the proper language-stemming and
tokenisers on Solr's end (in config/schema.xml), which allows
for language-specific searching
"""
global search_fields
batched_classes = {}
ret = {}
for thing in things:
try:
t = {'type': []}
for cls in ((thing.__class__,) + thing.__class__.__bases__):
t['type'].append(cls.__name__.lower())
if cls in search_fields:
for field in search_fields[cls]:
if field.__class__ == Field:
try:
val = field.extract_from(thing)
val = strip_control_characters(val)
if val != None and val != '':
t[field.name] = val
except AttributeError,e:
print e
elif field.__class__ == ThingField:
if not field.cls in batched_classes:
batched_classes[field.cls] = []
batched_classes[field.cls].append((thing,field))
# copy 'contents' to ('contents_%s' % lang) and contents_ws
t[lang_to_fieldname(thing.lang)] = t['contents']
t['contents_ws'] = t['contents']
ret[thing._fullname] = t
except AttributeError,e:
print e
except KeyError,e:
print e
# batched_classes should now be a {cls: [(Thing,ThingField)]}.
# This ugliness is to make it possible to batch Thing lookups, as
# they were accounting for most of the indexing time
for cls in batched_classes:
ids = set()
for (thing,field) in batched_classes[cls]:
# extract the IDs
try:
id = getattr(thing,field.id_attr)
ids.add(id)
except AttributeError,e:
print e
found_batch = cls._byID(ids,data=True,return_dict=True)
for (thing,field) in batched_classes[cls]:
try:
id = getattr(thing,field.id_attr)
ret[thing._fullname][field.name] = strip_control_characters(
getattr(found_batch[id],field.lu_attr_name))
except AttributeError,e:
print e
except KeyError,e:
print e
return ret if return_dict else ret.values()
def lang_to_fieldname(l):
"""
Returns the field-name for the given language, or `contents`
if it isn't found
"""
global searchable_langs
code = l[:2]
if code in searchable_langs:
return ("contents_%s" % code)
else:
return "contents"
def tokenize(thing):
return tokenize_things([thing])
def index_things(s=None,things=[]):
"Sends the given Things to Solr to be indexed"
tokenized = tokenize_things(things)
if s:
s.add(tokenized)
else:
with SolrConnection(commit=True) as s:
s.add(tokenize_things(things))
def fetch_batches(t_class,size,since,until):
"""
Convenience function to fetch all Things of class t_class with
_date from `since` to `until`, returning them in batches of
`size`. TODO: move to lib/utils, and merge to be the backend
of `fetch_things`
"""
q=t_class._query(t_class.c._date >= since,
t_class.c._spam == (True,False),
t_class.c._deleted == (True,False),
t_class.c._date < until,
sort = desc('_date'),
limit = size,
data = True)
orig_rules = deepcopy(q._rules)
things = list(q)
while things:
yield things
q._rules = deepcopy(orig_rules)
q._after(things[len(things)-1])
things = list(q)
solr_queue=Queue()
for i in range(20):
solr_queue.put(pysolr.Solr(g.solr_url))
class SolrConnection(object):
"""
Represents a connection to Solr, properly limited to N
concurrent connections. Used like
with SolrConnection() as s:
s.add(things)
"""
def __init__(self,commit=False,optimize=False):
self.commit = commit
self.optimize = optimize
def __enter__(self):
self.conn = solr_queue.get()
return self.conn
def __exit__(self, _type, _value, _tb):
if self.commit:
self.conn.commit()
if self.optimize:
self.conn.optimize()
solr_queue.task_done()
solr_queue.put(self.conn)
def indexer_worker(q,delete_all_first=False):
"""
The thread for mass-indexing that connects to Solr and submits
tokenised objects
"""
with SolrConnection(commit=True,optimize=True) as s:
count = 0
if delete_all_first:
s.delete(q='*:*')
t = q.get()
while t != "done":
# if it's not a list or a dictionary, I don't know how to
# handle it, so die. It's probably an exception pushed in
# by the handler in my parent
if not (isinstance(t,list) and isinstance(t[0],dict)):
raise t
count += len(t)
s.add(t)
if count > 25000:
print "Committing... (q:%d)" % (q.qsize(),)
s.commit()
count = 0
q.task_done()
t=q.get()
q.task_done()
def reindex_all(types = None, delete_all_first=False):
"""
Called from `paster run` to totally re-index everything in the
database. Spawns a thread to connect to Solr, and sends it
tokenised Things
"""
global indexed_types
start_t = datetime.now()
if not types:
types = indexed_types
# We don't want the default thread-local cache (which is just a
# dict) to grow un-bounded (normally, we'd use
# utils.set_emptying_cache, except that that preserves memcached,
# and we don't even want to get memcached for total indexing,
# because it would dump out more recent stuff)
g.cache.caches = (SelfEmptyingCache(),) # + g.cache.caches[1:]
count = 0
q=Queue(100)
indexer=Thread(target=indexer_worker,
args=(q,delete_all_first))
indexer.start()
try:
for cls in types:
for batch in fetch_batches(cls,1000,
timeago("50 years"),
start_t):
r = tokenize_things([ x for x in batch
if not x._spam and not x._deleted ])
count += len(r)
print ("Processing %s #%d(%s): %s"
% (cls.__name__, count, q.qsize(), r[0]['contents']))
if indexer.isAlive():
q.put(r)
else:
raise Exception("'tis a shame that I have but one thread to give")
q.put("done")
indexer.join()
except object,e:
if indexer.isAlive():
q.put(e,timeout=30)
raise e
except KeyboardInterrupt,e: # turns out KeyboardInterrupts aren't objects. Who knew?
if indexer.isAlive():
q.put(e,timeout=30)
raise e
def combine_searchterms(terms):
"""
Convenience function to take a list like
[ sr_id:1, sr_id:2 sr_id:3 subreddit:reddit.com ]
and turn it into
sr_id:(1 2 3) OR subreddit:reddit.com
"""
combined = {}
for (name,val) in terms:
combined[name] = combined.get(name,[]) + [val]
ret = []
for (name,vals) in combined.iteritems():
if len(vals) == 1:
ret.append("%s:%s" % (name,vals[0]))
else:
ret.append("%s:(%s)" % (name," ".join(vals)))
if len(ret) > 1:
ret = "(%s)" % " OR ".join(ret)
else:
ret = " ".join(ret)
return ret
def swap_strings(s,this,that):
"""
Just swaps substrings, like:
s = "hot asc"
s = swap_strings(s,'asc','desc')
s == "hot desc"
uses 'tmp' as a replacment string, so don't use for anything
very complicated
"""
return s.replace(this,'tmp').replace(that,this).replace('tmp',that)
class SearchQuery(object):
def __init__(self, q, sort, fields = [], subreddits = [], authors = [],
types = [], timerange = None, spam = False, deleted = False):
self.q = q
self.fields = fields
self.sort = sort
self.subreddits = subreddits
self.authors = authors
self.types = types
self.spam = spam
self.deleted = deleted
if timerange in ['day','month','year']:
self.timerange = ('NOW-1%s/HOUR' % timerange.upper(),"NOW")
elif timerange == 'week':
self.timerange = ('NOW-7DAY/HOUR',"NOW")
elif timerange == 'hour':
self.timerange = ('NOW-1HOUR/MINUTE',"NOW")
elif timerange == 'all' or timerange is None:
self.timerange = None
else:
self.timerange = timerange
def __repr__(self):
attrs = [ "***q=%s***" % self.q ]
if self.subreddits is not None:
attrs.append("srs=" + '+'.join([ "%d" % s
for s in self.subreddits ]))
if self.authors is not None:
attrs.append("authors=" + '+'.join([ "%d" % s
for s in self.authors ]))
if self.timerange is not None:
attrs.append("timerange=%s" % str(self.timerange))
if self.sort is not None:
attrs.append("sort=%r" % self.sort)
return "<%s(%s)>" % (self.__class__.__name__, ", ".join(attrs))
def run(self, after = None, num = 1000, reverse = False,
_update = False):
if not self.q:
return pysolr.Results([],0)
if not g.solr_url:
raise SolrError("g.solr_url is not set")
# there are two parts to our query: what the user typed
# (parsed with Solr's DisMax parser), and what we are adding
# to it. The latter is called the "boost" (and is parsed using
# full Lucene syntax), and it can be added to via the `boost`
# parameter
boost = []
if not self.spam:
boost.append("-spam:true")
if not self.deleted:
boost.append("-deleted:true")
if self.timerange:
def time_to_searchstr(t):
if isinstance(t, datetime):
t = t.strftime('%Y-%m-%dT%H:%M:%S.000Z')
elif isinstance(t, date):
t = t.strftime('%Y-%m-%dT00:00:00.000Z')
elif isinstance(t,str):
t = t
return t
(fromtime, totime) = self.timerange
fromtime = time_to_searchstr(fromtime)
totime = time_to_searchstr(totime)
boost.append("+date:[%s TO %s]"
% (fromtime,totime))
if self.subreddits:
def subreddit_to_searchstr(sr):
if isinstance(sr,Subreddit):
return ('sr_id','%d' % sr.id)
elif isinstance(sr,str) or isinstance(sr,unicode):
return ('subreddit',sr)
else:
return ('sr_id','%d' % sr)
s_subreddits = map(subreddit_to_searchstr, tup(self.subreddits))
boost.append("+(%s)" % combine_searchterms(s_subreddits))
if self.authors:
def author_to_searchstr(a):
if isinstance(a,Account):
return ('author_id','%d' % a.id)
elif isinstance(a,str) or isinstance(a,unicode):
return ('author',a)
else:
return ('author_id','%d' % a)
s_authors = map(author_to_searchstr,tup(self.authors))
boost.append('+(%s)^2' % combine_searchterms(s_authors))
def type_to_searchstr(t):
if isinstance(t,str):
return ('type',t)
else:
return ('type',t.__name__.lower())
s_types = map(type_to_searchstr,self.types)
boost.append("+%s" % combine_searchterms(s_types))
q,solr_params = self.solr_params(self.q,boost)
search = self.run_search(q, self.sort, solr_params,
reverse, after, num,
_update = _update)
return search
@classmethod
def run_search(cls, q, sort, solr_params, reverse, after, num,
_update = False):
"returns pysolr.Results(docs=[fullname()],hits=int())"
if reverse:
sort = swap_strings(sort,'asc','desc')
after = after._fullname if after else None
search = cls.run_search_cached(q, sort, 0, num, solr_params,
_update = _update)
search.docs = get_after(search.docs, after, num)
return search
@staticmethod
@memoize('solr_search', solr_cache_time)
def run_search_cached(q, sort, start, rows, other_params):
with SolrConnection() as s:
g.log.debug(("Searching q = %r; sort = %r,"
+ " start = %r, rows = %r,"
+ " params = %r")
% (q,sort,start,rows,other_params))
res = s.search(q, sort, start = start, rows = rows,
other_params = other_params)
# extract out the fullname in the 'docs' field, since that's
# all we care about
res = pysolr.Results(docs = [ i['fullname'] for i in res.docs ],
hits = res.hits)
return res
def solr_params(self,*k,**kw):
raise NotImplementedError
class UserSearchQuery(SearchQuery):
"Base class for queries that use the dismax parser"
def __init__(self, q, mm, sort=None, fields=[], langs=None, **kw):
default_fields = ['contents^1.5','contents_ws^3'] + fields
if langs is None:
fields = default_fields
else:
if langs == 'all':
langs = searchable_langs
fields = set([("%s^2" % lang_to_fieldname(lang)) for lang in langs]
+ default_fields)
# minimum match. See http://lucene.apache.org/solr/api/org/apache/solr/util/doc-files/min-should-match.html
self.mm = mm
SearchQuery.__init__(self, q, sort, fields = fields, **kw)
def solr_params(self, q, boost):
return q, dict(fl = 'fullname',
qt = 'dismax',
bq = ' '.join(boost),
qf = ' '.join(self.fields),
mm = self.mm)
class LinkSearchQuery(UserSearchQuery):
def __init__(self, q, mm = None, **kw):
additional_fields = ['site^1','author^1', 'subreddit^1', 'url^1']
if mm is None:
mm = '4<75%'
UserSearchQuery.__init__(self, q, mm = mm, fields = additional_fields,
types=[Link], **kw)
class RelatedSearchQuery(LinkSearchQuery):
def __init__(self, q, ignore = [], **kw):
self.ignore = set(ignore) if ignore else set()
LinkSearchQuery.__init__(self, q, mm = '3<100% 5<60% 8<50%', **kw)
def run(self, *k, **kw):
search = LinkSearchQuery.run(self, *k, **kw)
search.docs = [ x for x in search.docs if x not in self.ignore ]
return search
class SubredditSearchQuery(UserSearchQuery):
def __init__(self, q, **kw):
# note that 'downs' is a measure of activity on subreddits
UserSearchQuery.__init__(self, q, mm = '75%', sort = 'downs desc',
types=[Subreddit], **kw)
class DomainSearchQuery(SearchQuery):
def __init__(self, domain, **kw):
q = '+site:%s' % domain
SearchQuery.__init__(self, q = q, fields=['site'],types=[Link], **kw)
def solr_params(self, q, boost):
q = q + ' ' + ' '.join(boost)
return q, dict(fl='fullname',
qt='standard')
def run_commit(optimize=False):
with SolrConnection(commit=True, optimize=optimize) as s:
pass
def run_changed(drain=False):
"""
Run by `cron` (through `paster run`) on a schedule to update
all Things that have been created or have changed since the
last run. Note: unlike many queue-using functions, this one is
run from cron and totally drains the queue before terminating
"""
@g.stats.amqp_processor('solrsearch_changes')
def _run_changed(msgs, chan):
print "changed: Processing %d items" % len(msgs)
msgs = [strordict_fullname(msg.body)
for msg in msgs]
fullnames = set(msg['fullname'] for msg in msgs if not msg.get('boost_only'))
things = Thing._by_fullname(fullnames, data=True, return_dict=False)
things = [x for x in things if isinstance(x, indexed_types)]
update_things = [x for x in things if not x._spam and not x._deleted]
delete_things = [x for x in things if x._spam or x._deleted]
with SolrConnection() as s:
if update_things:
tokenized = tokenize_things(update_things)
s.add(tokenized)
if delete_things:
for i in delete_things:
s.delete(id=i._fullname)
amqp.handle_items('solrsearch_changes', _run_changed, limit=1000,
drain=drain)

View File

@@ -1,13 +0,0 @@
description "commit/optimize solr index"
instance $optimize
manual
task
nice 10
script
. /etc/default/reddit
wrap-job paster run $REDDIT_INI -c "from r2.lib import solrsearch; solrsearch.run_commit(optimize=$optimize)"
end script