Don't show spam in search listings (also avoids adding spam to the search index at all, would could speed up solr)

This commit is contained in:
ketralnis
2008-06-27 17:23:43 -07:00
parent 78ad7eb9a8
commit 355277e672
3 changed files with 18 additions and 10 deletions

2
.gitignore vendored
View File

@@ -19,6 +19,8 @@ r2/r2/public/static/frame.js
r2/r2/public/static/reddit.js
r2/r2/public/static/vote.js
r2/r2/public/static/reddit_rtl.css
r2/r2admin
r2/reddit_i18n
r2/data/*
r2/count.pickle
r2/srcount.pickle

View File

@@ -29,7 +29,6 @@
from __future__ import with_statement
from r2.models import *
from r2.models import thing_changes
from r2.lib.contrib import pysolr
from r2.lib.contrib.pysolr import SolrError
from r2.lib.utils import timeago, set_emptying_cache, IteratorChunker
@@ -382,7 +381,8 @@ def reindex_all(types = None, delete_all_first=False):
for batch in fetch_batches(cls,1000,
timeago("50 years"),
start_t):
r = tokenize_things(batch)
r = tokenize_things([x for x in batch
if not x._spam and not x._deleted ])
count += len(r)
print ("Processing %s #%d(%s): %s"
@@ -446,6 +446,7 @@ def changed(types=None,since=None,commit=True,optimize=False):
chunk = cls._by_fullname(chunk,
data=True, return_dict=False)
chunk = [x for x in chunk if not x._spam and not x._deleted]
to_delete = [x for x in chunk if x._spam or x._deleted]
# note: anything marked as spam or deleted is not
# updated in the search database. Since these are
@@ -454,6 +455,9 @@ def changed(types=None,since=None,commit=True,optimize=False):
chunk = tokenize_things(chunk)
s.add(chunk)
for i in to_delete:
s.delete(id=i._fullname)
save_last_run(start_t)
def combine_searchterms(terms):

View File

@@ -33,7 +33,6 @@ from r2.lib.wrapped import Wrapped
from r2.lib import utils
from r2.lib.db import operators
from r2.lib.cache import sgm
from r2.lib import solrsearch
from copy import deepcopy, copy
@@ -361,7 +360,7 @@ class SearchBuilder(QueryBuilder):
self.total_num = 0
self.langs = langs
self.ignore = ignore
self.ignore = set(x for x in (ignore if ignore else []))
def init_query(self):
subreddits = None
@@ -381,7 +380,15 @@ class SearchBuilder(QueryBuilder):
self.subreddits = subreddits
self.authors = authors
self.skip = True
def keep_item(self,item):
skip_if = item._spam or item._deleted or item._fullname in self.ignore
return not skip_if
def fetch_more(self, last_item, num_have):
from r2.lib import solrsearch
start_t = time.time()
done = False
@@ -412,15 +419,10 @@ class SearchBuilder(QueryBuilder):
timerange = timerange, langs = langs,
types = self.types)
things = [x
for x in new_items
if not x._fullname in
(self.ignore if self.ignore else [])]
self.total_num = new_items.hits
self.timing = time.time() - start_t
return done, things
return done, new_items
class CommentBuilder(Builder):
def __init__(self, link, sort, comment = None, context = None):