Don't show spam in search listings (also avoids adding spam to the search index at all, would could speed up solr)

2026-04-05 03:00:15 -04:00 · 2008-06-27 17:23:43 -07:00
parent 78ad7eb9a8
commit 355277e672
3 changed files with 18 additions and 10 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,8 @@ r2/r2/public/static/frame.js
 r2/r2/public/static/reddit.js
 r2/r2/public/static/vote.js
 r2/r2/public/static/reddit_rtl.css
+r2/r2admin
+r2/reddit_i18n
 r2/data/*
 r2/count.pickle
 r2/srcount.pickle
--- a/r2/r2/lib/solrsearch.py
+++ b/r2/r2/lib/solrsearch.py
@@ -29,7 +29,6 @@
 from __future__ import with_statement

 from r2.models import *
-from r2.models import thing_changes
 from r2.lib.contrib import pysolr
 from r2.lib.contrib.pysolr import SolrError
 from r2.lib.utils import timeago, set_emptying_cache, IteratorChunker
@@ -382,7 +381,8 @@ def reindex_all(types = None, delete_all_first=False):
            for batch in fetch_batches(cls,1000,
                                       timeago("50 years"),
                                       start_t):
-                r = tokenize_things(batch)
+                r = tokenize_things([x for x in batch
+                                     if not x._spam and not x._deleted ])

                count += len(r)
                print ("Processing %s #%d(%s): %s"
@@ -446,6 +446,7 @@ def changed(types=None,since=None,commit=True,optimize=False):
                chunk = cls._by_fullname(chunk,
                                         data=True, return_dict=False)
                chunk = [x for x in chunk if not x._spam and not x._deleted]
+                to_delete = [x for x in chunk if x._spam or x._deleted]

                # note: anything marked as spam or deleted is not
                # updated in the search database. Since these are
@@ -454,6 +455,9 @@ def changed(types=None,since=None,commit=True,optimize=False):
                    chunk  = tokenize_things(chunk)
                    s.add(chunk)

+                for i in to_delete:
+                    s.delete(id=i._fullname)
+
    save_last_run(start_t)

 def combine_searchterms(terms):
--- a/r2/r2/models/builder.py
+++ b/r2/r2/models/builder.py
@@ -33,7 +33,6 @@ from r2.lib.wrapped import Wrapped
 from r2.lib import utils
 from r2.lib.db import operators
 from r2.lib.cache import sgm
-from r2.lib import solrsearch

 from copy import deepcopy, copy

@@ -361,7 +360,7 @@ class SearchBuilder(QueryBuilder):
        self.total_num = 0
        self.langs = langs

-        self.ignore = ignore
+        self.ignore = set(x for x in (ignore if ignore else []))

    def init_query(self):
        subreddits = None
@@ -381,7 +380,15 @@ class SearchBuilder(QueryBuilder):
        self.subreddits = subreddits
        self.authors = authors

+        self.skip = True
+
+    def keep_item(self,item):
+        skip_if = item._spam or item._deleted or item._fullname in self.ignore
+        return not skip_if
+
    def fetch_more(self, last_item, num_have):
+        from r2.lib import solrsearch
+
        start_t = time.time()

        done = False
@@ -412,15 +419,10 @@ class SearchBuilder(QueryBuilder):
                                             timerange = timerange, langs = langs,
                                             types = self.types)

-        things = [x
-                  for x in new_items
-                  if not x._fullname in
-                      (self.ignore if self.ignore else [])]
-
        self.total_num = new_items.hits
        self.timing = time.time() - start_t

-        return done, things
+        return done, new_items

 class CommentBuilder(Builder):
    def __init__(self, link, sort, comment = None, context = None):