Enable fetching recommendations for multiple srs at once.

This change allows you to pass a list of srid36s and get back recommendations for all of them at once, which is useful for multireddit suggestions. A function that accepted a single subreddit id as an argument and memoized the results was removed and its functionality replaced by sgm caching, which sets multiple items in memcache at once. A source prefix is added to the row key so multiple sets of recs can be stored in the same column family. Adult items are filtered unless one of the original subreddits was over_18. Recommendations for a set of subreddits are merged and sorted by the number of items matched. (User feedback showed that people perceive recommendations as incorrect if they don't match the "theme" of a large multi, and recs that match several srs in the multi instead of just one are more likely to fit the theme, so we show those first.)
2026-01-28 08:17:58 -05:00 · 2013-07-13 14:02:00 -07:00
parent c0d63cf803
commit 2083e88100
1 changed files with 82 additions and 28 deletions
--- a/r2/r2/lib/recommender.py
+++ b/r2/r2/lib/recommender.py
@@ -20,32 +20,54 @@
 # Inc. All Rights Reserved.
 ###############################################################################

+import itertools
+import math
+from collections import defaultdict
 from datetime import timedelta
+from operator import itemgetter

 from r2.models import Subreddit
+from r2.lib.sgm import sgm
 from r2.lib.db import tdb_cassandra
 from r2.lib.memoize import memoize
+from r2.lib.utils import tup
+
+from pylons import g
+
+SRC_LINKVOTES = 'lv'
+SRC_MULTIREDDITS = 'mr'
+
+
+def get_recommendations(srs, count=10, source=SRC_MULTIREDDITS, to_omit=None):
+    """Return subreddits recommended if you like the given subreddits.
+
+    Args:
+    - srs is one Subreddit object or a list of Subreddits
+    - count is total number of results to return
+    - source is a prefix telling which set of recommendations to use
+    - to_omit is one Subreddit object or a list of Subreddits that should not
+        be included. (Useful for omitting recs that were already rejected.)

-def get_recommendations(srs):
-    """
-    Return the subreddits recommended if you like the given subreddit
    """
+    srs = tup(srs)
+    to_omit = tup(to_omit) if to_omit else []
+    
+    # fetch more recs than requested because some might get filtered out
+    rec_id36s = SRRecommendation.for_srs([sr._id36 for sr in srs],
+                                         [o._id36 for o in to_omit],
+                                          count * 2,
+                                          source)

-    # for now, but keep the API open for multireddits later
-    assert len(srs) == 1 and srs[0].__class__ == Subreddit
+    # always check for private subreddits at runtime since type might change
+    rec_srs = Subreddit._byID36(rec_id36s, return_dict=False)
+    filtered = [sr for sr in rec_srs if sr.type != 'private']

-    sr = srs[0]
-    recs = _get_recommendations(sr._id36)
-    if not recs:
-        return []
+    # don't recommend adult srs unless one of the originals was over_18
+    if not any(sr.over_18 for sr in srs):
+        filtered = [sr for sr in filtered if not sr.over_18]

-    srs = Subreddit._byID36(recs, return_dict=True, data=True)
+    return filtered[:count]

-    return srs
-
-@memoize('_get_recommendations', stale=True)
-def _get_recommendations(srid36):
-    return SRRecommendation.for_sr(srid36)

 class SRRecommendation(tdb_cassandra.View):
    _use_db = True
@@ -54,28 +76,60 @@ class SRRecommendation(tdb_cassandra.View):

    # don't keep these around if a run hasn't happened lately, or if the last
    # N runs didn't generate recommendations for a given subreddit
-    _ttl = timedelta(days=2)
+    _ttl = timedelta(days=7, hours=12)

    # we know that we mess with these but it's okay
    _warn_on_partial_ttl = False

    @classmethod
-    def for_sr(cls, srid36, count=5):
+    def for_srs(cls, srid36, to_omit, count=10, source=SRC_MULTIREDDITS):
+        # It's usually better to use get_recommendations() than to call this
+        # function directly because it does privacy filtering.
+
+        srid36s = tup(srid36)
+        to_omit = set(to_omit)
+        to_omit.update(srid36s)  # don't show the originals
+        rowkeys = ['%s.%s' % (source, srid36) for srid36 in srid36s]
+
+        # fetch multiple sets of recommendations, one for each input srid36
+        d = sgm(g.cache, rowkeys, SRRecommendation._byID, prefix='srr.')
+        rows = d.values()
+
+        sorted_recs = SRRecommendation._merge_and_sort_by_count(rows)
+        
+        # heuristic: if the input set is large, rec should match more than one
+        min_count = math.floor(.1 * len(srid36s))
+        sorted_recs = (rec[0] for rec in sorted_recs if rec[1] > min_count)
+
+        # remove duplicates and ids listed in to_omit
+        filtered = []
+        for r in sorted_recs:
+            if r not in to_omit:
+                filtered.append(r)
+                to_omit.add(r)
+        return filtered[:count]
+
+    @classmethod
+    def _merge_and_sort_by_count(cls, rows):
+        """Combine and sort multiple sets of recs.
+
+        Combines multiple sets of recs and sorts by number of times each rec
+        appears, the reasoning being that an item recommended for several of
+        the original srs is more likely to match the "theme" of the set.
+
        """
-        Return the subreddits ID36s recommended by the sr whose id36 is passed
-        """
-
-        cq = tdb_cassandra.ColumnQuery(cls, [srid36],
-                                       column_count = count+1,
-                                       column_reversed = True)
-
-        recs = [ r.values()[0] for r in cq if r.values()[0] != srid36 ][:count]
-
-        return recs
+        # combine recs from all input srs
+        rank_id36_pairs = itertools.chain(*[row._values().iteritems()
+                                            for row in rows])
+        ranks = defaultdict(list)
+        for rank, id36 in rank_id36_pairs:
+            ranks[id36].append(rank)
+        recs = [(id36, len(ranks), max(ranks)) for id36, ranks in ranks.iteritems()]
+        # sort by number of times appeared, then max rank
+        return sorted(recs, key=itemgetter(1, 2), reverse=True)

    def _to_recs(self):
        recs = self._values() # [ {rank, srid} ]
        recs = sorted(recs.items(), key=lambda x: int(x[0]))
        recs = [x[1] for x in recs]
        return recs
-