From 2083e88100e1a542bc72040fe0e1b03f81f84f39 Mon Sep 17 00:00:00 2001
From: shlurbee <valerie.hajdik@gmail.com>
Date: Sat, 13 Jul 2013 14:02:00 -0700
Subject: [PATCH] Enable fetching recommendations for multiple srs at once.

This change allows you to pass a list of srid36s and get back recommendations
for all of them at once, which is useful for multireddit suggestions.

A function that accepted a single subreddit id as an argument and memoized the
results was removed and its functionality replaced by sgm caching, which sets
multiple items in memcache at once.

A source prefix is added to the row key so multiple sets of recs can be stored
in the same column family.

Adult items are filtered unless one of the original subreddits was over_18.

Recommendations for a set of subreddits are merged and sorted by the number of
items matched. (User feedback showed that people perceive recommendations as
incorrect if they don't match the "theme" of a large multi, and recs that
match several srs in the multi instead of just one are more likely to fit the
theme, so we show those first.)
---
 r2/r2/lib/recommender.py | 110 +++++++++++++++++++++++++++++----------
 1 file changed, 82 insertions(+), 28 deletions(-)

diff --git a/r2/r2/lib/recommender.py b/r2/r2/lib/recommender.py
index e2b46df59..66bd8406a 100644
--- a/r2/r2/lib/recommender.py
+++ b/r2/r2/lib/recommender.py
@@ -20,32 +20,54 @@
 # Inc. All Rights Reserved.
 ###############################################################################
 
+import itertools
+import math
+from collections import defaultdict
 from datetime import timedelta
+from operator import itemgetter
 
 from r2.models import Subreddit
+from r2.lib.sgm import sgm
 from r2.lib.db import tdb_cassandra
 from r2.lib.memoize import memoize
+from r2.lib.utils import tup
+
+from pylons import g
+
+SRC_LINKVOTES = 'lv'
+SRC_MULTIREDDITS = 'mr'
+
+
+def get_recommendations(srs, count=10, source=SRC_MULTIREDDITS, to_omit=None):
+    """Return subreddits recommended if you like the given subreddits.
+
+    Args:
+    - srs is one Subreddit object or a list of Subreddits
+    - count is total number of results to return
+    - source is a prefix telling which set of recommendations to use
+    - to_omit is one Subreddit object or a list of Subreddits that should not
+        be included. (Useful for omitting recs that were already rejected.)
 
-def get_recommendations(srs):
-    """
-    Return the subreddits recommended if you like the given subreddit
     """
+    srs = tup(srs)
+    to_omit = tup(to_omit) if to_omit else []
+    
+    # fetch more recs than requested because some might get filtered out
+    rec_id36s = SRRecommendation.for_srs([sr._id36 for sr in srs],
+                                         [o._id36 for o in to_omit],
+                                          count * 2,
+                                          source)
 
-    # for now, but keep the API open for multireddits later
-    assert len(srs) == 1 and srs[0].__class__ == Subreddit
+    # always check for private subreddits at runtime since type might change
+    rec_srs = Subreddit._byID36(rec_id36s, return_dict=False)
+    filtered = [sr for sr in rec_srs if sr.type != 'private']
 
-    sr = srs[0]
-    recs = _get_recommendations(sr._id36)
-    if not recs:
-        return []
+    # don't recommend adult srs unless one of the originals was over_18
+    if not any(sr.over_18 for sr in srs):
+        filtered = [sr for sr in filtered if not sr.over_18]
 
-    srs = Subreddit._byID36(recs, return_dict=True, data=True)
+    return filtered[:count]
 
-    return srs
-
-@memoize('_get_recommendations', stale=True)
-def _get_recommendations(srid36):
-    return SRRecommendation.for_sr(srid36)
 
 class SRRecommendation(tdb_cassandra.View):
     _use_db = True
@@ -54,28 +76,60 @@ class SRRecommendation(tdb_cassandra.View):
 
     # don't keep these around if a run hasn't happened lately, or if the last
     # N runs didn't generate recommendations for a given subreddit
-    _ttl = timedelta(days=2)
+    _ttl = timedelta(days=7, hours=12)
 
     # we know that we mess with these but it's okay
     _warn_on_partial_ttl = False
 
     @classmethod
-    def for_sr(cls, srid36, count=5):
+    def for_srs(cls, srid36, to_omit, count=10, source=SRC_MULTIREDDITS):
+        # It's usually better to use get_recommendations() than to call this
+        # function directly because it does privacy filtering.
+
+        srid36s = tup(srid36)
+        to_omit = set(to_omit)
+        to_omit.update(srid36s)  # don't show the originals
+        rowkeys = ['%s.%s' % (source, srid36) for srid36 in srid36s]
+
+        # fetch multiple sets of recommendations, one for each input srid36
+        d = sgm(g.cache, rowkeys, SRRecommendation._byID, prefix='srr.')
+        rows = d.values()
+
+        sorted_recs = SRRecommendation._merge_and_sort_by_count(rows)
+        
+        # heuristic: if the input set is large, rec should match more than one
+        min_count = math.floor(.1 * len(srid36s))
+        sorted_recs = (rec[0] for rec in sorted_recs if rec[1] > min_count)
+
+        # remove duplicates and ids listed in to_omit
+        filtered = []
+        for r in sorted_recs:
+            if r not in to_omit:
+                filtered.append(r)
+                to_omit.add(r)
+        return filtered[:count]
+
+    @classmethod
+    def _merge_and_sort_by_count(cls, rows):
+        """Combine and sort multiple sets of recs.
+
+        Combines multiple sets of recs and sorts by number of times each rec
+        appears, the reasoning being that an item recommended for several of
+        the original srs is more likely to match the "theme" of the set.
+
         """
-        Return the subreddits ID36s recommended by the sr whose id36 is passed
-        """
-
-        cq = tdb_cassandra.ColumnQuery(cls, [srid36],
-                                       column_count = count+1,
-                                       column_reversed = True)
-
-        recs = [ r.values()[0] for r in cq if r.values()[0] != srid36 ][:count]
-
-        return recs
+        # combine recs from all input srs
+        rank_id36_pairs = itertools.chain(*[row._values().iteritems()
+                                            for row in rows])
+        ranks = defaultdict(list)
+        for rank, id36 in rank_id36_pairs:
+            ranks[id36].append(rank)
+        recs = [(id36, len(ranks), max(ranks)) for id36, ranks in ranks.iteritems()]
+        # sort by number of times appeared, then max rank
+        return sorted(recs, key=itemgetter(1, 2), reverse=True)
 
     def _to_recs(self):
         recs = self._values() # [ {rank, srid} ]
         recs = sorted(recs.items(), key=lambda x: int(x[0]))
         recs = [x[1] for x in recs]
         return recs
-