LabeledMulti: Age-normalized hot

Update normalized_hot to allow certain LabeledMultis to specify slightly different weighting between subreddits compared to the current algorithm. Current merge algorithm always results in an N-subreddit multi having the first N results being the #1 result from each of the individual subreddits; this is not always ideal for slow subreddits (e.g., /r/announcements and /r/blog). Age-weighting allows a LabeledMulti to scale those older posts further down the list, and lets them drop off after a number of days. The age-weighting will require a change to the use of sgm in normalized_hot prior to full deployment, as the calculated ehot values are no longer global amongst all users.
2026-01-22 21:38:11 -05:00 · 2015-01-06 15:30:23 -08:00
parent 1c8a27121a
commit f58b70fa25
3 changed files with 44 additions and 11 deletions
--- a/r2/r2/controllers/listingcontroller.py
+++ b/r2/r2/controllers/listingcontroller.py
@@ -417,7 +417,8 @@ class HotController(ListingWithPromos):
            sr_ids = Subreddit.user_subreddits(c.user)
            return normalized_hot(sr_ids)
        elif isinstance(c.site, MultiReddit):
-            return normalized_hot(c.site.kept_sr_ids, obey_age_limit=False)
+            return normalized_hot(c.site.kept_sr_ids, obey_age_limit=False,
+                                  ageweight=c.site.normalized_age_weight)
        else:
            if c.site.sticky_fullname:
                link_list = [c.site.sticky_fullname]
--- a/r2/r2/lib/normalized_hot.py
+++ b/r2/r2/lib/normalized_hot.py
@@ -26,6 +26,7 @@ from datetime import datetime, timedelta

 from pylons import g

+from r2.config import feature
 from r2.lib.cache import sgm
 from r2.lib.db.queries import _get_links, CachedResults
 from r2.lib.db.sorts import epoch_seconds
@@ -35,38 +36,62 @@ MAX_PER_SUBREDDIT = 150
 MAX_LINKS = 1000


-def get_hot_tuples(sr_ids):
+def get_hot_tuples(sr_ids, ageweight=None):
    queries_by_sr_id = {sr_id: _get_links(sr_id, sort='hot', time='all')
                        for sr_id in sr_ids}
    CachedResults.fetch_multi(queries_by_sr_id.values())
    tuples_by_srid = {sr_id: [] for sr_id in sr_ids}

+    now_seconds = epoch_seconds(datetime.now(g.tz))
+
    for sr_id, q in queries_by_sr_id.iteritems():
        if not q.data:
            continue

-        link_name, hot, timestamp = q.data[0]
-        thot = max(hot, 1.)
-        tuples_by_srid[sr_id].append((-1., -hot, link_name, timestamp))
+        hot_factor = get_hot_factor(q.data[0], now_seconds, ageweight)

-        for link_name, hot, timestamp in q.data[1:MAX_PER_SUBREDDIT]:
-            ehot = hot / thot
+        for link_name, hot, timestamp in q.data[:MAX_PER_SUBREDDIT]:
+            effective_hot = hot / hot_factor
            # heapq.merge sorts from smallest to largest so we need to flip
            # ehot and hot to get the hottest links first
-            tuples_by_srid[sr_id].append((-ehot, -hot, link_name, timestamp))
+            tuples_by_srid[sr_id].append(
+                (-effective_hot, -hot, link_name, timestamp)
+            )

    return tuples_by_srid


-def normalized_hot(sr_ids, obey_age_limit=True):
+def get_hot_factor(qdata, now, ageweight):
+    """Return a "hot factor" score for a link's hot tuple.
+
+    Recalculate the item's hot score as if it had been submitted
+    more recently than it was. This will cause the `effective_hot` value in
+    get_hot_tuples to move older first items back
+
+    ageweight should be a float from 0.0 - 1.0, which "scales" how far
+    between the original submission time and "now" to use as the base
+    for the new hot score. Smaller values will favor older #1 posts in
+    multireddits; larger values will drop older posts further in the ranking
+    (or possibly off the ranking entirely).
+
+    """
+    ageweight = float(ageweight or 0.0)
+    link_name, hot, timestamp = qdata
+    return max(hot + ((now - timestamp) * ageweight) / 45000.0, 1.0)
+
+
+def normalized_hot(sr_ids, obey_age_limit=True, ageweight=None):
    timer = g.stats.get_timer("normalized_hot")
    timer.start()

    if not sr_ids:
        return []

-    tuples_by_srid = sgm(g.cache, sr_ids, miss_fn=get_hot_tuples,
-                         prefix='normalized_hot', time=g.page_cache_time)
+    if ageweight and feature.is_enabled("scaled_normalized_hot"):
+        tuples_by_srid = get_hot_tuples(sr_ids, ageweight=ageweight)
+    else:
+        tuples_by_srid = sgm(g.cache, sr_ids, miss_fn=get_hot_tuples,
+                             prefix='normalized_hot', time=g.page_cache_time)

    if obey_age_limit:
        cutoff = datetime.now(g.tz) - timedelta(days=g.HOT_PAGE_AGE)
--- a/r2/r2/models/subreddit.py
+++ b/r2/r2/models/subreddit.py
@@ -1352,6 +1352,10 @@ class DefaultSR(_DefaultSR):
 class MultiReddit(FakeSubreddit):
    name = 'multi'
    header = ""
+    _defaults = dict(
+        FakeSubreddit._defaults,
+        normalized_age_weight=0.0,
+    )

    def __init__(self, path=None, srs=None):
        FakeSubreddit.__init__(self)
@@ -1542,6 +1546,9 @@ class LabeledMulti(tdb_cassandra.Thing, MultiReddit):
            "date": pycassa.system_manager.DATE_TYPE,
        },
    }
+    _float_props = (
+        "base_normalized_age_weight",
+    )
    _compare_with = tdb_cassandra.UTF8_TYPE
    _read_consistency_level = tdb_cassandra.CL.ONE
    _write_consistency_level = tdb_cassandra.CL.QUORUM