LabeledMulti: Age-normalized hot

Update normalized_hot to allow certain LabeledMultis
to specify slightly different weighting between subreddits
compared to the current algorithm.

Current merge algorithm always results in an N-subreddit multi
having the first N results being the #1 result from each of
the individual subreddits; this is not always ideal for slow
subreddits (e.g., /r/announcements and /r/blog).

Age-weighting allows a LabeledMulti to scale those older posts
further down the list, and lets them drop off after a number of
days.

The age-weighting will require a change to the use of sgm
in normalized_hot prior to full deployment, as the calculated
ehot values are no longer global amongst all users.
This commit is contained in:
Keith Mitchell
2015-01-06 15:30:23 -08:00
parent 1c8a27121a
commit f58b70fa25
3 changed files with 44 additions and 11 deletions

View File

@@ -417,7 +417,8 @@ class HotController(ListingWithPromos):
sr_ids = Subreddit.user_subreddits(c.user)
return normalized_hot(sr_ids)
elif isinstance(c.site, MultiReddit):
return normalized_hot(c.site.kept_sr_ids, obey_age_limit=False)
return normalized_hot(c.site.kept_sr_ids, obey_age_limit=False,
ageweight=c.site.normalized_age_weight)
else:
if c.site.sticky_fullname:
link_list = [c.site.sticky_fullname]

View File

@@ -26,6 +26,7 @@ from datetime import datetime, timedelta
from pylons import g
from r2.config import feature
from r2.lib.cache import sgm
from r2.lib.db.queries import _get_links, CachedResults
from r2.lib.db.sorts import epoch_seconds
@@ -35,38 +36,62 @@ MAX_PER_SUBREDDIT = 150
MAX_LINKS = 1000
def get_hot_tuples(sr_ids):
def get_hot_tuples(sr_ids, ageweight=None):
queries_by_sr_id = {sr_id: _get_links(sr_id, sort='hot', time='all')
for sr_id in sr_ids}
CachedResults.fetch_multi(queries_by_sr_id.values())
tuples_by_srid = {sr_id: [] for sr_id in sr_ids}
now_seconds = epoch_seconds(datetime.now(g.tz))
for sr_id, q in queries_by_sr_id.iteritems():
if not q.data:
continue
link_name, hot, timestamp = q.data[0]
thot = max(hot, 1.)
tuples_by_srid[sr_id].append((-1., -hot, link_name, timestamp))
hot_factor = get_hot_factor(q.data[0], now_seconds, ageweight)
for link_name, hot, timestamp in q.data[1:MAX_PER_SUBREDDIT]:
ehot = hot / thot
for link_name, hot, timestamp in q.data[:MAX_PER_SUBREDDIT]:
effective_hot = hot / hot_factor
# heapq.merge sorts from smallest to largest so we need to flip
# ehot and hot to get the hottest links first
tuples_by_srid[sr_id].append((-ehot, -hot, link_name, timestamp))
tuples_by_srid[sr_id].append(
(-effective_hot, -hot, link_name, timestamp)
)
return tuples_by_srid
def normalized_hot(sr_ids, obey_age_limit=True):
def get_hot_factor(qdata, now, ageweight):
"""Return a "hot factor" score for a link's hot tuple.
Recalculate the item's hot score as if it had been submitted
more recently than it was. This will cause the `effective_hot` value in
get_hot_tuples to move older first items back
ageweight should be a float from 0.0 - 1.0, which "scales" how far
between the original submission time and "now" to use as the base
for the new hot score. Smaller values will favor older #1 posts in
multireddits; larger values will drop older posts further in the ranking
(or possibly off the ranking entirely).
"""
ageweight = float(ageweight or 0.0)
link_name, hot, timestamp = qdata
return max(hot + ((now - timestamp) * ageweight) / 45000.0, 1.0)
def normalized_hot(sr_ids, obey_age_limit=True, ageweight=None):
timer = g.stats.get_timer("normalized_hot")
timer.start()
if not sr_ids:
return []
tuples_by_srid = sgm(g.cache, sr_ids, miss_fn=get_hot_tuples,
prefix='normalized_hot', time=g.page_cache_time)
if ageweight and feature.is_enabled("scaled_normalized_hot"):
tuples_by_srid = get_hot_tuples(sr_ids, ageweight=ageweight)
else:
tuples_by_srid = sgm(g.cache, sr_ids, miss_fn=get_hot_tuples,
prefix='normalized_hot', time=g.page_cache_time)
if obey_age_limit:
cutoff = datetime.now(g.tz) - timedelta(days=g.HOT_PAGE_AGE)

View File

@@ -1352,6 +1352,10 @@ class DefaultSR(_DefaultSR):
class MultiReddit(FakeSubreddit):
name = 'multi'
header = ""
_defaults = dict(
FakeSubreddit._defaults,
normalized_age_weight=0.0,
)
def __init__(self, path=None, srs=None):
FakeSubreddit.__init__(self)
@@ -1542,6 +1546,9 @@ class LabeledMulti(tdb_cassandra.Thing, MultiReddit):
"date": pycassa.system_manager.DATE_TYPE,
},
}
_float_props = (
"base_normalized_age_weight",
)
_compare_with = tdb_cassandra.UTF8_TYPE
_read_consistency_level = tdb_cassandra.CL.ONE
_write_consistency_level = tdb_cassandra.CL.QUORUM