mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-15 01:48:18 -05:00
Add promoted_link tools.
This commit is contained in:
269
scripts/promoted_links.py
Normal file
269
scripts/promoted_links.py
Normal file
@@ -0,0 +1,269 @@
|
||||
#!/usr/bin/python
|
||||
# The contents of this file are subject to the Common Public Attribution
|
||||
# License Version 1.0. (the "License"); you may not use this file except in
|
||||
# compliance with the License. You may obtain a copy of the License at
|
||||
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
# software over a computer network and provide for limited attribution for the
|
||||
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
# with Exhibit B.
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
# the specific language governing rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is reddit.
|
||||
#
|
||||
# The Original Developer is the Initial Developer. The Initial Developer of
|
||||
# the Original Code is reddit Inc.
|
||||
#
|
||||
# All portions of the code written by reddit are Copyright (c) 2006-2012 reddit
|
||||
# Inc. All Rights Reserved.
|
||||
###############################################################################
|
||||
"""Tools for evaluating promoted link distribution."""
|
||||
|
||||
from collections import defaultdict
|
||||
import datetime
|
||||
from math import sqrt
|
||||
|
||||
from pylons import g
|
||||
from sqlalchemy.sql.functions import sum as sa_sum
|
||||
|
||||
from r2.lib import promote
|
||||
from r2.lib.db.operators import and_, or_
|
||||
from r2.lib.utils import to36, weighted_lottery
|
||||
from r2.models.traffic import (
|
||||
Session,
|
||||
TargetedImpressionsByCodename,
|
||||
PageviewsBySubredditAndPath,
|
||||
)
|
||||
from r2.models.bidding import PromotionWeights
|
||||
from r2.models import (
|
||||
Link,
|
||||
PromoCampaign,
|
||||
DefaultSR,
|
||||
)
|
||||
|
||||
LINK_PREFIX = Link._type_prefix + str(Link._type_id)
|
||||
PC_PREFIX = PromoCampaign._type_prefix + str(PromoCampaign._type_id)
|
||||
|
||||
|
||||
def error_statistics(errors):
|
||||
mean_error = sum(errors) / len(errors)
|
||||
min_error = min([abs(i) for i in errors])
|
||||
max_error = max([abs(i) for i in errors])
|
||||
stdev_error = sqrt(
|
||||
(sum([i ** 2 for i in errors]) / len(errors))
|
||||
- mean_error ** 2)
|
||||
return (mean_error, min_error, max_error, stdev_error)
|
||||
|
||||
|
||||
def get_scheduled(date, sr_name=''):
|
||||
all_promotions = PromotionWeights.get_campaigns(date)
|
||||
fp_promotions = [p for p in all_promotions if p.sr_name == sr_name]
|
||||
campaigns = PromoCampaign._byID([i.promo_idx for i in fp_promotions],
|
||||
return_dict=False, data=True)
|
||||
links = Link._by_fullname([i.thing_name for i in fp_promotions],
|
||||
return_dict=False, data=True)
|
||||
links = {l._id: l for l in links}
|
||||
kept = []
|
||||
for camp in campaigns:
|
||||
if camp.trans_id == 0:
|
||||
continue
|
||||
|
||||
link = links[camp.link_id]
|
||||
if link._spam or not promote.is_accepted(link):
|
||||
continue
|
||||
|
||||
kept.append(camp._id)
|
||||
|
||||
return [('%s_%s' % (PC_PREFIX, to36(p.promo_idx)), p.thing_name, p.bid)
|
||||
for p in fp_promotions if p.promo_idx in kept]
|
||||
|
||||
|
||||
def get_campaign_pageviews(date, sr_name=''):
|
||||
# ads go live at hour=5
|
||||
start = datetime.datetime(date.year, date.month, date.day, 5, 0)
|
||||
hours = [start + datetime.timedelta(hours=i) for i in xrange(24)]
|
||||
|
||||
traffic_cls = TargetedImpressionsByCodename
|
||||
codename_string = PC_PREFIX + '_%'
|
||||
q = (Session.query(traffic_cls.codename,
|
||||
sa_sum(traffic_cls.pageview_count).label('daily'))
|
||||
.filter(traffic_cls.subreddit == sr_name)
|
||||
.filter(traffic_cls.codename.like(codename_string))
|
||||
.filter(traffic_cls.interval == 'hour')
|
||||
.filter(traffic_cls.date.in_(hours))
|
||||
.group_by(traffic_cls.codename))
|
||||
|
||||
pageviews = dict(q)
|
||||
return pageviews
|
||||
|
||||
|
||||
def filter_campaigns(date, fullnames):
|
||||
campaigns = PromoCampaign._by_fullname(fullnames, data=True,
|
||||
return_dict=False)
|
||||
|
||||
# filter out campaigns that shouldn't be live
|
||||
pc_date = datetime.datetime(date.year, date.month, date.day, 0, 0,
|
||||
tzinfo=g.tz)
|
||||
|
||||
campaigns = [camp for camp in campaigns
|
||||
if camp.start_date <= pc_date <= camp.end_date]
|
||||
|
||||
# check for links with targeted campaigns - we can't handle them now
|
||||
has_targeted = [camp.link_id for camp in campaigns if camp.sr_name != '']
|
||||
return [camp for camp in campaigns if camp.link_id not in has_targeted]
|
||||
|
||||
|
||||
def get_frontpage_pageviews(date):
|
||||
sr_name = DefaultSR.name
|
||||
traffic_cls = PageviewsBySubredditAndPath
|
||||
q = (Session.query(traffic_cls.srpath, traffic_cls.pageview_count)
|
||||
.filter(traffic_cls.interval == 'day')
|
||||
.filter(traffic_cls.date == date)
|
||||
.filter(traffic_cls.srpath == '%s-GET_listing' % sr_name))
|
||||
r = list(q)
|
||||
return r[0][1]
|
||||
|
||||
|
||||
def compare_pageviews(daysago=0, verbose=False):
|
||||
"""Evaluate past delivery for promoted links.
|
||||
|
||||
Check frontpage promoted links for their actual delivery compared to what
|
||||
would be expected based on their bids.
|
||||
|
||||
"""
|
||||
|
||||
date = (datetime.datetime.now(g.tz) -
|
||||
datetime.timedelta(days=daysago)).date()
|
||||
|
||||
scheduled = get_scheduled(date)
|
||||
pageviews_by_camp = get_campaign_pageviews(date)
|
||||
campaigns = filter_campaigns(date, pageviews_by_camp.keys())
|
||||
actual = []
|
||||
for camp in campaigns:
|
||||
link_fullname = '%s_%s' % (LINK_PREFIX, to36(camp.link_id))
|
||||
i = (camp._fullname, link_fullname, pageviews_by_camp[camp._fullname])
|
||||
actual.append(i)
|
||||
|
||||
scheduled_links = {link for camp, link, pageviews in scheduled}
|
||||
actual_links = {link for camp, link, pageviews in actual}
|
||||
|
||||
bid_by_link = defaultdict(int)
|
||||
total_bid = 0
|
||||
|
||||
pageviews_by_link = defaultdict(int)
|
||||
total_pageviews = 0
|
||||
|
||||
for camp, link, bid in scheduled:
|
||||
if link not in actual_links:
|
||||
if verbose:
|
||||
print '%s not found in actual, skipping' % link
|
||||
continue
|
||||
|
||||
bid_by_link[link] += bid
|
||||
total_bid += bid
|
||||
|
||||
for camp, link, pageviews in actual:
|
||||
# not ideal: links shouldn't be here
|
||||
if link not in scheduled_links:
|
||||
if verbose:
|
||||
print '%s not found in schedule, skipping' % link
|
||||
continue
|
||||
|
||||
pageviews_by_link[link] += pageviews
|
||||
total_pageviews += pageviews
|
||||
|
||||
errors = []
|
||||
for link, bid in sorted(bid_by_link.items(), key=lambda t: t[1]):
|
||||
pageviews = pageviews_by_link.get(link, 0)
|
||||
expected = bid / total_bid
|
||||
realized = float(pageviews) / total_pageviews
|
||||
difference = (realized - expected) / expected
|
||||
errors.append(difference)
|
||||
if verbose:
|
||||
print '%s - %s - %s - %s' % (link, expected, realized, difference)
|
||||
|
||||
mean_error, min_error, max_error, stdev_error = error_statistics(errors)
|
||||
|
||||
print '%s' % date
|
||||
print ('error %s max, %s min, %s +- %s' %
|
||||
(max_error, min_error, mean_error, stdev_error))
|
||||
print 'total bid %s' % total_bid
|
||||
print ('pageviews for promoted links targeted only to frontpage %s' %
|
||||
total_pageviews)
|
||||
print ('frontpage pageviews for all promoted links %s' %
|
||||
sum(pageviews_by_camp.values()))
|
||||
print 'promoted eligible pageviews %s' % get_frontpage_pageviews(date)
|
||||
|
||||
|
||||
PROMOS = [('promo_%s' % i, i + 1) for i in xrange(100)]
|
||||
|
||||
|
||||
def select_subset(n, weighted=False):
|
||||
promos = copy(PROMOS)
|
||||
selected = []
|
||||
|
||||
if weighted:
|
||||
d = {(name, weight): weight for name, weight in promos}
|
||||
while len(selected) < n and d:
|
||||
i = weighted_lottery(d)
|
||||
del d[i]
|
||||
selected.append(i)
|
||||
else:
|
||||
# Sample without replacement
|
||||
if n > len(promos):
|
||||
return promos
|
||||
else:
|
||||
return random.sample(promos, n)
|
||||
return selected
|
||||
|
||||
|
||||
def pick(subset, weighted=False):
|
||||
if weighted:
|
||||
d = {(name, weight): weight for name, weight in subset}
|
||||
picked = weighted_lottery(d)
|
||||
else:
|
||||
picked = random.choice(subset)
|
||||
return picked
|
||||
|
||||
|
||||
def benchmark(subsets=1440, picks=6945, weighted_subset=False,
|
||||
weighted_pick=True, subset_size=10, verbose=False):
|
||||
"""Test 2 stage randomization.
|
||||
|
||||
First stage picks a subset of promoted links, second stage picks a single
|
||||
promoted link. This is to simulate the server side subset plus client side
|
||||
randomization of promoted link display.
|
||||
|
||||
"""
|
||||
|
||||
counts = {(name, weight): 0 for name, weight in PROMOS}
|
||||
|
||||
for i in xrange(subsets):
|
||||
subset = select_subset(subset_size, weighted=weighted_subset)
|
||||
|
||||
for j in xrange(picks):
|
||||
name, weight = pick(subset, weighted=weighted_pick)
|
||||
counts[(name, weight)] += 1
|
||||
|
||||
total_weight = sum(counts.values())
|
||||
errors = []
|
||||
for name, weight in sorted(counts.keys(), key=lambda t: t[1]):
|
||||
count = counts[(name, weight)]
|
||||
actual = float(count) / (subsets * picks)
|
||||
expected = float(weight) / total_weight
|
||||
error = (actual - expected) / expected
|
||||
errors.append(error)
|
||||
if verbose:
|
||||
print ('%s - expected: %s - actual: %s - error %s' %
|
||||
(name, expected, actual, error))
|
||||
|
||||
mean_error, min_error, max_error, stdev_error = error_statistics(errors)
|
||||
|
||||
if verbose:
|
||||
print ('Error %s max, %s min, %s +- %s' %
|
||||
(max_error, min_error, mean_error, stdev_error))
|
||||
|
||||
return (max_error, min_error, mean_error, stdev_error)
|
||||
Reference in New Issue
Block a user