reddit/scripts/promoted_links.py

#!/usr/bin/python
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer.  The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2014 reddit
# Inc. All Rights Reserved.
###############################################################################
"""Tools for evaluating promoted link distribution."""

from collections import defaultdict
import datetime
from math import sqrt

from pylons import g
from sqlalchemy.sql.functions import sum as sa_sum

from r2.lib import promote
from r2.lib.db.operators import and_, or_
from r2.lib.utils import to36, weighted_lottery
from r2.models.traffic import (
    Session,
    TargetedImpressionsByCodename,
    PageviewsBySubredditAndPath,
)
from r2.models.bidding import PromotionWeights
from r2.models import (
    Link,
    PromoCampaign,
    DefaultSR,
)

LINK_PREFIX = Link._type_prefix + str(Link._type_id)
PC_PREFIX = PromoCampaign._type_prefix + str(PromoCampaign._type_id)


def error_statistics(errors):
    mean_error = sum(errors) / len(errors)
    min_error = min([abs(i) for i in errors])
    max_error = max([abs(i) for i in errors])
    stdev_error = sqrt(
        (sum([i ** 2 for i in errors]) / len(errors))
        - mean_error ** 2)
    return (mean_error, min_error, max_error, stdev_error)


def get_scheduled(date, sr_name=''):
    all_promotions = PromotionWeights.get_campaigns(date)
    fp_promotions = [p for p in all_promotions if p.sr_name == sr_name]
    campaigns = PromoCampaign._byID([i.promo_idx for i in fp_promotions],
                                    return_dict=False, data=True)
    links = Link._by_fullname([i.thing_name for i in fp_promotions],
                              return_dict=False, data=True)
    links = {l._id: l for l in links}
    kept = []
    for camp in campaigns:
        if camp.trans_id == 0:
            continue

        link = links[camp.link_id]
        if link._spam or not promote.is_accepted(link):
            continue

        kept.append(camp._id)

    return [('%s_%s' % (PC_PREFIX, to36(p.promo_idx)), p.thing_name, p.bid)
            for p in fp_promotions if p.promo_idx in kept]


def get_campaign_pageviews(date, sr_name=''):
    # ads go live at hour=5
    start = datetime.datetime(date.year, date.month, date.day, 5, 0)
    hours = [start + datetime.timedelta(hours=i) for i in xrange(24)]

    traffic_cls = TargetedImpressionsByCodename
    codename_string = PC_PREFIX + '_%'
    q = (Session.query(traffic_cls.codename,
                       sa_sum(traffic_cls.pageview_count).label('daily'))
            .filter(traffic_cls.subreddit == sr_name)
            .filter(traffic_cls.codename.like(codename_string))
            .filter(traffic_cls.interval == 'hour')
            .filter(traffic_cls.date.in_(hours))
            .group_by(traffic_cls.codename))

    pageviews = dict(q)
    return pageviews


def filter_campaigns(date, fullnames):
    campaigns = PromoCampaign._by_fullname(fullnames, data=True,
                                           return_dict=False)

    # filter out campaigns that shouldn't be live
    pc_date = datetime.datetime(date.year, date.month, date.day, 0, 0,
                                tzinfo=g.tz)

    campaigns = [camp for camp in campaigns
                 if camp.start_date <= pc_date <= camp.end_date]

    # check for links with targeted campaigns - we can't handle them now
    has_targeted = [camp.link_id for camp in campaigns if camp.sr_name != '']
    return [camp for camp in campaigns if camp.link_id not in has_targeted]


def get_frontpage_pageviews(date):
    sr_name = DefaultSR.name
    traffic_cls = PageviewsBySubredditAndPath
    q = (Session.query(traffic_cls.srpath, traffic_cls.pageview_count)
           .filter(traffic_cls.interval == 'day')
           .filter(traffic_cls.date == date)
           .filter(traffic_cls.srpath == '%s-GET_listing' % sr_name))
    r = list(q)
    return r[0][1]


def compare_pageviews(daysago=0, verbose=False):
    """Evaluate past delivery for promoted links.

    Check frontpage promoted links for their actual delivery compared to what
    would be expected based on their bids.

    """

    date = (datetime.datetime.now(g.tz) -
            datetime.timedelta(days=daysago)).date()

    scheduled = get_scheduled(date)
    pageviews_by_camp = get_campaign_pageviews(date)
    campaigns = filter_campaigns(date, pageviews_by_camp.keys())
    actual = []
    for camp in campaigns:
        link_fullname = '%s_%s' % (LINK_PREFIX, to36(camp.link_id))
        i = (camp._fullname, link_fullname, pageviews_by_camp[camp._fullname])
        actual.append(i)

    scheduled_links = {link for camp, link, pageviews in scheduled}
    actual_links = {link for camp, link, pageviews in actual}

    bid_by_link = defaultdict(int)
    total_bid = 0

    pageviews_by_link = defaultdict(int)
    total_pageviews = 0

    for camp, link, bid in scheduled:
        if link not in actual_links:
            if verbose:
                print '%s not found in actual, skipping' % link
            continue

        bid_by_link[link] += bid
        total_bid += bid

    for camp, link, pageviews in actual:
        # not ideal: links shouldn't be here
        if link not in scheduled_links:
            if verbose:
                print '%s not found in schedule, skipping' % link
            continue

        pageviews_by_link[link] += pageviews
        total_pageviews += pageviews

    errors = []
    for link, bid in sorted(bid_by_link.items(), key=lambda t: t[1]):
        pageviews = pageviews_by_link.get(link, 0)
        expected = bid / total_bid
        realized = float(pageviews) / total_pageviews
        difference = (realized - expected) / expected
        errors.append(difference)
        if verbose:
            print '%s - %s - %s - %s' % (link, expected, realized, difference)

    mean_error, min_error, max_error, stdev_error = error_statistics(errors)

    print '%s' % date
    print ('error %s max, %s min, %s +- %s' %
           (max_error, min_error, mean_error, stdev_error))
    print 'total bid %s' % total_bid
    print ('pageviews for promoted links targeted only to frontpage %s' %
           total_pageviews)
    print ('frontpage pageviews for all promoted links %s' %
           sum(pageviews_by_camp.values()))
    print 'promoted eligible pageviews %s' % get_frontpage_pageviews(date)


PROMOS = [('promo_%s' % i, i + 1) for i in xrange(100)]


def select_subset(n, weighted=False):
    promos = copy(PROMOS)
    selected = []

    if weighted:
        d = {(name, weight): weight for name, weight in promos}
        while len(selected) < n and d:
            i = weighted_lottery(d)
            del d[i]
            selected.append(i)
    else:
        # Sample without replacement
        if n > len(promos):
            return promos
        else:
            return random.sample(promos, n)
    return selected


def pick(subset, weighted=False):
    if weighted:
        d = {(name, weight): weight for name, weight in subset}
        picked = weighted_lottery(d)
    else:
        picked = random.choice(subset)
    return picked


def benchmark(subsets=1440, picks=6945, weighted_subset=False,
              weighted_pick=True, subset_size=10, verbose=False):
    """Test 2 stage randomization.

    First stage picks a subset of promoted links, second stage picks a single
    promoted link. This is to simulate the server side subset plus client side
    randomization of promoted link display.

    """

    counts = {(name, weight): 0 for name, weight in PROMOS}

    for i in xrange(subsets):
        subset = select_subset(subset_size, weighted=weighted_subset)

        for j in xrange(picks):
            name, weight = pick(subset, weighted=weighted_pick)
            counts[(name, weight)] += 1

    total_weight = sum(counts.values())
    errors = []
    for name, weight in sorted(counts.keys(), key=lambda t: t[1]):
        count = counts[(name, weight)]
        actual = float(count) / (subsets * picks)
        expected = float(weight) / total_weight
        error = (actual - expected) / expected
        errors.append(error)
        if verbose:
            print ('%s - expected: %s - actual: %s - error %s' %
                   (name, expected, actual, error))

    mean_error, min_error, max_error, stdev_error = error_statistics(errors)

    if verbose:
        print ('Error %s max, %s min, %s +- %s' %
               (max_error, min_error, mean_error, stdev_error))

    return (max_error, min_error, mean_error, stdev_error)