reddit/scripts/inject_test_data.py

# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer.  The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
# Inc. All Rights Reserved.
###############################################################################

from __future__ import division

import collections
import HTMLParser
import itertools
import random
import string
import time

import requests

from pylons import g

from r2.lib.db import queries
from r2.lib import amqp
from r2.lib.utils import weighted_lottery
from r2.models import Account, NotFound, register, Subreddit, Link, Comment


unescape_htmlentities = HTMLParser.HTMLParser().unescape


class TextGenerator(object):
    """A Markov Chain based text mimicker."""

    def __init__(self, order=8):
        self.order = order
        self.starts = collections.Counter()
        self.start_lengths = collections.defaultdict(collections.Counter)
        self.models = [
            collections.defaultdict(collections.Counter)
            for i in xrange(self.order)]

    @staticmethod
    def _in_groups(input_iterable, n):
        iterables = itertools.tee(input_iterable, n)
        for offset, iterable in enumerate(iterables):
            for _ in xrange(offset):
                next(iterable, None)
        return itertools.izip(*iterables)

    def add_sample(self, sample):
        """Add a sample to the model of text for this generator."""

        if len(sample) <= self.order:
            return

        start = sample[:self.order]
        self.starts[start] += 1
        self.start_lengths[start][len(sample)] += 1
        for order, model in enumerate(self.models, 1):
            for chars in self._in_groups(sample, order+1):
                prefix = "".join(chars[:-1])
                next_char = chars[-1]
                model[prefix][next_char] += 1

    def generate(self):
        """Generate a string similar to samples previously fed in."""

        start = weighted_lottery(self.starts)
        desired_length = weighted_lottery(self.start_lengths[start])
        desired_length = max(desired_length, self.order)

        generated = []
        generated.extend(start)
        while len(generated) < desired_length:
            # try each model, from highest order down, til we find
            # something
            for order, model in reversed(list(enumerate(self.models, 1))):
                current_prefix = "".join(generated[-order:])
                frequencies = model[current_prefix]
                if frequencies:
                    generated.append(weighted_lottery(frequencies))
                    break
            else:
                generated.append(random.choice(string.lowercase))

        return "".join(generated)


def fetch_listing(path, limit=1000, batch_size=100):
    """Fetch a reddit listing from reddit.com."""

    session = requests.Session()
    session.headers.update({
        "User-Agent": "reddit-test-data-generator/1.0",
    })

    base_url = "https://api.reddit.com" + path

    after = None
    count = 0
    while count < limit:
        params = {"limit": batch_size, "count": count}
        if after:
            params["after"] = after

        print "> {}-{}".format(count, count+batch_size)
        response = session.get(base_url, params=params)
        response.raise_for_status()

        listing = response.json["data"]
        for child in listing["children"]:
            yield child["data"]
            count += 1

        after = listing["after"]
        if not after:
            break

        # obey reddit.com's ratelimits
        # see: https://github.com/reddit/reddit/wiki/API#rules
        time.sleep(2)


class Modeler(object):
    def __init__(self):
        self.usernames = TextGenerator(order=2)

    def model_subreddit(self, subreddit_name):
        """Return a model of links and comments in a given subreddit."""

        subreddit_path = "/r/{}".format(subreddit_name)
        print ">>>", subreddit_path

        print ">> Links"
        titles = TextGenerator(order=5)
        selfposts = TextGenerator(order=8)
        link_count = self_count = 0
        urls = set()
        for link in fetch_listing(subreddit_path, limit=500):
            self.usernames.add_sample(link["author"])
            titles.add_sample(unescape_htmlentities(link["title"]))
            if link["is_self"]:
                self_count += 1
                selfposts.add_sample(unescape_htmlentities(link["selftext"]))
            else:
                urls.add(link["url"])
            link_count += 1
        self_frequency = self_count / link_count

        print ">> Comments"
        comments = TextGenerator(order=8)
        for comment in fetch_listing(subreddit_path + "/comments"):
            self.usernames.add_sample(comment["author"])
            comments.add_sample(unescape_htmlentities(comment["body"]))

        return SubredditModel(
            subreddit_name, titles, selfposts, urls, comments, self_frequency)

    def generate_username(self):
        """Generate and return a username like those seen on reddit.com."""
        return self.usernames.generate()


class SubredditModel(object):
    """A snapshot of a subreddit's links and comments."""

    def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
        self.name = name
        self.titles = titles
        self.selfposts = selfposts
        self.urls = list(urls)
        self.comments = comments
        self.selfpost_frequency = self_frequency

    def generate_link_title(self):
        """Generate and return a title like those seen in the subreddit."""
        return self.titles.generate()

    def generate_link_url(self):
        """Generate and return a URL from one seen in the subreddit.

        The URL returned may be "self" indicating a self post. This should
        happen with the same frequency it is seen in the modeled subreddit.

        """
        if random.random() < self.selfpost_frequency:
            return "self"
        else:
            return random.choice(self.urls)

    def generate_selfpost_body(self):
        """Generate and return a self-post body like seen in the subreddit."""
        return self.selfposts.generate()

    def generate_comment_body(self):
        """Generate and return a comment body like seen in the subreddit."""
        return self.comments.generate()


def fuzz_number(number):
    return int(random.betavariate(2, 8) * 5 * number)


def ensure_account(name):
    """Look up or register an account and return it."""
    try:
        account = Account._by_name(name)
        print ">> found /u/{}".format(name)
        return account
    except NotFound:
        print ">> registering /u/{}".format(name)
        return register(name, "password", "127.0.0.1")


def ensure_subreddit(name, author):
    """Look up or create a subreddit and return it."""
    try:
        sr = Subreddit._by_name(name)
        print ">> found /r/{}".format(name)
        return sr
    except NotFound:
        print ">> creating /r/{}".format(name)
        sr = Subreddit._new(
            name=name,
            title="/r/{}".format(name),
            author_id=author._id,
            lang="en",
            ip="127.0.0.1",
        )
        sr._commit()
        return sr


def inject_test_data(num_links=25, num_comments=25, num_votes=5):
    """Flood your reddit install with test data based on reddit.com."""

    print ">>>> Ensuring configured objects exist"
    system_user = ensure_account(g.system_user)
    ensure_subreddit(g.default_sr, system_user)
    ensure_subreddit(g.takedown_sr, system_user)

    print
    print

    print ">>>> Fetching real data from reddit.com"
    modeler = Modeler()
    subreddits = [
        modeler.model_subreddit("pics"),
        modeler.model_subreddit("videos"),
        modeler.model_subreddit("askhistorians"),
    ]
    extra_settings = {
        "pics": {
            "show_media": True,
        },
        "videos": {
            "show_media": True,
        },
    }

    print
    print

    print ">>>> Generating test data"
    print ">>> Accounts"
    account_query = Account._query(sort="_date", limit=500, data=True)
    accounts = [a for a in account_query if a.name != g.system_user]
    accounts.extend(
        ensure_account(modeler.generate_username())
        for i in xrange(50 - len(accounts)))

    print ">>> Content"
    things = []
    for sr_model in subreddits:
        sr_author = random.choice(accounts)
        sr = ensure_subreddit(sr_model.name, sr_author)

        # make the system user subscribed for easier testing
        if sr.add_subscriber(system_user):
            sr._incr("_ups", 1)

        # apply any custom config we need for this sr
        for setting, value in extra_settings.get(sr.name, {}).iteritems():
            setattr(sr, setting, value)
        sr._commit()

        for i in xrange(num_links):
            link_author = random.choice(accounts)

            link = Link._submit(
                title=sr_model.generate_link_title(),
                url=sr_model.generate_link_url(),
                author=link_author,
                sr=sr,
                ip="127.0.0.1",
            )
            if link.url == "self":
                link.url = link.make_permalink(sr)
                link.is_self = True
                link.selftext = sr_model.generate_selfpost_body()
                link._commit()
            queries.queue_vote(link_author, link, dir=True, ip="127.0.0.1")
            queries.new_link(link)
            things.append(link)

            comments = [None]
            for i in xrange(fuzz_number(num_comments)):
                comment_author = random.choice(accounts)
                comment, inbox_rel = Comment._new(
                    comment_author,
                    link,
                    parent=random.choice(comments),
                    body=sr_model.generate_comment_body(),
                    ip="127.0.0.1",
                )
                queries.queue_vote(
                    comment_author, comment, dir=True, ip="127.0.0.1")
                queries.new_comment(comment, inbox_rel)
                comments.append(comment)
                things.append(comment)

    for thing in things:
        for i in xrange(fuzz_number(num_votes)):
            direction = random.choice([True, None, False])
            voter = random.choice(accounts)
            queries.queue_vote(voter, thing, dir=direction, ip="127.0.0.1")

    amqp.worker.join()