mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-14 09:27:57 -05:00
359 lines
11 KiB
Python
359 lines
11 KiB
Python
# The contents of this file are subject to the Common Public Attribution
|
|
# License Version 1.0. (the "License"); you may not use this file except in
|
|
# compliance with the License. You may obtain a copy of the License at
|
|
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
|
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
|
# software over a computer network and provide for limited attribution for the
|
|
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
|
# with Exhibit B.
|
|
#
|
|
# Software distributed under the License is distributed on an "AS IS" basis,
|
|
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
|
# the specific language governing rights and limitations under the License.
|
|
#
|
|
# The Original Code is reddit.
|
|
#
|
|
# The Original Developer is the Initial Developer. The Initial Developer of
|
|
# the Original Code is reddit Inc.
|
|
#
|
|
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
|
|
# Inc. All Rights Reserved.
|
|
###############################################################################
|
|
|
|
from __future__ import division
|
|
|
|
import collections
|
|
import HTMLParser
|
|
import itertools
|
|
import random
|
|
import string
|
|
import time
|
|
|
|
import requests
|
|
|
|
from pylons import app_globals as g
|
|
|
|
from r2.lib.db import queries
|
|
from r2.lib import amqp
|
|
from r2.lib.utils import weighted_lottery, get_requests_resp_json
|
|
from r2.lib.voting import cast_vote
|
|
from r2.models import (
|
|
Account,
|
|
Comment,
|
|
Link,
|
|
LocalizedDefaultSubreddits,
|
|
NotFound,
|
|
register,
|
|
Subreddit,
|
|
Vote,
|
|
)
|
|
|
|
|
|
unescape_htmlentities = HTMLParser.HTMLParser().unescape
|
|
|
|
|
|
class TextGenerator(object):
|
|
"""A Markov Chain based text mimicker."""
|
|
|
|
def __init__(self, order=8):
|
|
self.order = order
|
|
self.starts = collections.Counter()
|
|
self.start_lengths = collections.defaultdict(collections.Counter)
|
|
self.models = [
|
|
collections.defaultdict(collections.Counter)
|
|
for i in xrange(self.order)]
|
|
|
|
@staticmethod
|
|
def _in_groups(input_iterable, n):
|
|
iterables = itertools.tee(input_iterable, n)
|
|
for offset, iterable in enumerate(iterables):
|
|
for _ in xrange(offset):
|
|
next(iterable, None)
|
|
return itertools.izip(*iterables)
|
|
|
|
def add_sample(self, sample):
|
|
"""Add a sample to the model of text for this generator."""
|
|
|
|
if len(sample) <= self.order:
|
|
return
|
|
|
|
start = sample[:self.order]
|
|
self.starts[start] += 1
|
|
self.start_lengths[start][len(sample)] += 1
|
|
for order, model in enumerate(self.models, 1):
|
|
for chars in self._in_groups(sample, order+1):
|
|
prefix = "".join(chars[:-1])
|
|
next_char = chars[-1]
|
|
model[prefix][next_char] += 1
|
|
|
|
def generate(self):
|
|
"""Generate a string similar to samples previously fed in."""
|
|
|
|
start = weighted_lottery(self.starts)
|
|
desired_length = weighted_lottery(self.start_lengths[start])
|
|
desired_length = max(desired_length, self.order)
|
|
|
|
generated = []
|
|
generated.extend(start)
|
|
while len(generated) < desired_length:
|
|
# try each model, from highest order down, til we find
|
|
# something
|
|
for order, model in reversed(list(enumerate(self.models, 1))):
|
|
current_prefix = "".join(generated[-order:])
|
|
frequencies = model[current_prefix]
|
|
if frequencies:
|
|
generated.append(weighted_lottery(frequencies))
|
|
break
|
|
else:
|
|
generated.append(random.choice(string.lowercase))
|
|
|
|
return "".join(generated)
|
|
|
|
|
|
def fetch_listing(path, limit=1000, batch_size=100):
|
|
"""Fetch a reddit listing from reddit.com."""
|
|
|
|
session = requests.Session()
|
|
session.headers.update({
|
|
"User-Agent": "reddit-test-data-generator/1.0",
|
|
})
|
|
|
|
base_url = "https://api.reddit.com" + path
|
|
|
|
after = None
|
|
count = 0
|
|
while count < limit:
|
|
params = {"limit": batch_size, "count": count}
|
|
if after:
|
|
params["after"] = after
|
|
|
|
print "> {}-{}".format(count, count+batch_size)
|
|
response = session.get(base_url, params=params)
|
|
response.raise_for_status()
|
|
|
|
listing = get_requests_resp_json(response)["data"]
|
|
for child in listing["children"]:
|
|
yield child["data"]
|
|
count += 1
|
|
|
|
after = listing["after"]
|
|
if not after:
|
|
break
|
|
|
|
# obey reddit.com's ratelimits
|
|
# see: https://github.com/reddit/reddit/wiki/API#rules
|
|
time.sleep(2)
|
|
|
|
|
|
class Modeler(object):
|
|
def __init__(self):
|
|
self.usernames = TextGenerator(order=2)
|
|
|
|
def model_subreddit(self, subreddit_name):
|
|
"""Return a model of links and comments in a given subreddit."""
|
|
|
|
subreddit_path = "/r/{}".format(subreddit_name)
|
|
print ">>>", subreddit_path
|
|
|
|
print ">> Links"
|
|
titles = TextGenerator(order=5)
|
|
selfposts = TextGenerator(order=8)
|
|
link_count = self_count = 0
|
|
urls = set()
|
|
for link in fetch_listing(subreddit_path, limit=500):
|
|
self.usernames.add_sample(link["author"])
|
|
titles.add_sample(unescape_htmlentities(link["title"]))
|
|
if link["is_self"]:
|
|
self_count += 1
|
|
selfposts.add_sample(unescape_htmlentities(link["selftext"]))
|
|
else:
|
|
urls.add(link["url"])
|
|
link_count += 1
|
|
self_frequency = self_count / link_count
|
|
|
|
print ">> Comments"
|
|
comments = TextGenerator(order=8)
|
|
for comment in fetch_listing(subreddit_path + "/comments"):
|
|
self.usernames.add_sample(comment["author"])
|
|
comments.add_sample(unescape_htmlentities(comment["body"]))
|
|
|
|
return SubredditModel(
|
|
subreddit_name, titles, selfposts, urls, comments, self_frequency)
|
|
|
|
def generate_username(self):
|
|
"""Generate and return a username like those seen on reddit.com."""
|
|
return self.usernames.generate()
|
|
|
|
|
|
class SubredditModel(object):
|
|
"""A snapshot of a subreddit's links and comments."""
|
|
|
|
def __init__(self, name, titles, selfposts, urls, comments, self_frequency):
|
|
self.name = name
|
|
self.titles = titles
|
|
self.selfposts = selfposts
|
|
self.urls = list(urls)
|
|
self.comments = comments
|
|
self.selfpost_frequency = self_frequency
|
|
|
|
def generate_link_title(self):
|
|
"""Generate and return a title like those seen in the subreddit."""
|
|
return self.titles.generate()
|
|
|
|
def generate_link_url(self):
|
|
"""Generate and return a URL from one seen in the subreddit.
|
|
|
|
The URL returned may be "self" indicating a self post. This should
|
|
happen with the same frequency it is seen in the modeled subreddit.
|
|
|
|
"""
|
|
if random.random() < self.selfpost_frequency:
|
|
return "self"
|
|
else:
|
|
return random.choice(self.urls)
|
|
|
|
def generate_selfpost_body(self):
|
|
"""Generate and return a self-post body like seen in the subreddit."""
|
|
return self.selfposts.generate()
|
|
|
|
def generate_comment_body(self):
|
|
"""Generate and return a comment body like seen in the subreddit."""
|
|
return self.comments.generate()
|
|
|
|
|
|
def fuzz_number(number):
|
|
return int(random.betavariate(2, 8) * 5 * number)
|
|
|
|
|
|
def ensure_account(name):
|
|
"""Look up or register an account and return it."""
|
|
try:
|
|
account = Account._by_name(name)
|
|
print ">> found /u/{}".format(name)
|
|
return account
|
|
except NotFound:
|
|
print ">> registering /u/{}".format(name)
|
|
return register(name, "password", "127.0.0.1")
|
|
|
|
|
|
def ensure_subreddit(name, author):
|
|
"""Look up or create a subreddit and return it."""
|
|
try:
|
|
sr = Subreddit._by_name(name)
|
|
print ">> found /r/{}".format(name)
|
|
return sr
|
|
except NotFound:
|
|
print ">> creating /r/{}".format(name)
|
|
sr = Subreddit._new(
|
|
name=name,
|
|
title="/r/{}".format(name),
|
|
author_id=author._id,
|
|
lang="en",
|
|
ip="127.0.0.1",
|
|
)
|
|
sr._commit()
|
|
return sr
|
|
|
|
|
|
def inject_test_data(num_links=25, num_comments=25, num_votes=5):
|
|
"""Flood your reddit install with test data based on reddit.com."""
|
|
|
|
print ">>>> Ensuring configured objects exist"
|
|
system_user = ensure_account(g.system_user)
|
|
ensure_account(g.automoderator_account)
|
|
ensure_subreddit(g.default_sr, system_user)
|
|
ensure_subreddit(g.takedown_sr, system_user)
|
|
ensure_subreddit(g.beta_sr, system_user)
|
|
ensure_subreddit(g.promo_sr_name, system_user)
|
|
|
|
print
|
|
print
|
|
|
|
print ">>>> Fetching real data from reddit.com"
|
|
modeler = Modeler()
|
|
subreddits = [
|
|
modeler.model_subreddit("pics"),
|
|
modeler.model_subreddit("videos"),
|
|
modeler.model_subreddit("askhistorians"),
|
|
]
|
|
extra_settings = {
|
|
"pics": {
|
|
"show_media": True,
|
|
},
|
|
"videos": {
|
|
"show_media": True,
|
|
},
|
|
}
|
|
|
|
print
|
|
print
|
|
|
|
print ">>>> Generating test data"
|
|
print ">>> Accounts"
|
|
account_query = Account._query(sort="_date", limit=500, data=True)
|
|
accounts = [a for a in account_query if a.name != g.system_user]
|
|
accounts.extend(
|
|
ensure_account(modeler.generate_username())
|
|
for i in xrange(50 - len(accounts)))
|
|
|
|
print ">>> Content"
|
|
things = []
|
|
for sr_model in subreddits:
|
|
sr_author = random.choice(accounts)
|
|
sr = ensure_subreddit(sr_model.name, sr_author)
|
|
|
|
# make the system user subscribed for easier testing
|
|
if sr.add_subscriber(system_user):
|
|
sr._incr("_ups", 1)
|
|
|
|
# apply any custom config we need for this sr
|
|
for setting, value in extra_settings.get(sr.name, {}).iteritems():
|
|
setattr(sr, setting, value)
|
|
sr._commit()
|
|
|
|
for i in xrange(num_links):
|
|
link_author = random.choice(accounts)
|
|
url = sr_model.generate_link_url()
|
|
is_self = (url == "self")
|
|
content = sr_model.generate_selfpost_body() if is_self else url
|
|
link = Link._submit(
|
|
is_self=is_self,
|
|
title=sr_model.generate_link_title(),
|
|
content=content,
|
|
author=link_author,
|
|
sr=sr,
|
|
ip="127.0.0.1",
|
|
)
|
|
queries.new_link(link)
|
|
things.append(link)
|
|
|
|
comments = [None]
|
|
for i in xrange(fuzz_number(num_comments)):
|
|
comment_author = random.choice(accounts)
|
|
comment, inbox_rel = Comment._new(
|
|
comment_author,
|
|
link,
|
|
parent=random.choice(comments),
|
|
body=sr_model.generate_comment_body(),
|
|
ip="127.0.0.1",
|
|
)
|
|
queries.new_comment(comment, inbox_rel)
|
|
comments.append(comment)
|
|
things.append(comment)
|
|
|
|
for thing in things:
|
|
for i in xrange(fuzz_number(num_votes)):
|
|
direction = random.choice([
|
|
Vote.DIRECTIONS.up,
|
|
Vote.DIRECTIONS.unvote,
|
|
Vote.DIRECTIONS.down,
|
|
])
|
|
voter = random.choice(accounts)
|
|
|
|
cast_vote(voter, thing, direction)
|
|
|
|
amqp.worker.join()
|
|
|
|
srs = [Subreddit._by_name(n) for n in ("pics", "videos", "askhistorians")]
|
|
LocalizedDefaultSubreddits.set_global_srs(srs)
|