From 7edd1ec8cd147ba0f71fa9ceaf45706fc869a3ca Mon Sep 17 00:00:00 2001
From: Shawn Krisman <shawn.krisman@reddit.com>
Date: Mon, 13 Jun 2016 14:12:54 -0700
Subject: [PATCH] Use s3 to read and store subreddit sitemaps.

---
 r2/example.ini                          |  11 +++
 r2/r2/config/routing.py                 |   3 -
 r2/r2/controllers/__init__.py           |   1 -
 r2/r2/controllers/sitemap.py            |  42 --------
 r2/r2/lib/hadoop_decompress.py          | 120 ++++++++++++++++++++++
 r2/r2/lib/pages/pages.py                |   4 +-
 r2/r2/lib/s3_helpers.py                 |  13 ++-
 r2/r2/lib/sitemaps/__init__.py          |  11 +--
 r2/r2/lib/sitemaps/data.py              |  82 ++++++++-------
 r2/r2/lib/sitemaps/generate.py          |  28 +++---
 r2/r2/lib/sitemaps/store.py             |  80 +++++++++++++--
 r2/r2/lib/sitemaps/watcher.py           | 101 +++++++++++++++++++
 r2/r2/models/sitemap.py                 | 126 ------------------------
 r2/r2/templates/robots.txt              |   2 +-
 upstart/reddit-consumer-sitemaps_q.conf |  13 +++
 15 files changed, 394 insertions(+), 243 deletions(-)
 delete mode 100644 r2/r2/controllers/sitemap.py
 create mode 100644 r2/r2/lib/hadoop_decompress.py
 create mode 100644 r2/r2/lib/sitemaps/watcher.py
 delete mode 100644 r2/r2/models/sitemap.py
 create mode 100644 upstart/reddit-consumer-sitemaps_q.conf

diff --git a/r2/example.ini b/r2/example.ini
index afe08a38e..83002e411 100644
--- a/r2/example.ini
+++ b/r2/example.ini
@@ -300,6 +300,17 @@ modmail_sender_email =
 modmail_system_email =
 modmail_email_domain =
 
+############################################ SITEMAPS
+# path of the subreddit sitemap index
+sitemap_subreddit_keyname =
+# s3 information related  to where the sitemaps are statically stored
+sitemap_upload_s3_bucket =
+sitemap_s3_static_host =
+sitemap_subreddit_static_url =
+# the SQS queue that we use to receive messages to modify our sitemaps
+sitemap_sqs_queue =
+
+
 ############################################ MEDIA STORAGE
 # which backend provider to use for media (thumbnails, subreddit stylesheets,
 # subreddit images, app icons). options are:
diff --git a/r2/r2/config/routing.py b/r2/r2/config/routing.py
index ea9b12a6c..8609b1680 100644
--- a/r2/r2/config/routing.py
+++ b/r2/r2/config/routing.py
@@ -61,9 +61,6 @@ def make_map(config):
     mc('/robots.txt', controller='robots', action='robots')
     mc('/crossdomain', controller='robots', action='crossdomain')
 
-    mc('/sitemap', controller='sitemap', action='index')
-    mc('/subreddit_sitemap', controller='sitemap', action='subreddits')
-
     mc('/login', controller='forms', action='login')
     mc('/register', controller='forms', action='register')
     mc('/logout', controller='forms', action='logout')
diff --git a/r2/r2/controllers/__init__.py b/r2/r2/controllers/__init__.py
index ad54adb20..5d359393a 100644
--- a/r2/r2/controllers/__init__.py
+++ b/r2/r2/controllers/__init__.py
@@ -96,7 +96,6 @@ def load_controllers():
     from oauth2 import OAuth2AccessController
     from redirect import RedirectController
     from robots import RobotsController
-    from sitemap import SitemapController
     from ipn import IpnController
     from ipn import StripeController
     from ipn import CoinbaseController
diff --git a/r2/r2/controllers/sitemap.py b/r2/r2/controllers/sitemap.py
deleted file mode 100644
index 087099a93..000000000
--- a/r2/r2/controllers/sitemap.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# The contents of this file are subject to the Common Public Attribution
-# License Version 1.0. (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
-# License Version 1.1, but Sections 14 and 15 have been added to cover use of
-# software over a computer network and provide for limited attribution for the
-# Original Developer. In addition, Exhibit A has been modified to be consistent
-# with Exhibit B.
-#
-# Software distributed under the License is distributed on an "AS IS" basis,
-# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
-# the specific language governing rights and limitations under the License.
-#
-# The Original Code is reddit.
-#
-# The Original Developer is the Initial Developer.  The Initial Developer of
-# the Original Code is reddit Inc.
-#
-# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
-# Inc. All Rights Reserved.
-###############################################################################
-
-from pylons import response
-
-from r2.controllers.reddit_base import MinimalController
-from r2.lib.db import tdb_cassandra
-from r2.lib.validator import validate, VInt
-from r2.models.sitemap import Sitemap
-
-class SitemapController(MinimalController):
-
-    def GET_index(self):
-        response.content_type = 'application/xml'
-        return Sitemap.sitemap_index()
-
-    @validate(index=VInt('index', 0, 50000))
-    def GET_subreddits(self, index):
-        response.content_type = 'application/xml'
-        try:
-            return Sitemap.subreddit_sitemap(index)
-        except tdb_cassandra.NotFound:
-            return self.abort404()
diff --git a/r2/r2/lib/hadoop_decompress.py b/r2/r2/lib/hadoop_decompress.py
new file mode 100644
index 000000000..0cf0a46ee
--- /dev/null
+++ b/r2/r2/lib/hadoop_decompress.py
@@ -0,0 +1,120 @@
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+#
+# The Original Code is reddit.
+#
+# The Original Developer is the Initial Developer.  The Initial Developer of
+# the Original Code is reddit Inc.
+#
+# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
+# Inc. All Rights Reserved.
+###############################################################################
+
+
+import snappy
+import struct
+
+
+class HadoopStreamDecompressor(object):
+    """This class implements the decompressor-side of the hadoop framing
+    format.
+
+    Hadoop fraiming format consists of one or more blocks, each of which is
+    composed of one or more compressed subblocks. The block size is the
+    uncompressed size, while the subblock size is the size of the compressed
+    data.
+
+    https://github.com/andrix/python-snappy/pull/35/files
+    """
+
+    __slots__ = ["_buf", "_block_size", "_block_read", "_subblock_size"]
+
+    def __init__(self):
+        self._buf = b""
+        self._block_size = None
+        self._block_read = 0
+        self._subblock_size = None
+
+    def decompress(self, data):
+        self._buf += data
+        output = b""
+        while True:
+            # decompress block will attempt to decompress  any subblocks if it
+            # has already read the block size and subblock size.
+            buf = self._decompress_block()
+            if len(buf) > 0:
+                output += buf
+            else:
+                break
+        return output
+
+    def _decompress_block(self):
+        if self._block_size is None:
+            if len(self._buf) <= 4:
+                return b""
+            self._block_size = struct.unpack(">i", self._buf[:4])[0]
+            self._buf = self._buf[4:]
+        output = b""
+        while self._block_read < self._block_size:
+            buf = self._decompress_subblock()
+            if len(buf) > 0:
+                output += buf
+            else:
+                # Buffer doesn't contain full subblock
+                break
+        if self._block_read == self._block_size:
+            # We finished reading this block, so reinitialize.
+            self._block_read = 0
+            self._block_size = None
+        return output
+
+    def _decompress_subblock(self):
+        if self._subblock_size is None:
+            if len(self._buf) <= 4:
+                return b""
+            self._subblock_size = struct.unpack(">i", self._buf[:4])[0]
+            self._buf = self._buf[4:]
+        # Only attempt to decompress complete subblocks.
+        if len(self._buf) < self._subblock_size:
+            return b""
+        compressed = self._buf[:self._subblock_size]
+        self._buf = self._buf[self._subblock_size:]
+        uncompressed = snappy.uncompress(compressed)
+        self._block_read += len(uncompressed)
+        self._subblock_size = None
+        return uncompressed
+
+    def flush(self):
+        if self._buf != b"":
+            raise snappy.UncompressError("chunk truncated")
+        return b""
+
+    def copy(self):
+        copy = HadoopStreamDecompressor()
+        copy._buf = self._buf
+        copy._block_size = self._block_size
+        copy._block_read = self._block_read
+        copy._subblock_size = self._subblock_size
+        return copy
+
+
+def hadoop_decompress(src, dst, blocksize=snappy._STREAM_TO_STREAM_BLOCK_SIZE):
+    decompressor = HadoopStreamDecompressor()
+    while True:
+        buf = src.read(blocksize)
+        if not buf:
+            break
+        buf = decompressor.decompress(buf)
+        if buf:
+            dst.write(buf)
+    decompressor.flush()  # makes sure the stream ended well
diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py
index fa5432b24..4ed56f478 100644
--- a/r2/r2/lib/pages/pages.py
+++ b/r2/r2/lib/pages/pages.py
@@ -203,8 +203,10 @@ def responsive(res, space_compress=None):
 
 
 class Robots(Templated):
-    pass
 
+    def __init__(self, **context):
+        Templated.__init__(self, **context)
+        self.subreddit_sitemap = g.sitemap_subreddit_static_url
 
 class CrossDomain(Templated):
     pass
diff --git a/r2/r2/lib/s3_helpers.py b/r2/r2/lib/s3_helpers.py
index 8d4c7abd0..4bf959bb7 100644
--- a/r2/r2/lib/s3_helpers.py
+++ b/r2/r2/lib/s3_helpers.py
@@ -30,6 +30,7 @@ import sys
 import time
 import datetime
 import pytz
+from collections import namedtuple
 
 from pylons import app_globals as g
 
@@ -69,6 +70,17 @@ def _from_path(path):
     return bucket, key
 
 
+S3Path = namedtuple('S3Path', ['bucket', 'key'])
+
+
+def parse_s3_path(path):
+    return S3Path(*_from_path(path))
+
+
+def format_expires(expires):
+    return expires.strftime(EXPIRES_DATE_FORMAT)
+
+
 def get_text_from_s3(s3_connection, path):
     """Read a file from S3 and return it as text."""
     bucket_name, key_name = _from_path(path)
@@ -363,4 +375,3 @@ def get_post_args(
         "action": "//%s.%s" % (bucket, g.s3_media_domain),
         "fields": fields,
     }
-
diff --git a/r2/r2/lib/sitemaps/__init__.py b/r2/r2/lib/sitemaps/__init__.py
index 9943b1378..ac13af3ca 100644
--- a/r2/r2/lib/sitemaps/__init__.py
+++ b/r2/r2/lib/sitemaps/__init__.py
@@ -34,8 +34,7 @@ Reddit contains tons and tons of links. Generating them on the fly is simply
 impractical. The solution this module implements is a very slow batch that
 goes through every subreddit and every link and creates crawlable permalinks
 from them. These links are then put into sitemaps and stored in
-r2.lib.sitemap.Sitemap. The sitemap controllers then simply serve up the xml
-files that they find in the Sitemap Thing.
+s3. We then upload those sitemaps as static files to s3 where we host them.
 
 The Sitemap protocol specifies a hard limit of 50000 links. Since we have
 significantly more links than that, we have to define a Sitemap Index
@@ -57,11 +56,11 @@ This module is split into 3 parts.
 
   r2.lib.sitemaps.data - Loads up the raw Subreddit and Link Things.
   r2.lib.sitemaps.generate - Transforms the Things into sitemap xml strings.
-  r2.lib.sitemaps.store - Glue code that makes and stores the sitemaps.
+  r2.lib.sitemaps.store - Stores the sitemaps on s3.
+  r2.lib.sitemaps.watcher - Reads from the SQS queue and starts a new upload
 
 
 The only function that's supposed to be used outside of this module is
-r2.lib.sitemaps.store.store_sitemaps which dumps the new
-sitemaps into r2.models.sitemap.Sitemap. This is designed to be used
-through a daily cron job given the slow speed of the operation.
+r2.lib.sitemaps.watcher.watcher. This is designed to be used as a constantly
+running daemon.
 """
diff --git a/r2/r2/lib/sitemaps/data.py b/r2/r2/lib/sitemaps/data.py
index 3b8bad127..9d467a6bf 100644
--- a/r2/r2/lib/sitemaps/data.py
+++ b/r2/r2/lib/sitemaps/data.py
@@ -25,53 +25,49 @@
 Currently only supports subreddit links but will soon support comment links.
 """
 
-import hashlib
-import itertools
+import tempfile
 
+from boto.s3.connection import S3Connection
 from pylons import app_globals as g
 
-from r2.lib.db.operators import asc
-from r2.lib.utils import fetch_things2, rate_limited_generator
-from r2.models.subreddit import Subreddit
+from r2.lib.hadoop_decompress import hadoop_decompress
 
 
-DB_CHUNK_SIZE = 50000
-DB_RATE_LIMIT = DB_CHUNK_SIZE
-EXPERIMENT_SUBREDDIT_SITEMAP = 'experiment-subreddit-sitemap'
-# number of possible ways a subreddit can be partitioned for the experiment.
-EXPERIMENT_BUCKET_COUNT = 20
+def _read_subreddit_etl_from_s3(s3path):
+    s3conn = S3Connection()
+    bucket = s3conn.get_bucket(s3path.bucket, validate=False)
+    s3keys = bucket.list(s3path.key)
+
+    key_count = 0
+    for s3key in s3keys:
+        g.log.info("Importing key %r", s3key)
+
+        with tempfile.TemporaryFile(mode='rw+b') as ntf_download:
+            with tempfile.TemporaryFile(mode='rw+b') as ntf_decompress:
+
+                # download it
+                g.log.debug("Downloading %r", s3key)
+                s3key.get_contents_to_file(ntf_download)
+
+                # decompress it
+                ntf_download.flush()
+                ntf_download.seek(0)
+                g.log.debug("Decompressing %r", s3key)
+                hadoop_decompress(ntf_download, ntf_decompress)
+                ntf_decompress.flush()
+                ntf_decompress.seek(0)
+
+                # import it
+                g.log.debug("Starting import of %r", s3key)
+                for line in ntf_decompress:
+                    yield line
+        key_count += 1
+
+    if key_count == 0:
+        raise ValueError('{0} contains no readable keys.'.format(s3path))
 
 
-def rate_limit_query(query):
-    return rate_limited_generator(
-        DB_RATE_LIMIT,
-        fetch_things2(query, DB_CHUNK_SIZE),
-    )
-
-def is_part_of_experiment(subreddit):
-    """Decide that this subreddit is part of the seo traffic experiment.
-
-    At the moment the features system (r2/config/feature/README.md)
-    is designed to be bucketed on a per user basis. We would like an
-    experiment that is bucketed by subreddits instead. To do this we
-    are going completely around the features system and instead
-    bucketing the code here and communicating our hashing method with
-    the data team.
-
-    Much of this logic is borrowed from FeatureState.
-    """
-    key = '_'.join((EXPERIMENT_SUBREDDIT_SITEMAP, subreddit.name))
-    hashed = hashlib.sha1(key)
-    bucket = long(hashed.hexdigest(), 16) % EXPERIMENT_BUCKET_COUNT
-    return bucket == 0
-
-def is_subreddit_to_crawl(subreddit):
-    return (subreddit.quarantine == False and
-            subreddit.over_18 == False and
-            is_part_of_experiment(subreddit))
-
-def find_all_subreddits():
-    iterator = rate_limit_query(Subreddit._query(
-        *[Subreddit.c.type != type_ for type_ in Subreddit.private_types],
-        sort=asc('_date')))
-    return itertools.ifilter(is_subreddit_to_crawl, iterator)
+def find_all_subreddits(s3path):
+    for line in _read_subreddit_etl_from_s3(s3path):
+        _, subreddit, __ = line.split('\x01')
+        yield subreddit
diff --git a/r2/r2/lib/sitemaps/generate.py b/r2/r2/lib/sitemaps/generate.py
index 3e978dab3..7b43cd5c8 100644
--- a/r2/r2/lib/sitemaps/generate.py
+++ b/r2/r2/lib/sitemaps/generate.py
@@ -65,6 +65,7 @@ Each sitemap and sitemap index will have 50000 links or fewer.
 
 from lxml import etree
 from pylons import app_globals as g
+
 from r2.lib.template_helpers import add_sr
 from r2.lib.utils import in_chunks
 
@@ -72,11 +73,11 @@ SITEMAP_NAMESPACE = "http://www.sitemaps.org/schemas/sitemap/0.9"
 LINKS_PER_SITEMAP = 50000
 
 
-def absolute_url(path):
+def _absolute_url(path):
     return add_sr(path, force_https=True, sr_path=False)
 
 
-def stringify_xml(root_element):
+def _stringify_xml(root_element):
     return etree.tostring(
         root_element,
         pretty_print=g.debug,
@@ -85,18 +86,19 @@ def stringify_xml(root_element):
     )
 
 
-def subreddit_links(subreddits):
+def _subreddit_links(subreddits):
     for subreddit in subreddits:
-        yield absolute_url(subreddit.path)
+        path = '/r/{0}/'.format(subreddit)
+        yield _absolute_url(path)
 
 
-def subreddit_sitemap(subreddits):
-    urlset  = etree.Element('urlset', xmlns=SITEMAP_NAMESPACE)
-    for link in subreddit_links(subreddits):
+def _subreddit_sitemap(subreddits):
+    urlset = etree.Element('urlset', xmlns=SITEMAP_NAMESPACE)
+    for link in _subreddit_links(subreddits):
         url_elem = etree.SubElement(urlset, 'url')
         loc_elem = etree.SubElement(url_elem, 'loc')
         loc_elem.text = link
-    return stringify_xml(urlset)
+    return _stringify_xml(urlset)
 
 
 def subreddit_sitemaps(subreddits):
@@ -106,13 +108,15 @@ def subreddit_sitemaps(subreddits):
     links according to the sitemap standard.
     """
     for subreddit_chunks in in_chunks(subreddits, LINKS_PER_SITEMAP):
-        yield subreddit_sitemap(subreddit_chunks)
+        yield _subreddit_sitemap(subreddit_chunks)
 
 
 def sitemap_index(count):
-    sm_elem  = etree.Element('sitemapindex', xmlns=SITEMAP_NAMESPACE)
+    sm_elem = etree.Element('sitemapindex', xmlns=SITEMAP_NAMESPACE)
     for i in xrange(count):
         sitemap_elem = etree.SubElement(sm_elem, 'sitemap')
         loc_elem = etree.SubElement(sitemap_elem, 'loc')
-        loc_elem.text = absolute_url('/subreddit_sitemap?index={0}'.format(i))
-    return stringify_xml(sm_elem)
+        url = '{0}/subreddit_sitemap/{1}.xml'.format(
+            g.sitemap_s3_static_host, i)
+        loc_elem.text = url
+    return _stringify_xml(sm_elem)
diff --git a/r2/r2/lib/sitemaps/store.py b/r2/r2/lib/sitemaps/store.py
index b0502e3f3..55ffcdf4c 100644
--- a/r2/r2/lib/sitemaps/store.py
+++ b/r2/r2/lib/sitemaps/store.py
@@ -20,12 +20,78 @@
 # Inc. All Rights Reserved.
 ###############################################################################
 
-from r2.models.sitemap import SitemapUpdater
-from r2.lib.sitemaps.data import find_all_subreddits
+"""Store sitemaps in s3.
+
+This module is uploads all subreddit sitemaps as well as the sitemap index
+to s3. The basic idea is that amazon will be serving the static sitemaps for
+us.
+
+The binary data we send to s3 is a gzipped xml file. In addition we also
+send the appropriate type and encoding headers so this is understood
+correctly by the browser.
+
+The only file expected to be used outside this module is:
+
+store_sitemaps_in_s3(subreddits)
+
+Even though the subreddits are expected to be generated and passed into this
+function, the sitemap index is created here. The reasoning is that in order
+to create the sitemap index we need to know how many sitemaps we have.
+If we simply queried the subreddit iterator for it's length then we would
+have to load all of the subreddits into memory, which would be ... bad.
+"""
+
+
+import gzip
+from StringIO import StringIO
+
+from boto.s3.connection import S3Connection
+from boto.s3.key import Key
+from pylons import app_globals as g
+
 from r2.lib.sitemaps.generate import subreddit_sitemaps, sitemap_index
 
-def store_sitemaps():
-    sitemap_updater = SitemapUpdater()
-    for subreddit_sitemap in subreddit_sitemaps(find_all_subreddits()):
-        sitemap_updater.add_subreddit_sitemap(subreddit_sitemap)
-    sitemap_updater.add_sitemap_index(sitemap_index(sitemap_updater.count))
+
+HEADERS = {
+    'Content-Type': 'text/xml',
+    'Content-Encoding': 'gzip',
+}
+
+
+def zip_string(string):
+    zipbuffer = StringIO()
+    with gzip.GzipFile(mode='w', fileobj=zipbuffer) as f:
+        f.write(string)
+    return zipbuffer.getvalue()
+
+
+def upload_sitemap(key, sitemap):
+    key.set_contents_from_string(zip_string(sitemap), headers=HEADERS)
+
+
+def store_subreddit_sitemap(bucket, index, sitemap):
+    key = Key(bucket)
+    key.key = 'subreddit_sitemap/{0}.xml'.format(index)
+    g.log.debug("Uploading %r", key)
+
+    upload_sitemap(key, sitemap)
+
+
+def store_sitemap_index(bucket, count):
+    key = Key(bucket)
+    key.key = g.sitemap_subreddit_keyname
+    g.log.debug("Uploading %r", key)
+
+    upload_sitemap(key, sitemap_index(count))
+
+
+def store_sitemaps_in_s3(subreddits):
+    s3conn = S3Connection()
+    bucket = s3conn.get_bucket(g.sitemap_upload_s3_bucket, validate=False)
+
+    sitemap_count = 0
+    for i, sitemap in enumerate(subreddit_sitemaps(subreddits)):
+        store_subreddit_sitemap(bucket, i, sitemap)
+        sitemap_count += 1
+
+    store_sitemap_index(bucket, sitemap_count)
diff --git a/r2/r2/lib/sitemaps/watcher.py b/r2/r2/lib/sitemaps/watcher.py
new file mode 100644
index 000000000..2fc3dc45e
--- /dev/null
+++ b/r2/r2/lib/sitemaps/watcher.py
@@ -0,0 +1,101 @@
+import datetime
+import dateutil
+import json
+import pytz
+import time
+
+from boto.s3.connection import S3Connection
+from boto.sqs.connection import SQSConnection
+from pylons import app_globals as g
+
+from r2.lib.s3_helpers import parse_s3_path
+from r2.lib.sitemaps.store import store_sitemaps_in_s3
+from r2.lib.sitemaps.data import find_all_subreddits
+
+"""Watch for SQS messages informing us to read, generate, and store sitemaps.
+
+There is only function that should be used outside this module
+
+watcher()
+
+It is designed to be used in a daemon process.
+"""
+
+
+def watcher():
+    """Poll for new sitemap data and process it as necessary."""
+    while True:
+        _process_message()
+
+
+def _subreddit_sitemap_key():
+    conn = S3Connection()
+    bucket = conn.get_bucket(g.sitemap_upload_s3_bucket, validate=False)
+    return bucket.get_key(g.sitemap_subreddit_keyname)
+
+
+def _datetime_from_timestamp(timestamp):
+    return datetime.datetime.fromtimestamp(timestamp / 1000, pytz.utc)
+
+
+def _before_last_sitemap(timestamp):
+    sitemap_key = _subreddit_sitemap_key()
+    if sitemap_key is None:
+        return False
+
+    sitemap_datetime = dateutil.parser.parse(sitemap_key.last_modified)
+    compare_datetime = _datetime_from_timestamp(timestamp)
+    return compare_datetime < sitemap_datetime
+
+
+def _process_message():
+    if not g.sitemap_sqs_queue:
+        return
+
+    sqs = SQSConnection()
+    sqs_q = sqs.get_queue(g.sitemap_sqs_queue)
+
+    messages = sqs.receive_message(sqs_q, number_messages=1)
+
+    if not messages:
+        return
+
+    message, = messages
+
+    js = json.loads(message.get_body())
+    s3path = parse_s3_path(js['location'])
+
+    # There are some error cases that allow us to get messages
+    # for sitemap creation that are now out of date.
+    timestamp = js.get('timestamp')
+    if timestamp is not None and _before_last_sitemap(timestamp):
+        sqs_q.delete_message(message)
+        return
+
+    g.log.info("Got import job %r", js)
+
+    subreddits = find_all_subreddits(s3path)
+    store_sitemaps_in_s3(subreddits)
+
+    sqs_q.delete_message(message)
+
+
+def _current_timestamp():
+    return time.time() * 1000
+
+
+def _create_test_message():
+    """A dev only function that drops a new message on the sqs queue."""
+    sqs = SQSConnection()
+    sqs_q = sqs.get_queue(g.sitemap_sqs_queue)
+
+    # it returns None on failure
+    assert sqs_q, "failed to connect to queue"
+
+    message = sqs_q.new_message(body=json.dumps({
+        'job_name': 'daily-sr-sitemap-reporting',
+        'location': ('s3://reddit-data-analysis/big-data/r2/prod/' +
+                     'daily_sr_sitemap_reporting/dt=2016-06-14'),
+        'timestamp': _current_timestamp(),
+    }))
+    sqs_q.write(message)
diff --git a/r2/r2/models/sitemap.py b/r2/r2/models/sitemap.py
deleted file mode 100644
index 42d6d6b26..000000000
--- a/r2/r2/models/sitemap.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# The contents of this file are subject to the Common Public Attribution
-# License Version 1.0. (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
-# License Version 1.1, but Sections 14 and 15 have been added to cover use of
-# software over a computer network and provide for limited attribution for the
-# Original Developer. In addition, Exhibit A has been modified to be consistent
-# with Exhibit B.
-#
-# Software distributed under the License is distributed on an "AS IS" basis,
-# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
-# the specific language governing rights and limitations under the License.
-#
-# The Original Code is reddit.
-#
-# The Original Developer is the Initial Developer.  The Initial Developer of
-# the Original Code is reddit Inc.
-#
-# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
-# Inc. All Rights Reserved.
-###############################################################################
-
-
-from datetime import datetime, timedelta
-import uuid
-
-from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE, DATE_TYPE
-
-from r2.lib.db import tdb_cassandra
-from r2.lib.db.operators import desc
-
-TTL = timedelta(days=10)
-
-class Sitemap(tdb_cassandra.View):
-    """Sitemaps that store the current state of the reddits.
-
-    See: http://www.sitemaps.org/protocol.html
-
-    We use a method of indirection to store the actual sitemaps. Even
-    though `Sitemap._retrieve_sitemap(Sitemap.INDEX_KEY)` can be used to get
-    the sitemap index, it's not the same as `Sitemap._byID(Sitemap.INDEX_KEY)`.
-
-    Instead for every batch update we create a unique subkey.
-    `Sitemap._byID(Sitemap.INDEX_KEY)`. Then returns the INDEX_KEY appended
-    to the current subkey. We then use that to retrieve the actual sitemap.
-    """
-
-    _use_db = True
-    _ttl = TTL
-
-    INDEX_KEY = 'key'
-    SUBREDDIT_KEY = 'subreddit_{0}'
-
-    @classmethod
-    def sitemap_index(cls):
-        """Find the current sitemap index."""
-        return cls._retrieve_sitemap(cls.INDEX_KEY)
-
-    @classmethod
-    def subreddit_sitemap(cls, index):
-        """Find one of the sitemaps dedicated to subreddit links."""
-        return cls._retrieve_sitemap(cls._subreddit_key(index))
-
-    @classmethod
-    def add_subreddit_sitemap(cls, sitemap, index, subkey):
-        key = cls._subreddit_key(index)
-        joined_key = cls._joined_key(key, subkey)
-        cls._set_values(joined_key, {'sitemap': sitemap})
-        cls._set_values(key, {'latest': joined_key})
-
-    @classmethod
-    def add_sitemap_index(cls, sitemap_index, subkey):
-        joined_key = cls._joined_key(cls.INDEX_KEY, subkey)
-        cls._set_values(joined_key, {'sitemap': sitemap_index})
-        cls._set_values(cls.INDEX_KEY, {'latest': joined_key})
-
-    @staticmethod
-    def generate_subkey():
-        return datetime.now().strftime('%y%m%d%H%M%S')
-
-    @classmethod
-    def _retrieve_sitemap(cls, sitemap_key):
-        joined_key = cls._byID(sitemap_key).latest
-        return cls._byID(joined_key).sitemap
-
-    @classmethod
-    def _subreddit_key(cls, index):
-        return cls.SUBREDDIT_KEY.format(index)
-
-    @staticmethod
-    def _joined_key(key, subkey):
-        return '_'.join((key, subkey))
-
-
-class SitemapUpdater(object):
-    """A class that facilitates the saving of many sitemaps.
-
-    This minimal helper class maintains the state of the subkey as well as the
-    indices of the various sitemap types.
-
-    Usage:
-
-    >>> su = SitemapUpdater()
-    >>> su.add_subreddit_sitemap(subreddit_sitemap_1)
-    >>> su.add_subreddit_sitemap(subreddit_sitemap_2)
-    >>> su.add_comment_page_sitemap(comment_page_sitemap)
-    >>> su.add_sitemap_index(create_sitemap_index(su.count))
-    >>> su.count # evaluates to 3
-    """
-
-    def __init__(self):
-        self._subkey = Sitemap.generate_subkey()
-        self._subreddit_count = 0
-
-    def add_subreddit_sitemap(self, sitemap):
-        Sitemap.add_subreddit_sitemap(
-            sitemap, self._subreddit_count, self._subkey)
-        self._subreddit_count += 1
-
-    def add_sitemap_index(self, sitemap_index):
-        Sitemap.add_sitemap_index(sitemap_index, self._subkey)
-
-    @property
-    def count(self):
-        # note that sitemap indices don't count towards this count.
-        return self._subreddit_count
diff --git a/r2/r2/templates/robots.txt b/r2/r2/templates/robots.txt
index 4a64f6b81..d2e8c92cb 100644
--- a/r2/r2/templates/robots.txt
+++ b/r2/r2/templates/robots.txt
@@ -62,4 +62,4 @@ Disallow: /search
 Disallow: /r/*/search
 Allow: /
 
-Sitemap: ${add_sr('sitemap', sr_path=False, force_https=True)}
+Sitemap: ${thing.subreddit_sitemap}
diff --git a/upstart/reddit-consumer-sitemaps_q.conf b/upstart/reddit-consumer-sitemaps_q.conf
new file mode 100644
index 000000000..4926998f1
--- /dev/null
+++ b/upstart/reddit-consumer-sitemaps_q.conf
@@ -0,0 +1,13 @@
+description "build sitemaps for almost every link on reddit"
+
+instance $x
+
+stop on reddit-stop or runlevel [016]
+
+respawn
+
+nice 10
+script
+    . /etc/default/reddit
+    wrap-job paster run --proctitle sitemaps_q$x $REDDIT_INI $REDDIT_ROOT/r2/lib/sitemaps/watcher.py -c 'watcher()'
+end script