Add a cache by URL to the media scraper

Pull media embeds and thumbnails from a cache when cache use was requested and cached media are available.
2026-04-27 03:00:12 -04:00 · 2014-02-21 14:38:55 -08:00
parent 34fd7f89ca
commit 11e739e1c6
2 changed files with 257 additions and 32 deletions
--- a/r2/r2/lib/media.py
+++ b/r2/r2/lib/media.py
@@ -47,6 +47,15 @@ from r2.lib.memoize import memoize
 from r2.lib.nymph import optimize_png
 from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
 from r2.models.link import Link
+from r2.models.media_cache import (
+    ERROR_MEDIA,
+    Media,
+    MediaByURL,
+)
+from urllib2 import (
+    HTTPError,
+    URLError,
+)


 MEDIA_FILENAME_LENGTH = 12
@@ -238,7 +247,59 @@ def upload_stylesheet(content):
    return g.media_provider.put(file_name, content)


-def _set_media(link, force=False):
+def _scrape_media(url, autoplay=False, force=False, use_cache=False,
+                  max_cache_age=None):
+    media = None
+
+    # Use media from the cache (if available)
+    if not force and use_cache:
+        mediaByURL = MediaByURL.get(url, autoplay=bool(autoplay),
+                                    max_cache_age=max_cache_age)
+        if mediaByURL:
+            media = mediaByURL.media
+
+    # Otherwise, scrape it
+    if not media:
+        media_object = secure_media_object = None
+        thumbnail_image = thumbnail_url = thumbnail_size = None
+
+        scraper = Scraper.for_url(url, autoplay=autoplay)
+        try:
+            thumbnail_image, media_object, secure_media_object = (
+                scraper.scrape())
+        except (HTTPError, URLError) as e:
+            if use_cache:
+                MediaByURL.add_error(url, str(e),
+                                     autoplay=bool(autoplay))
+            return None
+
+        # the scraper should be able to make a media embed out of the
+        # media object it just gave us. if not, null out the media object
+        # to protect downstream code
+        if media_object and not scraper.media_embed(media_object):
+            print "%s made a bad media obj for url %s" % (scraper, url)
+            media_object = None
+
+        if (secure_media_object and
+            not scraper.media_embed(secure_media_object)):
+            print "%s made a bad secure media obj for url %s" % (scraper, url)
+            secure_media_object = None
+
+        if thumbnail_image:
+            thumbnail_size = thumbnail_image.size
+            thumbnail_url = upload_media(thumbnail_image)
+
+        media = Media(media_object, secure_media_object,
+                      thumbnail_url, thumbnail_size)
+
+    # Store the media in the cache (if requested), possibly extending the ttl
+    if use_cache and media is not ERROR_MEDIA:
+        MediaByURL.add(url, media, autoplay=bool(autoplay))
+
+    return media
+
+
+def _set_media(link, force=False, **kwargs):
    if link.is_self:
        return
    if not force and link.promoted:
@@ -246,34 +307,16 @@ def _set_media(link, force=False):
    elif not force and (link.has_thumbnail or link.media_object):
        return

-    scraper = Scraper.for_url(link.url)
-    thumbnail, media_object, secure_media_object = scraper.scrape()
+    media = _scrape_media(link.url, force=force, **kwargs)

-    if media_object:
-        # the scraper should be able to make a media embed out of the
-        # media object it just gave us. if not, null out the media object
-        # to protect downstream code
-        res = scraper.media_embed(media_object)
+    if media and not link.promoted:
+        link.thumbnail_url = media.thumbnail_url
+        link.thumbnail_size = media.thumbnail_size

-        if not res:
-            print "%s made a bad media obj for link %s" % (scraper, link._id36)
-            media_object = None
+        link.set_media_object(media.media_object)
+        link.set_secure_media_object(media.secure_media_object)

-    if secure_media_object:
-        res = scraper.media_embed(secure_media_object)
-
-        if not res:
-            print "%s made a bad secure media obj for link %s" % (scraper,
-                                                                  link._id36)
-            secure_media_object = None
-
-    if thumbnail:
-        link.thumbnail_url = upload_media(thumbnail)
-        link.thumbnail_size = thumbnail.size
-
-    link.set_media_object(media_object)
-    link.set_secure_media_object(secure_media_object)
-    link._commit()
+        link._commit()


 def force_thumbnail(link, image_data, file_type=".jpg"):
@@ -344,7 +387,7 @@ def _make_thumbnail_from_url(thumbnail_url, referer):

 class Scraper(object):
    @classmethod
-    def for_url(cls, url):
+    def for_url(cls, url, autoplay=False):
        scraper = hooks.get_hook("scraper.factory").call_until_return(url=url)
        if scraper:
            return scraper
@@ -352,7 +395,7 @@ class Scraper(object):
        embedly_services = _fetch_embedly_services()
        for service_re, service_secure in embedly_services:
            if service_re.match(url):
-                return _EmbedlyScraper(url, service_secure)
+                return _EmbedlyScraper(url, service_secure, autoplay=autoplay)

        return _ThumbnailOnlyScraper(url)

@@ -438,18 +481,25 @@ class _ThumbnailOnlyScraper(Scraper):
 class _EmbedlyScraper(Scraper):
    EMBEDLY_API_URL = "https://api.embed.ly/1/oembed"

-    def __init__(self, url, can_embed_securely):
+    def __init__(self, url, can_embed_securely, autoplay=False):
        self.url = url
        self.can_embed_securely = can_embed_securely
+        self.embedly_params = {}
+
+        if autoplay:
+            self.embedly_params["autoplay"] = "true"

    def _fetch_from_embedly(self, secure):
-        params = urllib.urlencode({
+        param_dict = {
            "url": self.url,
            "format": "json",
            "maxwidth": 600,
            "key": g.embedly_api_key,
            "secure": "true" if secure else "false",
-        })
+        }
+
+        param_dict.update(self.embedly_params)
+        params = urllib.urlencode(param_dict)
        content = requests.get(self.EMBEDLY_API_URL + "?" + params).content
        return json.loads(content)

@@ -527,7 +577,7 @@ def run():
        link = Link._by_fullname(msg.body, data=True)

        try:
-            TimeoutFunction(_set_media, 30)(link)
+            TimeoutFunction(_set_media, 30)(link, use_cache=True)
        except TimeoutFunctionException:
            print "Timed out on %s" % fname
        except KeyboardInterrupt:
--- a/r2/r2/models/media_cache.py
+++ b/r2/r2/models/media_cache.py
@@ -0,0 +1,175 @@
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+#
+# The Original Code is reddit.
+#
+# The Original Developer is the Initial Developer.  The Initial Developer of
+# the Original Code is reddit Inc.
+#
+# All portions of the code written by reddit are Copyright (c) 2013-2014 reddit
+# Inc. All Rights Reserved.
+###############################################################################
+
+import collections
+import json
+
+from datetime import (
+    datetime,
+    timedelta,
+)
+from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE
+from r2.lib.db import tdb_cassandra
+
+
+Media = collections.namedtuple('_Media', ("media_object",
+                                          "secure_media_object",
+                                          "thumbnail_url",
+                                          "thumbnail_size"))
+
+ERROR_MEDIA = Media(None, None, None, None)
+
+
+class MediaByURL(tdb_cassandra.View):
+    _use_db = True
+    _connection_pool = 'main'
+    _ttl = timedelta(minutes=720)
+
+    _read_consistency_level = tdb_cassandra.CL.QUORUM
+    _write_consistency_level = tdb_cassandra.CL.QUORUM
+    _int_props = {"thumbnail_width", "thumbnail_height"}
+    _date_props = {"last_modified"}
+    _extra_schema_creation_args = {
+        "key_validation_class": ASCII_TYPE,
+        "column_name_class": UTF8_TYPE,
+    }
+
+    _defaults = {
+        "state": "enqueued",
+        "error": "",
+        "thumbnail_url": "",
+        "thumbnail_width": 0,
+        "thumbnail_height": 0,
+        "media_object": "",
+        "secure_media_object": "",
+        "last_modified": datetime.utcfromtimestamp(0),
+    }
+
+    @classmethod
+    def _rowkey(cls, url, **kwargs):
+        return (
+            url +
+            # pipe is not allowed in URLs, so use it as a delimiter
+            "|" +
+
+            # append the extra cache keys in kwargs as a canonical JSON string
+            json.dumps(
+                kwargs,
+                ensure_ascii=True,
+                encoding="ascii",
+                indent=None,
+                separators=(",", ":"),
+                sort_keys=True,
+            )
+        )
+
+    @classmethod
+    def add_placeholder(cls, url, **kwargs):
+        rowkey = cls._rowkey(url, **kwargs)
+        cls._set_values(rowkey, {
+            "state": "enqueued",
+            "error": "",
+            "last_modified": datetime.utcnow(),
+        })
+
+    @classmethod
+    def add(cls, url, media, **kwargs):
+        rowkey = cls._rowkey(url, **kwargs)
+        columns = cls._defaults.copy()
+
+        columns.update({
+            "state": "processed",
+            "error": "",
+            "last_modified": datetime.utcnow(),
+        })
+
+        if media.thumbnail_url and media.thumbnail_size:
+            columns.update({
+                "thumbnail_url": media.thumbnail_url,
+                "thumbnail_width": media.thumbnail_size[0],
+                "thumbnail_height": media.thumbnail_size[1],
+            })
+
+        if media.media_object:
+            columns.update({
+                "media_object": json.dumps(media.media_object),
+            })
+
+        if media.secure_media_object:
+            columns.update({
+                "secure_media_object": (json.
+                                        dumps(media.secure_media_object)),
+            })
+
+        cls._set_values(rowkey, columns)
+
+    @classmethod
+    def add_error(cls, url, error, **kwargs):
+        rowkey = cls._rowkey(url, **kwargs)
+        columns = {
+            "error": error,
+            "state": "processed",
+            "last_modified": datetime.utcnow(),
+        }
+        cls._set_values(rowkey, columns)
+
+    @classmethod
+    def get(cls, url, max_cache_age=None, **kwargs):
+        rowkey = cls._rowkey(url, **kwargs)
+        try:
+            temp = cls._byID(rowkey)
+
+            # Return None if this cache entry is too old
+            if (max_cache_age is not None and
+                datetime.datetime.utcnow() - temp.last_modified >
+                max_cache_age):
+                return None
+            else:
+                return temp
+        except tdb_cassandra.NotFound:
+            return None
+
+    @property
+    def media(self):
+        if self.state == "processed":
+            if not self.error:
+                media_object = secure_media_object = None
+                thumbnail_url = thumbnail_size = None
+
+                if (self.thumbnail_width and self.thumbnail_height and
+                    self.thumbnail_url):
+                    thumbnail_url = self.thumbnail_url
+                    thumbnail_size = (self.thumbnail_width,
+                                      self.thumbnail_height)
+
+                if self.media_object:
+                    media_object = json.loads(self.media_object)
+
+                if self.secure_media_object:
+                    secure_media_object = json.loads(self.secure_media_object)
+
+                return Media(media_object, secure_media_object,
+                             thumbnail_url, thumbnail_size)
+            else:
+                return ERROR_MEDIA
+        else:
+            return None