diff --git a/r2/r2/lib/media.py b/r2/r2/lib/media.py index d35360fda..a37f8721e 100644 --- a/r2/r2/lib/media.py +++ b/r2/r2/lib/media.py @@ -47,6 +47,15 @@ from r2.lib.memoize import memoize from r2.lib.nymph import optimize_png from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain from r2.models.link import Link +from r2.models.media_cache import ( + ERROR_MEDIA, + Media, + MediaByURL, +) +from urllib2 import ( + HTTPError, + URLError, +) MEDIA_FILENAME_LENGTH = 12 @@ -238,7 +247,59 @@ def upload_stylesheet(content): return g.media_provider.put(file_name, content) -def _set_media(link, force=False): +def _scrape_media(url, autoplay=False, force=False, use_cache=False, + max_cache_age=None): + media = None + + # Use media from the cache (if available) + if not force and use_cache: + mediaByURL = MediaByURL.get(url, autoplay=bool(autoplay), + max_cache_age=max_cache_age) + if mediaByURL: + media = mediaByURL.media + + # Otherwise, scrape it + if not media: + media_object = secure_media_object = None + thumbnail_image = thumbnail_url = thumbnail_size = None + + scraper = Scraper.for_url(url, autoplay=autoplay) + try: + thumbnail_image, media_object, secure_media_object = ( + scraper.scrape()) + except (HTTPError, URLError) as e: + if use_cache: + MediaByURL.add_error(url, str(e), + autoplay=bool(autoplay)) + return None + + # the scraper should be able to make a media embed out of the + # media object it just gave us. if not, null out the media object + # to protect downstream code + if media_object and not scraper.media_embed(media_object): + print "%s made a bad media obj for url %s" % (scraper, url) + media_object = None + + if (secure_media_object and + not scraper.media_embed(secure_media_object)): + print "%s made a bad secure media obj for url %s" % (scraper, url) + secure_media_object = None + + if thumbnail_image: + thumbnail_size = thumbnail_image.size + thumbnail_url = upload_media(thumbnail_image) + + media = Media(media_object, secure_media_object, + thumbnail_url, thumbnail_size) + + # Store the media in the cache (if requested), possibly extending the ttl + if use_cache and media is not ERROR_MEDIA: + MediaByURL.add(url, media, autoplay=bool(autoplay)) + + return media + + +def _set_media(link, force=False, **kwargs): if link.is_self: return if not force and link.promoted: @@ -246,34 +307,16 @@ def _set_media(link, force=False): elif not force and (link.has_thumbnail or link.media_object): return - scraper = Scraper.for_url(link.url) - thumbnail, media_object, secure_media_object = scraper.scrape() + media = _scrape_media(link.url, force=force, **kwargs) - if media_object: - # the scraper should be able to make a media embed out of the - # media object it just gave us. if not, null out the media object - # to protect downstream code - res = scraper.media_embed(media_object) + if media and not link.promoted: + link.thumbnail_url = media.thumbnail_url + link.thumbnail_size = media.thumbnail_size - if not res: - print "%s made a bad media obj for link %s" % (scraper, link._id36) - media_object = None + link.set_media_object(media.media_object) + link.set_secure_media_object(media.secure_media_object) - if secure_media_object: - res = scraper.media_embed(secure_media_object) - - if not res: - print "%s made a bad secure media obj for link %s" % (scraper, - link._id36) - secure_media_object = None - - if thumbnail: - link.thumbnail_url = upload_media(thumbnail) - link.thumbnail_size = thumbnail.size - - link.set_media_object(media_object) - link.set_secure_media_object(secure_media_object) - link._commit() + link._commit() def force_thumbnail(link, image_data, file_type=".jpg"): @@ -344,7 +387,7 @@ def _make_thumbnail_from_url(thumbnail_url, referer): class Scraper(object): @classmethod - def for_url(cls, url): + def for_url(cls, url, autoplay=False): scraper = hooks.get_hook("scraper.factory").call_until_return(url=url) if scraper: return scraper @@ -352,7 +395,7 @@ class Scraper(object): embedly_services = _fetch_embedly_services() for service_re, service_secure in embedly_services: if service_re.match(url): - return _EmbedlyScraper(url, service_secure) + return _EmbedlyScraper(url, service_secure, autoplay=autoplay) return _ThumbnailOnlyScraper(url) @@ -438,18 +481,25 @@ class _ThumbnailOnlyScraper(Scraper): class _EmbedlyScraper(Scraper): EMBEDLY_API_URL = "https://api.embed.ly/1/oembed" - def __init__(self, url, can_embed_securely): + def __init__(self, url, can_embed_securely, autoplay=False): self.url = url self.can_embed_securely = can_embed_securely + self.embedly_params = {} + + if autoplay: + self.embedly_params["autoplay"] = "true" def _fetch_from_embedly(self, secure): - params = urllib.urlencode({ + param_dict = { "url": self.url, "format": "json", "maxwidth": 600, "key": g.embedly_api_key, "secure": "true" if secure else "false", - }) + } + + param_dict.update(self.embedly_params) + params = urllib.urlencode(param_dict) content = requests.get(self.EMBEDLY_API_URL + "?" + params).content return json.loads(content) @@ -527,7 +577,7 @@ def run(): link = Link._by_fullname(msg.body, data=True) try: - TimeoutFunction(_set_media, 30)(link) + TimeoutFunction(_set_media, 30)(link, use_cache=True) except TimeoutFunctionException: print "Timed out on %s" % fname except KeyboardInterrupt: diff --git a/r2/r2/models/media_cache.py b/r2/r2/models/media_cache.py new file mode 100755 index 000000000..907d40d28 --- /dev/null +++ b/r2/r2/models/media_cache.py @@ -0,0 +1,175 @@ +# The contents of this file are subject to the Common Public Attribution +# License Version 1.0. (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public +# License Version 1.1, but Sections 14 and 15 have been added to cover use of +# software over a computer network and provide for limited attribution for the +# Original Developer. In addition, Exhibit A has been modified to be consistent +# with Exhibit B. +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for +# the specific language governing rights and limitations under the License. +# +# The Original Code is reddit. +# +# The Original Developer is the Initial Developer. The Initial Developer of +# the Original Code is reddit Inc. +# +# All portions of the code written by reddit are Copyright (c) 2013-2014 reddit +# Inc. All Rights Reserved. +############################################################################### + +import collections +import json + +from datetime import ( + datetime, + timedelta, +) +from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE +from r2.lib.db import tdb_cassandra + + +Media = collections.namedtuple('_Media', ("media_object", + "secure_media_object", + "thumbnail_url", + "thumbnail_size")) + +ERROR_MEDIA = Media(None, None, None, None) + + +class MediaByURL(tdb_cassandra.View): + _use_db = True + _connection_pool = 'main' + _ttl = timedelta(minutes=720) + + _read_consistency_level = tdb_cassandra.CL.QUORUM + _write_consistency_level = tdb_cassandra.CL.QUORUM + _int_props = {"thumbnail_width", "thumbnail_height"} + _date_props = {"last_modified"} + _extra_schema_creation_args = { + "key_validation_class": ASCII_TYPE, + "column_name_class": UTF8_TYPE, + } + + _defaults = { + "state": "enqueued", + "error": "", + "thumbnail_url": "", + "thumbnail_width": 0, + "thumbnail_height": 0, + "media_object": "", + "secure_media_object": "", + "last_modified": datetime.utcfromtimestamp(0), + } + + @classmethod + def _rowkey(cls, url, **kwargs): + return ( + url + + # pipe is not allowed in URLs, so use it as a delimiter + "|" + + + # append the extra cache keys in kwargs as a canonical JSON string + json.dumps( + kwargs, + ensure_ascii=True, + encoding="ascii", + indent=None, + separators=(",", ":"), + sort_keys=True, + ) + ) + + @classmethod + def add_placeholder(cls, url, **kwargs): + rowkey = cls._rowkey(url, **kwargs) + cls._set_values(rowkey, { + "state": "enqueued", + "error": "", + "last_modified": datetime.utcnow(), + }) + + @classmethod + def add(cls, url, media, **kwargs): + rowkey = cls._rowkey(url, **kwargs) + columns = cls._defaults.copy() + + columns.update({ + "state": "processed", + "error": "", + "last_modified": datetime.utcnow(), + }) + + if media.thumbnail_url and media.thumbnail_size: + columns.update({ + "thumbnail_url": media.thumbnail_url, + "thumbnail_width": media.thumbnail_size[0], + "thumbnail_height": media.thumbnail_size[1], + }) + + if media.media_object: + columns.update({ + "media_object": json.dumps(media.media_object), + }) + + if media.secure_media_object: + columns.update({ + "secure_media_object": (json. + dumps(media.secure_media_object)), + }) + + cls._set_values(rowkey, columns) + + @classmethod + def add_error(cls, url, error, **kwargs): + rowkey = cls._rowkey(url, **kwargs) + columns = { + "error": error, + "state": "processed", + "last_modified": datetime.utcnow(), + } + cls._set_values(rowkey, columns) + + @classmethod + def get(cls, url, max_cache_age=None, **kwargs): + rowkey = cls._rowkey(url, **kwargs) + try: + temp = cls._byID(rowkey) + + # Return None if this cache entry is too old + if (max_cache_age is not None and + datetime.datetime.utcnow() - temp.last_modified > + max_cache_age): + return None + else: + return temp + except tdb_cassandra.NotFound: + return None + + @property + def media(self): + if self.state == "processed": + if not self.error: + media_object = secure_media_object = None + thumbnail_url = thumbnail_size = None + + if (self.thumbnail_width and self.thumbnail_height and + self.thumbnail_url): + thumbnail_url = self.thumbnail_url + thumbnail_size = (self.thumbnail_width, + self.thumbnail_height) + + if self.media_object: + media_object = json.loads(self.media_object) + + if self.secure_media_object: + secure_media_object = json.loads(self.secure_media_object) + + return Media(media_object, secure_media_object, + thumbnail_url, thumbnail_size) + else: + return ERROR_MEDIA + else: + return None