Add a cache by URL to the media scraper

Pull media embeds and thumbnails from a cache when cache use was requested
and cached media are available.
This commit is contained in:
David Ehrmann
2014-02-21 14:38:55 -08:00
parent 34fd7f89ca
commit 11e739e1c6
2 changed files with 257 additions and 32 deletions

View File

@@ -47,6 +47,15 @@ from r2.lib.memoize import memoize
from r2.lib.nymph import optimize_png
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
from r2.models.link import Link
from r2.models.media_cache import (
ERROR_MEDIA,
Media,
MediaByURL,
)
from urllib2 import (
HTTPError,
URLError,
)
MEDIA_FILENAME_LENGTH = 12
@@ -238,7 +247,59 @@ def upload_stylesheet(content):
return g.media_provider.put(file_name, content)
def _set_media(link, force=False):
def _scrape_media(url, autoplay=False, force=False, use_cache=False,
max_cache_age=None):
media = None
# Use media from the cache (if available)
if not force and use_cache:
mediaByURL = MediaByURL.get(url, autoplay=bool(autoplay),
max_cache_age=max_cache_age)
if mediaByURL:
media = mediaByURL.media
# Otherwise, scrape it
if not media:
media_object = secure_media_object = None
thumbnail_image = thumbnail_url = thumbnail_size = None
scraper = Scraper.for_url(url, autoplay=autoplay)
try:
thumbnail_image, media_object, secure_media_object = (
scraper.scrape())
except (HTTPError, URLError) as e:
if use_cache:
MediaByURL.add_error(url, str(e),
autoplay=bool(autoplay))
return None
# the scraper should be able to make a media embed out of the
# media object it just gave us. if not, null out the media object
# to protect downstream code
if media_object and not scraper.media_embed(media_object):
print "%s made a bad media obj for url %s" % (scraper, url)
media_object = None
if (secure_media_object and
not scraper.media_embed(secure_media_object)):
print "%s made a bad secure media obj for url %s" % (scraper, url)
secure_media_object = None
if thumbnail_image:
thumbnail_size = thumbnail_image.size
thumbnail_url = upload_media(thumbnail_image)
media = Media(media_object, secure_media_object,
thumbnail_url, thumbnail_size)
# Store the media in the cache (if requested), possibly extending the ttl
if use_cache and media is not ERROR_MEDIA:
MediaByURL.add(url, media, autoplay=bool(autoplay))
return media
def _set_media(link, force=False, **kwargs):
if link.is_self:
return
if not force and link.promoted:
@@ -246,34 +307,16 @@ def _set_media(link, force=False):
elif not force and (link.has_thumbnail or link.media_object):
return
scraper = Scraper.for_url(link.url)
thumbnail, media_object, secure_media_object = scraper.scrape()
media = _scrape_media(link.url, force=force, **kwargs)
if media_object:
# the scraper should be able to make a media embed out of the
# media object it just gave us. if not, null out the media object
# to protect downstream code
res = scraper.media_embed(media_object)
if media and not link.promoted:
link.thumbnail_url = media.thumbnail_url
link.thumbnail_size = media.thumbnail_size
if not res:
print "%s made a bad media obj for link %s" % (scraper, link._id36)
media_object = None
link.set_media_object(media.media_object)
link.set_secure_media_object(media.secure_media_object)
if secure_media_object:
res = scraper.media_embed(secure_media_object)
if not res:
print "%s made a bad secure media obj for link %s" % (scraper,
link._id36)
secure_media_object = None
if thumbnail:
link.thumbnail_url = upload_media(thumbnail)
link.thumbnail_size = thumbnail.size
link.set_media_object(media_object)
link.set_secure_media_object(secure_media_object)
link._commit()
link._commit()
def force_thumbnail(link, image_data, file_type=".jpg"):
@@ -344,7 +387,7 @@ def _make_thumbnail_from_url(thumbnail_url, referer):
class Scraper(object):
@classmethod
def for_url(cls, url):
def for_url(cls, url, autoplay=False):
scraper = hooks.get_hook("scraper.factory").call_until_return(url=url)
if scraper:
return scraper
@@ -352,7 +395,7 @@ class Scraper(object):
embedly_services = _fetch_embedly_services()
for service_re, service_secure in embedly_services:
if service_re.match(url):
return _EmbedlyScraper(url, service_secure)
return _EmbedlyScraper(url, service_secure, autoplay=autoplay)
return _ThumbnailOnlyScraper(url)
@@ -438,18 +481,25 @@ class _ThumbnailOnlyScraper(Scraper):
class _EmbedlyScraper(Scraper):
EMBEDLY_API_URL = "https://api.embed.ly/1/oembed"
def __init__(self, url, can_embed_securely):
def __init__(self, url, can_embed_securely, autoplay=False):
self.url = url
self.can_embed_securely = can_embed_securely
self.embedly_params = {}
if autoplay:
self.embedly_params["autoplay"] = "true"
def _fetch_from_embedly(self, secure):
params = urllib.urlencode({
param_dict = {
"url": self.url,
"format": "json",
"maxwidth": 600,
"key": g.embedly_api_key,
"secure": "true" if secure else "false",
})
}
param_dict.update(self.embedly_params)
params = urllib.urlencode(param_dict)
content = requests.get(self.EMBEDLY_API_URL + "?" + params).content
return json.loads(content)
@@ -527,7 +577,7 @@ def run():
link = Link._by_fullname(msg.body, data=True)
try:
TimeoutFunction(_set_media, 30)(link)
TimeoutFunction(_set_media, 30)(link, use_cache=True)
except TimeoutFunctionException:
print "Timed out on %s" % fname
except KeyboardInterrupt:

175
r2/r2/models/media_cache.py Executable file
View File

@@ -0,0 +1,175 @@
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2013-2014 reddit
# Inc. All Rights Reserved.
###############################################################################
import collections
import json
from datetime import (
datetime,
timedelta,
)
from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE
from r2.lib.db import tdb_cassandra
Media = collections.namedtuple('_Media', ("media_object",
"secure_media_object",
"thumbnail_url",
"thumbnail_size"))
ERROR_MEDIA = Media(None, None, None, None)
class MediaByURL(tdb_cassandra.View):
_use_db = True
_connection_pool = 'main'
_ttl = timedelta(minutes=720)
_read_consistency_level = tdb_cassandra.CL.QUORUM
_write_consistency_level = tdb_cassandra.CL.QUORUM
_int_props = {"thumbnail_width", "thumbnail_height"}
_date_props = {"last_modified"}
_extra_schema_creation_args = {
"key_validation_class": ASCII_TYPE,
"column_name_class": UTF8_TYPE,
}
_defaults = {
"state": "enqueued",
"error": "",
"thumbnail_url": "",
"thumbnail_width": 0,
"thumbnail_height": 0,
"media_object": "",
"secure_media_object": "",
"last_modified": datetime.utcfromtimestamp(0),
}
@classmethod
def _rowkey(cls, url, **kwargs):
return (
url +
# pipe is not allowed in URLs, so use it as a delimiter
"|" +
# append the extra cache keys in kwargs as a canonical JSON string
json.dumps(
kwargs,
ensure_ascii=True,
encoding="ascii",
indent=None,
separators=(",", ":"),
sort_keys=True,
)
)
@classmethod
def add_placeholder(cls, url, **kwargs):
rowkey = cls._rowkey(url, **kwargs)
cls._set_values(rowkey, {
"state": "enqueued",
"error": "",
"last_modified": datetime.utcnow(),
})
@classmethod
def add(cls, url, media, **kwargs):
rowkey = cls._rowkey(url, **kwargs)
columns = cls._defaults.copy()
columns.update({
"state": "processed",
"error": "",
"last_modified": datetime.utcnow(),
})
if media.thumbnail_url and media.thumbnail_size:
columns.update({
"thumbnail_url": media.thumbnail_url,
"thumbnail_width": media.thumbnail_size[0],
"thumbnail_height": media.thumbnail_size[1],
})
if media.media_object:
columns.update({
"media_object": json.dumps(media.media_object),
})
if media.secure_media_object:
columns.update({
"secure_media_object": (json.
dumps(media.secure_media_object)),
})
cls._set_values(rowkey, columns)
@classmethod
def add_error(cls, url, error, **kwargs):
rowkey = cls._rowkey(url, **kwargs)
columns = {
"error": error,
"state": "processed",
"last_modified": datetime.utcnow(),
}
cls._set_values(rowkey, columns)
@classmethod
def get(cls, url, max_cache_age=None, **kwargs):
rowkey = cls._rowkey(url, **kwargs)
try:
temp = cls._byID(rowkey)
# Return None if this cache entry is too old
if (max_cache_age is not None and
datetime.datetime.utcnow() - temp.last_modified >
max_cache_age):
return None
else:
return temp
except tdb_cassandra.NotFound:
return None
@property
def media(self):
if self.state == "processed":
if not self.error:
media_object = secure_media_object = None
thumbnail_url = thumbnail_size = None
if (self.thumbnail_width and self.thumbnail_height and
self.thumbnail_url):
thumbnail_url = self.thumbnail_url
thumbnail_size = (self.thumbnail_width,
self.thumbnail_height)
if self.media_object:
media_object = json.loads(self.media_object)
if self.secure_media_object:
secure_media_object = json.loads(self.secure_media_object)
return Media(media_object, secure_media_object,
thumbnail_url, thumbnail_size)
else:
return ERROR_MEDIA
else:
return None