mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-04-27 03:00:12 -04:00
Add a cache by URL to the media scraper
Pull media embeds and thumbnails from a cache when cache use was requested and cached media are available.
This commit is contained in:
@@ -47,6 +47,15 @@ from r2.lib.memoize import memoize
|
||||
from r2.lib.nymph import optimize_png
|
||||
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
|
||||
from r2.models.link import Link
|
||||
from r2.models.media_cache import (
|
||||
ERROR_MEDIA,
|
||||
Media,
|
||||
MediaByURL,
|
||||
)
|
||||
from urllib2 import (
|
||||
HTTPError,
|
||||
URLError,
|
||||
)
|
||||
|
||||
|
||||
MEDIA_FILENAME_LENGTH = 12
|
||||
@@ -238,7 +247,59 @@ def upload_stylesheet(content):
|
||||
return g.media_provider.put(file_name, content)
|
||||
|
||||
|
||||
def _set_media(link, force=False):
|
||||
def _scrape_media(url, autoplay=False, force=False, use_cache=False,
|
||||
max_cache_age=None):
|
||||
media = None
|
||||
|
||||
# Use media from the cache (if available)
|
||||
if not force and use_cache:
|
||||
mediaByURL = MediaByURL.get(url, autoplay=bool(autoplay),
|
||||
max_cache_age=max_cache_age)
|
||||
if mediaByURL:
|
||||
media = mediaByURL.media
|
||||
|
||||
# Otherwise, scrape it
|
||||
if not media:
|
||||
media_object = secure_media_object = None
|
||||
thumbnail_image = thumbnail_url = thumbnail_size = None
|
||||
|
||||
scraper = Scraper.for_url(url, autoplay=autoplay)
|
||||
try:
|
||||
thumbnail_image, media_object, secure_media_object = (
|
||||
scraper.scrape())
|
||||
except (HTTPError, URLError) as e:
|
||||
if use_cache:
|
||||
MediaByURL.add_error(url, str(e),
|
||||
autoplay=bool(autoplay))
|
||||
return None
|
||||
|
||||
# the scraper should be able to make a media embed out of the
|
||||
# media object it just gave us. if not, null out the media object
|
||||
# to protect downstream code
|
||||
if media_object and not scraper.media_embed(media_object):
|
||||
print "%s made a bad media obj for url %s" % (scraper, url)
|
||||
media_object = None
|
||||
|
||||
if (secure_media_object and
|
||||
not scraper.media_embed(secure_media_object)):
|
||||
print "%s made a bad secure media obj for url %s" % (scraper, url)
|
||||
secure_media_object = None
|
||||
|
||||
if thumbnail_image:
|
||||
thumbnail_size = thumbnail_image.size
|
||||
thumbnail_url = upload_media(thumbnail_image)
|
||||
|
||||
media = Media(media_object, secure_media_object,
|
||||
thumbnail_url, thumbnail_size)
|
||||
|
||||
# Store the media in the cache (if requested), possibly extending the ttl
|
||||
if use_cache and media is not ERROR_MEDIA:
|
||||
MediaByURL.add(url, media, autoplay=bool(autoplay))
|
||||
|
||||
return media
|
||||
|
||||
|
||||
def _set_media(link, force=False, **kwargs):
|
||||
if link.is_self:
|
||||
return
|
||||
if not force and link.promoted:
|
||||
@@ -246,34 +307,16 @@ def _set_media(link, force=False):
|
||||
elif not force and (link.has_thumbnail or link.media_object):
|
||||
return
|
||||
|
||||
scraper = Scraper.for_url(link.url)
|
||||
thumbnail, media_object, secure_media_object = scraper.scrape()
|
||||
media = _scrape_media(link.url, force=force, **kwargs)
|
||||
|
||||
if media_object:
|
||||
# the scraper should be able to make a media embed out of the
|
||||
# media object it just gave us. if not, null out the media object
|
||||
# to protect downstream code
|
||||
res = scraper.media_embed(media_object)
|
||||
if media and not link.promoted:
|
||||
link.thumbnail_url = media.thumbnail_url
|
||||
link.thumbnail_size = media.thumbnail_size
|
||||
|
||||
if not res:
|
||||
print "%s made a bad media obj for link %s" % (scraper, link._id36)
|
||||
media_object = None
|
||||
link.set_media_object(media.media_object)
|
||||
link.set_secure_media_object(media.secure_media_object)
|
||||
|
||||
if secure_media_object:
|
||||
res = scraper.media_embed(secure_media_object)
|
||||
|
||||
if not res:
|
||||
print "%s made a bad secure media obj for link %s" % (scraper,
|
||||
link._id36)
|
||||
secure_media_object = None
|
||||
|
||||
if thumbnail:
|
||||
link.thumbnail_url = upload_media(thumbnail)
|
||||
link.thumbnail_size = thumbnail.size
|
||||
|
||||
link.set_media_object(media_object)
|
||||
link.set_secure_media_object(secure_media_object)
|
||||
link._commit()
|
||||
link._commit()
|
||||
|
||||
|
||||
def force_thumbnail(link, image_data, file_type=".jpg"):
|
||||
@@ -344,7 +387,7 @@ def _make_thumbnail_from_url(thumbnail_url, referer):
|
||||
|
||||
class Scraper(object):
|
||||
@classmethod
|
||||
def for_url(cls, url):
|
||||
def for_url(cls, url, autoplay=False):
|
||||
scraper = hooks.get_hook("scraper.factory").call_until_return(url=url)
|
||||
if scraper:
|
||||
return scraper
|
||||
@@ -352,7 +395,7 @@ class Scraper(object):
|
||||
embedly_services = _fetch_embedly_services()
|
||||
for service_re, service_secure in embedly_services:
|
||||
if service_re.match(url):
|
||||
return _EmbedlyScraper(url, service_secure)
|
||||
return _EmbedlyScraper(url, service_secure, autoplay=autoplay)
|
||||
|
||||
return _ThumbnailOnlyScraper(url)
|
||||
|
||||
@@ -438,18 +481,25 @@ class _ThumbnailOnlyScraper(Scraper):
|
||||
class _EmbedlyScraper(Scraper):
|
||||
EMBEDLY_API_URL = "https://api.embed.ly/1/oembed"
|
||||
|
||||
def __init__(self, url, can_embed_securely):
|
||||
def __init__(self, url, can_embed_securely, autoplay=False):
|
||||
self.url = url
|
||||
self.can_embed_securely = can_embed_securely
|
||||
self.embedly_params = {}
|
||||
|
||||
if autoplay:
|
||||
self.embedly_params["autoplay"] = "true"
|
||||
|
||||
def _fetch_from_embedly(self, secure):
|
||||
params = urllib.urlencode({
|
||||
param_dict = {
|
||||
"url": self.url,
|
||||
"format": "json",
|
||||
"maxwidth": 600,
|
||||
"key": g.embedly_api_key,
|
||||
"secure": "true" if secure else "false",
|
||||
})
|
||||
}
|
||||
|
||||
param_dict.update(self.embedly_params)
|
||||
params = urllib.urlencode(param_dict)
|
||||
content = requests.get(self.EMBEDLY_API_URL + "?" + params).content
|
||||
return json.loads(content)
|
||||
|
||||
@@ -527,7 +577,7 @@ def run():
|
||||
link = Link._by_fullname(msg.body, data=True)
|
||||
|
||||
try:
|
||||
TimeoutFunction(_set_media, 30)(link)
|
||||
TimeoutFunction(_set_media, 30)(link, use_cache=True)
|
||||
except TimeoutFunctionException:
|
||||
print "Timed out on %s" % fname
|
||||
except KeyboardInterrupt:
|
||||
|
||||
175
r2/r2/models/media_cache.py
Executable file
175
r2/r2/models/media_cache.py
Executable file
@@ -0,0 +1,175 @@
|
||||
# The contents of this file are subject to the Common Public Attribution
|
||||
# License Version 1.0. (the "License"); you may not use this file except in
|
||||
# compliance with the License. You may obtain a copy of the License at
|
||||
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
# software over a computer network and provide for limited attribution for the
|
||||
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
# with Exhibit B.
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
# the specific language governing rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is reddit.
|
||||
#
|
||||
# The Original Developer is the Initial Developer. The Initial Developer of
|
||||
# the Original Code is reddit Inc.
|
||||
#
|
||||
# All portions of the code written by reddit are Copyright (c) 2013-2014 reddit
|
||||
# Inc. All Rights Reserved.
|
||||
###############################################################################
|
||||
|
||||
import collections
|
||||
import json
|
||||
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
from pycassa.system_manager import ASCII_TYPE, UTF8_TYPE
|
||||
from r2.lib.db import tdb_cassandra
|
||||
|
||||
|
||||
Media = collections.namedtuple('_Media', ("media_object",
|
||||
"secure_media_object",
|
||||
"thumbnail_url",
|
||||
"thumbnail_size"))
|
||||
|
||||
ERROR_MEDIA = Media(None, None, None, None)
|
||||
|
||||
|
||||
class MediaByURL(tdb_cassandra.View):
|
||||
_use_db = True
|
||||
_connection_pool = 'main'
|
||||
_ttl = timedelta(minutes=720)
|
||||
|
||||
_read_consistency_level = tdb_cassandra.CL.QUORUM
|
||||
_write_consistency_level = tdb_cassandra.CL.QUORUM
|
||||
_int_props = {"thumbnail_width", "thumbnail_height"}
|
||||
_date_props = {"last_modified"}
|
||||
_extra_schema_creation_args = {
|
||||
"key_validation_class": ASCII_TYPE,
|
||||
"column_name_class": UTF8_TYPE,
|
||||
}
|
||||
|
||||
_defaults = {
|
||||
"state": "enqueued",
|
||||
"error": "",
|
||||
"thumbnail_url": "",
|
||||
"thumbnail_width": 0,
|
||||
"thumbnail_height": 0,
|
||||
"media_object": "",
|
||||
"secure_media_object": "",
|
||||
"last_modified": datetime.utcfromtimestamp(0),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _rowkey(cls, url, **kwargs):
|
||||
return (
|
||||
url +
|
||||
# pipe is not allowed in URLs, so use it as a delimiter
|
||||
"|" +
|
||||
|
||||
# append the extra cache keys in kwargs as a canonical JSON string
|
||||
json.dumps(
|
||||
kwargs,
|
||||
ensure_ascii=True,
|
||||
encoding="ascii",
|
||||
indent=None,
|
||||
separators=(",", ":"),
|
||||
sort_keys=True,
|
||||
)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def add_placeholder(cls, url, **kwargs):
|
||||
rowkey = cls._rowkey(url, **kwargs)
|
||||
cls._set_values(rowkey, {
|
||||
"state": "enqueued",
|
||||
"error": "",
|
||||
"last_modified": datetime.utcnow(),
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def add(cls, url, media, **kwargs):
|
||||
rowkey = cls._rowkey(url, **kwargs)
|
||||
columns = cls._defaults.copy()
|
||||
|
||||
columns.update({
|
||||
"state": "processed",
|
||||
"error": "",
|
||||
"last_modified": datetime.utcnow(),
|
||||
})
|
||||
|
||||
if media.thumbnail_url and media.thumbnail_size:
|
||||
columns.update({
|
||||
"thumbnail_url": media.thumbnail_url,
|
||||
"thumbnail_width": media.thumbnail_size[0],
|
||||
"thumbnail_height": media.thumbnail_size[1],
|
||||
})
|
||||
|
||||
if media.media_object:
|
||||
columns.update({
|
||||
"media_object": json.dumps(media.media_object),
|
||||
})
|
||||
|
||||
if media.secure_media_object:
|
||||
columns.update({
|
||||
"secure_media_object": (json.
|
||||
dumps(media.secure_media_object)),
|
||||
})
|
||||
|
||||
cls._set_values(rowkey, columns)
|
||||
|
||||
@classmethod
|
||||
def add_error(cls, url, error, **kwargs):
|
||||
rowkey = cls._rowkey(url, **kwargs)
|
||||
columns = {
|
||||
"error": error,
|
||||
"state": "processed",
|
||||
"last_modified": datetime.utcnow(),
|
||||
}
|
||||
cls._set_values(rowkey, columns)
|
||||
|
||||
@classmethod
|
||||
def get(cls, url, max_cache_age=None, **kwargs):
|
||||
rowkey = cls._rowkey(url, **kwargs)
|
||||
try:
|
||||
temp = cls._byID(rowkey)
|
||||
|
||||
# Return None if this cache entry is too old
|
||||
if (max_cache_age is not None and
|
||||
datetime.datetime.utcnow() - temp.last_modified >
|
||||
max_cache_age):
|
||||
return None
|
||||
else:
|
||||
return temp
|
||||
except tdb_cassandra.NotFound:
|
||||
return None
|
||||
|
||||
@property
|
||||
def media(self):
|
||||
if self.state == "processed":
|
||||
if not self.error:
|
||||
media_object = secure_media_object = None
|
||||
thumbnail_url = thumbnail_size = None
|
||||
|
||||
if (self.thumbnail_width and self.thumbnail_height and
|
||||
self.thumbnail_url):
|
||||
thumbnail_url = self.thumbnail_url
|
||||
thumbnail_size = (self.thumbnail_width,
|
||||
self.thumbnail_height)
|
||||
|
||||
if self.media_object:
|
||||
media_object = json.loads(self.media_object)
|
||||
|
||||
if self.secure_media_object:
|
||||
secure_media_object = json.loads(self.secure_media_object)
|
||||
|
||||
return Media(media_object, secure_media_object,
|
||||
thumbnail_url, thumbnail_size)
|
||||
else:
|
||||
return ERROR_MEDIA
|
||||
else:
|
||||
return None
|
||||
Reference in New Issue
Block a user