diff --git a/r2/r2/lib/jsontemplates.py b/r2/r2/lib/jsontemplates.py index 190532118..404b516ab 100755 --- a/r2/r2/lib/jsontemplates.py +++ b/r2/r2/lib/jsontemplates.py @@ -402,6 +402,8 @@ class LinkJsonTemplate(ThingJsonTemplate): permalink="permalink", saved="saved", score="score", + secure_media="secure_media_object", + secure_media_embed="secure_media_embed", selftext="selftext", selftext_html="selftext_html", stickied="stickied", @@ -415,16 +417,18 @@ class LinkJsonTemplate(ThingJsonTemplate): def thing_attr(self, thing, attr): from r2.lib.media import get_media_embed - if attr == "media_embed": - if (thing.media_object and - not isinstance(thing.media_object, basestring)): - media_embed = get_media_embed(thing.media_object) - if media_embed: - return dict(scrolling = media_embed.scrolling, - width = media_embed.width, - height = media_embed.height, - content = media_embed.content) - return dict() + if attr in ("media_embed", "secure_media_embed"): + media_object = getattr(thing, attr.replace("_embed", "_object")) + if media_object and not isinstance(media_object, basestring): + media_embed = get_media_embed(media_object) + if media_embed: + return { + "scrolling": media_embed.scrolling, + "width": media_embed.width, + "height": media_embed.height, + "content": media_embed.content, + } + return {} elif attr == "editted" and not isinstance(thing.editted, bool): return (time.mktime(thing.editted.astimezone(pytz.UTC).timetuple()) - time.timezone) diff --git a/r2/r2/lib/media.py b/r2/r2/lib/media.py index a4628bfec..7e0bc3c11 100644 --- a/r2/r2/lib/media.py +++ b/r2/r2/lib/media.py @@ -21,7 +21,6 @@ ############################################################################### import base64 -import collections import cStringIO import hashlib import json @@ -54,6 +53,15 @@ s3_direct_url = "s3.amazonaws.com" MEDIA_FILENAME_LENGTH = 12 thumbnail_size = 70, 70 +# TODO: replace this with data from the embedly service api when available +_SECURE_SERVICES = [ + "youtube", + "vimeo", + "soundcloud", + "wistia", + "slideshare", +] + def _image_to_str(image): s = cStringIO.StringIO() @@ -246,20 +254,6 @@ def upload_media(image, never_expire=True, file_type='.jpg'): return url -def update_link(link, thumbnail, media_object, thumbnail_size=None): - """Sets the link's has_thumbnail and media_object attributes iin the - database.""" - if thumbnail: - link.thumbnail_url = thumbnail - link.thumbnail_size = thumbnail_size - g.log.debug("Updated link with thumbnail: %s" % link.thumbnail_url) - - if media_object: - link.media_object = media_object - - link._commit() - - def _set_media(embedly_services, link, force=False): if link.is_self: return @@ -269,7 +263,7 @@ def _set_media(embedly_services, link, force=False): return scraper = Scraper.for_url(embedly_services, link.url) - thumbnail, media_object = scraper.scrape() + thumbnail, media_object, secure_media_object = scraper.scrape() if media_object: # the scraper should be able to make a media embed out of the @@ -281,16 +275,32 @@ def _set_media(embedly_services, link, force=False): print "%s made a bad media obj for link %s" % (scraper, link._id36) media_object = None - thumbnail_url = upload_media(thumbnail) if thumbnail else None - thumbnail_size = thumbnail.size if thumbnail else None + if secure_media_object: + res = scraper.media_embed(secure_media_object) + + if not res: + print "%s made a bad secure media obj for link %s" % (scraper, + link._id36) + secure_media_object = None + + if thumbnail: + link.thumbnail_url = upload_media(thumbnail) + link.thumbnail_size = thumbnail.size + + link.media_object = media_object + link.secure_media_object = secure_media_object + link._commit() - update_link(link, thumbnail_url, media_object, thumbnail_size=thumbnail_size) def force_thumbnail(link, image_data, never_expire=True, file_type=".jpg"): image = str_to_image(image_data) image = _prepare_image(image) thumb_url = upload_media(image, never_expire=never_expire, file_type=file_type) - update_link(link, thumbnail=thumb_url, media_object=None, thumbnail_size=image.size) + + link.thumbnail_url = thumb_url + link.thumbnail_size = image.size + link._commit() + def upload_icon(file_name, image_data, size): assert g.media_store == 's3' @@ -357,15 +367,13 @@ def _make_thumbnail_from_url(thumbnail_url, referer): class Scraper(object): @classmethod def for_url(cls, embedly_services, url): - url_domain = domain(url) - domain_embedly_regex = embedly_services.get(url_domain, None) - - if domain_embedly_regex and re.match(domain_embedly_regex, url): - return _EmbedlyScraper(url) + for service_re, service_secure in embedly_services: + if service_re.match(url): + return _EmbedlyScraper(url, service_secure) return _ThumbnailOnlyScraper(url) def scrape(self): - # should return a 2-tuple of: thumbnail, media_object + # should return a 3-tuple of: thumbnail, media_object, secure_media_obj raise NotImplementedError @classmethod @@ -381,7 +389,7 @@ class _ThumbnailOnlyScraper(Scraper): def scrape(self): thumbnail_url = self._find_thumbnail_image() thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) - return thumbnail, None + return thumbnail, None, None def _extract_image_urls(self, soup): for img in soup.findAll("img", src=True): @@ -446,8 +454,9 @@ class _ThumbnailOnlyScraper(Scraper): class _EmbedlyScraper(Scraper): EMBEDLY_API_URL = "https://api.embed.ly/1/oembed" - def __init__(self, url): + def __init__(self, url, can_embed_securely): self.url = url + self.can_embed_securely = can_embed_securely @classmethod def _utf8_encode(cls, input): @@ -463,18 +472,29 @@ class _EmbedlyScraper(Scraper): else: return input - def scrape(self): + def _fetch_from_embedly(self, secure): params = urllib.urlencode({ "url": self.url, "format": "json", "maxwidth": 600, "key": g.embedly_api_key, + "secure": "true" if secure else "false", }) content = requests.get(self.EMBEDLY_API_URL + "?" + params).content - oembed = json.loads(content, object_hook=self._utf8_encode) + return json.loads(content, object_hook=self._utf8_encode) + def _make_media_object(self, oembed): + if oembed.get("type") in ("video", "rich"): + return { + "type": domain(self.url), + "oembed": oembed, + } + return None + + def scrape(self): + oembed = self._fetch_from_embedly(secure=False) if not oembed: - return None, None + return None, None, None if oembed.get("type") == "photo": thumbnail_url = oembed.get("url") @@ -482,14 +502,15 @@ class _EmbedlyScraper(Scraper): thumbnail_url = oembed.get("thumbnail_url") thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) - embed = {} - if oembed.get("type") in ("video", "rich"): - embed = { - "type": domain(self.url), - "oembed": oembed, - } + secure_oembed = {} + if self.can_embed_securely: + secure_oembed = self._fetch_from_embedly(secure=True) - return thumbnail, embed + return ( + thumbnail, + self._make_media_object(oembed), + self._make_media_object(secure_oembed), + ) @classmethod def media_embed(cls, media_object): @@ -508,17 +529,21 @@ class _EmbedlyScraper(Scraper): ) -@memoize("media.embedly_services", time=3600) +@memoize("media.embedly_services2", time=3600) +def _fetch_embedly_service_data(): + return requests.get("https://api.embed.ly/1/services/python").json + + def _fetch_embedly_services(): - service_data = requests.get("https://api.embed.ly/1/services/python").json + service_data = _fetch_embedly_service_data() - patterns_by_domain = collections.defaultdict(set) + services = [] for service in service_data: - for domain in [service["domain"]] + service["subdomains"]: - patterns_by_domain[domain].update(service["regex"]) - - return {domain: "(?:%s)" % "|".join(patterns) - for domain, patterns in patterns_by_domain.iteritems()} + services.append(( + re.compile("(?:%s)" % "|".join(service["regex"])), + service["name"] in _SECURE_SERVICES, + )) + return services def run(): diff --git a/r2/r2/models/link.py b/r2/r2/models/link.py index 2e88b6646..56c4d8254 100755 --- a/r2/r2/models/link.py +++ b/r2/r2/models/link.py @@ -71,6 +71,7 @@ class Link(Thing, Printable): moderator_banned=False, banned_before_moderator=False, media_object=None, + secure_media_object=None, promoted=None, pending=False, disable_comments=False,