diff --git a/r2/example.ini b/r2/example.ini index de09834ce..753fd3939 100644 --- a/r2/example.ini +++ b/r2/example.ini @@ -91,6 +91,7 @@ max_sr_images = 20 login_cookie = reddit_session domain = localhost domain_prefix = +media_domain = localhost default_sr = localhost admins = sponsors = diff --git a/r2/r2/config/routing.py b/r2/r2/config/routing.py index 2fe26f742..eef0fcce7 100644 --- a/r2/r2/config/routing.py +++ b/r2/r2/config/routing.py @@ -168,6 +168,8 @@ def make_map(global_conf={}, app_conf={}): mc('/captcha/:iden', controller='captcha', action='captchaimg') + mc('/mediaembed/:link', controller="mediaembed", action="mediaembed") + mc('/doquery', controller='query', action='doquery') mc('/store', controller='redirect', action='redirect', diff --git a/r2/r2/controllers/__init__.py b/r2/r2/controllers/__init__.py index 0ce49dc57..fd024f035 100644 --- a/r2/r2/controllers/__init__.py +++ b/r2/r2/controllers/__init__.py @@ -45,6 +45,7 @@ from post import PostController from toolbar import ToolbarController from i18n import I18nController from promotecontroller import PromoteController +from mediaembed import MediaembedController from querycontroller import QueryController diff --git a/r2/r2/controllers/mediaembed.py b/r2/r2/controllers/mediaembed.py new file mode 100644 index 000000000..83e80bf46 --- /dev/null +++ b/r2/r2/controllers/mediaembed.py @@ -0,0 +1,52 @@ +# The contents of this file are subject to the Common Public Attribution +# License Version 1.0. (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public +# License Version 1.1, but Sections 14 and 15 have been added to cover use of +# software over a computer network and provide for limited attribution for the +# Original Developer. In addition, Exhibit A has been modified to be consistent +# with Exhibit B. +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for +# the specific language governing rights and limitations under the License. +# +# The Original Code is Reddit. +# +# The Original Developer is the Initial Developer. The Initial Developer of the +# Original Code is CondeNet, Inc. +# +# All portions of the code written by CondeNet are Copyright (c) 2006-2009 +# CondeNet, Inc. All Rights Reserved. +################################################################################ +from validator import * +from reddit_base import RedditController + +from r2.lib.scraper import scrapers +from r2.lib.pages import MediaEmbedBody + +from pylons import request + +class MediaembedController(RedditController): + @validate(link = VLink('link')) + def GET_mediaembed(self, link): + if request.host != g.media_domain: + # don't serve up untrusted content except on our + # specifically untrusted domain + return self.abort404() + + if not link or not link.media_object: + return self.abort404() + + if isinstance(link.media_object, basestring): + # it's an old-style string + content = link.media_object + + elif isinstance(link.media_object, dict): + # otherwise it's the new style, which is a dict(type=type, **args) + media_object_type = link.media_object['type'] + scraper = scrapers[media_object_type] + media_embed = scraper.media_embed(**link.media_object) + content = media_embed.content + + return MediaEmbedBody(body = content).render() diff --git a/r2/r2/controllers/validator/validator.py b/r2/r2/controllers/validator/validator.py index 58cea9fd7..93c9ee9ba 100644 --- a/r2/r2/controllers/validator/validator.py +++ b/r2/r2/controllers/validator/validator.py @@ -865,7 +865,8 @@ class VCnameDomain(Validator): def run(self, domain): if (domain and (not self.domain_re.match(domain) - or domain.endswith('.reddit.com') + or domain.endswith('.' + g.domain) + or domain.endswith('.' + g.media_domain) or len(domain) > 300)): self.set_error(errors.BAD_CNAME) elif domain: diff --git a/r2/r2/lib/app_globals.py b/r2/r2/lib/app_globals.py index 2c01d290a..4583bee85 100644 --- a/r2/r2/lib/app_globals.py +++ b/r2/r2/lib/app_globals.py @@ -174,6 +174,11 @@ class Globals(object): if self.debug: self.log.setLevel(logging.DEBUG) + if not self.media_domain: + self.media_domain = self.domain + if self.media_domain == self.domain: + print "Warning: g.media_domain == g.domain. This may give untrusted content access to user cookies" + #read in our CSS so that it can become a default for subreddit #stylesheets stylesheet_path = os.path.join(paths.get('static_files'), diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py index 5420ed4b3..8b108c890 100644 --- a/r2/r2/lib/pages/pages.py +++ b/r2/r2/lib/pages/pages.py @@ -42,6 +42,7 @@ from r2.lib.utils import title_to_url, query_string, UrlParser, to_js, vote_hash from r2.lib.utils import link_duplicates from r2.lib.template_helpers import add_sr, get_domain from r2.lib.subreddit_search import popular_searches +from r2.lib.scraper import scrapers import sys, random, datetime, locale, calendar, simplejson, re import graph @@ -1482,9 +1483,25 @@ class LinkChild(object): return '' class MediaChild(LinkChild): + """renders when the user hits the expando button to expand media + objects, like embedded videos""" css_style = "video" def content(self): - return self.link.media_object + if isinstance(self.link.media_object, basestring): + return self.link.media_object + + media_object_type = self.link.media_object['type'] + if media_object_type in scrapers: + scraper = scrapers[media_object_type] + media_embed = scraper.media_embed(**self.link.media_object) + return MediaEmbed(media_domain = g.media_domain, + height = media_embed.height+10, + width = media_embed.width+10, + id36 = self.link._id36).render() + +class MediaEmbed(Templated): + """The actual rendered iframe for a media child""" + pass class SelfTextChild(LinkChild): css_style = "selftext" @@ -1494,10 +1511,6 @@ class SelfTextChild(LinkChild): nofollow = self.nofollow) return u.render() -class SelfText(Templated): - def __init__(self, link): - Templated.__init__(self, link = link) - class UserText(CachedTemplate): def __init__(self, item, @@ -1531,6 +1544,10 @@ class UserText(CachedTemplate): cloneable = cloneable, css_class = css_class) +class MediaEmbedBody(CachedTemplate): + """What's rendered inside the iframe that contains media objects""" + pass + class Traffic(Templated): @staticmethod def slice_traffic(traffic, *indices): diff --git a/r2/r2/lib/scraper.py b/r2/r2/lib/scraper.py index a2ddd1a23..a89b392fa 100644 --- a/r2/r2/lib/scraper.py +++ b/r2/r2/lib/scraper.py @@ -151,6 +151,16 @@ def fetch_url(url, referer = None, retries = 1, dimension = False): def fetch_size(url, referer = None, retries = 1): return fetch_url(url, referer, retries, dimension = True) +class MediaEmbed(object): + width = None + height = None + content = None + + def __init__(self, height, width, content): + self.height = height + self.width = width + self.content = content + class Scraper: def __init__(self, url): self.url = url @@ -158,6 +168,9 @@ class Scraper: self.content_type = None self.soup = None + def __repr__(self): + return "%s(%r)" % (self.__class__.__name__, self.url) + def download(self): self.content_type, self.content = fetch_url(self.url) if self.content_type and 'html' in self.content_type and self.content: @@ -184,6 +197,12 @@ class Scraper: max_area = 0 max_url = None + if self.soup: + thumbnail_spec = self.soup.find('link', rel = 'image_src') + if thumbnail_spec and thumbnail_spec['href']: + log.debug("Using image_src") + return thumbnail_spec['href'] + for image_url in self.image_urls(): size = fetch_size(image_url, referer = self.url) if not size: @@ -228,28 +247,60 @@ class Scraper: return image def media_object(self): - return None + for deepscraper in deepscrapers: + ds = deepscraper() + found = ds.find_media_object(self) + if found: + return found + + @classmethod + def media_embed(cls): + raise NotImplementedError class MediaScraper(Scraper): media_template = "" thumbnail_template = "" + video_id = None video_id_rx = None - + def __init__(self, url): - m = self.video_id_rx.match(url) - if m: - self.video_id = m.groups()[0] - else: - #if we can't find the id just treat it like a normal page - log.debug('reverting to regular scraper: %s' % url) - self.__class__ = Scraper Scraper.__init__(self, url) + # first try the simple regex against the URL. If that fails, + # see if the MediaScraper subclass has its own extraction + # function + if self.video_id_rx: + m = self.video_id_rx.match(url) + if m: + self.video_id = m.groups()[0] + if not self.video_id: + video_id = self.video_id_extract() + if video_id: + self.video_id = video_id + if not self.video_id: + #if we still can't find the id just treat it like a normal page + log.debug('reverting to regular scraper: %s' % url) + self.__class__ = Scraper + + def video_id_extract(self): + return None + def largest_image_url(self): - return self.thumbnail_template.replace('$video_id', self.video_id) + if self.thumbnail_template: + return self.thumbnail_template.replace('$video_id', self.video_id) + else: + return Scraper.largest_image_url(self) def media_object(self): - return self.media_template.replace('$video_id', self.video_id) + return dict(video_id = self.video_id, + type = self.domains[0]) + + @classmethod + def media_embed(cls, video_id = None, height = None, width = None, **kw): + content = cls.media_template.replace('$video_id', video_id) + return MediaEmbed(height = height or cls.height, + width = width or cls.width, + content = content) def youtube_in_google(google_url): h = Scraper(google_url) @@ -276,17 +327,20 @@ def make_scraper(url): return make_scraper(youtube_url) return scraper(url) - ########## site-specific video scrapers ########## -#Youtube class YoutubeScraper(MediaScraper): - media_template = '' + domains = ['youtube.com'] + height = 295 + width = 480 + media_template = '' thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg' video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*') -#Metacage class MetacafeScraper(MediaScraper): + domains = ['metacafe.com'] + height = 345 + width = 400 media_template = '' video_id_rx = re.compile('.*/watch/([^/]+)/.*') @@ -296,20 +350,16 @@ class MetacafeScraper(MediaScraper): if self.soup: video_url = self.soup.find('link', rel = 'video_src')['href'] - return self.media_template.replace('$video_id', video_url) + return dict(video_id = video_url, + type = self.domains[0]) - def largest_image_url(self): - if not self.soup: - self.download() - - if self.soup: - return self.soup.find('link', rel = 'image_src')['href'] - -#Google Video -gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S) class GootubeScraper(MediaScraper): + domains = ['video.google.com'] + height = 326 + width = 400 media_template = '' - video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*') + video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*') + gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S) def largest_image_url(self): if not self.content: @@ -318,40 +368,353 @@ class GootubeScraper(MediaScraper): if not self.content: return None - m = gootube_thumb_rx.match(self.content) + m = self.gootube_thumb_rx.match(self.content) if m: image_url = m.groups()[0] image_url = utils.safe_eval_str(image_url) return image_url -scrapers = {'youtube.com': YoutubeScraper, - 'video.google.com': GootubeScraper, - 'metacafe.com': MetacafeScraper} +class VimeoScraper(MediaScraper): + domains = ['vimeo.com'] + height = 448 + width = 520 + media_template = '' + video_id_rx = re.compile('.*/(.*)') + + def media_object(self): + if not self.soup: + self.download() + + if self.soup: + video_url = self.soup.find('link', rel = 'video_src')['href'] + return dict(video_id = video_url, + type = self.domains[0]) + +class BreakScraper(MediaScraper): + domains = ['break.com'] + height = 421 + width = 520 + media_template = '' + video_id_rx = re.compile('.*/index/([^/]+).*'); + + def video_id_extract(self): + if not self.soup: + self.download() + + if self.soup: + video_src = self.soup.find('link', rel = 'video_src') + if video_src and video_src['href']: + return video_src['href'] + +class TheOnionScraper(MediaScraper): + domains = ['theonion.com'] + height = 430 + width = 480 + media_template = """""" + video_id_rx = re.compile('.*/video/([^/?#]+).*') + + def media_object(self): + if not self.soup: + self.download() + + if self.soup: + video_url = self.soup.find('meta', attrs={'name': 'nid'})['content'] + return dict(video_id = video_url, + type = self.domains[0]) + +class CollegeHumorScraper(MediaScraper): + domains = ['collegehumor.com'] + height = 390 + width = 520 + media_template = '' + video_id_rx = re.compile('.*video:(\d+).*'); + +class FunnyOrDieScraper(MediaScraper): + domains = ['funnyordie.com'] + height = 438 + width = 464 + media_template = '' + thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac' + video_id_rx = re.compile('.*/videos/([^/]+)/.*') + +class ComedyCentralScraper(MediaScraper): + domains = ['comedycentral.com', 'thedailyshow.com'] + height = 316 + width = 332 + media_template = '' + video_id_rx = re.compile('.*videoId=(\d+).*') + +class ColbertNationScraper(ComedyCentralScraper): + domains = ['colbertnation.com'] + video_id_rx = re.compile('.*videos/(\d+)/.*') + +class LiveLeakScraper(MediaScraper): + domains = ['liveleak.com'] + height = 370 + width = 450 + media_template = '' + video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*') + + def largest_image_url(self): + if not self.soup: + self.download() + + if self.soup: + return self.soup.find('link', rel = 'videothumbnail')['href'] + +class DailyMotionScraper(MediaScraper): + domains = ['dailymotion.com'] + height = 381 + width = 480 + media_template = '' + video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*') + + def media_object(self): + if not self.soup: + self.download() + + if self.soup: + video_url = self.soup.find('link', rel = 'video_src')['href'] + return dict(video_id = video_url, + type = self.domains[0]) + +class RevverScraper(MediaScraper): + domains = ['revver.com'] + height = 392 + width = 480 + media_template = '' + video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*') + +class EscapistScraper(MediaScraper): + domains = ['escapistmagazine.com'] + height = 294 + width = 480 + media_template = """""" + video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*') + +class JustintvScraper(MediaScraper): + """Can grab streams from justin.tv, but not clips""" + domains = ['justin.tv'] + height = 295 + width = 353 + stream_media_template = """""" + video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$') + + @classmethod + def media_embed(cls, video_id, **kw): + content = cls.stream_media_template.replace('$video_id', video_id) + return MediaEmbed(height = cls.height, + width = cls.width, + content = content) + +class SoundcloudScraper(MediaScraper): + """soundcloud.com""" + domains = ['soundcloud.com'] + height = 81 + width = 400 + media_template = """
| "
+ print "", websafe(url), ""
+ print " " + print websafe(repr(h)) + img = h.largest_image_url() + if img: + print " | (no image) | " + mo = h.media_object() + print ""
+ if mo:
+ s = scrapers[mo['type']]
+ print websafe(repr(mo))
+ print " " + print s.media_embed(**mo).content + else: + print "None" + print " | "
+ print "