* add some new scrapers

* start showing embedded videos inside of an <iframe>. This allows us to embed videos from external sites without having to trust their DOM security Note that the media_object property on new links is now a dictionary, but old ones will still be strings
2026-01-26 23:39:11 -05:00 · 2009-07-21 14:31:11 -07:00
parent b02ae5a8e1
commit 2cf3490329
12 changed files with 561 additions and 61 deletions
--- a/r2/example.ini
+++ b/r2/example.ini
@@ -91,6 +91,7 @@ max_sr_images = 20
 login_cookie = reddit_session
 domain = localhost
 domain_prefix = 
+media_domain = localhost
 default_sr = localhost
 admins = 
 sponsors = 
--- a/r2/r2/config/routing.py
+++ b/r2/r2/config/routing.py
@@ -168,6 +168,8 @@ def make_map(global_conf={}, app_conf={}):
    
    mc('/captcha/:iden', controller='captcha', action='captchaimg')

+    mc('/mediaembed/:link', controller="mediaembed", action="mediaembed")
+
    mc('/doquery', controller='query', action='doquery')

    mc('/store', controller='redirect', action='redirect',
--- a/r2/r2/controllers/init.py
+++ b/r2/r2/controllers/init.py
@@ -45,6 +45,7 @@ from post import PostController
 from toolbar import ToolbarController
 from i18n import I18nController
 from promotecontroller import PromoteController
+from mediaembed import MediaembedController

 from querycontroller import QueryController

--- a/r2/r2/controllers/mediaembed.py
+++ b/r2/r2/controllers/mediaembed.py
@@ -0,0 +1,52 @@
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+# 
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+# 
+# The Original Code is Reddit.
+# 
+# The Original Developer is the Initial Developer.  The Initial Developer of the
+# Original Code is CondeNet, Inc.
+# 
+# All portions of the code written by CondeNet are Copyright (c) 2006-2009
+# CondeNet, Inc. All Rights Reserved.
+################################################################################
+from validator import *
+from reddit_base import RedditController
+
+from r2.lib.scraper import scrapers
+from r2.lib.pages import MediaEmbedBody
+
+from pylons import request
+
+class MediaembedController(RedditController):
+    @validate(link = VLink('link'))
+    def GET_mediaembed(self, link):
+        if request.host != g.media_domain:
+            # don't serve up untrusted content except on our
+            # specifically untrusted domain
+            return self.abort404()
+
+        if not link or not link.media_object:
+            return self.abort404()
+
+        if isinstance(link.media_object, basestring):
+            # it's an old-style string
+            content = link.media_object
+
+        elif isinstance(link.media_object, dict):
+            # otherwise it's the new style, which is a dict(type=type, **args)
+            media_object_type = link.media_object['type']
+            scraper = scrapers[media_object_type]
+            media_embed = scraper.media_embed(**link.media_object)
+            content = media_embed.content
+
+        return MediaEmbedBody(body = content).render()
--- a/r2/r2/controllers/validator/validator.py
+++ b/r2/r2/controllers/validator/validator.py
@@ -865,7 +865,8 @@ class VCnameDomain(Validator):
    def run(self, domain):
        if (domain
            and (not self.domain_re.match(domain)
-                 or domain.endswith('.reddit.com')
+                 or domain.endswith('.' + g.domain)
+                 or domain.endswith('.' + g.media_domain)
                 or len(domain) > 300)):
            self.set_error(errors.BAD_CNAME)
        elif domain:
--- a/r2/r2/lib/app_globals.py
+++ b/r2/r2/lib/app_globals.py
@@ -174,6 +174,11 @@ class Globals(object):
        if self.debug:
            self.log.setLevel(logging.DEBUG)

+        if not self.media_domain:
+            self.media_domain = self.domain
+        if self.media_domain == self.domain:
+            print "Warning: g.media_domain == g.domain. This may give untrusted content access to user cookies"
+
        #read in our CSS so that it can become a default for subreddit
        #stylesheets
        stylesheet_path = os.path.join(paths.get('static_files'),
--- a/r2/r2/lib/pages/pages.py
+++ b/r2/r2/lib/pages/pages.py
@@ -42,6 +42,7 @@ from r2.lib.utils import title_to_url, query_string, UrlParser, to_js, vote_hash
 from r2.lib.utils import link_duplicates
 from r2.lib.template_helpers import add_sr, get_domain
 from r2.lib.subreddit_search import popular_searches
+from r2.lib.scraper import scrapers

 import sys, random, datetime, locale, calendar, simplejson, re
 import graph
@@ -1482,9 +1483,25 @@ class LinkChild(object):
        return ''

 class MediaChild(LinkChild):
+    """renders when the user hits the expando button to expand media
+       objects, like embedded videos"""
    css_style = "video"
    def content(self):
-        return self.link.media_object
+        if isinstance(self.link.media_object, basestring):
+            return self.link.media_object
+
+        media_object_type = self.link.media_object['type']
+        if media_object_type in scrapers:
+            scraper = scrapers[media_object_type]
+            media_embed = scraper.media_embed(**self.link.media_object)
+            return MediaEmbed(media_domain = g.media_domain,
+                              height = media_embed.height+10,
+                              width = media_embed.width+10,
+                              id36 = self.link._id36).render()
+
+class MediaEmbed(Templated):
+    """The actual rendered iframe for a media child"""
+    pass

 class SelfTextChild(LinkChild):
    css_style = "selftext"
@@ -1494,10 +1511,6 @@ class SelfTextChild(LinkChild):
                     nofollow = self.nofollow)
        return u.render()

-class SelfText(Templated):
-    def __init__(self, link):
-        Templated.__init__(self, link = link)
-
 class UserText(CachedTemplate):
    def __init__(self,
                 item,
@@ -1531,6 +1544,10 @@ class UserText(CachedTemplate):
                                cloneable = cloneable,
                                css_class = css_class)

+class MediaEmbedBody(CachedTemplate):
+    """What's rendered inside the iframe that contains media objects"""
+    pass
+
 class Traffic(Templated):
    @staticmethod
    def slice_traffic(traffic, *indices):
--- a/r2/r2/lib/scraper.py
+++ b/r2/r2/lib/scraper.py
@@ -151,6 +151,16 @@ def fetch_url(url, referer = None, retries = 1, dimension = False):
 def fetch_size(url, referer = None, retries = 1):
    return fetch_url(url, referer, retries, dimension = True)

+class MediaEmbed(object):
+    width   = None
+    height  = None
+    content = None
+
+    def __init__(self, height, width, content):
+        self.height  = height
+        self.width   = width
+        self.content = content
+
 class Scraper:
    def __init__(self, url):
        self.url = url
@@ -158,6 +168,9 @@ class Scraper:
        self.content_type = None
        self.soup = None

+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.url)
+
    def download(self):
        self.content_type, self.content = fetch_url(self.url)
        if self.content_type and 'html' in self.content_type and self.content:
@@ -184,6 +197,12 @@ class Scraper:
        max_area = 0
        max_url = None

+        if self.soup:
+            thumbnail_spec = self.soup.find('link', rel = 'image_src')
+            if thumbnail_spec and thumbnail_spec['href']:
+                log.debug("Using image_src")
+                return thumbnail_spec['href']
+
        for image_url in self.image_urls():
            size = fetch_size(image_url, referer = self.url)
            if not size:
@@ -228,28 +247,60 @@ class Scraper:
                return image

    def media_object(self):
-        return None
+        for deepscraper in deepscrapers:
+            ds = deepscraper()
+            found = ds.find_media_object(self)
+            if found:
+                return found
+
+    @classmethod
+    def media_embed(cls):
+        raise NotImplementedError

 class MediaScraper(Scraper):
    media_template = ""
    thumbnail_template = ""
+    video_id = None
    video_id_rx = None
-    
+
    def __init__(self, url):
-        m = self.video_id_rx.match(url)
-        if m:
-            self.video_id = m.groups()[0]
-        else:
-            #if we can't find the id just treat it like a normal page
-            log.debug('reverting to regular scraper: %s' % url)
-            self.__class__ = Scraper
        Scraper.__init__(self, url)

+        # first try the simple regex against the URL. If that fails,
+        # see if the MediaScraper subclass has its own extraction
+        # function
+        if self.video_id_rx:
+            m = self.video_id_rx.match(url)
+            if m:
+                self.video_id = m.groups()[0]
+        if not self.video_id:
+            video_id = self.video_id_extract()
+            if video_id:
+                self.video_id = video_id
+        if not self.video_id:
+            #if we still can't find the id just treat it like a normal page
+            log.debug('reverting to regular scraper: %s' % url)
+            self.__class__ = Scraper
+
+    def video_id_extract(self):
+        return None
+
    def largest_image_url(self):
-        return self.thumbnail_template.replace('$video_id', self.video_id)
+        if self.thumbnail_template:
+            return self.thumbnail_template.replace('$video_id', self.video_id)
+        else:
+            return Scraper.largest_image_url(self)

    def media_object(self):
-        return self.media_template.replace('$video_id', self.video_id)
+        return dict(video_id = self.video_id,
+                    type = self.domains[0])
+
+    @classmethod
+    def media_embed(cls, video_id = None, height = None, width = None, **kw):
+        content = cls.media_template.replace('$video_id', video_id)
+        return MediaEmbed(height = height or cls.height,
+                          width = width or cls.width,
+                          content = content)
    
 def youtube_in_google(google_url):
    h = Scraper(google_url)
@@ -276,17 +327,20 @@ def make_scraper(url):
            return make_scraper(youtube_url)
    return scraper(url)

-
 ########## site-specific video scrapers ##########

-#Youtube
 class YoutubeScraper(MediaScraper):
-    media_template = '<object width="480" height="295"><param name="movie" value="http://www.youtube-nocookie.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube-nocookie.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="480" height="295"></embed></object>'
+    domains = ['youtube.com']
+    height = 295
+    width = 480
+    media_template = '<object width="490" height="295"><param name="movie" value="http://www.youtube.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="480" height="295"></embed></object>'
    thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
    video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')

-#Metacage
 class MetacafeScraper(MediaScraper):
+    domains = ['metacafe.com']
+    height = 345
+    width  = 400
    media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
    video_id_rx = re.compile('.*/watch/([^/]+)/.*')

@@ -296,20 +350,16 @@ class MetacafeScraper(MediaScraper):

        if self.soup:
            video_url =  self.soup.find('link', rel = 'video_src')['href']
-            return self.media_template.replace('$video_id', video_url)
+            return dict(video_id = video_url,
+                        type = self.domains[0])

-    def largest_image_url(self):
-        if not self.soup:
-            self.download()
-
-        if self.soup:
-            return self.soup.find('link', rel = 'image_src')['href']
-
-#Google Video
-gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
 class GootubeScraper(MediaScraper):
+    domains = ['video.google.com']
+    height = 326
+    width  = 400
    media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
-    video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')    
+    video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
+    gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)

    def largest_image_url(self):
        if not self.content:
@@ -318,40 +368,353 @@ class GootubeScraper(MediaScraper):
        if not self.content:
            return None

-        m = gootube_thumb_rx.match(self.content)
+        m = self.gootube_thumb_rx.match(self.content)
        if m:
            image_url = m.groups()[0]
            image_url = utils.safe_eval_str(image_url)
            return image_url

-scrapers = {'youtube.com': YoutubeScraper,
-            'video.google.com': GootubeScraper,
-            'metacafe.com': MetacafeScraper}
+class VimeoScraper(MediaScraper):
+    domains = ['vimeo.com']
+    height = 448
+    width = 520
+    media_template = '<embed src="$video_id" width="520" height="448" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
+    video_id_rx = re.compile('.*/(.*)')
+
+    def media_object(self):
+        if not self.soup:
+            self.download()
+
+        if self.soup:
+            video_url =  self.soup.find('link', rel = 'video_src')['href']
+            return dict(video_id = video_url,
+                        type = self.domains[0])
+
+class BreakScraper(MediaScraper):
+    domains = ['break.com']
+    height = 421
+    width = 520
+    media_template = '<object width="520" height="421"><param name="movie" value="$video_id"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" allowScriptAccess="always" width="520" height="421"></embed></object>'
+    video_id_rx = re.compile('.*/index/([^/]+).*');
+
+    def video_id_extract(self):
+        if not self.soup:
+            self.download()
+
+        if self.soup:
+            video_src = self.soup.find('link', rel = 'video_src')
+            if video_src and video_src['href']:
+                return video_src['href']
+
+class TheOnionScraper(MediaScraper):
+    domains = ['theonion.com']
+    height = 430
+    width = 480
+    media_template = """<object width="480" height="430">
+                          <param name="allowfullscreen" value="true" />
+                          <param name="allowscriptaccess" value="always" />
+                          <param name="movie" value="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf?&amp;videoid=$video_id" />
+                          <param name="wmode" value="transparent" />
+
+                          <embed src="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf"
+                                 width="480" height="430"
+                                 wmode="transparent"
+                                 pluginspage="http://www.macromedia.com/go/getflashplayer"
+                                 type="application/x-shockwave-flash"
+                                 flashvars="videoid=$video_id" >
+                          </embed>
+                        </object>"""
+    video_id_rx = re.compile('.*/video/([^/?#]+).*')
+
+    def media_object(self):
+        if not self.soup:
+            self.download()
+
+        if self.soup:
+            video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
+            return dict(video_id = video_url,
+                        type = self.domains[0])
+
+class CollegeHumorScraper(MediaScraper):
+    domains = ['collegehumor.com']
+    height = 390
+    width = 520
+    media_template = '<object type="application/x-shockwave-flash" data="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" width="520" height="390" ><param name="allowfullscreen" value="true" /><param name="AllowScriptAccess" value="true" /><param name="movie" quality="best" value="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" /></object>'
+    video_id_rx = re.compile('.*video:(\d+).*');
+
+class FunnyOrDieScraper(MediaScraper):
+    domains = ['funnyordie.com']
+    height = 438
+    width = 464
+    media_template = '<object width="464" height="438" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" id="fodplayer"><param name="movie" value="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac?key=$video_id" /><param name="flashvars" value="key=$video_id&autostart=true&internal=true" /><param name="allowfullscreen" value="true" /><embed width="464" height="438" flashvars="key=$video_id&autostart=true" allowfullscreen="true" quality="high" src="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac" name="fodplayer" type="application/x-shockwave-flash"></embed></object>'
+    thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
+    video_id_rx = re.compile('.*/videos/([^/]+)/.*')
+
+class ComedyCentralScraper(MediaScraper):
+    domains = ['comedycentral.com', 'thedailyshow.com']
+    height = 316
+    width = 332
+    media_template = '<embed FlashVars="videoId=$video_id" src="http://www.comedycentral.com/sitewide/video_player/view/default/swf.jhtml" quality="high" bgcolor="#cccccc" width="332" height="316" name="comedy_central_player" align="middle" allowScriptAccess="always" allownetworking="external" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>'
+    video_id_rx = re.compile('.*videoId=(\d+).*')
+
+class ColbertNationScraper(ComedyCentralScraper):
+    domains = ['colbertnation.com']
+    video_id_rx = re.compile('.*videos/(\d+)/.*')
+
+class LiveLeakScraper(MediaScraper):
+    domains = ['liveleak.com']
+    height = 370
+    width = 450
+    media_template = '<object width="450" height="370"><param name="movie" value="http://www.liveleak.com/e/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.liveleak.com/e/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="450" height="370"></embed></object>'
+    video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
+
+    def largest_image_url(self):
+        if not self.soup:
+            self.download()
+
+        if self.soup:
+            return self.soup.find('link', rel = 'videothumbnail')['href']
+
+class DailyMotionScraper(MediaScraper):
+    domains = ['dailymotion.com']
+    height = 381
+    width = 480
+    media_template = '<object width="480" height="381"><param name="movie" value="$video_id"></param><param name="allowFullScreen" value="true"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" width="480" height="381" allowFullScreen="true" allowScriptAccess="always"></embed></object>'
+    video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
+
+    def media_object(self):
+        if not self.soup:
+            self.download()
+
+        if self.soup:
+            video_url =  self.soup.find('link', rel = 'video_src')['href']
+            return dict(video_id = video_url,
+                        type = self.domains[0])
+
+class RevverScraper(MediaScraper):
+    domains = ['revver.com']
+    height = 392
+    width = 480
+    media_template = '<script src="http://flash.revver.com/player/1.0/player.js?mediaId:$video_id;width:480;height:392;" type="text/javascript"></script>'
+    video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
+
+class EscapistScraper(MediaScraper):
+    domains = ['escapistmagazine.com']
+    height = 294
+    width = 480
+    media_template = """<script src="http://www.escapistmagazine.com/videos/embed/$video_id"></script>"""
+    video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
+
+class JustintvScraper(MediaScraper):
+    """Can grab streams from justin.tv, but not clips"""
+    domains = ['justin.tv']
+    height = 295
+    width = 353
+    stream_media_template = """<object type="application/x-shockwave-flash" height="295" width="353" id="jtv_player_flash" data="http://www.justin.tv/widgets/jtv_player.swf?channel=$video_id" bgcolor="#000000"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="allowNetworking" value="all" /><param name="movie" value="http://www.justin.tv/widgets/jtv_player.swf" /><param name="flashvars" value="channel=$video_id&auto_play=false&start_volume=25" /></object>"""
+    video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
+
+    @classmethod
+    def media_embed(cls, video_id, **kw):
+        content = cls.stream_media_template.replace('$video_id', video_id)
+        return MediaEmbed(height = cls.height,
+                          width = cls.width,
+                          content = content)
+
+class SoundcloudScraper(MediaScraper):
+    """soundcloud.com"""
+    domains = ['soundcloud.com']
+    height = 81
+    width  = 400
+    media_template = """<div style="font-size: 11px;">
+                          <object height="81" width="100%">
+                            <param name="movie"
+                                   value="http://player.soundcloud.com/player.swf?track=$video_id">
+                            </param>
+                            <param name="allowscriptaccess" value="always"></param>
+                            <embed allowscriptaccess="always" height="81"
+                                   src="http://player.soundcloud.com/player.swf?track=$video_id"
+                                   type="application/x-shockwave-flash"
+                                   width="100%">
+                            </embed>
+                          </object>"""
+    video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
+    
+
+class DeepScraper(object):
+    """Subclasses of DeepScraper attempt to dive into generic pages
+       for embeds of other types (like YouTube videos on blog
+       sites)."""
+
+    def find_media_object(self, scraper):
+        return None
+
+class YoutubeEmbedDeepScraper(DeepScraper):
+    youtube_url_re = re.compile('^(http://www.youtube.com/v/([_a-zA-Z0-9-]+)).*')
+
+    def find_media_object(self, scraper):
+        # try to find very simple youtube embeds
+        if not scraper.soup:
+            scraper.download()
+
+        if scraper.soup:
+            movie_embed = scraper.soup.find('embed',
+                                            attrs={'src': lambda x: self.youtube_url_re.match(x)})
+            if movie_embed:
+                youtube_id = self.youtube_url_re.match(movie_embed['src']).group(2)
+                youtube_url = 'http://www.youtube.com/watch?v=%s"' % youtube_id
+                log.debug('found youtube embed %s' % youtube_url)
+                mo = YoutubeScraper(youtube_url).media_object()
+                mo['deep'] = scraper.url
+                return mo
+
+#scrapers =:= dict(domain -> ScraperClass)
+scrapers = {}
+for scraper in [ YoutubeScraper,
+                 MetacafeScraper,
+                 GootubeScraper,
+                 VimeoScraper,
+                 BreakScraper,
+                 TheOnionScraper,
+                 CollegeHumorScraper,
+                 FunnyOrDieScraper,
+                 ComedyCentralScraper,
+                 ColbertNationScraper,
+                 LiveLeakScraper,
+                 DailyMotionScraper,
+                 RevverScraper,
+                 EscapistScraper,
+                 JustintvScraper,
+                 SoundcloudScraper,
+                 ]:
+    for domain in scraper.domains:
+        scrapers[domain] = scraper
+
+deepscrapers = [YoutubeEmbedDeepScraper]
+
+def convert_old_media_objects():
+    q = Link._query(Link.c.media_object is not None,
+                    Link.c._date > whenever,
+                    data = True)
+    for link in utils.fetch_things2(q):
+        if not getattr(link, 'media_object', None):
+            continue
+
+        if 'youtube' in link.media_object:
+            # we can rewrite this one without scraping
+            video_id = YoutubeScraper.video_id_rx.match(link.url)
+            link.media_object = dict(type='youtube.com',
+                                     video_id = video_id.group(1))
+        elif ('video.google.com' in link.media_object
+              or 'metacafe' in link.media_object):
+            scraper = make_scraper(link.url)
+            if not scraper:
+                continue
+            mo = scraper.media_object()
+            if not mo:
+                continue
+
+            link.media_object = mo
+
+        else:
+            print "skipping %s because it confuses me" % link._fullname
+            continue
+
+        link._commit()
+
+test_urls = [
+    'http://www.facebook.com/pages/Rick-Astley/5807213510?sid=c99aaf3888171e73668a38e0749ae12d', # regular thumbnail finder
+    'http://www.flickr.com/photos/septuagesima/317819584/', # thumbnail with image_src
+
+    'http://www.youtube.com/watch?v=Yu_moia-oVI',
+    'http://www.metacafe.com/watch/sy-1473689248/rick_astley_never_gonna_give_you_up_official_music_video/',
+    'http://video.google.com/videoplay?docid=5908758151704698048',
+    'http://vimeo.com/4495451',
+    'http://www.break.com/usercontent/2008/11/Macy-s-Thankgiving-Day-Parade-Rick-Roll-611965.html',
+    'http://www.theonion.com/content/video/sony_releases_new_stupid_piece_of',
+    'http://www.collegehumor.com/video:1823712',
+    'http://www.funnyordie.com/videos/7f2a184755/macys-thanksgiving-day-parade-gets-rick-rolled-from-that-happened',
+    'http://www.comedycentral.com/videos/index.jhtml?videoId=178342&title=ultimate-fighting-vs.-bloggers',
+    'http://www.thedailyshow.com/video/index.jhtml?videoId=175244&title=Photoshop-of-Horrors',
+    'http://www.colbertnation.com/the-colbert-report-videos/63549/may-01-2006/sign-off---spam',
+    'http://www.liveleak.com/view?i=e09_1207983531',
+    'http://www.dailymotion.com/relevance/search/rick+roll/video/x5l8e6_rickroll_fun',
+    'http://revver.com/video/1199591/rick-rolld-at-work/',
+    'http://www.escapistmagazine.com/videos/view/zero-punctuation/10-The-Orange-Box',
+    'http://www.escapistmagazine.com/videos/view/unskippable/736-Lost-Odyssey',
+
+    # justin.tv has two media types that we care about, streams, which
+    # we can scrape, and clips, which we can't
+    'http://www.justin.tv/help', # stream
+    'http://www.justin.tv/clip/c07a333f94e5716b', # clip, which we can't currently scrape, and shouldn't try
+
+    'http://soundcloud.com/kalhonaaho01/never-gonna-stand-you-up-rick-astley-vs-ludacris-album-version',
+    'http://listen.grooveshark.com/#/song/Never_Gonna_Give_You_Up/12616328',
+    'http://tinysong.com/2WOJ', # also Grooveshark
+
+    'http://www.rickrolled.com/videos/video/rickrolld' # test the DeepScraper
+    ]
+
+def submit_all():
+    from r2.models import Subreddit, Account, Link, NotFound
+    from r2.lib.media import set_media
+    from r2.lib.db import queries
+    sr = Subreddit._by_name('testmedia')
+    author = Account._by_name('testmedia')
+    links = []
+    for url in test_urls:
+        try:
+            # delete any existing version of the link
+            l = Link._by_url(url, sr)
+            print "Deleting %s" % l
+            l._deleted = True
+            l._commit()
+        except NotFound:
+            pass
+
+        l = Link._submit(url, url, author, sr, '0.0.0.0')
+
+        try:
+            set_media(l)
+        except Exception, e:
+            print e
+
+        if g.write_query_queue:
+            queries.new_link(l)
+
+        links.append(l)
+
+    return links

 def test():
-    #from r2.lib.pool2 import WorkQueue
-    jobs = []
-    f = open('/tmp/testurls.txt')
-    for url in f:
-        if url.startswith('#'):
-            continue
-        if url.startswith('/info'):
-            continue
-        
-        def make_job(url):
-            def fetch(url):
-                print 'START', url
-                url = url.strip()
-                h = make_scraper(url)
-                image_url = h.largest_image_url()
-                print 'DONE', image_url
-            return lambda: fetch(url)
+    """Take some example URLs and print out a nice pretty HTML table
+       of their extracted thubmnails and media objects"""
+    import sys
+    from r2.lib.filters import websafe

-        jobs.append(make_job(url))
+    print "<html><body><table border=\"1\">"
+    for url in test_urls:
+        sys.stderr.write("%s\n" % url)
+        print "<tr>"
+        h = make_scraper(url)
+        print "<td>"
+        print "<b>", websafe(url), "</b>"
+        print "<br />"
+        print websafe(repr(h))
+        img = h.largest_image_url()
+        if img:
+            print "<td><img src=\"%s\" /></td>" % img
+        else:
+            print "<td>(no image)</td>"
+        mo = h.media_object()
+        print "<td>"
+        if mo:
+            s = scrapers[mo['type']]
+            print websafe(repr(mo))
+            print "<br />"
+            print s.media_embed(**mo).content
+        else:
+            print "None"
+        print "</td>"
+        print "</tr>"
+    print "</table></body></html>"

-    print jobs[0]()
-    #wq = WorkQueue(jobs)
-    #wq.start()            
-
-if __name__ == '__main__':
-    test()
--- a/r2/r2/models/link.py
+++ b/r2/r2/models/link.py
@@ -211,6 +211,7 @@ class Link(Thing, Printable):
             s.append(request.get.has_key('twocolumn'))
        elif style == "xml":
            s.append(request.GET.has_key("nothumbs"))
+        s.append(getattr(wrapped, 'media_object', {}))
        return s

    def make_permalink(self, sr, force_domain = False):
--- a/r2/r2/templates/mediaembed.html
+++ b/r2/r2/templates/mediaembed.html
@@ -0,0 +1,24 @@
+## The contents of this file are subject to the Common Public Attribution
+## License Version 1.0. (the "License"); you may not use this file except in
+## compliance with the License. You may obtain a copy of the License at
+## http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+## License Version 1.1, but Sections 14 and 15 have been added to cover use of
+## software over a computer network and provide for limited attribution for the
+## Original Developer. In addition, Exhibit A has been modified to be consistent
+## with Exhibit B.
+## 
+## Software distributed under the License is distributed on an "AS IS" basis,
+## WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+## the specific language governing rights and limitations under the License.
+## 
+## The Original Code is Reddit.
+## 
+## The Original Developer is the Initial Developer.  The Initial Developer of
+## the Original Code is CondeNet, Inc.
+## 
+## All portions of the code written by CondeNet are Copyright (c) 2006-2009
+## CondeNet, Inc. All Rights Reserved.
+################################################################################
+<iframe src="http://${thing.media_domain}/mediaembed/${thing.id36}"
+        width="${thing.width}" height="${thing.height}" border="0"
+        frameBorder="0" scrolling="no"></iframe>
--- a/r2/r2/templates/mediaembedbody.html
+++ b/r2/r2/templates/mediaembedbody.html
@@ -0,0 +1,33 @@
+## The contents of this file are subject to the Common Public Attribution
+## License Version 1.0. (the "License"); you may not use this file except in
+## compliance with the License. You may obtain a copy of the License at
+## http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+## License Version 1.1, but Sections 14 and 15 have been added to cover use of
+## software over a computer network and provide for limited attribution for the
+## Original Developer. In addition, Exhibit A has been modified to be consistent
+## with Exhibit B.
+## 
+## Software distributed under the License is distributed on an "AS IS" basis,
+## WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+## the specific language governing rights and limitations under the License.
+## 
+## The Original Code is Reddit.
+## 
+## The Original Developer is the Initial Developer.  The Initial Developer of
+## the Original Code is CondeNet, Inc.
+## 
+## All portions of the code written by CondeNet are Copyright (c) 2006-2009
+## CondeNet, Inc. All Rights Reserved.
+################################################################################
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+  <head>
+    <style type="text/css">
+        body, object, embed, div, span, p {
+            margin:  0;
+            padding: 0;
+        }
+    </style>
+  </head>
+  <body>${unsafe(thing.body)}</body>
+</html>
--- a/r2/setup.py
+++ b/r2/setup.py
@@ -83,7 +83,7 @@ setup(
                      "flup",
                      "simplejson", 
                      "SQLAlchemy==0.5.3",
-                      "BeautifulSoup >= 3",
+                      "BeautifulSoup == 3.0.7a", # last version to use the good parser
                      "cssutils==0.9.5.1",
                      "chardet",
                      "psycopg2",