* add some new scrapers

* start showing embedded videos inside of an <iframe>. This allows us to
  embed videos from external sites without having to trust their DOM security

Note that the media_object property on new links is now a dictionary,
but old ones will still be strings
This commit is contained in:
ketralnis
2009-07-21 14:31:11 -07:00
parent b02ae5a8e1
commit 2cf3490329
12 changed files with 561 additions and 61 deletions

View File

@@ -91,6 +91,7 @@ max_sr_images = 20
login_cookie = reddit_session
domain = localhost
domain_prefix =
media_domain = localhost
default_sr = localhost
admins =
sponsors =

View File

@@ -168,6 +168,8 @@ def make_map(global_conf={}, app_conf={}):
mc('/captcha/:iden', controller='captcha', action='captchaimg')
mc('/mediaembed/:link', controller="mediaembed", action="mediaembed")
mc('/doquery', controller='query', action='doquery')
mc('/store', controller='redirect', action='redirect',

View File

@@ -45,6 +45,7 @@ from post import PostController
from toolbar import ToolbarController
from i18n import I18nController
from promotecontroller import PromoteController
from mediaembed import MediaembedController
from querycontroller import QueryController

View File

@@ -0,0 +1,52 @@
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is Reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of the
# Original Code is CondeNet, Inc.
#
# All portions of the code written by CondeNet are Copyright (c) 2006-2009
# CondeNet, Inc. All Rights Reserved.
################################################################################
from validator import *
from reddit_base import RedditController
from r2.lib.scraper import scrapers
from r2.lib.pages import MediaEmbedBody
from pylons import request
class MediaembedController(RedditController):
@validate(link = VLink('link'))
def GET_mediaembed(self, link):
if request.host != g.media_domain:
# don't serve up untrusted content except on our
# specifically untrusted domain
return self.abort404()
if not link or not link.media_object:
return self.abort404()
if isinstance(link.media_object, basestring):
# it's an old-style string
content = link.media_object
elif isinstance(link.media_object, dict):
# otherwise it's the new style, which is a dict(type=type, **args)
media_object_type = link.media_object['type']
scraper = scrapers[media_object_type]
media_embed = scraper.media_embed(**link.media_object)
content = media_embed.content
return MediaEmbedBody(body = content).render()

View File

@@ -865,7 +865,8 @@ class VCnameDomain(Validator):
def run(self, domain):
if (domain
and (not self.domain_re.match(domain)
or domain.endswith('.reddit.com')
or domain.endswith('.' + g.domain)
or domain.endswith('.' + g.media_domain)
or len(domain) > 300)):
self.set_error(errors.BAD_CNAME)
elif domain:

View File

@@ -174,6 +174,11 @@ class Globals(object):
if self.debug:
self.log.setLevel(logging.DEBUG)
if not self.media_domain:
self.media_domain = self.domain
if self.media_domain == self.domain:
print "Warning: g.media_domain == g.domain. This may give untrusted content access to user cookies"
#read in our CSS so that it can become a default for subreddit
#stylesheets
stylesheet_path = os.path.join(paths.get('static_files'),

View File

@@ -42,6 +42,7 @@ from r2.lib.utils import title_to_url, query_string, UrlParser, to_js, vote_hash
from r2.lib.utils import link_duplicates
from r2.lib.template_helpers import add_sr, get_domain
from r2.lib.subreddit_search import popular_searches
from r2.lib.scraper import scrapers
import sys, random, datetime, locale, calendar, simplejson, re
import graph
@@ -1482,9 +1483,25 @@ class LinkChild(object):
return ''
class MediaChild(LinkChild):
"""renders when the user hits the expando button to expand media
objects, like embedded videos"""
css_style = "video"
def content(self):
return self.link.media_object
if isinstance(self.link.media_object, basestring):
return self.link.media_object
media_object_type = self.link.media_object['type']
if media_object_type in scrapers:
scraper = scrapers[media_object_type]
media_embed = scraper.media_embed(**self.link.media_object)
return MediaEmbed(media_domain = g.media_domain,
height = media_embed.height+10,
width = media_embed.width+10,
id36 = self.link._id36).render()
class MediaEmbed(Templated):
"""The actual rendered iframe for a media child"""
pass
class SelfTextChild(LinkChild):
css_style = "selftext"
@@ -1494,10 +1511,6 @@ class SelfTextChild(LinkChild):
nofollow = self.nofollow)
return u.render()
class SelfText(Templated):
def __init__(self, link):
Templated.__init__(self, link = link)
class UserText(CachedTemplate):
def __init__(self,
item,
@@ -1531,6 +1544,10 @@ class UserText(CachedTemplate):
cloneable = cloneable,
css_class = css_class)
class MediaEmbedBody(CachedTemplate):
"""What's rendered inside the iframe that contains media objects"""
pass
class Traffic(Templated):
@staticmethod
def slice_traffic(traffic, *indices):

View File

@@ -151,6 +151,16 @@ def fetch_url(url, referer = None, retries = 1, dimension = False):
def fetch_size(url, referer = None, retries = 1):
return fetch_url(url, referer, retries, dimension = True)
class MediaEmbed(object):
width = None
height = None
content = None
def __init__(self, height, width, content):
self.height = height
self.width = width
self.content = content
class Scraper:
def __init__(self, url):
self.url = url
@@ -158,6 +168,9 @@ class Scraper:
self.content_type = None
self.soup = None
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.url)
def download(self):
self.content_type, self.content = fetch_url(self.url)
if self.content_type and 'html' in self.content_type and self.content:
@@ -184,6 +197,12 @@ class Scraper:
max_area = 0
max_url = None
if self.soup:
thumbnail_spec = self.soup.find('link', rel = 'image_src')
if thumbnail_spec and thumbnail_spec['href']:
log.debug("Using image_src")
return thumbnail_spec['href']
for image_url in self.image_urls():
size = fetch_size(image_url, referer = self.url)
if not size:
@@ -228,28 +247,60 @@ class Scraper:
return image
def media_object(self):
return None
for deepscraper in deepscrapers:
ds = deepscraper()
found = ds.find_media_object(self)
if found:
return found
@classmethod
def media_embed(cls):
raise NotImplementedError
class MediaScraper(Scraper):
media_template = ""
thumbnail_template = ""
video_id = None
video_id_rx = None
def __init__(self, url):
m = self.video_id_rx.match(url)
if m:
self.video_id = m.groups()[0]
else:
#if we can't find the id just treat it like a normal page
log.debug('reverting to regular scraper: %s' % url)
self.__class__ = Scraper
Scraper.__init__(self, url)
# first try the simple regex against the URL. If that fails,
# see if the MediaScraper subclass has its own extraction
# function
if self.video_id_rx:
m = self.video_id_rx.match(url)
if m:
self.video_id = m.groups()[0]
if not self.video_id:
video_id = self.video_id_extract()
if video_id:
self.video_id = video_id
if not self.video_id:
#if we still can't find the id just treat it like a normal page
log.debug('reverting to regular scraper: %s' % url)
self.__class__ = Scraper
def video_id_extract(self):
return None
def largest_image_url(self):
return self.thumbnail_template.replace('$video_id', self.video_id)
if self.thumbnail_template:
return self.thumbnail_template.replace('$video_id', self.video_id)
else:
return Scraper.largest_image_url(self)
def media_object(self):
return self.media_template.replace('$video_id', self.video_id)
return dict(video_id = self.video_id,
type = self.domains[0])
@classmethod
def media_embed(cls, video_id = None, height = None, width = None, **kw):
content = cls.media_template.replace('$video_id', video_id)
return MediaEmbed(height = height or cls.height,
width = width or cls.width,
content = content)
def youtube_in_google(google_url):
h = Scraper(google_url)
@@ -276,17 +327,20 @@ def make_scraper(url):
return make_scraper(youtube_url)
return scraper(url)
########## site-specific video scrapers ##########
#Youtube
class YoutubeScraper(MediaScraper):
media_template = '<object width="480" height="295"><param name="movie" value="http://www.youtube-nocookie.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube-nocookie.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="480" height="295"></embed></object>'
domains = ['youtube.com']
height = 295
width = 480
media_template = '<object width="490" height="295"><param name="movie" value="http://www.youtube.com/v/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.youtube.com/v/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="480" height="295"></embed></object>'
thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
#Metacage
class MetacafeScraper(MediaScraper):
domains = ['metacafe.com']
height = 345
width = 400
media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
video_id_rx = re.compile('.*/watch/([^/]+)/.*')
@@ -296,20 +350,16 @@ class MetacafeScraper(MediaScraper):
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return self.media_template.replace('$video_id', video_url)
return dict(video_id = video_url,
type = self.domains[0])
def largest_image_url(self):
if not self.soup:
self.download()
if self.soup:
return self.soup.find('link', rel = 'image_src')['href']
#Google Video
gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
class GootubeScraper(MediaScraper):
domains = ['video.google.com']
height = 326
width = 400
media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
def largest_image_url(self):
if not self.content:
@@ -318,40 +368,353 @@ class GootubeScraper(MediaScraper):
if not self.content:
return None
m = gootube_thumb_rx.match(self.content)
m = self.gootube_thumb_rx.match(self.content)
if m:
image_url = m.groups()[0]
image_url = utils.safe_eval_str(image_url)
return image_url
scrapers = {'youtube.com': YoutubeScraper,
'video.google.com': GootubeScraper,
'metacafe.com': MetacafeScraper}
class VimeoScraper(MediaScraper):
domains = ['vimeo.com']
height = 448
width = 520
media_template = '<embed src="$video_id" width="520" height="448" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
video_id_rx = re.compile('.*/(.*)')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return dict(video_id = video_url,
type = self.domains[0])
class BreakScraper(MediaScraper):
domains = ['break.com']
height = 421
width = 520
media_template = '<object width="520" height="421"><param name="movie" value="$video_id"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" allowScriptAccess="always" width="520" height="421"></embed></object>'
video_id_rx = re.compile('.*/index/([^/]+).*');
def video_id_extract(self):
if not self.soup:
self.download()
if self.soup:
video_src = self.soup.find('link', rel = 'video_src')
if video_src and video_src['href']:
return video_src['href']
class TheOnionScraper(MediaScraper):
domains = ['theonion.com']
height = 430
width = 480
media_template = """<object width="480" height="430">
<param name="allowfullscreen" value="true" />
<param name="allowscriptaccess" value="always" />
<param name="movie" value="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf?&amp;videoid=$video_id" />
<param name="wmode" value="transparent" />
<embed src="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf"
width="480" height="430"
wmode="transparent"
pluginspage="http://www.macromedia.com/go/getflashplayer"
type="application/x-shockwave-flash"
flashvars="videoid=$video_id" >
</embed>
</object>"""
video_id_rx = re.compile('.*/video/([^/?#]+).*')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
return dict(video_id = video_url,
type = self.domains[0])
class CollegeHumorScraper(MediaScraper):
domains = ['collegehumor.com']
height = 390
width = 520
media_template = '<object type="application/x-shockwave-flash" data="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" width="520" height="390" ><param name="allowfullscreen" value="true" /><param name="AllowScriptAccess" value="true" /><param name="movie" quality="best" value="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" /></object>'
video_id_rx = re.compile('.*video:(\d+).*');
class FunnyOrDieScraper(MediaScraper):
domains = ['funnyordie.com']
height = 438
width = 464
media_template = '<object width="464" height="438" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" id="fodplayer"><param name="movie" value="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac?key=$video_id" /><param name="flashvars" value="key=$video_id&autostart=true&internal=true" /><param name="allowfullscreen" value="true" /><embed width="464" height="438" flashvars="key=$video_id&autostart=true" allowfullscreen="true" quality="high" src="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac" name="fodplayer" type="application/x-shockwave-flash"></embed></object>'
thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
video_id_rx = re.compile('.*/videos/([^/]+)/.*')
class ComedyCentralScraper(MediaScraper):
domains = ['comedycentral.com', 'thedailyshow.com']
height = 316
width = 332
media_template = '<embed FlashVars="videoId=$video_id" src="http://www.comedycentral.com/sitewide/video_player/view/default/swf.jhtml" quality="high" bgcolor="#cccccc" width="332" height="316" name="comedy_central_player" align="middle" allowScriptAccess="always" allownetworking="external" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>'
video_id_rx = re.compile('.*videoId=(\d+).*')
class ColbertNationScraper(ComedyCentralScraper):
domains = ['colbertnation.com']
video_id_rx = re.compile('.*videos/(\d+)/.*')
class LiveLeakScraper(MediaScraper):
domains = ['liveleak.com']
height = 370
width = 450
media_template = '<object width="450" height="370"><param name="movie" value="http://www.liveleak.com/e/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.liveleak.com/e/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="450" height="370"></embed></object>'
video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
def largest_image_url(self):
if not self.soup:
self.download()
if self.soup:
return self.soup.find('link', rel = 'videothumbnail')['href']
class DailyMotionScraper(MediaScraper):
domains = ['dailymotion.com']
height = 381
width = 480
media_template = '<object width="480" height="381"><param name="movie" value="$video_id"></param><param name="allowFullScreen" value="true"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" width="480" height="381" allowFullScreen="true" allowScriptAccess="always"></embed></object>'
video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return dict(video_id = video_url,
type = self.domains[0])
class RevverScraper(MediaScraper):
domains = ['revver.com']
height = 392
width = 480
media_template = '<script src="http://flash.revver.com/player/1.0/player.js?mediaId:$video_id;width:480;height:392;" type="text/javascript"></script>'
video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
class EscapistScraper(MediaScraper):
domains = ['escapistmagazine.com']
height = 294
width = 480
media_template = """<script src="http://www.escapistmagazine.com/videos/embed/$video_id"></script>"""
video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
class JustintvScraper(MediaScraper):
"""Can grab streams from justin.tv, but not clips"""
domains = ['justin.tv']
height = 295
width = 353
stream_media_template = """<object type="application/x-shockwave-flash" height="295" width="353" id="jtv_player_flash" data="http://www.justin.tv/widgets/jtv_player.swf?channel=$video_id" bgcolor="#000000"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="allowNetworking" value="all" /><param name="movie" value="http://www.justin.tv/widgets/jtv_player.swf" /><param name="flashvars" value="channel=$video_id&auto_play=false&start_volume=25" /></object>"""
video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
@classmethod
def media_embed(cls, video_id, **kw):
content = cls.stream_media_template.replace('$video_id', video_id)
return MediaEmbed(height = cls.height,
width = cls.width,
content = content)
class SoundcloudScraper(MediaScraper):
"""soundcloud.com"""
domains = ['soundcloud.com']
height = 81
width = 400
media_template = """<div style="font-size: 11px;">
<object height="81" width="100%">
<param name="movie"
value="http://player.soundcloud.com/player.swf?track=$video_id">
</param>
<param name="allowscriptaccess" value="always"></param>
<embed allowscriptaccess="always" height="81"
src="http://player.soundcloud.com/player.swf?track=$video_id"
type="application/x-shockwave-flash"
width="100%">
</embed>
</object>"""
video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
class DeepScraper(object):
"""Subclasses of DeepScraper attempt to dive into generic pages
for embeds of other types (like YouTube videos on blog
sites)."""
def find_media_object(self, scraper):
return None
class YoutubeEmbedDeepScraper(DeepScraper):
youtube_url_re = re.compile('^(http://www.youtube.com/v/([_a-zA-Z0-9-]+)).*')
def find_media_object(self, scraper):
# try to find very simple youtube embeds
if not scraper.soup:
scraper.download()
if scraper.soup:
movie_embed = scraper.soup.find('embed',
attrs={'src': lambda x: self.youtube_url_re.match(x)})
if movie_embed:
youtube_id = self.youtube_url_re.match(movie_embed['src']).group(2)
youtube_url = 'http://www.youtube.com/watch?v=%s"' % youtube_id
log.debug('found youtube embed %s' % youtube_url)
mo = YoutubeScraper(youtube_url).media_object()
mo['deep'] = scraper.url
return mo
#scrapers =:= dict(domain -> ScraperClass)
scrapers = {}
for scraper in [ YoutubeScraper,
MetacafeScraper,
GootubeScraper,
VimeoScraper,
BreakScraper,
TheOnionScraper,
CollegeHumorScraper,
FunnyOrDieScraper,
ComedyCentralScraper,
ColbertNationScraper,
LiveLeakScraper,
DailyMotionScraper,
RevverScraper,
EscapistScraper,
JustintvScraper,
SoundcloudScraper,
]:
for domain in scraper.domains:
scrapers[domain] = scraper
deepscrapers = [YoutubeEmbedDeepScraper]
def convert_old_media_objects():
q = Link._query(Link.c.media_object is not None,
Link.c._date > whenever,
data = True)
for link in utils.fetch_things2(q):
if not getattr(link, 'media_object', None):
continue
if 'youtube' in link.media_object:
# we can rewrite this one without scraping
video_id = YoutubeScraper.video_id_rx.match(link.url)
link.media_object = dict(type='youtube.com',
video_id = video_id.group(1))
elif ('video.google.com' in link.media_object
or 'metacafe' in link.media_object):
scraper = make_scraper(link.url)
if not scraper:
continue
mo = scraper.media_object()
if not mo:
continue
link.media_object = mo
else:
print "skipping %s because it confuses me" % link._fullname
continue
link._commit()
test_urls = [
'http://www.facebook.com/pages/Rick-Astley/5807213510?sid=c99aaf3888171e73668a38e0749ae12d', # regular thumbnail finder
'http://www.flickr.com/photos/septuagesima/317819584/', # thumbnail with image_src
'http://www.youtube.com/watch?v=Yu_moia-oVI',
'http://www.metacafe.com/watch/sy-1473689248/rick_astley_never_gonna_give_you_up_official_music_video/',
'http://video.google.com/videoplay?docid=5908758151704698048',
'http://vimeo.com/4495451',
'http://www.break.com/usercontent/2008/11/Macy-s-Thankgiving-Day-Parade-Rick-Roll-611965.html',
'http://www.theonion.com/content/video/sony_releases_new_stupid_piece_of',
'http://www.collegehumor.com/video:1823712',
'http://www.funnyordie.com/videos/7f2a184755/macys-thanksgiving-day-parade-gets-rick-rolled-from-that-happened',
'http://www.comedycentral.com/videos/index.jhtml?videoId=178342&title=ultimate-fighting-vs.-bloggers',
'http://www.thedailyshow.com/video/index.jhtml?videoId=175244&title=Photoshop-of-Horrors',
'http://www.colbertnation.com/the-colbert-report-videos/63549/may-01-2006/sign-off---spam',
'http://www.liveleak.com/view?i=e09_1207983531',
'http://www.dailymotion.com/relevance/search/rick+roll/video/x5l8e6_rickroll_fun',
'http://revver.com/video/1199591/rick-rolld-at-work/',
'http://www.escapistmagazine.com/videos/view/zero-punctuation/10-The-Orange-Box',
'http://www.escapistmagazine.com/videos/view/unskippable/736-Lost-Odyssey',
# justin.tv has two media types that we care about, streams, which
# we can scrape, and clips, which we can't
'http://www.justin.tv/help', # stream
'http://www.justin.tv/clip/c07a333f94e5716b', # clip, which we can't currently scrape, and shouldn't try
'http://soundcloud.com/kalhonaaho01/never-gonna-stand-you-up-rick-astley-vs-ludacris-album-version',
'http://listen.grooveshark.com/#/song/Never_Gonna_Give_You_Up/12616328',
'http://tinysong.com/2WOJ', # also Grooveshark
'http://www.rickrolled.com/videos/video/rickrolld' # test the DeepScraper
]
def submit_all():
from r2.models import Subreddit, Account, Link, NotFound
from r2.lib.media import set_media
from r2.lib.db import queries
sr = Subreddit._by_name('testmedia')
author = Account._by_name('testmedia')
links = []
for url in test_urls:
try:
# delete any existing version of the link
l = Link._by_url(url, sr)
print "Deleting %s" % l
l._deleted = True
l._commit()
except NotFound:
pass
l = Link._submit(url, url, author, sr, '0.0.0.0')
try:
set_media(l)
except Exception, e:
print e
if g.write_query_queue:
queries.new_link(l)
links.append(l)
return links
def test():
#from r2.lib.pool2 import WorkQueue
jobs = []
f = open('/tmp/testurls.txt')
for url in f:
if url.startswith('#'):
continue
if url.startswith('/info'):
continue
def make_job(url):
def fetch(url):
print 'START', url
url = url.strip()
h = make_scraper(url)
image_url = h.largest_image_url()
print 'DONE', image_url
return lambda: fetch(url)
"""Take some example URLs and print out a nice pretty HTML table
of their extracted thubmnails and media objects"""
import sys
from r2.lib.filters import websafe
jobs.append(make_job(url))
print "<html><body><table border=\"1\">"
for url in test_urls:
sys.stderr.write("%s\n" % url)
print "<tr>"
h = make_scraper(url)
print "<td>"
print "<b>", websafe(url), "</b>"
print "<br />"
print websafe(repr(h))
img = h.largest_image_url()
if img:
print "<td><img src=\"%s\" /></td>" % img
else:
print "<td>(no image)</td>"
mo = h.media_object()
print "<td>"
if mo:
s = scrapers[mo['type']]
print websafe(repr(mo))
print "<br />"
print s.media_embed(**mo).content
else:
print "None"
print "</td>"
print "</tr>"
print "</table></body></html>"
print jobs[0]()
#wq = WorkQueue(jobs)
#wq.start()
if __name__ == '__main__':
test()

View File

@@ -211,6 +211,7 @@ class Link(Thing, Printable):
s.append(request.get.has_key('twocolumn'))
elif style == "xml":
s.append(request.GET.has_key("nothumbs"))
s.append(getattr(wrapped, 'media_object', {}))
return s
def make_permalink(self, sr, force_domain = False):

View File

@@ -0,0 +1,24 @@
## The contents of this file are subject to the Common Public Attribution
## License Version 1.0. (the "License"); you may not use this file except in
## compliance with the License. You may obtain a copy of the License at
## http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
## License Version 1.1, but Sections 14 and 15 have been added to cover use of
## software over a computer network and provide for limited attribution for the
## Original Developer. In addition, Exhibit A has been modified to be consistent
## with Exhibit B.
##
## Software distributed under the License is distributed on an "AS IS" basis,
## WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
## the specific language governing rights and limitations under the License.
##
## The Original Code is Reddit.
##
## The Original Developer is the Initial Developer. The Initial Developer of
## the Original Code is CondeNet, Inc.
##
## All portions of the code written by CondeNet are Copyright (c) 2006-2009
## CondeNet, Inc. All Rights Reserved.
################################################################################
<iframe src="http://${thing.media_domain}/mediaembed/${thing.id36}"
width="${thing.width}" height="${thing.height}" border="0"
frameBorder="0" scrolling="no"></iframe>

View File

@@ -0,0 +1,33 @@
## The contents of this file are subject to the Common Public Attribution
## License Version 1.0. (the "License"); you may not use this file except in
## compliance with the License. You may obtain a copy of the License at
## http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
## License Version 1.1, but Sections 14 and 15 have been added to cover use of
## software over a computer network and provide for limited attribution for the
## Original Developer. In addition, Exhibit A has been modified to be consistent
## with Exhibit B.
##
## Software distributed under the License is distributed on an "AS IS" basis,
## WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
## the specific language governing rights and limitations under the License.
##
## The Original Code is Reddit.
##
## The Original Developer is the Initial Developer. The Initial Developer of
## the Original Code is CondeNet, Inc.
##
## All portions of the code written by CondeNet are Copyright (c) 2006-2009
## CondeNet, Inc. All Rights Reserved.
################################################################################
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<style type="text/css">
body, object, embed, div, span, p {
margin: 0;
padding: 0;
}
</style>
</head>
<body>${unsafe(thing.body)}</body>
</html>

View File

@@ -83,7 +83,7 @@ setup(
"flup",
"simplejson",
"SQLAlchemy==0.5.3",
"BeautifulSoup >= 3",
"BeautifulSoup == 3.0.7a", # last version to use the good parser
"cssutils==0.9.5.1",
"chardet",
"psycopg2",