Files
reddit/r2/r2/lib/scraper.py
2013-08-08 14:53:34 -07:00

1865 lines
89 KiB
Python

# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2013 reddit
# Inc. All Rights Reserved.
###############################################################################
from pylons import g
from r2.lib import utils
from r2.lib.memoize import memoize
import simplejson as json
from urllib2 import Request, HTTPError, URLError, urlopen
from httplib import InvalidURL
import urlparse, re, urllib, logging, StringIO, logging
import Image, ImageFile, math
from BeautifulSoup import BeautifulSoup
log = g.log
useragent = g.useragent
chunk_size = 1024
thumbnail_size = 70, 70
def image_to_str(image):
s = StringIO.StringIO()
image.save(s, image.format)
s.seek(0)
return s.read()
def str_to_image(s):
s = StringIO.StringIO(s)
s.seek(0)
image = Image.open(s)
return image
def prepare_image(image):
image = square_image(image)
image.thumbnail(thumbnail_size, Image.ANTIALIAS)
return image
def image_entropy(img):
"""calculate the entropy of an image"""
hist = img.histogram()
hist_size = sum(hist)
hist = [float(h) / hist_size for h in hist]
return -sum([p * math.log(p, 2) for p in hist if p != 0])
def square_image(img):
"""if the image is taller than it is wide, square it off. determine
which pieces to cut off based on the entropy pieces."""
x,y = img.size
while y > x:
#slice 10px at a time until square
slice_height = min(y - x, 10)
bottom = img.crop((0, y - slice_height, x, y))
top = img.crop((0, 0, x, slice_height))
#remove the slice with the least entropy
if image_entropy(bottom) < image_entropy(top):
img = img.crop((0, 0, x, y - slice_height))
else:
img = img.crop((0, slice_height, x, y))
x,y = img.size
return img
def clean_url(url):
"""url quotes unicode data out of urls"""
s = url
url = url.encode('utf8')
url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
return url
def fetch_url(url, referer = None, retries = 1, dimension = False):
cur_try = 0
log.debug('fetching: %s' % url)
nothing = None if dimension else (None, None)
url = clean_url(url)
#just basic urls
if not url.startswith(('http://', 'https://')):
return nothing
while True:
try:
req = Request(url)
if useragent:
req.add_header('User-Agent', useragent)
if referer:
req.add_header('Referer', referer)
open_req = urlopen(req)
#if we only need the dimension of the image, we may not
#need to download the entire thing
if dimension:
content = open_req.read(chunk_size)
else:
content = open_req.read()
content_type = open_req.headers.get('content-type')
if not content_type:
return nothing
if 'image' in content_type:
p = ImageFile.Parser()
new_data = content
while not p.image and new_data:
p.feed(new_data)
new_data = open_req.read(chunk_size)
content += new_data
#return the size, or return the data
if dimension and p.image:
return p.image.size
elif dimension:
return nothing
elif dimension:
#expected an image, but didn't get one
return nothing
return content_type, content
except (URLError, HTTPError, InvalidURL), e:
cur_try += 1
if cur_try >= retries:
log.debug('error while fetching: %s referer: %s' % (url, referer))
log.debug(e)
return nothing
finally:
if 'open_req' in locals():
open_req.close()
@memoize('media.fetch_size')
def fetch_size(url, referer = None, retries = 1):
return fetch_url(url, referer, retries, dimension = True)
class MediaEmbed(object):
width = None
height = None
content = None
scrolling = False
def __init__(self, height, width, content, scrolling = False):
self.height = int(height)
self.width = int(width)
self.content = content
self.scrolling = scrolling
class Scraper:
def __init__(self, url):
self.url = url
self.content = None
self.content_type = None
self.soup = None
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.url)
def download(self):
self.content_type, self.content = fetch_url(self.url)
if self.content_type and 'html' in self.content_type and self.content:
self.soup = BeautifulSoup(self.content)
def image_urls(self):
#if the original url was an image, use that
if 'image' in self.content_type:
yield self.url
elif self.soup:
images = self.soup.findAll('img', src = True)
for i in images:
image_url = urlparse.urljoin(self.url, i['src'])
yield image_url
def largest_image_url(self):
if not self.content:
self.download()
#if download didn't work
if not self.content or not self.content_type:
return None
max_area = 0
max_url = None
if self.soup:
og_image = (self.soup.find('meta', property='og:image') or
self.soup.find('meta', attrs={'name': 'og:image'}))
if og_image and og_image['content']:
log.debug("Using og:image")
return og_image['content']
thumbnail_spec = self.soup.find('link', rel = 'image_src')
if thumbnail_spec and thumbnail_spec['href']:
log.debug("Using image_src")
return thumbnail_spec['href']
for image_url in self.image_urls():
size = fetch_size(image_url, referer = self.url)
if not size:
continue
area = size[0] * size[1]
#ignore little images
if area < 5000:
log.debug('ignore little %s' % image_url)
continue
#ignore excessively long/wide images
if max(size) / min(size) > 1.5:
log.debug('ignore dimensions %s' % image_url)
continue
#penalize images with "sprite" in their name
if 'sprite' in image_url.lower():
log.debug('penalizing sprite %s' % image_url)
area /= 10
if area > max_area:
max_area = area
max_url = image_url
return max_url
def thumbnail(self):
image_url = self.largest_image_url()
if image_url:
content_type, image_str = fetch_url(image_url, referer = self.url)
if image_str:
image = str_to_image(image_str)
try:
image = prepare_image(image)
except IOError, e:
#can't read interlaced PNGs, ignore
if 'interlaced' in e.message:
return
raise
return image
def media_object(self):
for deepscraper in deepscrapers:
ds = deepscraper()
found = ds.find_media_object(self)
if found:
return found
@classmethod
def media_embed(cls):
raise NotImplementedError
class MediaScraper(Scraper):
media_template = ""
thumbnail_template = ""
video_id = None
video_id_rx = None
def __init__(self, url):
Scraper.__init__(self, url)
# first try the simple regex against the URL. If that fails,
# see if the MediaScraper subclass has its own extraction
# function
if self.video_id_rx:
m = self.video_id_rx.match(url)
if m:
self.video_id = m.groups()[0]
if not self.video_id:
video_id = self.video_id_extract()
if video_id:
self.video_id = video_id
if not self.video_id:
#if we still can't find the id just treat it like a normal page
log.debug('reverting to regular scraper: %s' % url)
self.__class__ = Scraper
def video_id_extract(self):
return None
def largest_image_url(self):
if self.thumbnail_template:
return self.thumbnail_template.replace('$video_id', self.video_id)
else:
return Scraper.largest_image_url(self)
def media_object(self):
return dict(video_id = self.video_id,
type = self.domains[0])
@classmethod
def media_embed(cls, video_id = None, height = None, width = None, **kw):
content = cls.media_template.replace('$video_id', video_id)
return MediaEmbed(height = height or cls.height,
width = width or cls.width,
content = content)
def youtube_in_google(google_url):
h = Scraper(google_url)
h.download()
try:
youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
log.debug('%s is really %s' % (google_url, youtube_url))
return youtube_url
except AttributeError, KeyError:
pass
def make_scraper(url):
domain = utils.domain(url)
scraper = Scraper
for suffix, clses in scrapers.iteritems():
for cls in clses:
if domain.endswith(suffix):
scraper = cls
break
#sometimes youtube scrapers masquerade as google scrapers
if scraper == GootubeScraper:
youtube_url = youtube_in_google(url)
if youtube_url:
return make_scraper(youtube_url)
return scraper(url)
########## site-specific video scrapers ##########
class YoutubeScraper(MediaScraper):
domains = ['youtube.com']
height = 295
width = 480
media_template = '<object width="490" height="295"><param name="movie" value="http://www.youtube.com/v/$video_id&fs=1"></param><param name="wmode" value="transparent"></param><param name="allowFullScreen" value="true"></param><embed src="http://www.youtube.com/v/$video_id&fs=1" type="application/x-shockwave-flash" wmode="transparent" allowFullScreen="true" width="480" height="295"></embed></object>'
thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
video_deeplink_rx = re.compile('.*#t=(\d+)m(\d+)s.*')
def video_id_extract(self):
vid = self.video_id_rx.match(self.url)
if(vid):
video_id = vid.groups()[0]
d = self.video_deeplink_rx.match(self.url)
if(d):
seconds = int(d.groups()[0])*60 + int(d.groups()[1])
video_id += "&start=%d" % seconds
return video_id
def largest_image_url(self):
# Remove the deeplink part from the video id
return self.thumbnail_template.replace("$video_id",
self.video_id.split("&")[0])
class TedScraper(MediaScraper):
domains = ['ted.com']
height = 326
width = 446
media_template = '<object width="446" height="326"><param name="movie" value="http://video.ted.com/assets/player/swf/EmbedPlayer.swf"></param><param name="allowFullScreen" value="true" /><param name="wmode" value="transparent"></param><param name="bgColor" value="#ffffff"></param> <param name="flashvars" value="$video_id" /><embed src="http://video.ted.com/assets/player/swf/EmbedPlayer.swf" pluginspace="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash" wmode="transparent" bgColor="#ffffff" width="446" height="326" allowFullScreen="true" flashvars="$video_id"></embed></object>'
flashvars_rx = re.compile('.*flashvars="(.*)".*')
def video_id_extract(self):
if "/talks/" in self.url:
content_type, content = fetch_url(self.url.replace("/talks/","/talks/embed/"))
if content:
m = self.flashvars_rx.match(content)
if m:
return m.groups()[0]
def largest_image_url(self):
if not self.soup:
self.download()
if self.soup:
return self.soup.find('link', rel = 'image_src')['href']
class MetacafeScraper(MediaScraper):
domains = ['metacafe.com']
height = 345
width = 400
media_template = '<embed src="$video_id" width="400" height="345" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
video_id_rx = re.compile('.*/watch/([^/]+)/.*')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return dict(video_id = video_url,
type = self.domains[0])
class GootubeScraper(MediaScraper):
domains = ['video.google.com']
height = 326
width = 400
media_template = '<embed style="width:400px; height:326px;" id="VideoPlayback" type="application/x-shockwave-flash" src="http://video.google.com/googleplayer.swf?docId=$video_id&hl=en" flashvars=""> </embed>'
video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
def largest_image_url(self):
if not self.content:
self.download()
if not self.content:
return None
m = self.gootube_thumb_rx.match(self.content)
if m:
image_url = m.groups()[0]
image_url = utils.safe_eval_str(image_url)
return image_url
class VimeoScraper(MediaScraper):
domains = ['vimeo.com']
height = 448
width = 520
media_template = '<embed src="$video_id" width="520" height="448" wmode="transparent" pluginspage="http://www.macromedia.com/go/getflashplayer" type="application/x-shockwave-flash"> </embed>'
video_id_rx = re.compile('.*/(.*)')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return dict(video_id = video_url,
type = self.domains[0])
class BreakScraper(MediaScraper):
domains = ['break.com']
height = 421
width = 520
media_template = '<object width="520" height="421"><param name="movie" value="$video_id"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" allowScriptAccess="always" width="520" height="421"></embed></object>'
video_id_rx = re.compile('.*/index/([^/]+).*');
def video_id_extract(self):
if not self.soup:
self.download()
if self.soup:
video_src = self.soup.find('link', rel = 'video_src')
if video_src and video_src['href']:
return video_src['href']
class TheOnionScraper(MediaScraper):
domains = ['theonion.com']
height = 430
width = 480
media_template = """<object width="480" height="430">
<param name="allowfullscreen" value="true" />
<param name="allowscriptaccess" value="always" />
<param name="movie" value="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf?&amp;videoid=$video_id" />
<param name="wmode" value="transparent" />
<embed src="http://www.theonion.com/content/themes/common/assets/onn_embed/embedded_player.swf"
width="480" height="430"
wmode="transparent"
pluginspage="http://www.macromedia.com/go/getflashplayer"
type="application/x-shockwave-flash"
flashvars="videoid=$video_id" >
</embed>
</object>"""
video_id_rx = re.compile('.*/video/([^/?#]+).*')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
return dict(video_id = video_url,
type = self.domains[0])
class CollegeHumorScraper(MediaScraper):
domains = ['collegehumor.com']
height = 390
width = 520
media_template = '<object type="application/x-shockwave-flash" data="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" width="520" height="390" ><param name="allowfullscreen" value="true" /><param name="AllowScriptAccess" value="true" /><param name="movie" quality="best" value="http://www.collegehumor.com/moogaloop/moogaloop.swf?clip_id=$video_id&fullscreen=1" /></object>'
video_id_rx = re.compile('.*video:(\d+).*');
class FunnyOrDieScraper(MediaScraper):
domains = ['funnyordie.com']
height = 438
width = 464
media_template = '<object width="464" height="438" classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" id="fodplayer"><param name="movie" value="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac?key=$video_id" /><param name="flashvars" value="key=$video_id&autostart=true&internal=true" /><param name="allowfullscreen" value="true" /><embed width="464" height="438" flashvars="key=$video_id&autostart=true" allowfullscreen="true" quality="high" src="http://player.ordienetworks.com/flash/fodplayer.swf?c79e63ac" name="fodplayer" type="application/x-shockwave-flash"></embed></object>'
thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
video_id_rx = re.compile('.*/videos/([^/]+)/.*')
class ComedyCentralScraper(MediaScraper):
domains = ['comedycentral.com']
height = 316
width = 332
media_template = '<embed FlashVars="videoId=$video_id" src="http://www.comedycentral.com/sitewide/video_player/view/default/swf.jhtml" quality="high" bgcolor="#cccccc" width="332" height="316" name="comedy_central_player" align="middle" allowScriptAccess="always" allownetworking="external" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>'
video_id_rx = re.compile('.*videoId=(\d+).*')
class TheDailyShowScraper(MediaScraper):
domains = ['thedailyshow.com']
height = 353
width = 360
media_template = """<embed style='display:block' src='http://media.mtvnservices.com/mgid:cms:item:comedycentral.com:$video_id' width='360' height='301' type='application/x-shockwave-flash' wmode='window' allowFullscreen='true' flashvars='autoPlay=false' allowscriptaccess='always' allownetworking='all' bgcolor='#000000'></embed>"""
def video_id_extract(self):
"This is a bit of a hack"
if not self.soup:
self.download()
if self.soup:
embed_container = self.soup.find('div', {'class': 'videoplayerPromo module'})
if embed_container:
if embed_container['id'].startswith('promo_'):
video_id = embed_container['id'].split('_')[1]
return video_id
class ColbertNationScraper(ComedyCentralScraper):
domains = ['colbertnation.com']
video_id_rx = re.compile('.*videos/(\d+)/.*')
class LiveLeakScraper(MediaScraper):
domains = ['liveleak.com']
height = 370
width = 450
media_template = '<object width="450" height="370"><param name="movie" value="http://www.liveleak.com/e/$video_id"></param><param name="wmode" value="transparent"></param><embed src="http://www.liveleak.com/e/$video_id" type="application/x-shockwave-flash" wmode="transparent" width="450" height="370"></embed></object>'
video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
def largest_image_url(self):
if not self.soup:
self.download()
if self.soup:
return self.soup.find('link', rel = 'videothumbnail')['href']
class DailyMotionScraper(MediaScraper):
domains = ['dailymotion.com']
height = 381
width = 480
media_template = '<object width="480" height="381"><param name="movie" value="$video_id"></param><param name="allowFullScreen" value="true"></param><param name="allowScriptAccess" value="always"></param><embed src="$video_id" type="application/x-shockwave-flash" width="480" height="381" allowFullScreen="true" allowScriptAccess="always"></embed></object>'
video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
def media_object(self):
if not self.soup:
self.download()
if self.soup:
video_url = self.soup.find('link', rel = 'video_src')['href']
return dict(video_id = video_url,
type = self.domains[0])
class RevverScraper(MediaScraper):
domains = ['revver.com']
height = 392
width = 480
media_template = '<script src="http://flash.revver.com/player/1.0/player.js?mediaId:$video_id;width:480;height:392;" type="text/javascript"></script>'
video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
class EscapistScraper(MediaScraper):
domains = ['escapistmagazine.com']
height = 294
width = 480
media_template = """<script src="http://www.escapistmagazine.com/videos/embed/$video_id"></script>"""
video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
class JustintvScraper(MediaScraper):
"""Can grab streams from justin.tv, but not clips"""
domains = ['justin.tv']
height = 295
width = 353
stream_media_template = """<object type="application/x-shockwave-flash" height="295" width="353" id="jtv_player_flash" data="http://www.justin.tv/widgets/jtv_player.swf?channel=$video_id" bgcolor="#000000"><param name="allowFullScreen" value="true" /><param name="allowScriptAccess" value="always" /><param name="allowNetworking" value="all" /><param name="movie" value="http://www.justin.tv/widgets/jtv_player.swf" /><param name="flashvars" value="channel=$video_id&auto_play=false&start_volume=25" /></object>"""
video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
@classmethod
def media_embed(cls, video_id, **kw):
content = cls.stream_media_template.replace('$video_id', video_id)
return MediaEmbed(height = cls.height,
width = cls.width,
content = content)
class SoundcloudScraper(MediaScraper):
"""soundcloud.com"""
domains = ['soundcloud.com']
height = 81
width = 400
media_template = """<div style="font-size: 11px;">
<object height="81" width="100%">
<param name="movie"
value="http://player.soundcloud.com/player.swf?track=$video_id">
</param>
<param name="allowscriptaccess" value="always"></param>
<embed allowscriptaccess="always" height="81"
src="http://player.soundcloud.com/player.swf?track=$video_id"
type="application/x-shockwave-flash"
width="100%">
</embed>
</object>"""
video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
class CraigslistScraper(MediaScraper):
domains = ['craigslist.org']
height = 480
width = 640
max_size_kb = 50
def video_id_extract(self):
return self.url
def media_object(self):
if not self.soup:
self.download()
if self.soup:
ub = self.soup.find('div', {'id': 'userbody'})
if ub:
ub = str(ub)
if len(ub) <= self.max_size_kb * 1024:
return dict(content = ub,
type = self.domains[0])
@classmethod
def media_embed(cls, content, **kw):
return MediaEmbed(height = cls.height,
width = cls.width,
content = content,
scrolling = True)
########## oembed rich-media scrapers ##########
class OEmbed(Scraper):
"""
Oembed Scraper
==============
Tries to use the oembed standard to create a media object.
url_re: Regular Expression to match the incoming url against.
api_endpoint: Url of the api end point you are using.
api_params: Default Params to be sent with the outgoing request.
"""
url_re = ''
api_endpoint = ''
api_params = {}
def __init__(self, url):
Scraper.__init__(self, url)
self.oembed = None
#Fallback to the scraper if the url doesn't match
if not self.url_re.match(self.url):
self.__class__ = Scraper
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.url)
def utf8_encode(self, input):
"""UTF-8 encodes any strings in an object (from json.loads)"""
if isinstance(input, dict):
return {self.utf8_encode(key): self.utf8_encode(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [self.utf8_encode(item)
for item in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
def download(self):
self.api_params.update( { 'url':self.url})
query = urllib.urlencode(self.api_params)
api_url = "%s?%s" % (self.api_endpoint, query)
self.content_type, self.content = fetch_url(api_url)
#Either a 404 or 500.
if not self.content:
#raise ValueError('ISSUE CALLING %s' %api_url)
log.warning('oEmbed call (%s) failed to return content for %s'
%(api_url, self.url))
return None
try:
self.oembed = json.loads(self.content,
object_hook=self.utf8_encode)
except ValueError, e:
log.error('oEmbed call (%s) return invalid json for %s'
%(api_url, self.url))
return None
def image_urls(self):
#if the original url was an image, use that
if self.oembed and self.oembed.get('type') =='photo':
yield self.oembed.get('url')
elif self.oembed and self.oembed.get('thumbnail_url'):
yield self.oembed.get('thumbnail_url')
def largest_image_url(self):
#Seems to be the default place to check if the download has happened.
if not self.oembed:
self.download()
#if the original url was of the photo type
if self.oembed and self.oembed.get('type') =='photo':
return self.oembed.get('url')
elif self.oembed and self.oembed.get('thumbnail_url'):
return self.oembed.get('thumbnail_url')
def media_object(self):
#Seems to be the default place to check if the download has happened.
if not self.oembed:
self.download()
if self.oembed and self.oembed.get('type') in ['video', 'rich']:
for domain in self.domains:
if self.url.find(domain) > -1:
return dict(type=domain, oembed=self.oembed)
return None
@classmethod
def media_embed(cls, video_id = None, height = None, width = None, **kw):
content = None
oembed = kw.get('oembed')
# check if oembed is there and has html
if oembed and oembed.get('html'):
content = oembed.get('html')
if content and oembed.get('height') and oembed.get('width'):
return MediaEmbed(height = oembed['height'],
width = oembed['width'],
content = content)
class EmbedlyOEmbed(OEmbed):
"""
Embedly oEmbed Provider
=======================
documentation: http://api.embed.ly
"""
domains = ['23hq.com', '5min.com', '99dollarmusicvideos.com',
'abcnews.go.com', 'achewood.com', 'allthingsd.com', 'amazon.com',
'aniboom.com', 'animoto.com', 'asofterworld.com', 'atom.com',
'audioboo.com', 'bambuser.com', 'bandcamp.com', 'barelydigital.com',
'barelypolitical.com', 'bigthink.com', 'blip.tv', 'bnter.com',
'boston.com', 'brainbird.net', 'bravotv.com', 'break.com',
'brizzly.com', 'cbsnews.com', 'channelfrederator.com', 'chart.ly',
'cl.ly', 'clikthrough.com', 'clipfish.de', 'clipshack.com', 'cnbc.com',
'cnn.com', 'colbertnation.com', 'collegehumor.com', 'color.com',
'comedycentral.com', 'compete.com', 'confreaks.net', 'crackle.com',
'craigslist.org', 'crocodoc.com', 'crunchbase.com', 'dailybooth.com',
'dailymile.com', 'dailymotion.com', 'deviantart.com', 'digg.com',
'dipdive.com', 'discovery.com', 'dotsub.com', 'dribbble.com',
'edition.cnn.com', 'emberapp.com', 'escapistmagazine.com',
'espn.go.com', 'facebook.com', 'fancast.com', 'flickr.com', 'fora.tv',
'formspring.me', 'fotopedia.com', 'freemusicarchive.org',
'funnyordie.com', 'gametrailers.com', 'gist.github.com',
'globalpost.com', 'godtube.com', 'gogoyoko.com', 'google.com',
'graphicly.com', 'grindtv.com', 'grooveshark.com', 'guardian.co.uk',
'hark.com', 'howcast.com', 'huffduffer.com', 'hulu.com',
'hungrynation.tv', 'ifood.tv', 'img.ly', 'imgur.com', 'indenti.ca',
'indymogul.com', 'instagr.am', 'issuu.com', 'itunes.apple.com',
'justin.tv', 'kickstarter.com', 'kinomap.com', 'kiva.org',
'koldcast.tv', 'last.fm', 'lightbox.com', 'liveleak.com',
'livestream.com', 'lockerz.com', 'logotv.com', 'lonelyplanet.com',
'maps.google.com', 'meadd.com', 'mediamatters.org', 'meetup.com',
'metacafe.com', 'metacdn.com', 'mixcloud.com', 'mixergy.com',
'mlkshk.com', 'mobypicture.com', 'money.cnn.com', 'movies.yahoo.com',
'msnbc.com', 'my.opera.com', 'myloc.me', 'myvideo.de',
'nationalgeographic.com', 'nfb.ca', 'npr.org', 'nzonscreen.com',
'overstream.net', 'ow.ly', 'pastebin.com', 'pastie.org',
'phodroid.com', 'photobucket.com', 'photozou.jp',
'picasaweb.google.com', 'picplz.com', 'pikchur.com', 'ping.fm',
'polldaddy.com', 'polleverywhere.com', 'posterous.com', 'prezi.com',
'qik.com', 'quantcast.com', 'questionablecontent.net', 'qwantz.com',
'qwiki.com', 'radionomy.com', 'radioreddit.com', 'rdio.com',
'recordsetter.com','redux.com', 'revision3.com', 'revver.com',
'saynow.com', 'schooltube.com', 'sciencestage.com', 'scrapblog.com',
'screencast.com', 'screenr.com', 'scribd.com', 'sendables.jibjab.com',
'share.ovi.com', 'shitmydadsays.com', 'shopstyle.com', 'skitch.com',
'slideshare.net', 'smugmug.com', 'snotr.com', 'socialcam.com',
'someecards.com', 'soundcloud.com', 'speakerdeck.com', 'spike.com',
'statsheet.com', 'status.net', 'storify.com', 'streetfire.net',
'studivz.net', 'tangle.com', 'teachertube.com', 'techcrunch.tv',
'ted.com', 'thedailyshow.com', 'theonion.com', 'threadbanger.com',
'timetoast.com', 'tinypic.com', 'tmiweekly.com', 'traileraddict.com',
'trailerspy.com', 'trooptube.tv', 'trutv.com', 'tumblr.com',
'twitgoo.com', 'twitlonger.com', 'twitpic.com', 'twitrpix.com',
'twitter.com', 'twitvid.com', 'ultrakawaii.com', 'urtak.com',
'uservoice.com', 'ustream.com', 'viddler.com', 'video.forbes.com',
'video.google.com', 'video.jardenberg.com', 'video.pbs.org',
'video.yahoo.com', 'videos.nymag.com', 'vids.myspace.com', 'vimeo.com',
'vodcars.com', 'washingtonpost.com', 'whitehouse.gov', 'whosay.com',
'wikimedia.org', 'wikipedia.org', 'wistia.com', 'wordpress.tv',
'worldstarhiphop.com', 'xiami.com', 'xkcd.com', 'xtranormal.com',
'yfrog.com', 'youku.com', 'youtu.be', 'youtube.com', 'zapiks.com',
'zero-inch.com']
url_re = re.compile(
'http:\\/\\/.*youtube\\.com\\/watch.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
'https:\\/\\/.*youtube\\.com\\/watch.*|' +
'https:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
'http:\\/\\/youtu\\.be\\/.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/user\\/.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/.*\\#.*\\/.*|' +
'http:\\/\\/m\\.youtube\\.com\\/watch.*|' +
'http:\\/\\/m\\.youtube\\.com\\/index.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/profile.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/view_play_list.*|' +
'http:\\/\\/.*\\.youtube\\.com\\/playlist.*|' +
'http:\\/\\/.*justin\\.tv\\/.*|' +
'http:\\/\\/.*justin\\.tv\\/.*\\/b\\/.*|' +
'http:\\/\\/.*justin\\.tv\\/.*\\/w\\/.*|' +
'http:\\/\\/www\\.ustream\\.tv\\/recorded\\/.*|' +
'http:\\/\\/www\\.ustream\\.tv\\/channel\\/.*|' +
'http:\\/\\/www\\.ustream\\.tv\\/.*|' +
'http:\\/\\/qik\\.com\\/video\\/.*|' +
'http:\\/\\/qik\\.com\\/.*|' +
'http:\\/\\/qik\\.ly\\/.*|' +
'http:\\/\\/.*revision3\\.com\\/.*|' +
'http:\\/\\/.*\\.dailymotion\\.com\\/video\\/.*|' +
'http:\\/\\/.*\\.dailymotion\\.com\\/.*\\/video\\/.*|' +
'http:\\/\\/collegehumor\\.com\\/video:.*|' +
'http:\\/\\/collegehumor\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.collegehumor\\.com\\/video:.*|' +
'http:\\/\\/www\\.collegehumor\\.com\\/video\\/.*|' +
'http:\\/\\/.*twitvid\\.com\\/.*|' +
'http:\\/\\/www\\.break\\.com\\/.*\\/.*|' +
'http:\\/\\/vids\\.myspace\\.com\\/index\\.cfm\\?fuseaction=vids\\.individual&videoid.*|' +
'http:\\/\\/www\\.myspace\\.com\\/index\\.cfm\\?fuseaction=.*&videoid.*|' +
'http:\\/\\/www\\.metacafe\\.com\\/watch\\/.*|' +
'http:\\/\\/www\\.metacafe\\.com\\/w\\/.*|' +
'http:\\/\\/blip\\.tv\\/.*\\/.*|' +
'http:\\/\\/.*\\.blip\\.tv\\/.*\\/.*|' +
'http:\\/\\/video\\.google\\.com\\/videoplay\\?.*|' +
'http:\\/\\/.*revver\\.com\\/video\\/.*|' +
'http:\\/\\/video\\.yahoo\\.com\\/watch\\/.*\\/.*|' +
'http:\\/\\/video\\.yahoo\\.com\\/network\\/.*|' +
'http:\\/\\/.*viddler\\.com\\/explore\\/.*\\/videos\\/.*|' +
'http:\\/\\/liveleak\\.com\\/view\\?.*|' +
'http:\\/\\/www\\.liveleak\\.com\\/view\\?.*|' +
'http:\\/\\/animoto\\.com\\/play\\/.*|' +
'http:\\/\\/dotsub\\.com\\/view\\/.*|' +
'http:\\/\\/www\\.overstream\\.net\\/view\\.php\\?oid=.*|' +
'http:\\/\\/www\\.livestream\\.com\\/.*|' +
'http:\\/\\/www\\.worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
'http:\\/\\/worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
'http:\\/\\/teachertube\\.com\\/viewVideo\\.php.*|' +
'http:\\/\\/www\\.teachertube\\.com\\/viewVideo\\.php.*|' +
'http:\\/\\/www1\\.teachertube\\.com\\/viewVideo\\.php.*|' +
'http:\\/\\/www2\\.teachertube\\.com\\/viewVideo\\.php.*|' +
'http:\\/\\/bambuser\\.com\\/v\\/.*|' +
'http:\\/\\/bambuser\\.com\\/channel\\/.*|' +
'http:\\/\\/bambuser\\.com\\/channel\\/.*\\/broadcast\\/.*|' +
'http:\\/\\/www\\.schooltube\\.com\\/video\\/.*\\/.*|' +
'http:\\/\\/bigthink\\.com\\/ideas\\/.*|' +
'http:\\/\\/bigthink\\.com\\/series\\/.*|' +
'http:\\/\\/sendables\\.jibjab\\.com\\/view\\/.*|' +
'http:\\/\\/sendables\\.jibjab\\.com\\/originals\\/.*|' +
'http:\\/\\/www\\.xtranormal\\.com\\/watch\\/.*|' +
'http:\\/\\/socialcam\\.com\\/v\\/.*|' +
'http:\\/\\/www\\.socialcam\\.com\\/v\\/.*|' +
'http:\\/\\/dipdive\\.com\\/media\\/.*|' +
'http:\\/\\/dipdive\\.com\\/member\\/.*\\/media\\/.*|' +
'http:\\/\\/dipdive\\.com\\/v\\/.*|' +
'http:\\/\\/.*\\.dipdive\\.com\\/media\\/.*|' +
'http:\\/\\/.*\\.dipdive\\.com\\/v\\/.*|' +
'http:\\/\\/v\\.youku\\.com\\/v_show\\/.*\\.html|' +
'http:\\/\\/v\\.youku\\.com\\/v_playlist\\/.*\\.html|' +
'http:\\/\\/www\\.snotr\\.com\\/video\\/.*|' +
'http:\\/\\/snotr\\.com\\/video\\/.*|' +
'http:\\/\\/video\\.jardenberg\\.se\\/.*|' +
'http:\\/\\/www\\.clipfish\\.de\\/.*\\/.*\\/video\\/.*|' +
'http:\\/\\/www\\.myvideo\\.de\\/watch\\/.*|' +
'http:\\/\\/www\\.whitehouse\\.gov\\/photos-and-video\\/video\\/.*|' +
'http:\\/\\/www\\.whitehouse\\.gov\\/video\\/.*|' +
'http:\\/\\/wh\\.gov\\/photos-and-video\\/video\\/.*|' +
'http:\\/\\/wh\\.gov\\/video\\/.*|' +
'http:\\/\\/www\\.hulu\\.com\\/watch.*|' +
'http:\\/\\/www\\.hulu\\.com\\/w\\/.*|' +
'http:\\/\\/hulu\\.com\\/watch.*|' +
'http:\\/\\/hulu\\.com\\/w\\/.*|' +
'http:\\/\\/.*crackle\\.com\\/c\\/.*|' +
'http:\\/\\/www\\.fancast\\.com\\/.*\\/videos|' +
'http:\\/\\/www\\.funnyordie\\.com\\/videos\\/.*|' +
'http:\\/\\/www\\.funnyordie\\.com\\/m\\/.*|' +
'http:\\/\\/funnyordie\\.com\\/videos\\/.*|' +
'http:\\/\\/funnyordie\\.com\\/m\\/.*|' +
'http:\\/\\/www\\.vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
'http:\\/\\/www\\.vimeo\\.com\\/.*|' +
'http:\\/\\/vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
'http:\\/\\/vimeo\\.com\\/.*|' +
'http:\\/\\/vimeo\\.com\\/m\\/\\#\\/.*|' +
'http:\\/\\/www\\.ted\\.com\\/talks\\/.*\\.html.*|' +
'http:\\/\\/www\\.ted\\.com\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/.*\\.html.*|' +
'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
'http:\\/\\/.*nfb\\.ca\\/film\\/.*|' +
'http:\\/\\/www\\.thedailyshow\\.com\\/watch\\/.*|' +
'http:\\/\\/www\\.thedailyshow\\.com\\/full-episodes\\/.*|' +
'http:\\/\\/www\\.thedailyshow\\.com\\/collection\\/.*\\/.*\\/.*|' +
'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video\\/.*|' +
'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/trailer|' +
'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video|' +
'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-collections\\/.*|' +
'http:\\/\\/www\\.colbertnation\\.com\\/full-episodes\\/.*|' +
'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-videos\\/.*|' +
'http:\\/\\/www\\.comedycentral\\.com\\/videos\\/index\\.jhtml\\?.*|' +
'http:\\/\\/www\\.theonion\\.com\\/video\\/.*|' +
'http:\\/\\/theonion\\.com\\/video\\/.*|' +
'http:\\/\\/wordpress\\.tv\\/.*\\/.*\\/.*\\/.*\\/|' +
'http:\\/\\/www\\.traileraddict\\.com\\/trailer\\/.*|' +
'http:\\/\\/www\\.traileraddict\\.com\\/clip\\/.*|' +
'http:\\/\\/www\\.traileraddict\\.com\\/poster\\/.*|' +
'http:\\/\\/www\\.escapistmagazine\\.com\\/videos\\/.*|' +
'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*\\/.*|' +
'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*|' +
'http:\\/\\/www\\.trailerspy\\.com\\/view_video\\.php.*|' +
'http:\\/\\/www\\.atom\\.com\\/.*\\/.*\\/|' +
'http:\\/\\/fora\\.tv\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.spike\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.gametrailers\\.com\\/video\\/.*|' +
'http:\\/\\/gametrailers\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.koldcast\\.tv\\/video\\/.*|' +
'http:\\/\\/www\\.koldcast\\.tv\\/\\#video:.*|' +
'http:\\/\\/techcrunch\\.tv\\/watch.*|' +
'http:\\/\\/techcrunch\\.tv\\/.*\\/watch.*|' +
'http:\\/\\/mixergy\\.com\\/.*|' +
'http:\\/\\/video\\.pbs\\.org\\/video\\/.*|' +
'http:\\/\\/www\\.zapiks\\.com\\/.*|' +
'http:\\/\\/tv\\.digg\\.com\\/diggnation\\/.*|' +
'http:\\/\\/tv\\.digg\\.com\\/diggreel\\/.*|' +
'http:\\/\\/tv\\.digg\\.com\\/diggdialogg\\/.*|' +
'http:\\/\\/www\\.trutv\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.nzonscreen\\.com\\/title\\/.*|' +
'http:\\/\\/nzonscreen\\.com\\/title\\/.*|' +
'http:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
'https:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
'http:\\/\\/hungrynation\\.tv\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.hungrynation\\.tv\\/.*\\/episode\\/.*|' +
'http:\\/\\/hungrynation\\.tv\\/episode\\/.*|' +
'http:\\/\\/www\\.hungrynation\\.tv\\/episode\\/.*|' +
'http:\\/\\/indymogul\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.indymogul\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/indymogul\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.indymogul\\.com\\/episode\\/.*|' +
'http:\\/\\/channelfrederator\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.channelfrederator\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/channelfrederator\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.channelfrederator\\.com\\/episode\\/.*|' +
'http:\\/\\/tmiweekly\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.tmiweekly\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/tmiweekly\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.tmiweekly\\.com\\/episode\\/.*|' +
'http:\\/\\/99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/99dollarmusicvideos\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/episode\\/.*|' +
'http:\\/\\/ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/ultrakawaii\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.ultrakawaii\\.com\\/episode\\/.*|' +
'http:\\/\\/barelypolitical\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.barelypolitical\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/barelypolitical\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.barelypolitical\\.com\\/episode\\/.*|' +
'http:\\/\\/barelydigital\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.barelydigital\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/barelydigital\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.barelydigital\\.com\\/episode\\/.*|' +
'http:\\/\\/threadbanger\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.threadbanger\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/threadbanger\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.threadbanger\\.com\\/episode\\/.*|' +
'http:\\/\\/vodcars\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/www\\.vodcars\\.com\\/.*\\/episode\\/.*|' +
'http:\\/\\/vodcars\\.com\\/episode\\/.*|' +
'http:\\/\\/www\\.vodcars\\.com\\/episode\\/.*|' +
'http:\\/\\/confreaks\\.net\\/videos\\/.*|' +
'http:\\/\\/www\\.confreaks\\.net\\/videos\\/.*|' +
'http:\\/\\/video\\.allthingsd\\.com\\/video\\/.*|' +
'http:\\/\\/videos\\.nymag\\.com\\/.*|' +
'http:\\/\\/aniboom\\.com\\/animation-video\\/.*|' +
'http:\\/\\/www\\.aniboom\\.com\\/animation-video\\/.*|' +
'http:\\/\\/clipshack\\.com\\/Clip\\.aspx\\?.*|' +
'http:\\/\\/www\\.clipshack\\.com\\/Clip\\.aspx\\?.*|' +
'http:\\/\\/grindtv\\.com\\/.*\\/video\\/.*|' +
'http:\\/\\/www\\.grindtv\\.com\\/.*\\/video\\/.*|' +
'http:\\/\\/ifood\\.tv\\/recipe\\/.*|' +
'http:\\/\\/ifood\\.tv\\/video\\/.*|' +
'http:\\/\\/ifood\\.tv\\/channel\\/user\\/.*|' +
'http:\\/\\/www\\.ifood\\.tv\\/recipe\\/.*|' +
'http:\\/\\/www\\.ifood\\.tv\\/video\\/.*|' +
'http:\\/\\/www\\.ifood\\.tv\\/channel\\/user\\/.*|' +
'http:\\/\\/logotv\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.logotv\\.com\\/video\\/.*|' +
'http:\\/\\/lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
'http:\\/\\/www\\.lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
'http:\\/\\/streetfire\\.net\\/video\\/.*\\.htm.*|' +
'http:\\/\\/www\\.streetfire\\.net\\/video\\/.*\\.htm.*|' +
'http:\\/\\/trooptube\\.tv\\/videos\\/.*|' +
'http:\\/\\/www\\.trooptube\\.tv\\/videos\\/.*|' +
'http:\\/\\/sciencestage\\.com\\/v\\/.*\\.html|' +
'http:\\/\\/sciencestage\\.com\\/a\\/.*\\.html|' +
'http:\\/\\/www\\.sciencestage\\.com\\/v\\/.*\\.html|' +
'http:\\/\\/www\\.sciencestage\\.com\\/a\\/.*\\.html|' +
'http:\\/\\/www\\.godtube\\.com\\/featured\\/video\\/.*|' +
'http:\\/\\/godtube\\.com\\/featured\\/video\\/.*|' +
'http:\\/\\/www\\.godtube\\.com\\/watch\\/.*|' +
'http:\\/\\/godtube\\.com\\/watch\\/.*|' +
'http:\\/\\/www\\.tangle\\.com\\/view_video.*|' +
'http:\\/\\/mediamatters\\.org\\/mmtv\\/.*|' +
'http:\\/\\/www\\.clikthrough\\.com\\/theater\\/video\\/.*|' +
'http:\\/\\/gist\\.github\\.com\\/.*|' +
'http:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
'http:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
'http:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
'http:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
'https:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
'https:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
'https:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
'https:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
'http:\\/\\/www\\.crunchbase\\.com\\/.*\\/.*|' +
'http:\\/\\/crunchbase\\.com\\/.*\\/.*|' +
'http:\\/\\/www\\.slideshare\\.net\\/.*\\/.*|' +
'http:\\/\\/www\\.slideshare\\.net\\/mobile\\/.*\\/.*|' +
'http:\\/\\/slidesha\\.re\\/.*|' +
'http:\\/\\/scribd\\.com\\/doc\\/.*|' +
'http:\\/\\/www\\.scribd\\.com\\/doc\\/.*|' +
'http:\\/\\/scribd\\.com\\/mobile\\/documents\\/.*|' +
'http:\\/\\/www\\.scribd\\.com\\/mobile\\/documents\\/.*|' +
'http:\\/\\/screenr\\.com\\/.*|' +
'http:\\/\\/polldaddy\\.com\\/community\\/poll\\/.*|' +
'http:\\/\\/polldaddy\\.com\\/poll\\/.*|' +
'http:\\/\\/answers\\.polldaddy\\.com\\/poll\\/.*|' +
'http:\\/\\/www\\.5min\\.com\\/Video\\/.*|' +
'http:\\/\\/www\\.howcast\\.com\\/videos\\/.*|' +
'http:\\/\\/www\\.screencast\\.com\\/.*\\/media\\/.*|' +
'http:\\/\\/screencast\\.com\\/.*\\/media\\/.*|' +
'http:\\/\\/www\\.screencast\\.com\\/t\\/.*|' +
'http:\\/\\/screencast\\.com\\/t\\/.*|' +
'http:\\/\\/issuu\\.com\\/.*\\/docs\\/.*|' +
'http:\\/\\/www\\.kickstarter\\.com\\/projects\\/.*\\/.*|' +
'http:\\/\\/www\\.scrapblog\\.com\\/viewer\\/viewer\\.aspx.*|' +
'http:\\/\\/ping\\.fm\\/p\\/.*|' +
'http:\\/\\/chart\\.ly\\/symbols\\/.*|' +
'http:\\/\\/chart\\.ly\\/.*|' +
'http:\\/\\/maps\\.google\\.com\\/maps\\?.*|' +
'http:\\/\\/maps\\.google\\.com\\/\\?.*|' +
'http:\\/\\/maps\\.google\\.com\\/maps\\/ms\\?.*|' +
'http:\\/\\/.*\\.craigslist\\.org\\/.*\\/.*|' +
'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/show\\.dml\\?id=.*|' +
'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/showpic\\.dml\\?album=.*&picture=.*|' +
'http:\\/\\/tumblr\\.com\\/.*|' +
'http:\\/\\/.*\\.tumblr\\.com\\/post\\/.*|' +
'http:\\/\\/www\\.polleverywhere\\.com\\/polls\\/.*|' +
'http:\\/\\/www\\.polleverywhere\\.com\\/multiple_choice_polls\\/.*|' +
'http:\\/\\/www\\.polleverywhere\\.com\\/free_text_polls\\/.*|' +
'http:\\/\\/www\\.quantcast\\.com\\/wd:.*|' +
'http:\\/\\/www\\.quantcast\\.com\\/.*|' +
'http:\\/\\/siteanalytics\\.compete\\.com\\/.*|' +
'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/e\\/.*|' +
'http:\\/\\/statsheet\\.com\\/.*\\/teams\\/.*\\/.*|' +
'http:\\/\\/statsheet\\.com\\/tools\\/chartlets\\?chart=.*|' +
'http:\\/\\/.*\\.status\\.net\\/notice\\/.*|' +
'http:\\/\\/identi\\.ca\\/notice\\/.*|' +
'http:\\/\\/brainbird\\.net\\/notice\\/.*|' +
'http:\\/\\/shitmydadsays\\.com\\/notice\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/Profile\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/l\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/Groups\\/Overview\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Info\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Install\\/.*|' +
'http:\\/\\/www\\.studivz\\.net\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/Profile\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/l\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/Groups\\/Overview\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Info\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Install\\/.*|' +
'http:\\/\\/www\\.meinvz\\.net\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/Profile\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/l\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/Groups\\/Overview\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Info\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Install\\/.*|' +
'http:\\/\\/www\\.schuelervz\\.net\\/.*|' +
'http:\\/\\/myloc\\.me\\/.*|' +
'http:\\/\\/pastebin\\.com\\/.*|' +
'http:\\/\\/pastie\\.org\\/.*|' +
'http:\\/\\/www\\.pastie\\.org\\/.*|' +
'http:\\/\\/redux\\.com\\/stream\\/item\\/.*\\/.*|' +
'http:\\/\\/redux\\.com\\/f\\/.*\\/.*|' +
'http:\\/\\/www\\.redux\\.com\\/stream\\/item\\/.*\\/.*|' +
'http:\\/\\/www\\.redux\\.com\\/f\\/.*\\/.*|' +
'http:\\/\\/cl\\.ly\\/.*|' +
'http:\\/\\/cl\\.ly\\/.*\\/content|' +
'http:\\/\\/speakerdeck\\.com\\/u\\/.*\\/p\\/.*|' +
'http:\\/\\/www\\.kiva\\.org\\/lend\\/.*|' +
'http:\\/\\/www\\.timetoast\\.com\\/timelines\\/.*|' +
'http:\\/\\/storify\\.com\\/.*\\/.*|' +
'http:\\/\\/.*meetup\\.com\\/.*|' +
'http:\\/\\/meetu\\.ps\\/.*|' +
'http:\\/\\/www\\.dailymile\\.com\\/people\\/.*\\/entries\\/.*|' +
'http:\\/\\/.*\\.kinomap\\.com\\/.*|' +
'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/content\\/.*|' +
'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/media\\/.*|' +
'http:\\/\\/prezi\\.com\\/.*\\/.*|' +
'http:\\/\\/.*\\.uservoice\\.com\\/.*\\/suggestions\\/.*|' +
'http:\\/\\/formspring\\.me\\/.*|' +
'http:\\/\\/www\\.formspring\\.me\\/.*|' +
'http:\\/\\/formspring\\.me\\/.*\\/q\\/.*|' +
'http:\\/\\/www\\.formspring\\.me\\/.*\\/q\\/.*|' +
'http:\\/\\/twitlonger\\.com\\/show\\/.*|' +
'http:\\/\\/www\\.twitlonger\\.com\\/show\\/.*|' +
'http:\\/\\/tl\\.gd\\/.*|' +
'http:\\/\\/www\\.qwiki\\.com\\/q\\/.*|' +
'http:\\/\\/crocodoc\\.com\\/.*|' +
'http:\\/\\/.*\\.crocodoc\\.com\\/.*|' +
'https:\\/\\/crocodoc\\.com\\/.*|' +
'https:\\/\\/.*\\.crocodoc\\.com\\/.*|' +
'http:\\/\\/www\\.wikipedia\\.org\\/wiki\\/.*|' +
'http:\\/\\/www\\.wikimedia\\.org\\/wiki\\/File.*|' +
'https:\\/\\/urtak\\.com\\/u\\/.*|' +
'https:\\/\\/urtak\\.com\\/clr\\/.*|' +
'http:\\/\\/graphicly\\.com\\/.*\\/.*\\/.*|' +
'http:\\/\\/.*yfrog\\..*\\/.*|' +
'http:\\/\\/www\\.flickr\\.com\\/photos\\/.*|' +
'http:\\/\\/flic\\.kr\\/.*|' +
'http:\\/\\/twitpic\\.com\\/.*|' +
'http:\\/\\/www\\.twitpic\\.com\\/.*|' +
'http:\\/\\/twitpic\\.com\\/photos\\/.*|' +
'http:\\/\\/www\\.twitpic\\.com\\/photos\\/.*|' +
'http:\\/\\/.*imgur\\.com\\/.*|' +
'http:\\/\\/.*\\.posterous\\.com\\/.*|' +
'http:\\/\\/post\\.ly\\/.*|' +
'http:\\/\\/twitgoo\\.com\\/.*|' +
'http:\\/\\/i.*\\.photobucket\\.com\\/albums\\/.*|' +
'http:\\/\\/s.*\\.photobucket\\.com\\/albums\\/.*|' +
'http:\\/\\/phodroid\\.com\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.mobypicture\\.com\\/user\\/.*\\/view\\/.*|' +
'http:\\/\\/moby\\.to\\/.*|' +
'http:\\/\\/xkcd\\.com\\/.*|' +
'http:\\/\\/www\\.xkcd\\.com\\/.*|' +
'http:\\/\\/imgs\\.xkcd\\.com\\/.*|' +
'http:\\/\\/www\\.asofterworld\\.com\\/index\\.php\\?id=.*|' +
'http:\\/\\/www\\.asofterworld\\.com\\/.*\\.jpg|' +
'http:\\/\\/asofterworld\\.com\\/.*\\.jpg|' +
'http:\\/\\/www\\.qwantz\\.com\\/index\\.php\\?comic=.*|' +
'http:\\/\\/23hq\\.com\\/.*\\/photo\\/.*|' +
'http:\\/\\/www\\.23hq\\.com\\/.*\\/photo\\/.*|' +
'http:\\/\\/.*dribbble\\.com\\/shots\\/.*|' +
'http:\\/\\/drbl\\.in\\/.*|' +
'http:\\/\\/.*\\.smugmug\\.com\\/.*|' +
'http:\\/\\/.*\\.smugmug\\.com\\/.*\\#.*|' +
'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*|' +
'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*\\/sizes\\/.*|' +
'http:\\/\\/emberapp\\.com\\/.*\\/collections\\/.*\\/.*|' +
'http:\\/\\/emberapp\\.com\\/.*\\/categories\\/.*\\/.*\\/.*|' +
'http:\\/\\/embr\\.it\\/.*|' +
'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*\\#.*|' +
'http:\\/\\/picasaweb\\.google\\.com.*\\/lh\\/photo\\/.*|' +
'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*|' +
'http:\\/\\/dailybooth\\.com\\/.*\\/.*|' +
'http:\\/\\/brizzly\\.com\\/pic\\/.*|' +
'http:\\/\\/pics\\.brizzly\\.com\\/.*\\.jpg|' +
'http:\\/\\/img\\.ly\\/.*|' +
'http:\\/\\/www\\.tinypic\\.com\\/view\\.php.*|' +
'http:\\/\\/tinypic\\.com\\/view\\.php.*|' +
'http:\\/\\/www\\.tinypic\\.com\\/player\\.php.*|' +
'http:\\/\\/tinypic\\.com\\/player\\.php.*|' +
'http:\\/\\/www\\.tinypic\\.com\\/r\\/.*\\/.*|' +
'http:\\/\\/tinypic\\.com\\/r\\/.*\\/.*|' +
'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.jpg|' +
'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.png|' +
'http:\\/\\/meadd\\.com\\/.*\\/.*|' +
'http:\\/\\/meadd\\.com\\/.*|' +
'http:\\/\\/.*\\.deviantart\\.com\\/art\\/.*|' +
'http:\\/\\/.*\\.deviantart\\.com\\/gallery\\/.*|' +
'http:\\/\\/.*\\.deviantart\\.com\\/\\#\\/.*|' +
'http:\\/\\/fav\\.me\\/.*|' +
'http:\\/\\/.*\\.deviantart\\.com|' +
'http:\\/\\/.*\\.deviantart\\.com\\/gallery|' +
'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.jpg|' +
'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.gif|' +
'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.jpg|' +
'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.gif|' +
'http:\\/\\/www\\.fotopedia\\.com\\/.*\\/.*|' +
'http:\\/\\/fotopedia\\.com\\/.*\\/.*|' +
'http:\\/\\/photozou\\.jp\\/photo\\/show\\/.*\\/.*|' +
'http:\\/\\/photozou\\.jp\\/photo\\/photo_only\\/.*\\/.*|' +
'http:\\/\\/instagr\\.am\\/p\\/.*|' +
'http:\\/\\/instagram\\.com\\/p\\/.*|' +
'http:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' +
'http:\\/\\/img\\.skitch\\.com\\/.*|' +
'https:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' +
'https:\\/\\/img\\.skitch\\.com\\/.*|' +
'http:\\/\\/share\\.ovi\\.com\\/media\\/.*\\/.*|' +
'http:\\/\\/www\\.questionablecontent\\.net\\/|' +
'http:\\/\\/questionablecontent\\.net\\/|' +
'http:\\/\\/www\\.questionablecontent\\.net\\/view\\.php.*|' +
'http:\\/\\/questionablecontent\\.net\\/view\\.php.*|' +
'http:\\/\\/questionablecontent\\.net\\/comics\\/.*\\.png|' +
'http:\\/\\/www\\.questionablecontent\\.net\\/comics\\/.*\\.png|' +
'http:\\/\\/picplz\\.com\\/.*|' +
'http:\\/\\/twitrpix\\.com\\/.*|' +
'http:\\/\\/.*\\.twitrpix\\.com\\/.*|' +
'http:\\/\\/www\\.someecards\\.com\\/.*\\/.*|' +
'http:\\/\\/someecards\\.com\\/.*\\/.*|' +
'http:\\/\\/some\\.ly\\/.*|' +
'http:\\/\\/www\\.some\\.ly\\/.*|' +
'http:\\/\\/pikchur\\.com\\/.*|' +
'http:\\/\\/achewood\\.com\\/.*|' +
'http:\\/\\/www\\.achewood\\.com\\/.*|' +
'http:\\/\\/achewood\\.com\\/index\\.php.*|' +
'http:\\/\\/www\\.achewood\\.com\\/index\\.php.*|' +
'http:\\/\\/www\\.whosay\\.com\\/content\\/.*|' +
'http:\\/\\/www\\.whosay\\.com\\/photos\\/.*|' +
'http:\\/\\/www\\.whosay\\.com\\/videos\\/.*|' +
'http:\\/\\/say\\.ly\\/.*|' +
'http:\\/\\/ow\\.ly\\/i\\/.*|' +
'http:\\/\\/color\\.com\\/s\\/.*|' +
'http:\\/\\/bnter\\.com\\/convo\\/.*|' +
'http:\\/\\/mlkshk\\.com\\/p\\/.*|' +
'http:\\/\\/lockerz\\.com\\/s\\/.*|' +
'http:\\/\\/lightbox\\.com\\/.*|' +
'http:\\/\\/www\\.lightbox\\.com\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/.*\\/dp\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/dp\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/o\\/ASIN\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/gp\\/offer-listing\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/.*\\/ASIN\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/images\\/.*|' +
'http:\\/\\/.*amazon\\..*\\/gp\\/aw\\/d\\/.*|' +
'http:\\/\\/www\\.amzn\\.com\\/.*|' +
'http:\\/\\/amzn\\.com\\/.*|' +
'http:\\/\\/www\\.shopstyle\\.com\\/browse.*|' +
'http:\\/\\/www\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' +
'http:\\/\\/api\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' +
'http:\\/\\/www\\.shopstyle\\.com\\/action\\/viewLook.*|' +
'http:\\/\\/itunes\\.apple\\.com\\/.*|' +
'https:\\/\\/itunes\\.apple\\.com\\/.*|' +
'http:\\/\\/soundcloud\\.com\\/.*|' +
'http:\\/\\/soundcloud\\.com\\/.*\\/.*|' +
'http:\\/\\/soundcloud\\.com\\/.*\\/sets\\/.*|' +
'http:\\/\\/soundcloud\\.com\\/groups\\/.*|' +
'http:\\/\\/snd\\.sc\\/.*|' +
'http:\\/\\/www\\.last\\.fm\\/music\\/.*|' +
'http:\\/\\/www\\.last\\.fm\\/music\\/+videos\\/.*|' +
'http:\\/\\/www\\.last\\.fm\\/music\\/+images\\/.*|' +
'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/_\\/.*|' +
'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/.*|' +
'http:\\/\\/www\\.mixcloud\\.com\\/.*\\/.*\\/|' +
'http:\\/\\/www\\.radionomy\\.com\\/.*\\/radio\\/.*|' +
'http:\\/\\/radionomy\\.com\\/.*\\/radio\\/.*|' +
'http:\\/\\/www\\.hark\\.com\\/clips\\/.*|' +
'http:\\/\\/www\\.rdio\\.com\\/\\#\\/artist\\/.*\\/album\\/.*|' +
'http:\\/\\/www\\.rdio\\.com\\/artist\\/.*\\/album\\/.*|' +
'http:\\/\\/www\\.zero-inch\\.com\\/.*|' +
'http:\\/\\/.*\\.bandcamp\\.com\\/|' +
'http:\\/\\/.*\\.bandcamp\\.com\\/track\\/.*|' +
'http:\\/\\/.*\\.bandcamp\\.com\\/album\\/.*|' +
'http:\\/\\/freemusicarchive\\.org\\/music\\/.*|' +
'http:\\/\\/www\\.freemusicarchive\\.org\\/music\\/.*|' +
'http:\\/\\/freemusicarchive\\.org\\/curator\\/.*|' +
'http:\\/\\/www\\.freemusicarchive\\.org\\/curator\\/.*|' +
'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.npr\\.org\\/templates\\/story\\/story\\.php.*|' +
'http:\\/\\/huffduffer\\.com\\/.*\\/.*|' +
'http:\\/\\/www\\.audioboo\\.fm\\/boos\\/.*|' +
'http:\\/\\/audioboo\\.fm\\/boos\\/.*|' +
'http:\\/\\/boo\\.fm\\/b.*|' +
'http:\\/\\/www\\.xiami\\.com\\/song\\/.*|' +
'http:\\/\\/xiami\\.com\\/song\\/.*|' +
'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' +
'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' +
'http:\\/\\/grooveshark\\.com\\/.*|' +
'http:\\/\\/radioreddit\\.com\\/songs.*|' +
'http:\\/\\/www\\.radioreddit\\.com\\/songs.*|' +
'http:\\/\\/radioreddit\\.com\\/\\?q=songs.*|' +
'http:\\/\\/www\\.radioreddit\\.com\\/\\?q=songs.*|' +
'http:\\/\\/www\\.gogoyoko\\.com\\/song\\/.*|' +
'http:\\/\\/espn\\.go\\.com\\/video\\/clip.*|' +
'http:\\/\\/espn\\.go\\.com\\/.*\\/story.*|' +
'http:\\/\\/abcnews\\.com\\/.*\\/video\\/.*|' +
'http:\\/\\/abcnews\\.com\\/video\\/playerIndex.*|' +
'http:\\/\\/washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.boston\\.com\\/video.*|' +
'http:\\/\\/boston\\.com\\/video.*|' +
'http:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' +
'http:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' +
'http:\\/\\/www\\.facebook\\.com\\/v\\/.*|' +
'https:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' +
'https:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' +
'https:\\/\\/www\\.facebook\\.com\\/v\\/.*|' +
'http:\\/\\/cnbc\\.com\\/id\\/.*\\?.*video.*|' +
'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\?.*video.*|' +
'http:\\/\\/cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' +
'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' +
'http:\\/\\/cbsnews\\.com\\/video\\/watch\\/.*|' +
'http:\\/\\/www\\.google\\.com\\/buzz\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.google\\.com\\/buzz\\/.*|' +
'http:\\/\\/www\\.google\\.com\\/profiles\\/.*|' +
'http:\\/\\/google\\.com\\/buzz\\/.*\\/.*\\/.*|' +
'http:\\/\\/google\\.com\\/buzz\\/.*|' +
'http:\\/\\/google\\.com\\/profiles\\/.*|' +
'http:\\/\\/www\\.cnn\\.com\\/video\\/.*|' +
'http:\\/\\/edition\\.cnn\\.com\\/video\\/.*|' +
'http:\\/\\/money\\.cnn\\.com\\/video\\/.*|' +
'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' +
'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' +
'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' +
'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' +
'http:\\/\\/www\\.globalpost\\.com\\/video\\/.*|' +
'http:\\/\\/www\\.globalpost\\.com\\/dispatch\\/.*|' +
'http:\\/\\/guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/www\\.guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
'http:\\/\\/bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' +
'http:\\/\\/www\\.bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' +
'http:\\/\\/video\\.nationalgeographic\\.com\\/.*\\/.*\\/.*\\.html|' +
'http:\\/\\/dsc\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/animal\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/health\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/investigation\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/military\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/planetgreen\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/science\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/tlc\\.discovery\\.com\\/videos\\/.*|' +
'http:\\/\\/video\\.forbes\\.com\\/fvn\\/.*|' +
'http:\\/\\/recordsetter\\.com\\/*\\/*\\/*'
, re.I
)
api_endpoint = 'http://api.embed.ly/1/oembed'
api_params = {'format':'json', 'maxwidth':600, 'key' : g.embedly_api_key }
class GenericScraper(MediaScraper):
"""a special scrapper not associated with any domains, used to
write media objects to links by hand"""
domains = ['*']
height = 480
width = 640
@classmethod
def media_embed(cls, content, height = None, width = None, scrolling = False, **kw):
return MediaEmbed(height = height or cls.height,
width = width or cls.width,
scrolling = scrolling,
content = content)
class DeepScraper(object):
"""Subclasses of DeepScraper attempt to dive into generic pages
for embeds of other types (like YouTube videos on blog
sites)."""
def find_media_object(self, scraper):
return None
class YoutubeEmbedDeepScraper(DeepScraper):
youtube_url_re = re.compile('^(http://www.youtube.com/v/([_a-zA-Z0-9-]+)).*')
def find_media_object(self, scraper):
# try to find very simple youtube embeds
if not scraper.soup:
scraper.download()
if scraper.soup:
movie_embed = scraper.soup.find('embed',
attrs={'src': lambda x: self.youtube_url_re.match(x)})
if movie_embed:
youtube_id = self.youtube_url_re.match(movie_embed['src']).group(2)
youtube_url = 'http://www.youtube.com/watch?v=%s"' % youtube_id
log.debug('found youtube embed %s' % youtube_url)
mo = make_scraper(youtube_url).media_object()
mo['deep'] = scraper.url
return mo
#scrapers =:= dict(domain -> ScraperClass)
scrapers = {}
for scraper in [ EmbedlyOEmbed,
YoutubeScraper,
MetacafeScraper,
GootubeScraper,
VimeoScraper,
BreakScraper,
TheOnionScraper,
CollegeHumorScraper,
FunnyOrDieScraper,
ComedyCentralScraper,
ColbertNationScraper,
TheDailyShowScraper,
TedScraper,
LiveLeakScraper,
DailyMotionScraper,
RevverScraper,
EscapistScraper,
JustintvScraper,
SoundcloudScraper,
CraigslistScraper,
GenericScraper,
]:
for domain in scraper.domains:
scrapers.setdefault(domain, []).append(scraper)
deepscrapers = [YoutubeEmbedDeepScraper]
def get_media_embed(media_object):
for scraper in scrapers.get(media_object['type'], []):
res = scraper.media_embed(**media_object)
if res:
return res
if 'content' in media_object:
return GenericScraper.media_embed(**media_object)
def convert_old_media_objects():
q = Link._query(Link.c.media_object is not None,
Link.c._date > whenever,
data = True)
for link in utils.fetch_things2(q):
if not getattr(link, 'media_object', None):
continue
if 'youtube' in link.media_object:
# we can rewrite this one without scraping
video_id = YoutubeScraper.video_id_rx.match(link.url)
link.media_object = dict(type='youtube.com',
video_id = video_id.group(1))
elif ('video.google.com' in link.media_object
or 'metacafe' in link.media_object):
scraper = make_scraper(link.url)
if not scraper:
continue
mo = scraper.media_object()
if not mo:
continue
link.media_object = mo
else:
print "skipping %s because it confuses me" % link._fullname
continue
link._commit()
test_urls = [
'http://www.facebook.com/pages/Rick-Astley/5807213510?sid=c99aaf3888171e73668a38e0749ae12d', # regular thumbnail finder
'http://www.flickr.com/photos/septuagesima/317819584/', # thumbnail with image_src
#'http://www.youtube.com/watch?v=Yu_moia-oVI',
'http://www.metacafe.com/watch/sy-1473689248/rick_astley_never_gonna_give_you_up_official_music_video/',
'http://video.google.com/videoplay?docid=5908758151704698048',
#'http://vimeo.com/4495451',
'http://www.break.com/usercontent/2008/11/Macy-s-Thankgiving-Day-Parade-Rick-Roll-611965.html',
'http://www.theonion.com/content/video/sony_releases_new_stupid_piece_of',
'http://www.collegehumor.com/video:1823712',
'http://www.funnyordie.com/videos/7f2a184755/macys-thanksgiving-day-parade-gets-rick-rolled-from-that-happened',
'http://www.comedycentral.com/videos/index.jhtml?videoId=178342&title=ultimate-fighting-vs.-bloggers',
# old style
'http://www.thedailyshow.com/video/index.jhtml?videoId=175244&title=Photoshop-of-Horrors',
# new style
'http://www.thedailyshow.com/watch/wed-july-22-2009/the-born-identity',
'http://www.colbertnation.com/the-colbert-report-videos/63549/may-01-2006/sign-off---spam',
'http://www.liveleak.com/view?i=e09_1207983531',
'http://www.dailymotion.com/relevance/search/rick+roll/video/x5l8e6_rickroll_fun',
'http://revver.com/video/1199591/rick-rolld-at-work/',
'http://www.escapistmagazine.com/videos/view/zero-punctuation/10-The-Orange-Box',
'http://www.escapistmagazine.com/videos/view/unskippable/736-Lost-Odyssey',
# justin.tv has two media types that we care about, streams, which
# we can scrape, and clips, which we can't
'http://www.justin.tv/help', # stream
'http://www.justin.tv/clip/c07a333f94e5716b', # clip, which we can't currently scrape, and shouldn't try
'http://soundcloud.com/kalhonaaho01/never-gonna-stand-you-up-rick-astley-vs-ludacris-album-version',
'http://www.craigslist.org/about/best/sea/240705630.html',
'http://listen.grooveshark.com/#/song/Never_Gonna_Give_You_Up/12616328',
'http://tinysong.com/2WOJ', # also Grooveshark
'http://www.slideshare.net/doina/happy-easter-from-holland-slideshare',
'http://www.slideshare.net/stinson/easter-1284190',
'http://www.slideshare.net/angelspascual/easter-events',
'http://www.slideshare.net/sirrods/happy-easter-3626014',
'http://www.slideshare.net/sirrods/happy-easter-wide-screen',
'http://www.slideshare.net/carmen_serbanescu/easter-holiday',
'http://www.slideshare.net/Lithuaniabook/easter-1255880',
'http://www.slideshare.net/hues/easter-plants',
'http://www.slideshare.net/Gospelman/passover-week',
'http://www.slideshare.net/angelspascual/easter-around-the-world-1327542',
'http://www.scribd.com/doc/13994900/Easter',
'http://www.scribd.com/doc/27425714/Celebrating-Easter-ideas-for-adults-and-children',
'http://www.scribd.com/doc/28010101/Easter-Foods-No-Name',
'http://www.scribd.com/doc/28452730/Easter-Cards',
'http://www.scribd.com/doc/19026714/The-Easter-Season',
'http://www.scribd.com/doc/29183659/History-of-Easter',
'http://www.scribd.com/doc/15632842/The-Last-Easter',
'http://www.scribd.com/doc/28741860/The-Plain-Truth-About-Easter',
'http://www.scribd.com/doc/23616250/4-27-08-ITS-EASTER-AGAIN-ORTHODOX-EASTER-by-vanderKOK',
'http://screenr.com/t9d',
'http://screenr.com/yLS',
'http://screenr.com/gzS',
'http://screenr.com/IwU',
'http://screenr.com/FM7',
'http://screenr.com/Ejg',
'http://screenr.com/u4h',
'http://screenr.com/QiN',
'http://screenr.com/zts',
'http://www.5min.com/Video/How-to-Decorate-Easter-Eggs-with-Decoupage-142076462',
'http://www.5min.com/Video/How-to-Color-Easter-Eggs-Dye-142076281',
'http://www.5min.com/Video/How-to-Make-an-Easter-Egg-Diorama-142076482',
'http://www.5min.com/Video/How-to-Make-Sequined-Easter-Eggs-142076512',
'http://www.5min.com/Video/How-to-Decorate-Wooden-Easter-Eggs-142076558',
'http://www.5min.com/Video/How-to-Blow-out-an-Easter-Egg-142076367',
'http://www.5min.com/Video/Learn-About-Easter-38363995',
'http://www.howcast.com/videos/368909-Easter-Egg-Dying-How-To-Make-Ukrainian-Easter-Eggs',
'http://www.howcast.com/videos/368911-Easter-Egg-Dying-How-To-Color-Easter-Eggs-With-Food-Dyes',
'http://www.howcast.com/videos/368913-Easter-Egg-Dying-How-To-Make-Homemade-Easter-Egg-Dye',
'http://www.howcast.com/videos/220110-The-Meaning-Of-Easter',
'http://my.opera.com/nirvanka/albums/show.dml?id=519866',
'http://img402.yfrog.com/i/mfe.jpg/',
'http://img20.yfrog.com/i/dy6.jpg/',
'http://img145.yfrog.com/i/4mu.mp4/',
'http://img15.yfrog.com/i/mygreatmovie.mp4/',
'http://img159.yfrog.com/i/500x5000401.jpg/',
'http://tweetphoto.com/14784358',
'http://tweetphoto.com/16044847',
'http://tweetphoto.com/16718883',
'http://tweetphoto.com/16451148',
'http://tweetphoto.com/16133984',
'http://tweetphoto.com/8069529',
'http://tweetphoto.com/16207556',
'http://tweetphoto.com/7448361',
'http://tweetphoto.com/16069325',
'http://tweetphoto.com/4791033',
'http://www.flickr.com/photos/10349896@N08/4490293418/',
'http://www.flickr.com/photos/mneylon/4483279051/',
'http://www.flickr.com/photos/xstartxtodayx/4488996521/',
'http://www.flickr.com/photos/mommyknows/4485313917/',
'http://www.flickr.com/photos/29988430@N06/4487127638/',
'http://www.flickr.com/photos/excomedia/4484159563/',
'http://www.flickr.com/photos/sunnybrook100/4471526636/',
'http://www.flickr.com/photos/jaimewalsh/4489497178/',
'http://www.flickr.com/photos/29988430@N06/4486475549/',
'http://www.flickr.com/photos/22695183@N08/4488681694/',
'http://twitpic.com/1cnsf6',
'http://twitpic.com/1cgtti',
'http://twitpic.com/1coc0n',
'http://twitpic.com/1cm8us',
'http://twitpic.com/1cgks4',
'http://imgur.com/6pLoN',
'http://onegoodpenguin.posterous.com/golden-tee-live-2010-easter-egg',
'http://adland.posterous.com/?tag=royaleastershowauckland',
'http://apartmentliving.posterous.com/biggest-easter-egg-hunts-in-the-dc-area',
'http://twitgoo.com/1as',
'http://twitgoo.com/1p94',
'http://twitgoo.com/4kg2',
'http://twitgoo.com/6c9',
'http://twitgoo.com/1w5',
'http://twitgoo.com/6mu',
'http://twitgoo.com/1w3',
'http://twitgoo.com/1om',
'http://twitgoo.com/1mh',
'http://www.qwantz.com/index.php?comic=1686',
'http://www.qwantz.com/index.php?comic=773',
'http://www.qwantz.com/index.php?comic=1018',
'http://www.qwantz.com/index.php?comic=1019',
'http://www.23hq.com/mhg/photo/5498347',
'http://www.23hq.com/Greetingdesignstudio/photo/5464607',
'http://www.23hq.com/Greetingdesignstudio/photo/5464590',
'http://www.23hq.com/Greetingdesignstudio/photo/5464605',
'http://www.23hq.com/Greetingdesignstudio/photo/5464604',
'http://www.23hq.com/dvilles2/photo/5443192',
'http://www.23hq.com/Greetingdesignstudio/photo/5464606',
'http://www.youtube.com/watch?v=gghKdx558Qg',
'http://www.youtube.com/watch?v=yPid9BLQQcg',
'http://www.youtube.com/watch?v=uEo2vboUYUk',
'http://www.youtube.com/watch?v=geUhtoHbLu4',
'http://www.youtube.com/watch?v=Zk7dDekYej0',
'http://www.youtube.com/watch?v=Q3tgMosx_tI',
'http://www.youtube.com/watch?v=s9P8_vgmLfs',
'http://www.youtube.com/watch?v=1cmtN1meMmk',
'http://www.youtube.com/watch?v=AVzj-U5Ihm0',
'http://www.veoh.com/collection/easycookvideos/watch/v366931kcdgj7Hd',
'http://www.veoh.com/collection/easycookvideos/watch/v366991zjpANrqc',
'http://www.veoh.com/browse/videos/category/educational/watch/v7054535EZGFJqyX',
'http://www.veoh.com/browse/videos/category/lifestyle/watch/v18155013XBBtnYwq',
'http://www.justin.tv/easter7presents',
'http://www.justin.tv/easterfraud',
'http://www.justin.tv/cccog27909',
'http://www.justin.tv/clip/6e8c18f7050',
'http://www.justin.tv/venom24',
'http://qik.com/video/1622287',
'http://qik.com/video/1503735',
'http://qik.com/video/40504',
'http://qik.com/video/1445763',
'http://qik.com/video/743285',
'http://qik.com/video/1445299',
'http://qik.com/video/1443200',
'http://qik.com/video/1445889',
'http://qik.com/video/174242',
'http://qik.com/video/1444897',
'http://revision3.com/hak5/DualCore',
'http://revision3.com/popsiren/charm',
'http://revision3.com/tekzilla/eyefinity',
'http://revision3.com/diggnation/2005-10-06',
'http://revision3.com/hak5/netcat-virtualization-wordpress/',
'http://revision3.com/infected/forsaken',
'http://revision3.com/hak5/purepwnage',
'http://revision3.com/tekzilla/wowheadset',
'http://www.dailymotion.com/video/xcstzd_greek-wallets-tighten-during-easter_news',
'http://www.dailymotion.com/video/xcso4y_exclusive-easter-eggs-easter-basket_lifestyle',
'http://www.dailymotion.com/video/x2sgkt_evil-easter-bunny',
'http://www.dailymotion.com/video/xco7oc_invitation-to-2010-easter-services_news',
'http://www.dailymotion.com/video/xcss6b_big-cat-easter_animals',
'http://www.dailymotion.com/video/xcszw1_easter-bunny-visits-buenos-aires-zo_news',
'http://www.dailymotion.com/video/xcsfvs_forecasters-warn-of-easter-misery_news',
'http://www.collegehumor.com/video:1682246',
'http://www.twitvid.com/D9997',
'http://www.twitvid.com/902B9',
'http://www.twitvid.com/C33F8',
'http://www.twitvid.com/63F73',
'http://www.twitvid.com/BC0BA',
'http://www.twitvid.com/1C33C',
'http://www.twitvid.com/8A8E2',
'http://www.twitvid.com/51035',
'http://www.twitvid.com/5C733',
'http://www.break.com/game-trailers/game/just-cause-2/just-cause-2-lost-easter-egg?res=1',
'http://www.break.com/usercontent/2010/3/10/easter-holiday-2009-slideshow-1775624',
'http://www.break.com/index/a-very-sexy-easter-video.html',
'http://www.break.com/usercontent/2010/3/11/this-video-features-gizzi-erskine-making-easter-cookies-1776089',
'http://www.break.com/usercontent/2007/4/4/happy-easter-265717',
'http://www.break.com/usercontent/2007/4/17/extreme-easter-egg-hunting-276064',
'http://www.break.com/usercontent/2006/11/18/the-evil-easter-bunny-184789',
'http://www.break.com/usercontent/2006/4/16/hoppy-easter-kitty-91040',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104063637',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004674',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103928002',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103999188',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103920940',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103981831',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004673',
'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104046456',
'http://www.metacafe.com/watch/105023/the_easter_bunny/',
'http://www.metacafe.com/watch/4376131/easter_lay/',
'http://www.metacafe.com/watch/2245996/how_to_make_ukraine_easter_eggs/',
'http://www.metacafe.com/watch/4374339/easter_eggs/',
'http://www.metacafe.com/watch/2605860/filled_easter_baskets/',
'http://www.metacafe.com/watch/2372088/easter_eggs/',
'http://www.metacafe.com/watch/3043671/www_goodnews_ws_easter_island/',
'http://www.metacafe.com/watch/1652057/easter_eggs/',
'http://www.metacafe.com/watch/1173632/ultra_kawaii_easter_bunny_party/',
'http://celluloidremix.blip.tv/file/3378272/',
'http://blip.tv/file/449469',
'http://blip.tv/file/199776',
'http://blip.tv/file/766967',
'http://blip.tv/file/770127',
'http://blip.tv/file/854925',
'http://www.blip.tv/file/22695?filename=Uncle_dale-THEEASTERBUNNYHATESYOU395.flv',
'http://iofa.blip.tv/file/3412333/',
'http://blip.tv/file/190393',
'http://blip.tv/file/83152',
'http://video.google.com/videoplay?docid=-5427138374898988918&q=easter+bunny&pl=true',
'http://video.google.com/videoplay?docid=7785441737970480237',
'http://video.google.com/videoplay?docid=2320995867449957036',
'http://video.google.com/videoplay?docid=-2586684490991458032&q=peeps&pl=true',
'http://video.google.com/videoplay?docid=5621139047118918034',
'http://video.google.com/videoplay?docid=4232304376070958848',
'http://video.google.com/videoplay?docid=-6612726032157145299',
'http://video.google.com/videoplay?docid=4478549130377875994&hl=en',
'http://video.google.com/videoplay?docid=9169278170240080877',
'http://video.google.com/videoplay?docid=2551240967354893096',
'http://video.yahoo.com/watch/7268801/18963438',
'http://video.yahoo.com/watch/2224892/7014048',
'http://video.yahoo.com/watch/7244748/18886014',
'http://video.yahoo.com/watch/4656845/12448951',
'http://video.yahoo.com/watch/363942/2249254',
'http://video.yahoo.com/watch/2232968/7046348',
'http://video.yahoo.com/watch/4530253/12135472',
'http://video.yahoo.com/watch/2237137/7062908',
'http://video.yahoo.com/watch/952841/3706424',
'http://www.viddler.com/explore/BigAppleChannel/videos/113/',
'http://www.viddler.com/explore/cheezburger/videos/379/',
'http://www.viddler.com/explore/warnerbros/videos/350/',
'http://www.viddler.com/explore/tvcgroup/videos/169/',
'http://www.viddler.com/explore/thebrickshow/videos/12/',
'http://www.liveleak.com/view?i=e0b_1239827917',
'http://www.liveleak.com/view?i=715_1239490211',
'http://www.liveleak.com/view?i=d30_1206233786&p=1',
'http://www.liveleak.com/view?i=d91_1239548947',
'http://www.liveleak.com/view?i=f58_1190741182',
'http://www.liveleak.com/view?i=44e_1179885621&c=1',
'http://www.liveleak.com/view?i=451_1188059885',
'http://www.liveleak.com/view?i=3f5_1267456341&c=1',
'http://www.hulu.com/watch/67313/howcast-how-to-make-braided-easter-bread',
'http://www.hulu.com/watch/133583/access-hollywood-glees-matthew-morrison-on-touring-and-performing-for-president-obama',
'http://www.hulu.com/watch/66319/saturday-night-live-easter-album',
'http://www.hulu.com/watch/80229/explorer-end-of-easter-island',
'http://www.hulu.com/watch/139020/nbc-today-show-lamb-and-ham-create-easter-feast',
'http://www.hulu.com/watch/84272/rex-the-runt-easter-island',
'http://www.hulu.com/watch/132203/everyday-italian-easter-pie',
'http://www.hulu.com/watch/23349/nova-secrets-of-lost-empires-ii-easter-island',
'http://movieclips.com/watch/dirty_harry_1971/do_you_feel_lucky_punk/',
'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/',
'http://movieclips.com/watch/dumb__dumber_1994/the_toilet_doesnt_flush/',
'http://movieclips.com/watch/jaws_1975/youre_gonna_need_a_bigger_boat/',
'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/61.495/75.413',
'http://movieclips.com/watch/super_troopers_2001/the_cat_game/12.838/93.018',
'http://movieclips.com/watch/this_is_spinal_tap_1984/these_go_to_eleven/79.703/129.713',
'http://crackle.com/c/Originals/What_s_the_deal_with_Easter_candy_/2303243',
'http://crackle.com/c/How_To/Dryer_Lint_Easter_Bunny_Trailer_Park_Craft/2223902',
'http://crackle.com/c/How_To/Pagan_Origin_of_Easter_Easter_Egg_Rabbit_Playb_/2225124',
'http://crackle.com/c/Funny/Happy_Easter/2225363',
'http://crackle.com/c/Funny/Crazy_and_Hilarious_Easter_Egg_Hunt/2225737',
'http://crackle.com/c/How_To/Learn_About_Greek_Orthodox_Easter/2262294',
'http://crackle.com/c/How_To/How_to_Make_Ukraine_Easter_Eggs/2262274',
'http://crackle.com/c/How_To/Symbolism_Of_Ukrainian_Easter_Eggs/2262267',
'http://crackle.com/c/Funny/Easter_Retard/931976',
'http://www.fancast.com/tv/It-s-the-Easter-Beagle,-Charlie-Brown/74789/1078053475/Peanuts:-Specials:-It-s-the-Easter-Beagle,-Charlie-Brown/videos',
'http://www.fancast.com/movies/Easter-Parade/97802/687440525/Easter-Parade/videos',
'http://www.fancast.com/tv/Saturday-Night-Live/10009/1083396482/Easter-Album/videos',
'http://www.fancast.com/movies/The-Proposal/147176/1140660489/The-Proposal:-Easter-Egg-Hunt/videos',
'http://www.funnyordie.com/videos/f6883f54ae/the-unsettling-ritualistic-origin-of-the-easter-bunny',
'http://www.funnyordie.com/videos/3ccb03863e/easter-tail-keaster-bunny',
'http://www.funnyordie.com/videos/17b1d36ad0/easter-bunny-from-leatherfink',
'http://www.funnyordie.com/videos/0c55aa116d/easter-exposed-from-bryan-erwin',
'http://www.funnyordie.com/videos/040dac4eff/easter-eggs',
'http://vimeo.com/10446922',
'http://vimeo.com/10642542',
'http://www.vimeo.com/10664068',
'http://vimeo.com/819176',
'http://www.vimeo.com/10525353',
'http://vimeo.com/10429123',
'http://www.vimeo.com/10652053',
'http://vimeo.com/10572216',
'http://www.ted.com/talks/jared_diamond_on_why_societies_collapse.html',
'http://www.ted.com/talks/nathan_myhrvold_on_archeology_animal_photography_bbq.html',
'http://www.ted.com/talks/johnny_lee_demos_wii_remote_hacks.html',
'http://www.ted.com/talks/robert_ballard_on_exploring_the_oceans.html',
'http://www.omnisio.com/v/Z3QxbTUdjhG/wall-e-collection-of-videos',
'http://www.omnisio.com/v/3ND6LTvdjhG/php-tutorial-4-login-form-updated',
'http://www.thedailyshow.com/watch/thu-december-14-2000/intro---easter',
'http://www.thedailyshow.com/watch/tue-april-18-2006/headlines---easter-charade',
'http://www.thedailyshow.com/watch/tue-april-18-2006/egg-beaters',
'http://www.thedailyshow.com/watch/tue-april-18-2006/moment-of-zen---scuba-diver-hiding-easter-eggs',
'http://www.thedailyshow.com/watch/tue-april-7-2009/easter---passover-highlights',
'http://www.thedailyshow.com/watch/tue-february-29-2000/headlines---leap-impact',
'http://www.thedailyshow.com/watch/thu-march-1-2007/tomb-with-a-jew',
'http://www.thedailyshow.com/watch/mon-april-24-2000/the-meaning-of-passover',
'http://www.colbertnation.com/the-colbert-report-videos/268800/march-31-2010/easter-under-attack---peeps-display-update',
'http://www.colbertnation.com/the-colbert-report-videos/268797/march-31-2010/intro---03-31-10',
'http://www.colbertnation.com/full-episodes/wed-march-31-2010-craig-mullaney',
'http://www.colbertnation.com/the-colbert-report-videos/60902/march-28-2006/the-word---easter-under-attack---marketing',
'http://www.colbertnation.com/the-colbert-report-videos/83362/march-07-2007/easter-under-attack---bunny',
'http://www.colbertnation.com/the-colbert-report-videos/61404/april-06-2006/easter-under-attack---recalled-eggs?videoId=61404',
'http://www.colbertnation.com/the-colbert-report-videos/223957/april-06-2009/colbert-s-easter-parade',
'http://www.colbertnation.com/the-colbert-report-videos/181772/march-28-2006/intro---3-28-06',
'http://www.traileraddict.com/trailer/despicable-me/easter-greeting',
'http://www.traileraddict.com/trailer/easter-parade/trailer',
'http://www.traileraddict.com/clip/the-proposal/easter-egg-hunt',
'http://www.traileraddict.com/trailer/despicable-me/international-teaser-trailer',
'http://www.traileraddict.com/trailer/despicable-me/today-show-minions',
'http://revver.com/video/263817/happy-easter/',
'http://www.revver.com/video/1574939/easter-bunny-house/',
'http://revver.com/video/771140/easter-08/',
]
def submit_all():
from r2.models import Subreddit, Account, Link, NotFound
from r2.lib.media import set_media
from r2.lib.db import queries
sr = Subreddit._by_name('testmedia')
author = Account._by_name('testmedia')
links = []
for url in test_urls:
try:
# delete any existing version of the link
l = Link._by_url(url, sr)
print "Deleting %s" % l
l._deleted = True
l._commit()
except NotFound:
pass
l = Link._submit(url, url, author, sr, '0.0.0.0')
try:
set_media(l)
except Exception, e:
print e
queries.new_link(l)
links.append(l)
return links
def test_real(nlinks):
from r2.models import Link, desc
from r2.lib.utils import fetch_things2
counter = 0
q = Link._query(sort = desc("_date"))
print "<html><body><table border=\"1\">"
for l in fetch_things2(q):
if counter > nlinks:
break
if not l.is_self:
h = make_scraper(l.url)
mo = h.media_object()
print "scraper: %s" % mo
if mo:
print get_media_embed(mo).content
counter +=1
print "</table></body></html>"
def test_url(url):
import sys
from r2.lib.filters import websafe
sys.stderr.write("%s\n" % url)
print "<tr>"
h = make_scraper(url)
print "<td>"
print "<b>", websafe(url), "</b>"
print "<br />"
print websafe(repr(h))
img = h.largest_image_url()
if img:
print "<td><img src=\"%s\" /></td>" % img
else:
print "<td>(no image)</td>"
mo = h.media_object()
print "<td>"
if mo:
print get_media_embed(mo).content
else:
print "None"
print "</td>"
print "</tr>"
def test():
"""Take some example URLs and print out a nice pretty HTML table
of their extracted thubmnails and media objects"""
print "<html><body><table border=\"1\">"
for url in test_urls:
test_url(url)
print "</table></body></html>"