diff --git a/r2/r2/controllers/api.py b/r2/r2/controllers/api.py
index 38c41dd2e..7abed0dcb 100755
--- a/r2/r2/controllers/api.py
+++ b/r2/r2/controllers/api.py
@@ -66,7 +66,7 @@ from r2.lib import tracking, cssfilter, emailer
from r2.lib.subreddit_search import search_reddits
from r2.lib.log import log_text
from r2.lib.filters import safemarkdown
-from r2.lib.scraper import str_to_image
+from r2.lib.media import str_to_image
from r2.controllers.api_docs import api_doc, api_section
from r2.lib.search import SearchQuery
from r2.controllers.oauth2 import OAuth2ResourceController, require_oauth2_scope
diff --git a/r2/r2/controllers/mediaembed.py b/r2/r2/controllers/mediaembed.py
index 756259435..24ea5d35c 100644
--- a/r2/r2/controllers/mediaembed.py
+++ b/r2/r2/controllers/mediaembed.py
@@ -25,7 +25,7 @@ from pylons.controllers.util import abort
from r2.controllers.reddit_base import MinimalController
from r2.lib.pages import MediaEmbedBody
-from r2.lib.scraper import get_media_embed
+from r2.lib.media import get_media_embed
from r2.lib.validator import validate, VLink
@@ -39,11 +39,6 @@ class MediaembedController(MinimalController):
if not link or not link.media_object:
abort(404)
-
- if isinstance(link.media_object, basestring):
- # it's an old-style string
- content = link.media_object
-
elif isinstance(link.media_object, dict):
# otherwise it's the new style, which is a dict(type=type, **args)
media_embed = get_media_embed(link.media_object)
diff --git a/r2/r2/lib/jsontemplates.py b/r2/r2/lib/jsontemplates.py
index b63371fe7..190532118 100755
--- a/r2/r2/lib/jsontemplates.py
+++ b/r2/r2/lib/jsontemplates.py
@@ -414,7 +414,7 @@ class LinkJsonTemplate(ThingJsonTemplate):
)
def thing_attr(self, thing, attr):
- from r2.lib.scraper import get_media_embed
+ from r2.lib.media import get_media_embed
if attr == "media_embed":
if (thing.media_object and
not isinstance(thing.media_object, basestring)):
diff --git a/r2/r2/lib/media.py b/r2/r2/lib/media.py
index 5ee7d7102..296ae5398 100644
--- a/r2/r2/lib/media.py
+++ b/r2/r2/lib/media.py
@@ -20,36 +20,144 @@
# Inc. All Rights Reserved.
###############################################################################
-import subprocess
-
-from pylons import g, config
-
-from r2.models.link import Link
-from r2.lib import s3cp
-from r2.lib.utils import timeago, fetch_things2
-from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
-from r2.lib.db.operators import desc
-from r2.lib.scraper import make_scraper, str_to_image, image_to_str, prepare_image
-from r2.lib import amqp
-from r2.lib.nymph import optimize_png
-
-import Image
-
+import base64
+import collections
+import cStringIO
+import hashlib
+import json
+import math
+import mimetypes
import os
+import re
+import subprocess
import tempfile
import traceback
+import urllib
+import urllib2
+import urlparse
-import base64
-import hashlib
+import BeautifulSoup
+import Image
+import ImageFile
+
+from pylons import g
+
+from r2.lib import amqp, s3cp
+from r2.lib.memoize import memoize
+from r2.lib.nymph import optimize_png
+from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
+from r2.models.link import Link
-import mimetypes
s3_direct_url = "s3.amazonaws.com"
-
-threads = 20
-log = g.log
-
MEDIA_FILENAME_LENGTH = 12
+thumbnail_size = 70, 70
+
+
+def _image_to_str(image):
+ s = cStringIO.StringIO()
+ image.save(s, image.format)
+ return s.getvalue()
+
+
+def str_to_image(s):
+ s = cStringIO.StringIO(s)
+ image = Image.open(s)
+ return image
+
+
+def _image_entropy(img):
+ """calculate the entropy of an image"""
+ hist = img.histogram()
+ hist_size = sum(hist)
+ hist = [float(h) / hist_size for h in hist]
+
+ return -sum(p * math.log(p, 2) for p in hist if p != 0)
+
+
+def _square_image(img):
+ """if the image is taller than it is wide, square it off. determine
+ which pieces to cut off based on the entropy pieces."""
+ x,y = img.size
+ while y > x:
+ #slice 10px at a time until square
+ slice_height = min(y - x, 10)
+
+ bottom = img.crop((0, y - slice_height, x, y))
+ top = img.crop((0, 0, x, slice_height))
+
+ #remove the slice with the least entropy
+ if _image_entropy(bottom) < _image_entropy(top):
+ img = img.crop((0, 0, x, y - slice_height))
+ else:
+ img = img.crop((0, slice_height, x, y))
+
+ x,y = img.size
+
+ return img
+
+
+def _prepare_image(image):
+ image = _square_image(image)
+ image.thumbnail(thumbnail_size, Image.ANTIALIAS)
+ return image
+
+
+def _clean_url(url):
+ """url quotes unicode data out of urls"""
+ url = url.encode('utf8')
+ url = ''.join(urllib.quote(c) if ord(c) >= 127 else c for c in url)
+ return url
+
+
+def _initialize_request(url, referer):
+ url = _clean_url(url)
+
+ if not url.startswith(("http://", "https://")):
+ return
+
+ req = urllib2.Request(url)
+ if g.useragent:
+ req.add_header('User-Agent', g.useragent)
+ if referer:
+ req.add_header('Referer', referer)
+ return req
+
+
+def _fetch_url(url, referer=None):
+ request = _initialize_request(url, referer=referer)
+ if not request:
+ return None, None
+ response = urllib2.urlopen(request)
+ return response.headers.get("Content-Type"), response.read()
+
+
+@memoize('media.fetch_size', time=3600)
+def _fetch_image_size(url, referer):
+ """Return the size of an image by URL downloading as little as possible."""
+
+ request = _initialize_request(url, referer)
+ if not request:
+ return None
+
+ parser = ImageFile.Parser()
+ response = None
+ try:
+ response = urllib2.urlopen(request)
+
+ while True:
+ chunk = response.read(1024)
+ if not chunk:
+ break
+
+ parser.feed(chunk)
+ if parser.image:
+ return parser.image.size
+ except urllib2.URLError:
+ return None
+ finally:
+ if response:
+ response.close()
def optimize_jpeg(filename, optimizer):
@@ -151,29 +259,27 @@ def update_link(link, thumbnail, media_object, thumbnail_size=None):
link._commit()
-def set_media(link, force = False):
+def _set_media(embedly_services, link, force=False):
if link.is_self:
return
if not force and link.promoted:
return
elif not force and (link.has_thumbnail or link.media_object):
return
-
- scraper = make_scraper(link.url)
- thumbnail = scraper.thumbnail()
- media_object = scraper.media_object()
+ scraper = Scraper.for_url(embedly_services, link.url)
+ thumbnail, media_object = scraper.scrape()
if media_object:
# the scraper should be able to make a media embed out of the
# media object it just gave us. if not, null out the media object
# to protect downstream code
- res = scraper.media_embed(**media_object)
+ res = scraper.media_embed(media_object)
if not res:
print "%s made a bad media obj for link %s" % (scraper, link._id36)
media_object = None
-
+
thumbnail_url = upload_media(thumbnail) if thumbnail else None
thumbnail_size = thumbnail.size if thumbnail else None
@@ -181,7 +287,7 @@ def set_media(link, force = False):
def force_thumbnail(link, image_data, never_expire=True, file_type=".jpg"):
image = str_to_image(image_data)
- image = prepare_image(image)
+ image = _prepare_image(image)
thumb_url = upload_media(image, never_expire=never_expire, file_type=file_type)
update_link(link, thumbnail=thumb_url, media_object=None, thumbnail_size=image.size)
@@ -190,7 +296,7 @@ def upload_icon(file_name, image_data, size):
image = str_to_image(image_data)
image.format = 'PNG'
image.thumbnail(size, Image.ANTIALIAS)
- icon_data = image_to_str(image)
+ icon_data = _image_to_str(image)
return s3_upload_media(icon_data,
file_name=file_name,
mime_type='image/png',
@@ -201,16 +307,218 @@ def upload_icon(file_name, image_data, size):
def can_upload_icon():
return g.media_store == 's3'
+
+def get_media_embed(media_object):
+ if not isinstance(media_object, dict):
+ return
+
+ if "oembed" not in media_object:
+ return
+
+ return _EmbedlyScraper.media_embed(media_object)
+
+
+class MediaEmbed(object):
+ width = None
+ height = None
+ content = None
+ scrolling = False
+
+ def __init__(self, height, width, content, scrolling=False):
+ self.height = int(height)
+ self.width = int(width)
+ self.content = content
+ self.scrolling = scrolling
+
+
+def _make_thumbnail_from_url(thumbnail_url, referer):
+ if not thumbnail_url:
+ return
+ content_type, content = _fetch_url(thumbnail_url, referer=referer)
+ if not content:
+ return
+ image = str_to_image(content)
+ return _prepare_image(image)
+
+
+class Scraper(object):
+ @classmethod
+ def for_url(cls, embedly_services, url):
+ url_domain = domain(url)
+ domain_embedly_regex = embedly_services.get(url_domain, None)
+
+ if domain_embedly_regex and re.match(domain_embedly_regex, url):
+ return _EmbedlyScraper(url)
+ return _ThumbnailOnlyScraper(url)
+
+ def scrape(self):
+ # should return a 2-tuple of: thumbnail, media_object
+ raise NotImplementedError
+
+ @classmethod
+ def media_embed(cls, media_object):
+ # should take a media object and return an appropriate MediaEmbed
+ raise NotImplementedError
+
+
+class _ThumbnailOnlyScraper(Scraper):
+ def __init__(self, url):
+ self.url = url
+
+ def scrape(self):
+ thumbnail_url = self._find_thumbnail_image()
+ thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
+ return thumbnail, None
+
+ def _extract_image_urls(self, soup):
+ for img in soup.findAll("img", src=True):
+ yield urlparse.urljoin(self.url, img["src"])
+
+ def _find_thumbnail_image(self):
+ content_type, content = _fetch_url(self.url)
+
+ # if it's an image. it's pretty easy to guess what we should thumbnail.
+ if "image" in content_type:
+ return self.url
+
+ if content_type and "html" in content_type and content:
+ soup = BeautifulSoup.BeautifulSoup(content)
+ else:
+ return None
+
+ # allow the content author to specify the thumbnail:
+ #
+ og_image = (soup.find('meta', property='og:image') or
+ soup.find('meta', attrs={'name': 'og:image'}))
+ if og_image and og_image['content']:
+ return og_image['content']
+
+ #
+ thumbnail_spec = soup.find('link', rel='image_src')
+ if thumbnail_spec and thumbnail_spec['href']:
+ return thumbnail_spec['href']
+
+ # ok, we have no guidance from the author. look for the largest
+ # image on the page with a few caveats. (see below)
+ max_area = 0
+ max_url = None
+ for image_url in self._extract_image_urls(soup):
+ size = _fetch_image_size(image_url, referer=self.url)
+ if not size:
+ continue
+
+ area = size[0] * size[1]
+
+ # ignore little images
+ if area < 5000:
+ g.log.debug('ignore little %s' % image_url)
+ continue
+
+ # ignore excessively long/wide images
+ if max(size) / min(size) > 1.5:
+ g.log.debug('ignore dimensions %s' % image_url)
+ continue
+
+ # penalize images with "sprite" in their name
+ if 'sprite' in image_url.lower():
+ g.log.debug('penalizing sprite %s' % image_url)
+ area /= 10
+
+ if area > max_area:
+ max_area = area
+ max_url = image_url
+ return max_url
+
+
+class _EmbedlyScraper(Scraper):
+ EMBEDLY_API_URL = "http://api.embed.ly/1/oembed"
+
+ def __init__(self, url):
+ self.url = url
+
+ @classmethod
+ def _utf8_encode(cls, input):
+ """UTF-8 encodes any strings in an object (from json.loads)"""
+ if isinstance(input, dict):
+ return {cls._utf8_encode(key): cls._utf8_encode(value)
+ for key, value in input.iteritems()}
+ elif isinstance(input, list):
+ return [cls._utf8_encode(item)
+ for item in input]
+ elif isinstance(input, unicode):
+ return input.encode('utf-8')
+ else:
+ return input
+
+ def scrape(self):
+ params = urllib.urlencode({
+ "url": self.url,
+ "format": "json",
+ "maxwidth": 600,
+ "key": g.embedly_api_key,
+ })
+ response = urllib2.urlopen(self.EMBEDLY_API_URL + "?" + params)
+ oembed = json.load(response, object_hook=self._utf8_encode)
+
+ if not oembed:
+ return None, None
+
+ if oembed.get("type") == "photo":
+ thumbnail_url = oembed.get("url")
+ else:
+ thumbnail_url = oembed.get("thumbnail_url")
+ thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
+
+ embed = {}
+ if oembed.get("type") in ("video", "rich"):
+ embed = {
+ "type": domain(self.url),
+ "oembed": oembed,
+ }
+
+ return thumbnail, embed
+
+ @classmethod
+ def media_embed(cls, media_object):
+ oembed = media_object["oembed"]
+
+ html = oembed.get("html")
+ width = oembed.get("width")
+ height = oembed.get("height")
+ if not (html and width and height):
+ return
+
+ return MediaEmbed(
+ width=width,
+ height=height,
+ content=html,
+ )
+
+
+@memoize("media.embedly_services", time=3600)
+def _fetch_embedly_services():
+ response = urllib2.urlopen("http://api.embed.ly/1/services/python")
+ service_data = json.load(response)
+
+ patterns_by_domain = collections.defaultdict(set)
+ for service in service_data:
+ for domain in [service["domain"]] + service["subdomains"]:
+ patterns_by_domain[domain].update(service["regex"])
+
+ return {domain: "(?:%s)" % "|".join(patterns)
+ for domain, patterns in patterns_by_domain.iteritems()}
+
+
def run():
+ embedly_services = _fetch_embedly_services()
+
@g.stats.amqp_processor('scraper_q')
def process_link(msg):
- def _process_link(fname):
- link = Link._by_fullname(fname, data=True)
- set_media(link)
-
fname = msg.body
+ link = Link._by_fullname(msg.body, data=True)
+
try:
- TimeoutFunction(_process_link, 30)(fname)
+ TimeoutFunction(_set_media, 30)(embedly_services, link)
except TimeoutFunctionException:
print "Timed out on %s" % fname
except KeyboardInterrupt:
diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py
index 843f5989e..703619ab2 100755
--- a/r2/r2/lib/pages/pages.py
+++ b/r2/r2/lib/pages/pages.py
@@ -67,7 +67,6 @@ from r2.lib.utils import url_links_builder, make_offset_date, median, to36
from r2.lib.utils import trunc_time, timesince, timeuntil, weighted_lottery
from r2.lib.template_helpers import add_sr, get_domain, format_number
from r2.lib.subreddit_search import popular_searches
-from r2.lib.scraper import get_media_embed
from r2.lib.log import log_text
from r2.lib.memoize import memoize
from r2.lib.utils import trunc_string as _truncate, to_date
@@ -3454,7 +3453,7 @@ def make_link_child(item):
media_embed = item.media_object
else:
try:
- media_embed = get_media_embed(item.media_object)
+ media_embed = media.get_media_embed(item.media_object)
except TypeError:
g.log.warning("link %s has a bad media object" % item)
media_embed = None
diff --git a/r2/r2/lib/scraper.py b/r2/r2/lib/scraper.py
deleted file mode 100644
index c1285f953..000000000
--- a/r2/r2/lib/scraper.py
+++ /dev/null
@@ -1,1864 +0,0 @@
-# The contents of this file are subject to the Common Public Attribution
-# License Version 1.0. (the "License"); you may not use this file except in
-# compliance with the License. You may obtain a copy of the License at
-# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
-# License Version 1.1, but Sections 14 and 15 have been added to cover use of
-# software over a computer network and provide for limited attribution for the
-# Original Developer. In addition, Exhibit A has been modified to be consistent
-# with Exhibit B.
-#
-# Software distributed under the License is distributed on an "AS IS" basis,
-# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
-# the specific language governing rights and limitations under the License.
-#
-# The Original Code is reddit.
-#
-# The Original Developer is the Initial Developer. The Initial Developer of
-# the Original Code is reddit Inc.
-#
-# All portions of the code written by reddit are Copyright (c) 2006-2013 reddit
-# Inc. All Rights Reserved.
-###############################################################################
-
-from pylons import g
-from r2.lib import utils
-from r2.lib.memoize import memoize
-import simplejson as json
-
-from urllib2 import Request, HTTPError, URLError, urlopen
-from httplib import InvalidURL
-import urlparse, re, urllib, logging, StringIO, logging
-import Image, ImageFile, math
-from BeautifulSoup import BeautifulSoup
-
-log = g.log
-useragent = g.useragent
-
-chunk_size = 1024
-thumbnail_size = 70, 70
-
-def image_to_str(image):
- s = StringIO.StringIO()
- image.save(s, image.format)
- s.seek(0)
- return s.read()
-
-def str_to_image(s):
- s = StringIO.StringIO(s)
- s.seek(0)
- image = Image.open(s)
- return image
-
-def prepare_image(image):
- image = square_image(image)
- image.thumbnail(thumbnail_size, Image.ANTIALIAS)
- return image
-
-def image_entropy(img):
- """calculate the entropy of an image"""
- hist = img.histogram()
- hist_size = sum(hist)
- hist = [float(h) / hist_size for h in hist]
-
- return -sum([p * math.log(p, 2) for p in hist if p != 0])
-
-def square_image(img):
- """if the image is taller than it is wide, square it off. determine
- which pieces to cut off based on the entropy pieces."""
- x,y = img.size
- while y > x:
- #slice 10px at a time until square
- slice_height = min(y - x, 10)
-
- bottom = img.crop((0, y - slice_height, x, y))
- top = img.crop((0, 0, x, slice_height))
-
- #remove the slice with the least entropy
- if image_entropy(bottom) < image_entropy(top):
- img = img.crop((0, 0, x, y - slice_height))
- else:
- img = img.crop((0, slice_height, x, y))
-
- x,y = img.size
-
- return img
-
-def clean_url(url):
- """url quotes unicode data out of urls"""
- s = url
- url = url.encode('utf8')
- url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url])
- return url
-
-def fetch_url(url, referer = None, retries = 1, dimension = False):
- cur_try = 0
- log.debug('fetching: %s' % url)
- nothing = None if dimension else (None, None)
- url = clean_url(url)
- #just basic urls
- if not url.startswith(('http://', 'https://')):
- return nothing
- while True:
- try:
- req = Request(url)
- if useragent:
- req.add_header('User-Agent', useragent)
- if referer:
- req.add_header('Referer', referer)
-
- open_req = urlopen(req)
-
- #if we only need the dimension of the image, we may not
- #need to download the entire thing
- if dimension:
- content = open_req.read(chunk_size)
- else:
- content = open_req.read()
- content_type = open_req.headers.get('content-type')
-
- if not content_type:
- return nothing
-
- if 'image' in content_type:
- p = ImageFile.Parser()
- new_data = content
- while not p.image and new_data:
- p.feed(new_data)
- new_data = open_req.read(chunk_size)
- content += new_data
-
- #return the size, or return the data
- if dimension and p.image:
- return p.image.size
- elif dimension:
- return nothing
- elif dimension:
- #expected an image, but didn't get one
- return nothing
-
- return content_type, content
-
- except (URLError, HTTPError, InvalidURL), e:
- cur_try += 1
- if cur_try >= retries:
- log.debug('error while fetching: %s referer: %s' % (url, referer))
- log.debug(e)
- return nothing
- finally:
- if 'open_req' in locals():
- open_req.close()
-
-@memoize('media.fetch_size')
-def fetch_size(url, referer = None, retries = 1):
- return fetch_url(url, referer, retries, dimension = True)
-
-class MediaEmbed(object):
- width = None
- height = None
- content = None
- scrolling = False
-
- def __init__(self, height, width, content, scrolling = False):
- self.height = int(height)
- self.width = int(width)
- self.content = content
- self.scrolling = scrolling
-
-class Scraper:
- def __init__(self, url):
- self.url = url
- self.content = None
- self.content_type = None
- self.soup = None
-
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.url)
-
- def download(self):
- self.content_type, self.content = fetch_url(self.url)
- if self.content_type and 'html' in self.content_type and self.content:
- self.soup = BeautifulSoup(self.content)
-
- def image_urls(self):
- #if the original url was an image, use that
- if 'image' in self.content_type:
- yield self.url
- elif self.soup:
- images = self.soup.findAll('img', src = True)
- for i in images:
- image_url = urlparse.urljoin(self.url, i['src'])
- yield image_url
-
- def largest_image_url(self):
- if not self.content:
- self.download()
-
- #if download didn't work
- if not self.content or not self.content_type:
- return None
-
- max_area = 0
- max_url = None
-
- if self.soup:
- og_image = (self.soup.find('meta', property='og:image') or
- self.soup.find('meta', attrs={'name': 'og:image'}))
- if og_image and og_image['content']:
- log.debug("Using og:image")
- return og_image['content']
- thumbnail_spec = self.soup.find('link', rel = 'image_src')
- if thumbnail_spec and thumbnail_spec['href']:
- log.debug("Using image_src")
- return thumbnail_spec['href']
-
- for image_url in self.image_urls():
- size = fetch_size(image_url, referer = self.url)
- if not size:
- continue
-
- area = size[0] * size[1]
-
- #ignore little images
- if area < 5000:
- log.debug('ignore little %s' % image_url)
- continue
-
- #ignore excessively long/wide images
- if max(size) / min(size) > 1.5:
- log.debug('ignore dimensions %s' % image_url)
- continue
-
- #penalize images with "sprite" in their name
- if 'sprite' in image_url.lower():
- log.debug('penalizing sprite %s' % image_url)
- area /= 10
-
- if area > max_area:
- max_area = area
- max_url = image_url
-
- return max_url
-
- def thumbnail(self):
- image_url = self.largest_image_url()
- if image_url:
- content_type, image_str = fetch_url(image_url, referer = self.url)
- if image_str:
- image = str_to_image(image_str)
- try:
- image = prepare_image(image)
- except IOError, e:
- #can't read interlaced PNGs, ignore
- if 'interlaced' in e.message:
- return
- raise
- return image
-
- def media_object(self):
- for deepscraper in deepscrapers:
- ds = deepscraper()
- found = ds.find_media_object(self)
- if found:
- return found
-
- @classmethod
- def media_embed(cls):
- raise NotImplementedError
-
-class MediaScraper(Scraper):
- media_template = ""
- thumbnail_template = ""
- video_id = None
- video_id_rx = None
-
- def __init__(self, url):
- Scraper.__init__(self, url)
-
- # first try the simple regex against the URL. If that fails,
- # see if the MediaScraper subclass has its own extraction
- # function
- if self.video_id_rx:
- m = self.video_id_rx.match(url)
- if m:
- self.video_id = m.groups()[0]
- if not self.video_id:
- video_id = self.video_id_extract()
- if video_id:
- self.video_id = video_id
- if not self.video_id:
- #if we still can't find the id just treat it like a normal page
- log.debug('reverting to regular scraper: %s' % url)
- self.__class__ = Scraper
-
- def video_id_extract(self):
- return None
-
- def largest_image_url(self):
- if self.thumbnail_template:
- return self.thumbnail_template.replace('$video_id', self.video_id)
- else:
- return Scraper.largest_image_url(self)
-
- def media_object(self):
- return dict(video_id = self.video_id,
- type = self.domains[0])
-
- @classmethod
- def media_embed(cls, video_id = None, height = None, width = None, **kw):
- content = cls.media_template.replace('$video_id', video_id)
- return MediaEmbed(height = height or cls.height,
- width = width or cls.width,
- content = content)
-
-def youtube_in_google(google_url):
- h = Scraper(google_url)
- h.download()
- try:
- youtube_url = h.soup.find('div', 'original-text').findNext('a')['href']
- log.debug('%s is really %s' % (google_url, youtube_url))
- return youtube_url
- except AttributeError, KeyError:
- pass
-
-def make_scraper(url):
- domain = utils.domain(url)
- scraper = Scraper
- for suffix, clses in scrapers.iteritems():
- for cls in clses:
- if domain.endswith(suffix):
- scraper = cls
- break
-
- #sometimes youtube scrapers masquerade as google scrapers
- if scraper == GootubeScraper:
- youtube_url = youtube_in_google(url)
- if youtube_url:
- return make_scraper(youtube_url)
- return scraper(url)
-
-########## site-specific video scrapers ##########
-
-class YoutubeScraper(MediaScraper):
- domains = ['youtube.com']
- height = 295
- width = 480
- media_template = ' '
- thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg'
- video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*')
- video_deeplink_rx = re.compile('.*#t=(\d+)m(\d+)s.*')
-
- def video_id_extract(self):
- vid = self.video_id_rx.match(self.url)
- if(vid):
- video_id = vid.groups()[0]
- d = self.video_deeplink_rx.match(self.url)
- if(d):
- seconds = int(d.groups()[0])*60 + int(d.groups()[1])
- video_id += "&start=%d" % seconds
- return video_id
-
- def largest_image_url(self):
- # Remove the deeplink part from the video id
- return self.thumbnail_template.replace("$video_id",
- self.video_id.split("&")[0])
-
-class TedScraper(MediaScraper):
- domains = ['ted.com']
- height = 326
- width = 446
- media_template = ' '
- flashvars_rx = re.compile('.*flashvars="(.*)".*')
-
- def video_id_extract(self):
- if "/talks/" in self.url:
- content_type, content = fetch_url(self.url.replace("/talks/","/talks/embed/"))
- if content:
- m = self.flashvars_rx.match(content)
- if m:
- return m.groups()[0]
- def largest_image_url(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- return self.soup.find('link', rel = 'image_src')['href']
-
-
-class MetacafeScraper(MediaScraper):
- domains = ['metacafe.com']
- height = 345
- width = 400
- media_template = ' '
- video_id_rx = re.compile('.*/watch/([^/]+)/.*')
-
- def media_object(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
-
-class GootubeScraper(MediaScraper):
- domains = ['video.google.com']
- height = 326
- width = 400
- media_template = ' '
- video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*')
- gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S)
-
- def largest_image_url(self):
- if not self.content:
- self.download()
-
- if not self.content:
- return None
-
- m = self.gootube_thumb_rx.match(self.content)
- if m:
- image_url = m.groups()[0]
- image_url = utils.safe_eval_str(image_url)
- return image_url
-
-class VimeoScraper(MediaScraper):
- domains = ['vimeo.com']
- height = 448
- width = 520
- media_template = ' '
- video_id_rx = re.compile('.*/(.*)')
-
- def media_object(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
-
-class BreakScraper(MediaScraper):
- domains = ['break.com']
- height = 421
- width = 520
- media_template = ' '
- video_id_rx = re.compile('.*/index/([^/]+).*');
-
- def video_id_extract(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- video_src = self.soup.find('link', rel = 'video_src')
- if video_src and video_src['href']:
- return video_src['href']
-
-class TheOnionScraper(MediaScraper):
- domains = ['theonion.com']
- height = 430
- width = 480
- media_template = """
-
-
-
-
-
-
-
- """
- video_id_rx = re.compile('.*/video/([^/?#]+).*')
-
- def media_object(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- video_url = self.soup.find('meta', attrs={'name': 'nid'})['content']
- return dict(video_id = video_url,
- type = self.domains[0])
-
-class CollegeHumorScraper(MediaScraper):
- domains = ['collegehumor.com']
- height = 390
- width = 520
- media_template = ' '
- video_id_rx = re.compile('.*video:(\d+).*');
-
-class FunnyOrDieScraper(MediaScraper):
- domains = ['funnyordie.com']
- height = 438
- width = 464
- media_template = ' '
- thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac'
- video_id_rx = re.compile('.*/videos/([^/]+)/.*')
-
-class ComedyCentralScraper(MediaScraper):
- domains = ['comedycentral.com']
- height = 316
- width = 332
- media_template = ' '
- video_id_rx = re.compile('.*videoId=(\d+).*')
-
-class TheDailyShowScraper(MediaScraper):
- domains = ['thedailyshow.com']
- height = 353
- width = 360
- media_template = """ """
-
- def video_id_extract(self):
- "This is a bit of a hack"
- if not self.soup:
- self.download()
-
- if self.soup:
- embed_container = self.soup.find('div', {'class': 'videoplayerPromo module'})
- if embed_container:
- if embed_container['id'].startswith('promo_'):
- video_id = embed_container['id'].split('_')[1]
- return video_id
-
-class ColbertNationScraper(ComedyCentralScraper):
- domains = ['colbertnation.com']
- video_id_rx = re.compile('.*videos/(\d+)/.*')
-
-class LiveLeakScraper(MediaScraper):
- domains = ['liveleak.com']
- height = 370
- width = 450
- media_template = ' '
- video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*')
-
- def largest_image_url(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- return self.soup.find('link', rel = 'videothumbnail')['href']
-
-class DailyMotionScraper(MediaScraper):
- domains = ['dailymotion.com']
- height = 381
- width = 480
- media_template = ' '
- video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*')
-
- def media_object(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- video_url = self.soup.find('link', rel = 'video_src')['href']
- return dict(video_id = video_url,
- type = self.domains[0])
-
-class RevverScraper(MediaScraper):
- domains = ['revver.com']
- height = 392
- width = 480
- media_template = ''
- video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*')
-
-class EscapistScraper(MediaScraper):
- domains = ['escapistmagazine.com']
- height = 294
- width = 480
- media_template = """"""
- video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*')
-
-class JustintvScraper(MediaScraper):
- """Can grab streams from justin.tv, but not clips"""
- domains = ['justin.tv']
- height = 295
- width = 353
- stream_media_template = """ """
- video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$')
-
- @classmethod
- def media_embed(cls, video_id, **kw):
- content = cls.stream_media_template.replace('$video_id', video_id)
- return MediaEmbed(height = cls.height,
- width = cls.width,
- content = content)
-
-class SoundcloudScraper(MediaScraper):
- """soundcloud.com"""
- domains = ['soundcloud.com']
- height = 81
- width = 400
- media_template = """
-
-
-
-
-
-
- """
- video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)')
-
-class CraigslistScraper(MediaScraper):
- domains = ['craigslist.org']
- height = 480
- width = 640
- max_size_kb = 50
-
- def video_id_extract(self):
- return self.url
-
- def media_object(self):
- if not self.soup:
- self.download()
-
- if self.soup:
- ub = self.soup.find('div', {'id': 'userbody'})
- if ub:
- ub = str(ub)
- if len(ub) <= self.max_size_kb * 1024:
- return dict(content = ub,
- type = self.domains[0])
-
- @classmethod
- def media_embed(cls, content, **kw):
- return MediaEmbed(height = cls.height,
- width = cls.width,
- content = content,
- scrolling = True)
-
-
-########## oembed rich-media scrapers ##########
-
-class OEmbed(Scraper):
- """
- Oembed Scraper
- ==============
- Tries to use the oembed standard to create a media object.
-
- url_re: Regular Expression to match the incoming url against.
- api_endpoint: Url of the api end point you are using.
- api_params: Default Params to be sent with the outgoing request.
- """
- url_re = ''
- api_endpoint = ''
- api_params = {}
-
- def __init__(self, url):
- Scraper.__init__(self, url)
- self.oembed = None
-
- #Fallback to the scraper if the url doesn't match
- if not self.url_re.match(self.url):
- self.__class__ = Scraper
-
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, self.url)
-
- def utf8_encode(self, input):
- """UTF-8 encodes any strings in an object (from json.loads)"""
- if isinstance(input, dict):
- return {self.utf8_encode(key): self.utf8_encode(value)
- for key, value in input.iteritems()}
- elif isinstance(input, list):
- return [self.utf8_encode(item)
- for item in input]
- elif isinstance(input, unicode):
- return input.encode('utf-8')
- else:
- return input
-
- def download(self):
- self.api_params.update( { 'url':self.url})
- query = urllib.urlencode(self.api_params)
- api_url = "%s?%s" % (self.api_endpoint, query)
-
- self.content_type, self.content = fetch_url(api_url)
-
- #Either a 404 or 500.
- if not self.content:
- #raise ValueError('ISSUE CALLING %s' %api_url)
- log.warning('oEmbed call (%s) failed to return content for %s'
- %(api_url, self.url))
- return None
-
- try:
- self.oembed = json.loads(self.content,
- object_hook=self.utf8_encode)
- except ValueError, e:
- log.error('oEmbed call (%s) return invalid json for %s'
- %(api_url, self.url))
- return None
-
- def image_urls(self):
- #if the original url was an image, use that
- if self.oembed and self.oembed.get('type') =='photo':
- yield self.oembed.get('url')
- elif self.oembed and self.oembed.get('thumbnail_url'):
- yield self.oembed.get('thumbnail_url')
-
- def largest_image_url(self):
- #Seems to be the default place to check if the download has happened.
- if not self.oembed:
- self.download()
-
- #if the original url was of the photo type
- if self.oembed and self.oembed.get('type') =='photo':
- return self.oembed.get('url')
- elif self.oembed and self.oembed.get('thumbnail_url'):
- return self.oembed.get('thumbnail_url')
-
- def media_object(self):
- #Seems to be the default place to check if the download has happened.
- if not self.oembed:
- self.download()
-
- if self.oembed and self.oembed.get('type') in ['video', 'rich']:
- for domain in self.domains:
- if self.url.find(domain) > -1:
- return dict(type=domain, oembed=self.oembed)
- return None
-
- @classmethod
- def media_embed(cls, video_id = None, height = None, width = None, **kw):
- content = None
- oembed = kw.get('oembed')
-
- # check if oembed is there and has html
- if oembed and oembed.get('html'):
- content = oembed.get('html')
- if content and oembed.get('height') and oembed.get('width'):
- return MediaEmbed(height = oembed['height'],
- width = oembed['width'],
- content = content)
-
-class EmbedlyOEmbed(OEmbed):
- """
- Embedly oEmbed Provider
- =======================
- documentation: http://api.embed.ly
- """
- domains = ['23hq.com', '5min.com', '99dollarmusicvideos.com',
- 'abcnews.go.com', 'achewood.com', 'allthingsd.com', 'amazon.com',
- 'aniboom.com', 'animoto.com', 'asofterworld.com', 'atom.com',
- 'audioboo.com', 'bambuser.com', 'bandcamp.com', 'barelydigital.com',
- 'barelypolitical.com', 'bigthink.com', 'blip.tv', 'bnter.com',
- 'boston.com', 'brainbird.net', 'bravotv.com', 'break.com',
- 'brizzly.com', 'cbsnews.com', 'channelfrederator.com', 'chart.ly',
- 'cl.ly', 'clikthrough.com', 'clipfish.de', 'clipshack.com', 'cnbc.com',
- 'cnn.com', 'colbertnation.com', 'collegehumor.com', 'color.com',
- 'comedycentral.com', 'compete.com', 'confreaks.net', 'crackle.com',
- 'craigslist.org', 'crocodoc.com', 'crunchbase.com', 'dailybooth.com',
- 'dailymile.com', 'dailymotion.com', 'deviantart.com', 'digg.com',
- 'dipdive.com', 'discovery.com', 'dotsub.com', 'dribbble.com',
- 'edition.cnn.com', 'emberapp.com', 'escapistmagazine.com',
- 'espn.go.com', 'facebook.com', 'fancast.com', 'flickr.com', 'fora.tv',
- 'formspring.me', 'fotopedia.com', 'freemusicarchive.org',
- 'funnyordie.com', 'gametrailers.com', 'gist.github.com',
- 'globalpost.com', 'godtube.com', 'gogoyoko.com', 'google.com',
- 'graphicly.com', 'grindtv.com', 'grooveshark.com', 'guardian.co.uk',
- 'hark.com', 'howcast.com', 'huffduffer.com', 'hulu.com',
- 'hungrynation.tv', 'ifood.tv', 'img.ly', 'imgur.com', 'indenti.ca',
- 'indymogul.com', 'instagr.am', 'issuu.com', 'itunes.apple.com',
- 'justin.tv', 'kickstarter.com', 'kinomap.com', 'kiva.org',
- 'koldcast.tv', 'last.fm', 'lightbox.com', 'liveleak.com',
- 'livestream.com', 'lockerz.com', 'logotv.com', 'lonelyplanet.com',
- 'maps.google.com', 'meadd.com', 'mediamatters.org', 'meetup.com',
- 'metacafe.com', 'metacdn.com', 'mixcloud.com', 'mixergy.com',
- 'mlkshk.com', 'mobypicture.com', 'money.cnn.com', 'movies.yahoo.com',
- 'msnbc.com', 'my.opera.com', 'myloc.me', 'myvideo.de',
- 'nationalgeographic.com', 'nfb.ca', 'npr.org', 'nzonscreen.com',
- 'overstream.net', 'ow.ly', 'pastebin.com', 'pastie.org',
- 'phodroid.com', 'photobucket.com', 'photozou.jp',
- 'picasaweb.google.com', 'picplz.com', 'pikchur.com', 'ping.fm',
- 'polldaddy.com', 'polleverywhere.com', 'posterous.com', 'prezi.com',
- 'qik.com', 'quantcast.com', 'questionablecontent.net', 'qwantz.com',
- 'qwiki.com', 'radionomy.com', 'radioreddit.com', 'rdio.com',
- 'recordsetter.com','redux.com', 'revision3.com', 'revver.com',
- 'saynow.com', 'schooltube.com', 'sciencestage.com', 'scrapblog.com',
- 'screencast.com', 'screenr.com', 'scribd.com', 'sendables.jibjab.com',
- 'share.ovi.com', 'shitmydadsays.com', 'shopstyle.com', 'skitch.com',
- 'slideshare.net', 'smugmug.com', 'snotr.com', 'socialcam.com',
- 'someecards.com', 'soundcloud.com', 'speakerdeck.com', 'spike.com',
- 'statsheet.com', 'status.net', 'storify.com', 'streetfire.net',
- 'studivz.net', 'tangle.com', 'teachertube.com', 'techcrunch.tv',
- 'ted.com', 'thedailyshow.com', 'theonion.com', 'threadbanger.com',
- 'timetoast.com', 'tinypic.com', 'tmiweekly.com', 'traileraddict.com',
- 'trailerspy.com', 'trooptube.tv', 'trutv.com', 'tumblr.com',
- 'twitgoo.com', 'twitlonger.com', 'twitpic.com', 'twitrpix.com',
- 'twitter.com', 'twitvid.com', 'ultrakawaii.com', 'urtak.com',
- 'uservoice.com', 'ustream.com', 'viddler.com', 'video.forbes.com',
- 'video.google.com', 'video.jardenberg.com', 'video.pbs.org',
- 'video.yahoo.com', 'videos.nymag.com', 'vids.myspace.com', 'vimeo.com',
- 'vodcars.com', 'washingtonpost.com', 'whitehouse.gov', 'whosay.com',
- 'wikimedia.org', 'wikipedia.org', 'wistia.com', 'wordpress.tv',
- 'worldstarhiphop.com', 'xiami.com', 'xkcd.com', 'xtranormal.com',
- 'yfrog.com', 'youku.com', 'youtu.be', 'youtube.com', 'zapiks.com',
- 'zero-inch.com']
-
- url_re = re.compile(
- 'http:\\/\\/.*youtube\\.com\\/watch.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
- 'https:\\/\\/.*youtube\\.com\\/watch.*|' +
- 'https:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' +
- 'http:\\/\\/youtu\\.be\\/.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/user\\/.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/.*\\#.*\\/.*|' +
- 'http:\\/\\/m\\.youtube\\.com\\/watch.*|' +
- 'http:\\/\\/m\\.youtube\\.com\\/index.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/profile.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/view_play_list.*|' +
- 'http:\\/\\/.*\\.youtube\\.com\\/playlist.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*\\/b\\/.*|' +
- 'http:\\/\\/.*justin\\.tv\\/.*\\/w\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/recorded\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/channel\\/.*|' +
- 'http:\\/\\/www\\.ustream\\.tv\\/.*|' +
- 'http:\\/\\/qik\\.com\\/video\\/.*|' +
- 'http:\\/\\/qik\\.com\\/.*|' +
- 'http:\\/\\/qik\\.ly\\/.*|' +
- 'http:\\/\\/.*revision3\\.com\\/.*|' +
- 'http:\\/\\/.*\\.dailymotion\\.com\\/video\\/.*|' +
- 'http:\\/\\/.*\\.dailymotion\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/collegehumor\\.com\\/video:.*|' +
- 'http:\\/\\/collegehumor\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.collegehumor\\.com\\/video:.*|' +
- 'http:\\/\\/www\\.collegehumor\\.com\\/video\\/.*|' +
- 'http:\\/\\/.*twitvid\\.com\\/.*|' +
- 'http:\\/\\/www\\.break\\.com\\/.*\\/.*|' +
- 'http:\\/\\/vids\\.myspace\\.com\\/index\\.cfm\\?fuseaction=vids\\.individual&videoid.*|' +
- 'http:\\/\\/www\\.myspace\\.com\\/index\\.cfm\\?fuseaction=.*&videoid.*|' +
- 'http:\\/\\/www\\.metacafe\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.metacafe\\.com\\/w\\/.*|' +
- 'http:\\/\\/blip\\.tv\\/.*\\/.*|' +
- 'http:\\/\\/.*\\.blip\\.tv\\/.*\\/.*|' +
- 'http:\\/\\/video\\.google\\.com\\/videoplay\\?.*|' +
- 'http:\\/\\/.*revver\\.com\\/video\\/.*|' +
- 'http:\\/\\/video\\.yahoo\\.com\\/watch\\/.*\\/.*|' +
- 'http:\\/\\/video\\.yahoo\\.com\\/network\\/.*|' +
- 'http:\\/\\/.*viddler\\.com\\/explore\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/liveleak\\.com\\/view\\?.*|' +
- 'http:\\/\\/www\\.liveleak\\.com\\/view\\?.*|' +
- 'http:\\/\\/animoto\\.com\\/play\\/.*|' +
- 'http:\\/\\/dotsub\\.com\\/view\\/.*|' +
- 'http:\\/\\/www\\.overstream\\.net\\/view\\.php\\?oid=.*|' +
- 'http:\\/\\/www\\.livestream\\.com\\/.*|' +
- 'http:\\/\\/www\\.worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
- 'http:\\/\\/worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' +
- 'http:\\/\\/teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www1\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/www2\\.teachertube\\.com\\/viewVideo\\.php.*|' +
- 'http:\\/\\/bambuser\\.com\\/v\\/.*|' +
- 'http:\\/\\/bambuser\\.com\\/channel\\/.*|' +
- 'http:\\/\\/bambuser\\.com\\/channel\\/.*\\/broadcast\\/.*|' +
- 'http:\\/\\/www\\.schooltube\\.com\\/video\\/.*\\/.*|' +
- 'http:\\/\\/bigthink\\.com\\/ideas\\/.*|' +
- 'http:\\/\\/bigthink\\.com\\/series\\/.*|' +
- 'http:\\/\\/sendables\\.jibjab\\.com\\/view\\/.*|' +
- 'http:\\/\\/sendables\\.jibjab\\.com\\/originals\\/.*|' +
- 'http:\\/\\/www\\.xtranormal\\.com\\/watch\\/.*|' +
- 'http:\\/\\/socialcam\\.com\\/v\\/.*|' +
- 'http:\\/\\/www\\.socialcam\\.com\\/v\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/media\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/member\\/.*\\/media\\/.*|' +
- 'http:\\/\\/dipdive\\.com\\/v\\/.*|' +
- 'http:\\/\\/.*\\.dipdive\\.com\\/media\\/.*|' +
- 'http:\\/\\/.*\\.dipdive\\.com\\/v\\/.*|' +
- 'http:\\/\\/v\\.youku\\.com\\/v_show\\/.*\\.html|' +
- 'http:\\/\\/v\\.youku\\.com\\/v_playlist\\/.*\\.html|' +
- 'http:\\/\\/www\\.snotr\\.com\\/video\\/.*|' +
- 'http:\\/\\/snotr\\.com\\/video\\/.*|' +
- 'http:\\/\\/video\\.jardenberg\\.se\\/.*|' +
- 'http:\\/\\/www\\.clipfish\\.de\\/.*\\/.*\\/video\\/.*|' +
- 'http:\\/\\/www\\.myvideo\\.de\\/watch\\/.*|' +
- 'http:\\/\\/www\\.whitehouse\\.gov\\/photos-and-video\\/video\\/.*|' +
- 'http:\\/\\/www\\.whitehouse\\.gov\\/video\\/.*|' +
- 'http:\\/\\/wh\\.gov\\/photos-and-video\\/video\\/.*|' +
- 'http:\\/\\/wh\\.gov\\/video\\/.*|' +
- 'http:\\/\\/www\\.hulu\\.com\\/watch.*|' +
- 'http:\\/\\/www\\.hulu\\.com\\/w\\/.*|' +
- 'http:\\/\\/hulu\\.com\\/watch.*|' +
- 'http:\\/\\/hulu\\.com\\/w\\/.*|' +
- 'http:\\/\\/.*crackle\\.com\\/c\\/.*|' +
- 'http:\\/\\/www\\.fancast\\.com\\/.*\\/videos|' +
- 'http:\\/\\/www\\.funnyordie\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.funnyordie\\.com\\/m\\/.*|' +
- 'http:\\/\\/funnyordie\\.com\\/videos\\/.*|' +
- 'http:\\/\\/funnyordie\\.com\\/m\\/.*|' +
- 'http:\\/\\/www\\.vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/www\\.vimeo\\.com\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/.*|' +
- 'http:\\/\\/vimeo\\.com\\/m\\/\\#\\/.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/talks\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/.*\\.html.*|' +
- 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/lang\\/.*\\/.*\\.html.*|' +
- 'http:\\/\\/.*nfb\\.ca\\/film\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/full-episodes\\/.*|' +
- 'http:\\/\\/www\\.thedailyshow\\.com\\/collection\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video\\/.*|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/trailer|' +
- 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-collections\\/.*|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/full-episodes\\/.*|' +
- 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-videos\\/.*|' +
- 'http:\\/\\/www\\.comedycentral\\.com\\/videos\\/index\\.jhtml\\?.*|' +
- 'http:\\/\\/www\\.theonion\\.com\\/video\\/.*|' +
- 'http:\\/\\/theonion\\.com\\/video\\/.*|' +
- 'http:\\/\\/wordpress\\.tv\\/.*\\/.*\\/.*\\/.*\\/|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/trailer\\/.*|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/clip\\/.*|' +
- 'http:\\/\\/www\\.traileraddict\\.com\\/poster\\/.*|' +
- 'http:\\/\\/www\\.escapistmagazine\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*|' +
- 'http:\\/\\/www\\.trailerspy\\.com\\/view_video\\.php.*|' +
- 'http:\\/\\/www\\.atom\\.com\\/.*\\/.*\\/|' +
- 'http:\\/\\/fora\\.tv\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.spike\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.gametrailers\\.com\\/video\\/.*|' +
- 'http:\\/\\/gametrailers\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.koldcast\\.tv\\/video\\/.*|' +
- 'http:\\/\\/www\\.koldcast\\.tv\\/\\#video:.*|' +
- 'http:\\/\\/techcrunch\\.tv\\/watch.*|' +
- 'http:\\/\\/techcrunch\\.tv\\/.*\\/watch.*|' +
- 'http:\\/\\/mixergy\\.com\\/.*|' +
- 'http:\\/\\/video\\.pbs\\.org\\/video\\/.*|' +
- 'http:\\/\\/www\\.zapiks\\.com\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggnation\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggreel\\/.*|' +
- 'http:\\/\\/tv\\.digg\\.com\\/diggdialogg\\/.*|' +
- 'http:\\/\\/www\\.trutv\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.nzonscreen\\.com\\/title\\/.*|' +
- 'http:\\/\\/nzonscreen\\.com\\/title\\/.*|' +
- 'http:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
- 'https:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' +
- 'http:\\/\\/hungrynation\\.tv\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.hungrynation\\.tv\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/hungrynation\\.tv\\/episode\\/.*|' +
- 'http:\\/\\/www\\.hungrynation\\.tv\\/episode\\/.*|' +
- 'http:\\/\\/indymogul\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.indymogul\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/indymogul\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.indymogul\\.com\\/episode\\/.*|' +
- 'http:\\/\\/channelfrederator\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.channelfrederator\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/channelfrederator\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.channelfrederator\\.com\\/episode\\/.*|' +
- 'http:\\/\\/tmiweekly\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.tmiweekly\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/tmiweekly\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.tmiweekly\\.com\\/episode\\/.*|' +
- 'http:\\/\\/99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/99dollarmusicvideos\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/episode\\/.*|' +
- 'http:\\/\\/ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.ultrakawaii\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/ultrakawaii\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.ultrakawaii\\.com\\/episode\\/.*|' +
- 'http:\\/\\/barelypolitical\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelypolitical\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/barelypolitical\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelypolitical\\.com\\/episode\\/.*|' +
- 'http:\\/\\/barelydigital\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelydigital\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/barelydigital\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.barelydigital\\.com\\/episode\\/.*|' +
- 'http:\\/\\/threadbanger\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.threadbanger\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/threadbanger\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.threadbanger\\.com\\/episode\\/.*|' +
- 'http:\\/\\/vodcars\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/www\\.vodcars\\.com\\/.*\\/episode\\/.*|' +
- 'http:\\/\\/vodcars\\.com\\/episode\\/.*|' +
- 'http:\\/\\/www\\.vodcars\\.com\\/episode\\/.*|' +
- 'http:\\/\\/confreaks\\.net\\/videos\\/.*|' +
- 'http:\\/\\/www\\.confreaks\\.net\\/videos\\/.*|' +
- 'http:\\/\\/video\\.allthingsd\\.com\\/video\\/.*|' +
- 'http:\\/\\/videos\\.nymag\\.com\\/.*|' +
- 'http:\\/\\/aniboom\\.com\\/animation-video\\/.*|' +
- 'http:\\/\\/www\\.aniboom\\.com\\/animation-video\\/.*|' +
- 'http:\\/\\/clipshack\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/www\\.clipshack\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/grindtv\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/www\\.grindtv\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/recipe\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/video\\/.*|' +
- 'http:\\/\\/ifood\\.tv\\/channel\\/user\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/recipe\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/video\\/.*|' +
- 'http:\\/\\/www\\.ifood\\.tv\\/channel\\/user\\/.*|' +
- 'http:\\/\\/logotv\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.logotv\\.com\\/video\\/.*|' +
- 'http:\\/\\/lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/www\\.lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' +
- 'http:\\/\\/streetfire\\.net\\/video\\/.*\\.htm.*|' +
- 'http:\\/\\/www\\.streetfire\\.net\\/video\\/.*\\.htm.*|' +
- 'http:\\/\\/trooptube\\.tv\\/videos\\/.*|' +
- 'http:\\/\\/www\\.trooptube\\.tv\\/videos\\/.*|' +
- 'http:\\/\\/sciencestage\\.com\\/v\\/.*\\.html|' +
- 'http:\\/\\/sciencestage\\.com\\/a\\/.*\\.html|' +
- 'http:\\/\\/www\\.sciencestage\\.com\\/v\\/.*\\.html|' +
- 'http:\\/\\/www\\.sciencestage\\.com\\/a\\/.*\\.html|' +
- 'http:\\/\\/www\\.godtube\\.com\\/featured\\/video\\/.*|' +
- 'http:\\/\\/godtube\\.com\\/featured\\/video\\/.*|' +
- 'http:\\/\\/www\\.godtube\\.com\\/watch\\/.*|' +
- 'http:\\/\\/godtube\\.com\\/watch\\/.*|' +
- 'http:\\/\\/www\\.tangle\\.com\\/view_video.*|' +
- 'http:\\/\\/mediamatters\\.org\\/mmtv\\/.*|' +
- 'http:\\/\\/www\\.clikthrough\\.com\\/theater\\/video\\/.*|' +
- 'http:\\/\\/gist\\.github\\.com\\/.*|' +
- 'http:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' +
- 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' +
- 'http:\\/\\/www\\.crunchbase\\.com\\/.*\\/.*|' +
- 'http:\\/\\/crunchbase\\.com\\/.*\\/.*|' +
- 'http:\\/\\/www\\.slideshare\\.net\\/.*\\/.*|' +
- 'http:\\/\\/www\\.slideshare\\.net\\/mobile\\/.*\\/.*|' +
- 'http:\\/\\/slidesha\\.re\\/.*|' +
- 'http:\\/\\/scribd\\.com\\/doc\\/.*|' +
- 'http:\\/\\/www\\.scribd\\.com\\/doc\\/.*|' +
- 'http:\\/\\/scribd\\.com\\/mobile\\/documents\\/.*|' +
- 'http:\\/\\/www\\.scribd\\.com\\/mobile\\/documents\\/.*|' +
- 'http:\\/\\/screenr\\.com\\/.*|' +
- 'http:\\/\\/polldaddy\\.com\\/community\\/poll\\/.*|' +
- 'http:\\/\\/polldaddy\\.com\\/poll\\/.*|' +
- 'http:\\/\\/answers\\.polldaddy\\.com\\/poll\\/.*|' +
- 'http:\\/\\/www\\.5min\\.com\\/Video\\/.*|' +
- 'http:\\/\\/www\\.howcast\\.com\\/videos\\/.*|' +
- 'http:\\/\\/www\\.screencast\\.com\\/.*\\/media\\/.*|' +
- 'http:\\/\\/screencast\\.com\\/.*\\/media\\/.*|' +
- 'http:\\/\\/www\\.screencast\\.com\\/t\\/.*|' +
- 'http:\\/\\/screencast\\.com\\/t\\/.*|' +
- 'http:\\/\\/issuu\\.com\\/.*\\/docs\\/.*|' +
- 'http:\\/\\/www\\.kickstarter\\.com\\/projects\\/.*\\/.*|' +
- 'http:\\/\\/www\\.scrapblog\\.com\\/viewer\\/viewer\\.aspx.*|' +
- 'http:\\/\\/ping\\.fm\\/p\\/.*|' +
- 'http:\\/\\/chart\\.ly\\/symbols\\/.*|' +
- 'http:\\/\\/chart\\.ly\\/.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/maps\\?.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/\\?.*|' +
- 'http:\\/\\/maps\\.google\\.com\\/maps\\/ms\\?.*|' +
- 'http:\\/\\/.*\\.craigslist\\.org\\/.*\\/.*|' +
- 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/show\\.dml\\?id=.*|' +
- 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/showpic\\.dml\\?album=.*&picture=.*|' +
- 'http:\\/\\/tumblr\\.com\\/.*|' +
- 'http:\\/\\/.*\\.tumblr\\.com\\/post\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/polls\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/multiple_choice_polls\\/.*|' +
- 'http:\\/\\/www\\.polleverywhere\\.com\\/free_text_polls\\/.*|' +
- 'http:\\/\\/www\\.quantcast\\.com\\/wd:.*|' +
- 'http:\\/\\/www\\.quantcast\\.com\\/.*|' +
- 'http:\\/\\/siteanalytics\\.compete\\.com\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/e\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/.*\\/teams\\/.*\\/.*|' +
- 'http:\\/\\/statsheet\\.com\\/tools\\/chartlets\\?chart=.*|' +
- 'http:\\/\\/.*\\.status\\.net\\/notice\\/.*|' +
- 'http:\\/\\/identi\\.ca\\/notice\\/.*|' +
- 'http:\\/\\/brainbird\\.net\\/notice\\/.*|' +
- 'http:\\/\\/shitmydadsays\\.com\\/notice\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Profile\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/l\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Groups\\/Overview\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Info\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Install\\/.*|' +
- 'http:\\/\\/www\\.studivz\\.net\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Profile\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/l\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Groups\\/Overview\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Info\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Install\\/.*|' +
- 'http:\\/\\/www\\.meinvz\\.net\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/Profile\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/l\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/Groups\\/Overview\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Info\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Install\\/.*|' +
- 'http:\\/\\/www\\.schuelervz\\.net\\/.*|' +
- 'http:\\/\\/myloc\\.me\\/.*|' +
- 'http:\\/\\/pastebin\\.com\\/.*|' +
- 'http:\\/\\/pastie\\.org\\/.*|' +
- 'http:\\/\\/www\\.pastie\\.org\\/.*|' +
- 'http:\\/\\/redux\\.com\\/stream\\/item\\/.*\\/.*|' +
- 'http:\\/\\/redux\\.com\\/f\\/.*\\/.*|' +
- 'http:\\/\\/www\\.redux\\.com\\/stream\\/item\\/.*\\/.*|' +
- 'http:\\/\\/www\\.redux\\.com\\/f\\/.*\\/.*|' +
- 'http:\\/\\/cl\\.ly\\/.*|' +
- 'http:\\/\\/cl\\.ly\\/.*\\/content|' +
- 'http:\\/\\/speakerdeck\\.com\\/u\\/.*\\/p\\/.*|' +
- 'http:\\/\\/www\\.kiva\\.org\\/lend\\/.*|' +
- 'http:\\/\\/www\\.timetoast\\.com\\/timelines\\/.*|' +
- 'http:\\/\\/storify\\.com\\/.*\\/.*|' +
- 'http:\\/\\/.*meetup\\.com\\/.*|' +
- 'http:\\/\\/meetu\\.ps\\/.*|' +
- 'http:\\/\\/www\\.dailymile\\.com\\/people\\/.*\\/entries\\/.*|' +
- 'http:\\/\\/.*\\.kinomap\\.com\\/.*|' +
- 'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/content\\/.*|' +
- 'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/media\\/.*|' +
- 'http:\\/\\/prezi\\.com\\/.*\\/.*|' +
- 'http:\\/\\/.*\\.uservoice\\.com\\/.*\\/suggestions\\/.*|' +
- 'http:\\/\\/formspring\\.me\\/.*|' +
- 'http:\\/\\/www\\.formspring\\.me\\/.*|' +
- 'http:\\/\\/formspring\\.me\\/.*\\/q\\/.*|' +
- 'http:\\/\\/www\\.formspring\\.me\\/.*\\/q\\/.*|' +
- 'http:\\/\\/twitlonger\\.com\\/show\\/.*|' +
- 'http:\\/\\/www\\.twitlonger\\.com\\/show\\/.*|' +
- 'http:\\/\\/tl\\.gd\\/.*|' +
- 'http:\\/\\/www\\.qwiki\\.com\\/q\\/.*|' +
- 'http:\\/\\/crocodoc\\.com\\/.*|' +
- 'http:\\/\\/.*\\.crocodoc\\.com\\/.*|' +
- 'https:\\/\\/crocodoc\\.com\\/.*|' +
- 'https:\\/\\/.*\\.crocodoc\\.com\\/.*|' +
- 'http:\\/\\/www\\.wikipedia\\.org\\/wiki\\/.*|' +
- 'http:\\/\\/www\\.wikimedia\\.org\\/wiki\\/File.*|' +
- 'https:\\/\\/urtak\\.com\\/u\\/.*|' +
- 'https:\\/\\/urtak\\.com\\/clr\\/.*|' +
- 'http:\\/\\/graphicly\\.com\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/.*yfrog\\..*\\/.*|' +
- 'http:\\/\\/www\\.flickr\\.com\\/photos\\/.*|' +
- 'http:\\/\\/flic\\.kr\\/.*|' +
- 'http:\\/\\/twitpic\\.com\\/.*|' +
- 'http:\\/\\/www\\.twitpic\\.com\\/.*|' +
- 'http:\\/\\/twitpic\\.com\\/photos\\/.*|' +
- 'http:\\/\\/www\\.twitpic\\.com\\/photos\\/.*|' +
- 'http:\\/\\/.*imgur\\.com\\/.*|' +
- 'http:\\/\\/.*\\.posterous\\.com\\/.*|' +
- 'http:\\/\\/post\\.ly\\/.*|' +
- 'http:\\/\\/twitgoo\\.com\\/.*|' +
- 'http:\\/\\/i.*\\.photobucket\\.com\\/albums\\/.*|' +
- 'http:\\/\\/s.*\\.photobucket\\.com\\/albums\\/.*|' +
- 'http:\\/\\/phodroid\\.com\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.mobypicture\\.com\\/user\\/.*\\/view\\/.*|' +
- 'http:\\/\\/moby\\.to\\/.*|' +
- 'http:\\/\\/xkcd\\.com\\/.*|' +
- 'http:\\/\\/www\\.xkcd\\.com\\/.*|' +
- 'http:\\/\\/imgs\\.xkcd\\.com\\/.*|' +
- 'http:\\/\\/www\\.asofterworld\\.com\\/index\\.php\\?id=.*|' +
- 'http:\\/\\/www\\.asofterworld\\.com\\/.*\\.jpg|' +
- 'http:\\/\\/asofterworld\\.com\\/.*\\.jpg|' +
- 'http:\\/\\/www\\.qwantz\\.com\\/index\\.php\\?comic=.*|' +
- 'http:\\/\\/23hq\\.com\\/.*\\/photo\\/.*|' +
- 'http:\\/\\/www\\.23hq\\.com\\/.*\\/photo\\/.*|' +
- 'http:\\/\\/.*dribbble\\.com\\/shots\\/.*|' +
- 'http:\\/\\/drbl\\.in\\/.*|' +
- 'http:\\/\\/.*\\.smugmug\\.com\\/.*|' +
- 'http:\\/\\/.*\\.smugmug\\.com\\/.*\\#.*|' +
- 'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*|' +
- 'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*\\/sizes\\/.*|' +
- 'http:\\/\\/emberapp\\.com\\/.*\\/collections\\/.*\\/.*|' +
- 'http:\\/\\/emberapp\\.com\\/.*\\/categories\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/embr\\.it\\/.*|' +
- 'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*\\#.*|' +
- 'http:\\/\\/picasaweb\\.google\\.com.*\\/lh\\/photo\\/.*|' +
- 'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*|' +
- 'http:\\/\\/dailybooth\\.com\\/.*\\/.*|' +
- 'http:\\/\\/brizzly\\.com\\/pic\\/.*|' +
- 'http:\\/\\/pics\\.brizzly\\.com\\/.*\\.jpg|' +
- 'http:\\/\\/img\\.ly\\/.*|' +
- 'http:\\/\\/www\\.tinypic\\.com\\/view\\.php.*|' +
- 'http:\\/\\/tinypic\\.com\\/view\\.php.*|' +
- 'http:\\/\\/www\\.tinypic\\.com\\/player\\.php.*|' +
- 'http:\\/\\/tinypic\\.com\\/player\\.php.*|' +
- 'http:\\/\\/www\\.tinypic\\.com\\/r\\/.*\\/.*|' +
- 'http:\\/\\/tinypic\\.com\\/r\\/.*\\/.*|' +
- 'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.jpg|' +
- 'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.png|' +
- 'http:\\/\\/meadd\\.com\\/.*\\/.*|' +
- 'http:\\/\\/meadd\\.com\\/.*|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/art\\/.*|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/gallery\\/.*|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/\\#\\/.*|' +
- 'http:\\/\\/fav\\.me\\/.*|' +
- 'http:\\/\\/.*\\.deviantart\\.com|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/gallery|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.jpg|' +
- 'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.gif|' +
- 'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.jpg|' +
- 'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.gif|' +
- 'http:\\/\\/www\\.fotopedia\\.com\\/.*\\/.*|' +
- 'http:\\/\\/fotopedia\\.com\\/.*\\/.*|' +
- 'http:\\/\\/photozou\\.jp\\/photo\\/show\\/.*\\/.*|' +
- 'http:\\/\\/photozou\\.jp\\/photo\\/photo_only\\/.*\\/.*|' +
- 'http:\\/\\/instagr\\.am\\/p\\/.*|' +
- 'http:\\/\\/instagram\\.com\\/p\\/.*|' +
- 'http:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/img\\.skitch\\.com\\/.*|' +
- 'https:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' +
- 'https:\\/\\/img\\.skitch\\.com\\/.*|' +
- 'http:\\/\\/share\\.ovi\\.com\\/media\\/.*\\/.*|' +
- 'http:\\/\\/www\\.questionablecontent\\.net\\/|' +
- 'http:\\/\\/questionablecontent\\.net\\/|' +
- 'http:\\/\\/www\\.questionablecontent\\.net\\/view\\.php.*|' +
- 'http:\\/\\/questionablecontent\\.net\\/view\\.php.*|' +
- 'http:\\/\\/questionablecontent\\.net\\/comics\\/.*\\.png|' +
- 'http:\\/\\/www\\.questionablecontent\\.net\\/comics\\/.*\\.png|' +
- 'http:\\/\\/picplz\\.com\\/.*|' +
- 'http:\\/\\/twitrpix\\.com\\/.*|' +
- 'http:\\/\\/.*\\.twitrpix\\.com\\/.*|' +
- 'http:\\/\\/www\\.someecards\\.com\\/.*\\/.*|' +
- 'http:\\/\\/someecards\\.com\\/.*\\/.*|' +
- 'http:\\/\\/some\\.ly\\/.*|' +
- 'http:\\/\\/www\\.some\\.ly\\/.*|' +
- 'http:\\/\\/pikchur\\.com\\/.*|' +
- 'http:\\/\\/achewood\\.com\\/.*|' +
- 'http:\\/\\/www\\.achewood\\.com\\/.*|' +
- 'http:\\/\\/achewood\\.com\\/index\\.php.*|' +
- 'http:\\/\\/www\\.achewood\\.com\\/index\\.php.*|' +
- 'http:\\/\\/www\\.whosay\\.com\\/content\\/.*|' +
- 'http:\\/\\/www\\.whosay\\.com\\/photos\\/.*|' +
- 'http:\\/\\/www\\.whosay\\.com\\/videos\\/.*|' +
- 'http:\\/\\/say\\.ly\\/.*|' +
- 'http:\\/\\/ow\\.ly\\/i\\/.*|' +
- 'http:\\/\\/color\\.com\\/s\\/.*|' +
- 'http:\\/\\/bnter\\.com\\/convo\\/.*|' +
- 'http:\\/\\/mlkshk\\.com\\/p\\/.*|' +
- 'http:\\/\\/lockerz\\.com\\/s\\/.*|' +
- 'http:\\/\\/lightbox\\.com\\/.*|' +
- 'http:\\/\\/www\\.lightbox\\.com\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/.*\\/dp\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/dp\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/o\\/ASIN\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/gp\\/offer-listing\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/.*\\/ASIN\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/images\\/.*|' +
- 'http:\\/\\/.*amazon\\..*\\/gp\\/aw\\/d\\/.*|' +
- 'http:\\/\\/www\\.amzn\\.com\\/.*|' +
- 'http:\\/\\/amzn\\.com\\/.*|' +
- 'http:\\/\\/www\\.shopstyle\\.com\\/browse.*|' +
- 'http:\\/\\/www\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' +
- 'http:\\/\\/api\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' +
- 'http:\\/\\/www\\.shopstyle\\.com\\/action\\/viewLook.*|' +
- 'http:\\/\\/itunes\\.apple\\.com\\/.*|' +
- 'https:\\/\\/itunes\\.apple\\.com\\/.*|' +
- 'http:\\/\\/soundcloud\\.com\\/.*|' +
- 'http:\\/\\/soundcloud\\.com\\/.*\\/.*|' +
- 'http:\\/\\/soundcloud\\.com\\/.*\\/sets\\/.*|' +
- 'http:\\/\\/soundcloud\\.com\\/groups\\/.*|' +
- 'http:\\/\\/snd\\.sc\\/.*|' +
- 'http:\\/\\/www\\.last\\.fm\\/music\\/.*|' +
- 'http:\\/\\/www\\.last\\.fm\\/music\\/+videos\\/.*|' +
- 'http:\\/\\/www\\.last\\.fm\\/music\\/+images\\/.*|' +
- 'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/_\\/.*|' +
- 'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/.*|' +
- 'http:\\/\\/www\\.mixcloud\\.com\\/.*\\/.*\\/|' +
- 'http:\\/\\/www\\.radionomy\\.com\\/.*\\/radio\\/.*|' +
- 'http:\\/\\/radionomy\\.com\\/.*\\/radio\\/.*|' +
- 'http:\\/\\/www\\.hark\\.com\\/clips\\/.*|' +
- 'http:\\/\\/www\\.rdio\\.com\\/\\#\\/artist\\/.*\\/album\\/.*|' +
- 'http:\\/\\/www\\.rdio\\.com\\/artist\\/.*\\/album\\/.*|' +
- 'http:\\/\\/www\\.zero-inch\\.com\\/.*|' +
- 'http:\\/\\/.*\\.bandcamp\\.com\\/|' +
- 'http:\\/\\/.*\\.bandcamp\\.com\\/track\\/.*|' +
- 'http:\\/\\/.*\\.bandcamp\\.com\\/album\\/.*|' +
- 'http:\\/\\/freemusicarchive\\.org\\/music\\/.*|' +
- 'http:\\/\\/www\\.freemusicarchive\\.org\\/music\\/.*|' +
- 'http:\\/\\/freemusicarchive\\.org\\/curator\\/.*|' +
- 'http:\\/\\/www\\.freemusicarchive\\.org\\/curator\\/.*|' +
- 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.npr\\.org\\/templates\\/story\\/story\\.php.*|' +
- 'http:\\/\\/huffduffer\\.com\\/.*\\/.*|' +
- 'http:\\/\\/www\\.audioboo\\.fm\\/boos\\/.*|' +
- 'http:\\/\\/audioboo\\.fm\\/boos\\/.*|' +
- 'http:\\/\\/boo\\.fm\\/b.*|' +
- 'http:\\/\\/www\\.xiami\\.com\\/song\\/.*|' +
- 'http:\\/\\/xiami\\.com\\/song\\/.*|' +
- 'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' +
- 'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' +
- 'http:\\/\\/grooveshark\\.com\\/.*|' +
- 'http:\\/\\/radioreddit\\.com\\/songs.*|' +
- 'http:\\/\\/www\\.radioreddit\\.com\\/songs.*|' +
- 'http:\\/\\/radioreddit\\.com\\/\\?q=songs.*|' +
- 'http:\\/\\/www\\.radioreddit\\.com\\/\\?q=songs.*|' +
- 'http:\\/\\/www\\.gogoyoko\\.com\\/song\\/.*|' +
- 'http:\\/\\/espn\\.go\\.com\\/video\\/clip.*|' +
- 'http:\\/\\/espn\\.go\\.com\\/.*\\/story.*|' +
- 'http:\\/\\/abcnews\\.com\\/.*\\/video\\/.*|' +
- 'http:\\/\\/abcnews\\.com\\/video\\/playerIndex.*|' +
- 'http:\\/\\/washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.boston\\.com\\/video.*|' +
- 'http:\\/\\/boston\\.com\\/video.*|' +
- 'http:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' +
- 'http:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' +
- 'http:\\/\\/www\\.facebook\\.com\\/v\\/.*|' +
- 'https:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' +
- 'https:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' +
- 'https:\\/\\/www\\.facebook\\.com\\/v\\/.*|' +
- 'http:\\/\\/cnbc\\.com\\/id\\/.*\\?.*video.*|' +
- 'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\?.*video.*|' +
- 'http:\\/\\/cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' +
- 'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' +
- 'http:\\/\\/cbsnews\\.com\\/video\\/watch\\/.*|' +
- 'http:\\/\\/www\\.google\\.com\\/buzz\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.google\\.com\\/buzz\\/.*|' +
- 'http:\\/\\/www\\.google\\.com\\/profiles\\/.*|' +
- 'http:\\/\\/google\\.com\\/buzz\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/google\\.com\\/buzz\\/.*|' +
- 'http:\\/\\/google\\.com\\/profiles\\/.*|' +
- 'http:\\/\\/www\\.cnn\\.com\\/video\\/.*|' +
- 'http:\\/\\/edition\\.cnn\\.com\\/video\\/.*|' +
- 'http:\\/\\/money\\.cnn\\.com\\/video\\/.*|' +
- 'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' +
- 'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' +
- 'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' +
- 'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' +
- 'http:\\/\\/www\\.globalpost\\.com\\/video\\/.*|' +
- 'http:\\/\\/www\\.globalpost\\.com\\/dispatch\\/.*|' +
- 'http:\\/\\/guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/www\\.guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' +
- 'http:\\/\\/bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/www\\.bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' +
- 'http:\\/\\/video\\.nationalgeographic\\.com\\/.*\\/.*\\/.*\\.html|' +
- 'http:\\/\\/dsc\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/animal\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/health\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/investigation\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/military\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/planetgreen\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/science\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/tlc\\.discovery\\.com\\/videos\\/.*|' +
- 'http:\\/\\/video\\.forbes\\.com\\/fvn\\/.*|' +
- 'http:\\/\\/recordsetter\\.com\\/*\\/*\\/*'
- , re.I
- )
-
- api_endpoint = 'http://api.embed.ly/1/oembed'
- api_params = {'format':'json', 'maxwidth':600, 'key' : g.embedly_api_key }
-
-class GenericScraper(MediaScraper):
- """a special scrapper not associated with any domains, used to
- write media objects to links by hand"""
- domains = ['*']
- height = 480
- width = 640
-
- @classmethod
- def media_embed(cls, content, height = None, width = None, scrolling = False, **kw):
- return MediaEmbed(height = height or cls.height,
- width = width or cls.width,
- scrolling = scrolling,
- content = content)
-
-class DeepScraper(object):
- """Subclasses of DeepScraper attempt to dive into generic pages
- for embeds of other types (like YouTube videos on blog
- sites)."""
-
- def find_media_object(self, scraper):
- return None
-
-class YoutubeEmbedDeepScraper(DeepScraper):
- youtube_url_re = re.compile('^(http://www.youtube.com/v/([_a-zA-Z0-9-]+)).*')
-
- def find_media_object(self, scraper):
- # try to find very simple youtube embeds
- if not scraper.soup:
- scraper.download()
-
- if scraper.soup:
- movie_embed = scraper.soup.find('embed',
- attrs={'src': lambda x: self.youtube_url_re.match(x)})
- if movie_embed:
- youtube_id = self.youtube_url_re.match(movie_embed['src']).group(2)
- youtube_url = 'http://www.youtube.com/watch?v=%s"' % youtube_id
- log.debug('found youtube embed %s' % youtube_url)
- mo = make_scraper(youtube_url).media_object()
- mo['deep'] = scraper.url
- return mo
-
-#scrapers =:= dict(domain -> ScraperClass)
-scrapers = {}
-for scraper in [ EmbedlyOEmbed,
- YoutubeScraper,
- MetacafeScraper,
- GootubeScraper,
- VimeoScraper,
- BreakScraper,
- TheOnionScraper,
- CollegeHumorScraper,
- FunnyOrDieScraper,
- ComedyCentralScraper,
- ColbertNationScraper,
- TheDailyShowScraper,
- TedScraper,
- LiveLeakScraper,
- DailyMotionScraper,
- RevverScraper,
- EscapistScraper,
- JustintvScraper,
- SoundcloudScraper,
- CraigslistScraper,
- GenericScraper,
- ]:
- for domain in scraper.domains:
- scrapers.setdefault(domain, []).append(scraper)
-
-deepscrapers = [YoutubeEmbedDeepScraper]
-
-def get_media_embed(media_object):
- for scraper in scrapers.get(media_object['type'], []):
- res = scraper.media_embed(**media_object)
- if res:
- return res
- if 'content' in media_object:
- return GenericScraper.media_embed(**media_object)
-
-def convert_old_media_objects():
- q = Link._query(Link.c.media_object is not None,
- Link.c._date > whenever,
- data = True)
- for link in utils.fetch_things2(q):
- if not getattr(link, 'media_object', None):
- continue
-
- if 'youtube' in link.media_object:
- # we can rewrite this one without scraping
- video_id = YoutubeScraper.video_id_rx.match(link.url)
- link.media_object = dict(type='youtube.com',
- video_id = video_id.group(1))
- elif ('video.google.com' in link.media_object
- or 'metacafe' in link.media_object):
- scraper = make_scraper(link.url)
- if not scraper:
- continue
- mo = scraper.media_object()
- if not mo:
- continue
-
- link.media_object = mo
-
- else:
- print "skipping %s because it confuses me" % link._fullname
- continue
-
- link._commit()
-
-test_urls = [
- 'http://www.facebook.com/pages/Rick-Astley/5807213510?sid=c99aaf3888171e73668a38e0749ae12d', # regular thumbnail finder
- 'http://www.flickr.com/photos/septuagesima/317819584/', # thumbnail with image_src
-
- #'http://www.youtube.com/watch?v=Yu_moia-oVI',
- 'http://www.metacafe.com/watch/sy-1473689248/rick_astley_never_gonna_give_you_up_official_music_video/',
- 'http://video.google.com/videoplay?docid=5908758151704698048',
- #'http://vimeo.com/4495451',
- 'http://www.break.com/usercontent/2008/11/Macy-s-Thankgiving-Day-Parade-Rick-Roll-611965.html',
- 'http://www.theonion.com/content/video/sony_releases_new_stupid_piece_of',
- 'http://www.collegehumor.com/video:1823712',
- 'http://www.funnyordie.com/videos/7f2a184755/macys-thanksgiving-day-parade-gets-rick-rolled-from-that-happened',
- 'http://www.comedycentral.com/videos/index.jhtml?videoId=178342&title=ultimate-fighting-vs.-bloggers',
-
- # old style
- 'http://www.thedailyshow.com/video/index.jhtml?videoId=175244&title=Photoshop-of-Horrors',
- # new style
- 'http://www.thedailyshow.com/watch/wed-july-22-2009/the-born-identity',
-
- 'http://www.colbertnation.com/the-colbert-report-videos/63549/may-01-2006/sign-off---spam',
- 'http://www.liveleak.com/view?i=e09_1207983531',
- 'http://www.dailymotion.com/relevance/search/rick+roll/video/x5l8e6_rickroll_fun',
- 'http://revver.com/video/1199591/rick-rolld-at-work/',
- 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10-The-Orange-Box',
- 'http://www.escapistmagazine.com/videos/view/unskippable/736-Lost-Odyssey',
-
- # justin.tv has two media types that we care about, streams, which
- # we can scrape, and clips, which we can't
- 'http://www.justin.tv/help', # stream
- 'http://www.justin.tv/clip/c07a333f94e5716b', # clip, which we can't currently scrape, and shouldn't try
-
- 'http://soundcloud.com/kalhonaaho01/never-gonna-stand-you-up-rick-astley-vs-ludacris-album-version',
-
- 'http://www.craigslist.org/about/best/sea/240705630.html',
-
- 'http://listen.grooveshark.com/#/song/Never_Gonna_Give_You_Up/12616328',
- 'http://tinysong.com/2WOJ', # also Grooveshark
- 'http://www.slideshare.net/doina/happy-easter-from-holland-slideshare',
- 'http://www.slideshare.net/stinson/easter-1284190',
- 'http://www.slideshare.net/angelspascual/easter-events',
- 'http://www.slideshare.net/sirrods/happy-easter-3626014',
- 'http://www.slideshare.net/sirrods/happy-easter-wide-screen',
- 'http://www.slideshare.net/carmen_serbanescu/easter-holiday',
- 'http://www.slideshare.net/Lithuaniabook/easter-1255880',
- 'http://www.slideshare.net/hues/easter-plants',
- 'http://www.slideshare.net/Gospelman/passover-week',
- 'http://www.slideshare.net/angelspascual/easter-around-the-world-1327542',
- 'http://www.scribd.com/doc/13994900/Easter',
- 'http://www.scribd.com/doc/27425714/Celebrating-Easter-ideas-for-adults-and-children',
- 'http://www.scribd.com/doc/28010101/Easter-Foods-No-Name',
- 'http://www.scribd.com/doc/28452730/Easter-Cards',
- 'http://www.scribd.com/doc/19026714/The-Easter-Season',
- 'http://www.scribd.com/doc/29183659/History-of-Easter',
- 'http://www.scribd.com/doc/15632842/The-Last-Easter',
- 'http://www.scribd.com/doc/28741860/The-Plain-Truth-About-Easter',
- 'http://www.scribd.com/doc/23616250/4-27-08-ITS-EASTER-AGAIN-ORTHODOX-EASTER-by-vanderKOK',
- 'http://screenr.com/t9d',
- 'http://screenr.com/yLS',
- 'http://screenr.com/gzS',
- 'http://screenr.com/IwU',
- 'http://screenr.com/FM7',
- 'http://screenr.com/Ejg',
- 'http://screenr.com/u4h',
- 'http://screenr.com/QiN',
- 'http://screenr.com/zts',
- 'http://www.5min.com/Video/How-to-Decorate-Easter-Eggs-with-Decoupage-142076462',
- 'http://www.5min.com/Video/How-to-Color-Easter-Eggs-Dye-142076281',
- 'http://www.5min.com/Video/How-to-Make-an-Easter-Egg-Diorama-142076482',
- 'http://www.5min.com/Video/How-to-Make-Sequined-Easter-Eggs-142076512',
- 'http://www.5min.com/Video/How-to-Decorate-Wooden-Easter-Eggs-142076558',
- 'http://www.5min.com/Video/How-to-Blow-out-an-Easter-Egg-142076367',
- 'http://www.5min.com/Video/Learn-About-Easter-38363995',
- 'http://www.howcast.com/videos/368909-Easter-Egg-Dying-How-To-Make-Ukrainian-Easter-Eggs',
- 'http://www.howcast.com/videos/368911-Easter-Egg-Dying-How-To-Color-Easter-Eggs-With-Food-Dyes',
- 'http://www.howcast.com/videos/368913-Easter-Egg-Dying-How-To-Make-Homemade-Easter-Egg-Dye',
- 'http://www.howcast.com/videos/220110-The-Meaning-Of-Easter',
- 'http://my.opera.com/nirvanka/albums/show.dml?id=519866',
- 'http://img402.yfrog.com/i/mfe.jpg/',
- 'http://img20.yfrog.com/i/dy6.jpg/',
- 'http://img145.yfrog.com/i/4mu.mp4/',
- 'http://img15.yfrog.com/i/mygreatmovie.mp4/',
- 'http://img159.yfrog.com/i/500x5000401.jpg/',
- 'http://tweetphoto.com/14784358',
- 'http://tweetphoto.com/16044847',
- 'http://tweetphoto.com/16718883',
- 'http://tweetphoto.com/16451148',
- 'http://tweetphoto.com/16133984',
- 'http://tweetphoto.com/8069529',
- 'http://tweetphoto.com/16207556',
- 'http://tweetphoto.com/7448361',
- 'http://tweetphoto.com/16069325',
- 'http://tweetphoto.com/4791033',
- 'http://www.flickr.com/photos/10349896@N08/4490293418/',
- 'http://www.flickr.com/photos/mneylon/4483279051/',
- 'http://www.flickr.com/photos/xstartxtodayx/4488996521/',
- 'http://www.flickr.com/photos/mommyknows/4485313917/',
- 'http://www.flickr.com/photos/29988430@N06/4487127638/',
- 'http://www.flickr.com/photos/excomedia/4484159563/',
- 'http://www.flickr.com/photos/sunnybrook100/4471526636/',
- 'http://www.flickr.com/photos/jaimewalsh/4489497178/',
- 'http://www.flickr.com/photos/29988430@N06/4486475549/',
- 'http://www.flickr.com/photos/22695183@N08/4488681694/',
- 'http://twitpic.com/1cnsf6',
- 'http://twitpic.com/1cgtti',
- 'http://twitpic.com/1coc0n',
- 'http://twitpic.com/1cm8us',
- 'http://twitpic.com/1cgks4',
- 'http://imgur.com/6pLoN',
- 'http://onegoodpenguin.posterous.com/golden-tee-live-2010-easter-egg',
- 'http://adland.posterous.com/?tag=royaleastershowauckland',
- 'http://apartmentliving.posterous.com/biggest-easter-egg-hunts-in-the-dc-area',
- 'http://twitgoo.com/1as',
- 'http://twitgoo.com/1p94',
- 'http://twitgoo.com/4kg2',
- 'http://twitgoo.com/6c9',
- 'http://twitgoo.com/1w5',
- 'http://twitgoo.com/6mu',
- 'http://twitgoo.com/1w3',
- 'http://twitgoo.com/1om',
- 'http://twitgoo.com/1mh',
- 'http://www.qwantz.com/index.php?comic=1686',
- 'http://www.qwantz.com/index.php?comic=773',
- 'http://www.qwantz.com/index.php?comic=1018',
- 'http://www.qwantz.com/index.php?comic=1019',
- 'http://www.23hq.com/mhg/photo/5498347',
- 'http://www.23hq.com/Greetingdesignstudio/photo/5464607',
- 'http://www.23hq.com/Greetingdesignstudio/photo/5464590',
- 'http://www.23hq.com/Greetingdesignstudio/photo/5464605',
- 'http://www.23hq.com/Greetingdesignstudio/photo/5464604',
- 'http://www.23hq.com/dvilles2/photo/5443192',
- 'http://www.23hq.com/Greetingdesignstudio/photo/5464606',
- 'http://www.youtube.com/watch?v=gghKdx558Qg',
- 'http://www.youtube.com/watch?v=yPid9BLQQcg',
- 'http://www.youtube.com/watch?v=uEo2vboUYUk',
- 'http://www.youtube.com/watch?v=geUhtoHbLu4',
- 'http://www.youtube.com/watch?v=Zk7dDekYej0',
- 'http://www.youtube.com/watch?v=Q3tgMosx_tI',
- 'http://www.youtube.com/watch?v=s9P8_vgmLfs',
- 'http://www.youtube.com/watch?v=1cmtN1meMmk',
- 'http://www.youtube.com/watch?v=AVzj-U5Ihm0',
- 'http://www.veoh.com/collection/easycookvideos/watch/v366931kcdgj7Hd',
- 'http://www.veoh.com/collection/easycookvideos/watch/v366991zjpANrqc',
- 'http://www.veoh.com/browse/videos/category/educational/watch/v7054535EZGFJqyX',
- 'http://www.veoh.com/browse/videos/category/lifestyle/watch/v18155013XBBtnYwq',
- 'http://www.justin.tv/easter7presents',
- 'http://www.justin.tv/easterfraud',
- 'http://www.justin.tv/cccog27909',
- 'http://www.justin.tv/clip/6e8c18f7050',
- 'http://www.justin.tv/venom24',
- 'http://qik.com/video/1622287',
- 'http://qik.com/video/1503735',
- 'http://qik.com/video/40504',
- 'http://qik.com/video/1445763',
- 'http://qik.com/video/743285',
- 'http://qik.com/video/1445299',
- 'http://qik.com/video/1443200',
- 'http://qik.com/video/1445889',
- 'http://qik.com/video/174242',
- 'http://qik.com/video/1444897',
- 'http://revision3.com/hak5/DualCore',
- 'http://revision3.com/popsiren/charm',
- 'http://revision3.com/tekzilla/eyefinity',
- 'http://revision3.com/diggnation/2005-10-06',
- 'http://revision3.com/hak5/netcat-virtualization-wordpress/',
- 'http://revision3.com/infected/forsaken',
- 'http://revision3.com/hak5/purepwnage',
- 'http://revision3.com/tekzilla/wowheadset',
- 'http://www.dailymotion.com/video/xcstzd_greek-wallets-tighten-during-easter_news',
- 'http://www.dailymotion.com/video/xcso4y_exclusive-easter-eggs-easter-basket_lifestyle',
- 'http://www.dailymotion.com/video/x2sgkt_evil-easter-bunny',
- 'http://www.dailymotion.com/video/xco7oc_invitation-to-2010-easter-services_news',
- 'http://www.dailymotion.com/video/xcss6b_big-cat-easter_animals',
- 'http://www.dailymotion.com/video/xcszw1_easter-bunny-visits-buenos-aires-zo_news',
- 'http://www.dailymotion.com/video/xcsfvs_forecasters-warn-of-easter-misery_news',
- 'http://www.collegehumor.com/video:1682246',
- 'http://www.twitvid.com/D9997',
- 'http://www.twitvid.com/902B9',
- 'http://www.twitvid.com/C33F8',
- 'http://www.twitvid.com/63F73',
- 'http://www.twitvid.com/BC0BA',
- 'http://www.twitvid.com/1C33C',
- 'http://www.twitvid.com/8A8E2',
- 'http://www.twitvid.com/51035',
- 'http://www.twitvid.com/5C733',
- 'http://www.break.com/game-trailers/game/just-cause-2/just-cause-2-lost-easter-egg?res=1',
- 'http://www.break.com/usercontent/2010/3/10/easter-holiday-2009-slideshow-1775624',
- 'http://www.break.com/index/a-very-sexy-easter-video.html',
- 'http://www.break.com/usercontent/2010/3/11/this-video-features-gizzi-erskine-making-easter-cookies-1776089',
- 'http://www.break.com/usercontent/2007/4/4/happy-easter-265717',
- 'http://www.break.com/usercontent/2007/4/17/extreme-easter-egg-hunting-276064',
- 'http://www.break.com/usercontent/2006/11/18/the-evil-easter-bunny-184789',
- 'http://www.break.com/usercontent/2006/4/16/hoppy-easter-kitty-91040',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104063637',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004674',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103928002',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103999188',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103920940',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103981831',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004673',
- 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104046456',
- 'http://www.metacafe.com/watch/105023/the_easter_bunny/',
- 'http://www.metacafe.com/watch/4376131/easter_lay/',
- 'http://www.metacafe.com/watch/2245996/how_to_make_ukraine_easter_eggs/',
- 'http://www.metacafe.com/watch/4374339/easter_eggs/',
- 'http://www.metacafe.com/watch/2605860/filled_easter_baskets/',
- 'http://www.metacafe.com/watch/2372088/easter_eggs/',
- 'http://www.metacafe.com/watch/3043671/www_goodnews_ws_easter_island/',
- 'http://www.metacafe.com/watch/1652057/easter_eggs/',
- 'http://www.metacafe.com/watch/1173632/ultra_kawaii_easter_bunny_party/',
- 'http://celluloidremix.blip.tv/file/3378272/',
- 'http://blip.tv/file/449469',
- 'http://blip.tv/file/199776',
- 'http://blip.tv/file/766967',
- 'http://blip.tv/file/770127',
- 'http://blip.tv/file/854925',
- 'http://www.blip.tv/file/22695?filename=Uncle_dale-THEEASTERBUNNYHATESYOU395.flv',
- 'http://iofa.blip.tv/file/3412333/',
- 'http://blip.tv/file/190393',
- 'http://blip.tv/file/83152',
- 'http://video.google.com/videoplay?docid=-5427138374898988918&q=easter+bunny&pl=true',
- 'http://video.google.com/videoplay?docid=7785441737970480237',
- 'http://video.google.com/videoplay?docid=2320995867449957036',
- 'http://video.google.com/videoplay?docid=-2586684490991458032&q=peeps&pl=true',
- 'http://video.google.com/videoplay?docid=5621139047118918034',
- 'http://video.google.com/videoplay?docid=4232304376070958848',
- 'http://video.google.com/videoplay?docid=-6612726032157145299',
- 'http://video.google.com/videoplay?docid=4478549130377875994&hl=en',
- 'http://video.google.com/videoplay?docid=9169278170240080877',
- 'http://video.google.com/videoplay?docid=2551240967354893096',
- 'http://video.yahoo.com/watch/7268801/18963438',
- 'http://video.yahoo.com/watch/2224892/7014048',
- 'http://video.yahoo.com/watch/7244748/18886014',
- 'http://video.yahoo.com/watch/4656845/12448951',
- 'http://video.yahoo.com/watch/363942/2249254',
- 'http://video.yahoo.com/watch/2232968/7046348',
- 'http://video.yahoo.com/watch/4530253/12135472',
- 'http://video.yahoo.com/watch/2237137/7062908',
- 'http://video.yahoo.com/watch/952841/3706424',
- 'http://www.viddler.com/explore/BigAppleChannel/videos/113/',
- 'http://www.viddler.com/explore/cheezburger/videos/379/',
- 'http://www.viddler.com/explore/warnerbros/videos/350/',
- 'http://www.viddler.com/explore/tvcgroup/videos/169/',
- 'http://www.viddler.com/explore/thebrickshow/videos/12/',
- 'http://www.liveleak.com/view?i=e0b_1239827917',
- 'http://www.liveleak.com/view?i=715_1239490211',
- 'http://www.liveleak.com/view?i=d30_1206233786&p=1',
- 'http://www.liveleak.com/view?i=d91_1239548947',
- 'http://www.liveleak.com/view?i=f58_1190741182',
- 'http://www.liveleak.com/view?i=44e_1179885621&c=1',
- 'http://www.liveleak.com/view?i=451_1188059885',
- 'http://www.liveleak.com/view?i=3f5_1267456341&c=1',
- 'http://www.hulu.com/watch/67313/howcast-how-to-make-braided-easter-bread',
- 'http://www.hulu.com/watch/133583/access-hollywood-glees-matthew-morrison-on-touring-and-performing-for-president-obama',
- 'http://www.hulu.com/watch/66319/saturday-night-live-easter-album',
- 'http://www.hulu.com/watch/80229/explorer-end-of-easter-island',
- 'http://www.hulu.com/watch/139020/nbc-today-show-lamb-and-ham-create-easter-feast',
- 'http://www.hulu.com/watch/84272/rex-the-runt-easter-island',
- 'http://www.hulu.com/watch/132203/everyday-italian-easter-pie',
- 'http://www.hulu.com/watch/23349/nova-secrets-of-lost-empires-ii-easter-island',
- 'http://movieclips.com/watch/dirty_harry_1971/do_you_feel_lucky_punk/',
- 'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/',
- 'http://movieclips.com/watch/dumb__dumber_1994/the_toilet_doesnt_flush/',
- 'http://movieclips.com/watch/jaws_1975/youre_gonna_need_a_bigger_boat/',
- 'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/61.495/75.413',
- 'http://movieclips.com/watch/super_troopers_2001/the_cat_game/12.838/93.018',
- 'http://movieclips.com/watch/this_is_spinal_tap_1984/these_go_to_eleven/79.703/129.713',
- 'http://crackle.com/c/Originals/What_s_the_deal_with_Easter_candy_/2303243',
- 'http://crackle.com/c/How_To/Dryer_Lint_Easter_Bunny_Trailer_Park_Craft/2223902',
- 'http://crackle.com/c/How_To/Pagan_Origin_of_Easter_Easter_Egg_Rabbit_Playb_/2225124',
- 'http://crackle.com/c/Funny/Happy_Easter/2225363',
- 'http://crackle.com/c/Funny/Crazy_and_Hilarious_Easter_Egg_Hunt/2225737',
- 'http://crackle.com/c/How_To/Learn_About_Greek_Orthodox_Easter/2262294',
- 'http://crackle.com/c/How_To/How_to_Make_Ukraine_Easter_Eggs/2262274',
- 'http://crackle.com/c/How_To/Symbolism_Of_Ukrainian_Easter_Eggs/2262267',
- 'http://crackle.com/c/Funny/Easter_Retard/931976',
- 'http://www.fancast.com/tv/It-s-the-Easter-Beagle,-Charlie-Brown/74789/1078053475/Peanuts:-Specials:-It-s-the-Easter-Beagle,-Charlie-Brown/videos',
- 'http://www.fancast.com/movies/Easter-Parade/97802/687440525/Easter-Parade/videos',
- 'http://www.fancast.com/tv/Saturday-Night-Live/10009/1083396482/Easter-Album/videos',
- 'http://www.fancast.com/movies/The-Proposal/147176/1140660489/The-Proposal:-Easter-Egg-Hunt/videos',
- 'http://www.funnyordie.com/videos/f6883f54ae/the-unsettling-ritualistic-origin-of-the-easter-bunny',
- 'http://www.funnyordie.com/videos/3ccb03863e/easter-tail-keaster-bunny',
- 'http://www.funnyordie.com/videos/17b1d36ad0/easter-bunny-from-leatherfink',
- 'http://www.funnyordie.com/videos/0c55aa116d/easter-exposed-from-bryan-erwin',
- 'http://www.funnyordie.com/videos/040dac4eff/easter-eggs',
- 'http://vimeo.com/10446922',
- 'http://vimeo.com/10642542',
- 'http://www.vimeo.com/10664068',
- 'http://vimeo.com/819176',
- 'http://www.vimeo.com/10525353',
- 'http://vimeo.com/10429123',
- 'http://www.vimeo.com/10652053',
- 'http://vimeo.com/10572216',
- 'http://www.ted.com/talks/jared_diamond_on_why_societies_collapse.html',
- 'http://www.ted.com/talks/nathan_myhrvold_on_archeology_animal_photography_bbq.html',
- 'http://www.ted.com/talks/johnny_lee_demos_wii_remote_hacks.html',
- 'http://www.ted.com/talks/robert_ballard_on_exploring_the_oceans.html',
- 'http://www.omnisio.com/v/Z3QxbTUdjhG/wall-e-collection-of-videos',
- 'http://www.omnisio.com/v/3ND6LTvdjhG/php-tutorial-4-login-form-updated',
- 'http://www.thedailyshow.com/watch/thu-december-14-2000/intro---easter',
- 'http://www.thedailyshow.com/watch/tue-april-18-2006/headlines---easter-charade',
- 'http://www.thedailyshow.com/watch/tue-april-18-2006/egg-beaters',
- 'http://www.thedailyshow.com/watch/tue-april-18-2006/moment-of-zen---scuba-diver-hiding-easter-eggs',
- 'http://www.thedailyshow.com/watch/tue-april-7-2009/easter---passover-highlights',
- 'http://www.thedailyshow.com/watch/tue-february-29-2000/headlines---leap-impact',
- 'http://www.thedailyshow.com/watch/thu-march-1-2007/tomb-with-a-jew',
- 'http://www.thedailyshow.com/watch/mon-april-24-2000/the-meaning-of-passover',
- 'http://www.colbertnation.com/the-colbert-report-videos/268800/march-31-2010/easter-under-attack---peeps-display-update',
- 'http://www.colbertnation.com/the-colbert-report-videos/268797/march-31-2010/intro---03-31-10',
- 'http://www.colbertnation.com/full-episodes/wed-march-31-2010-craig-mullaney',
- 'http://www.colbertnation.com/the-colbert-report-videos/60902/march-28-2006/the-word---easter-under-attack---marketing',
- 'http://www.colbertnation.com/the-colbert-report-videos/83362/march-07-2007/easter-under-attack---bunny',
- 'http://www.colbertnation.com/the-colbert-report-videos/61404/april-06-2006/easter-under-attack---recalled-eggs?videoId=61404',
- 'http://www.colbertnation.com/the-colbert-report-videos/223957/april-06-2009/colbert-s-easter-parade',
- 'http://www.colbertnation.com/the-colbert-report-videos/181772/march-28-2006/intro---3-28-06',
- 'http://www.traileraddict.com/trailer/despicable-me/easter-greeting',
- 'http://www.traileraddict.com/trailer/easter-parade/trailer',
- 'http://www.traileraddict.com/clip/the-proposal/easter-egg-hunt',
- 'http://www.traileraddict.com/trailer/despicable-me/international-teaser-trailer',
- 'http://www.traileraddict.com/trailer/despicable-me/today-show-minions',
- 'http://revver.com/video/263817/happy-easter/',
- 'http://www.revver.com/video/1574939/easter-bunny-house/',
- 'http://revver.com/video/771140/easter-08/',
- ]
-
-def submit_all():
- from r2.models import Subreddit, Account, Link, NotFound
- from r2.lib.media import set_media
- from r2.lib.db import queries
- sr = Subreddit._by_name('testmedia')
- author = Account._by_name('testmedia')
- links = []
- for url in test_urls:
- try:
- # delete any existing version of the link
- l = Link._by_url(url, sr)
- print "Deleting %s" % l
- l._deleted = True
- l._commit()
- except NotFound:
- pass
-
- l = Link._submit(url, url, author, sr, '0.0.0.0')
-
- try:
- set_media(l)
- except Exception, e:
- print e
-
- queries.new_link(l)
-
- links.append(l)
-
- return links
-
-def test_real(nlinks):
- from r2.models import Link, desc
- from r2.lib.utils import fetch_things2
-
- counter = 0
- q = Link._query(sort = desc("_date"))
-
- print "
"
- for l in fetch_things2(q):
- if counter > nlinks:
- break
- if not l.is_self:
- h = make_scraper(l.url)
- mo = h.media_object()
- print "scraper: %s" % mo
- if mo:
- print get_media_embed(mo).content
- counter +=1
- print "
"
-
-def test_url(url):
- import sys
- from r2.lib.filters import websafe
- sys.stderr.write("%s\n" % url)
- print "
"
- h = make_scraper(url)
- print ""
- print "", websafe(url), " "
- print " "
- print websafe(repr(h))
- img = h.largest_image_url()
- if img:
- print " " % img
- else:
- print "(no image) "
- mo = h.media_object()
- print ""
- if mo:
- print get_media_embed(mo).content
- else:
- print "None"
- print " "
- print " "
-
-def test():
- """Take some example URLs and print out a nice pretty HTML table
- of their extracted thubmnails and media objects"""
- print "
"
- for url in test_urls:
- test_url(url)
- print "
"
-