From 252437da170d2505a0c8dab7b40b1c6687f695da Mon Sep 17 00:00:00 2001 From: Neil Williams Date: Thu, 29 Aug 2013 10:50:21 -0700 Subject: [PATCH] Overhaul scraper_q processing. This is primarily about removing the dead old scrapers in favor of a streamlined oEmbed-based system. The new embed.ly scraper uses their Service API to determine which URLs can be scraped through them. This removes the giant list of domains and regexes in the code and makes the scraper_q proc always have the latest list of scrapable domains. --- r2/r2/controllers/api.py | 2 +- r2/r2/controllers/mediaembed.py | 7 +- r2/r2/lib/jsontemplates.py | 2 +- r2/r2/lib/media.py | 380 ++++++- r2/r2/lib/pages/pages.py | 3 +- r2/r2/lib/scraper.py | 1864 ------------------------------- 6 files changed, 348 insertions(+), 1910 deletions(-) delete mode 100644 r2/r2/lib/scraper.py diff --git a/r2/r2/controllers/api.py b/r2/r2/controllers/api.py index 38c41dd2e..7abed0dcb 100755 --- a/r2/r2/controllers/api.py +++ b/r2/r2/controllers/api.py @@ -66,7 +66,7 @@ from r2.lib import tracking, cssfilter, emailer from r2.lib.subreddit_search import search_reddits from r2.lib.log import log_text from r2.lib.filters import safemarkdown -from r2.lib.scraper import str_to_image +from r2.lib.media import str_to_image from r2.controllers.api_docs import api_doc, api_section from r2.lib.search import SearchQuery from r2.controllers.oauth2 import OAuth2ResourceController, require_oauth2_scope diff --git a/r2/r2/controllers/mediaembed.py b/r2/r2/controllers/mediaembed.py index 756259435..24ea5d35c 100644 --- a/r2/r2/controllers/mediaembed.py +++ b/r2/r2/controllers/mediaembed.py @@ -25,7 +25,7 @@ from pylons.controllers.util import abort from r2.controllers.reddit_base import MinimalController from r2.lib.pages import MediaEmbedBody -from r2.lib.scraper import get_media_embed +from r2.lib.media import get_media_embed from r2.lib.validator import validate, VLink @@ -39,11 +39,6 @@ class MediaembedController(MinimalController): if not link or not link.media_object: abort(404) - - if isinstance(link.media_object, basestring): - # it's an old-style string - content = link.media_object - elif isinstance(link.media_object, dict): # otherwise it's the new style, which is a dict(type=type, **args) media_embed = get_media_embed(link.media_object) diff --git a/r2/r2/lib/jsontemplates.py b/r2/r2/lib/jsontemplates.py index b63371fe7..190532118 100755 --- a/r2/r2/lib/jsontemplates.py +++ b/r2/r2/lib/jsontemplates.py @@ -414,7 +414,7 @@ class LinkJsonTemplate(ThingJsonTemplate): ) def thing_attr(self, thing, attr): - from r2.lib.scraper import get_media_embed + from r2.lib.media import get_media_embed if attr == "media_embed": if (thing.media_object and not isinstance(thing.media_object, basestring)): diff --git a/r2/r2/lib/media.py b/r2/r2/lib/media.py index 5ee7d7102..296ae5398 100644 --- a/r2/r2/lib/media.py +++ b/r2/r2/lib/media.py @@ -20,36 +20,144 @@ # Inc. All Rights Reserved. ############################################################################### -import subprocess - -from pylons import g, config - -from r2.models.link import Link -from r2.lib import s3cp -from r2.lib.utils import timeago, fetch_things2 -from r2.lib.utils import TimeoutFunction, TimeoutFunctionException -from r2.lib.db.operators import desc -from r2.lib.scraper import make_scraper, str_to_image, image_to_str, prepare_image -from r2.lib import amqp -from r2.lib.nymph import optimize_png - -import Image - +import base64 +import collections +import cStringIO +import hashlib +import json +import math +import mimetypes import os +import re +import subprocess import tempfile import traceback +import urllib +import urllib2 +import urlparse -import base64 -import hashlib +import BeautifulSoup +import Image +import ImageFile + +from pylons import g + +from r2.lib import amqp, s3cp +from r2.lib.memoize import memoize +from r2.lib.nymph import optimize_png +from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain +from r2.models.link import Link -import mimetypes s3_direct_url = "s3.amazonaws.com" - -threads = 20 -log = g.log - MEDIA_FILENAME_LENGTH = 12 +thumbnail_size = 70, 70 + + +def _image_to_str(image): + s = cStringIO.StringIO() + image.save(s, image.format) + return s.getvalue() + + +def str_to_image(s): + s = cStringIO.StringIO(s) + image = Image.open(s) + return image + + +def _image_entropy(img): + """calculate the entropy of an image""" + hist = img.histogram() + hist_size = sum(hist) + hist = [float(h) / hist_size for h in hist] + + return -sum(p * math.log(p, 2) for p in hist if p != 0) + + +def _square_image(img): + """if the image is taller than it is wide, square it off. determine + which pieces to cut off based on the entropy pieces.""" + x,y = img.size + while y > x: + #slice 10px at a time until square + slice_height = min(y - x, 10) + + bottom = img.crop((0, y - slice_height, x, y)) + top = img.crop((0, 0, x, slice_height)) + + #remove the slice with the least entropy + if _image_entropy(bottom) < _image_entropy(top): + img = img.crop((0, 0, x, y - slice_height)) + else: + img = img.crop((0, slice_height, x, y)) + + x,y = img.size + + return img + + +def _prepare_image(image): + image = _square_image(image) + image.thumbnail(thumbnail_size, Image.ANTIALIAS) + return image + + +def _clean_url(url): + """url quotes unicode data out of urls""" + url = url.encode('utf8') + url = ''.join(urllib.quote(c) if ord(c) >= 127 else c for c in url) + return url + + +def _initialize_request(url, referer): + url = _clean_url(url) + + if not url.startswith(("http://", "https://")): + return + + req = urllib2.Request(url) + if g.useragent: + req.add_header('User-Agent', g.useragent) + if referer: + req.add_header('Referer', referer) + return req + + +def _fetch_url(url, referer=None): + request = _initialize_request(url, referer=referer) + if not request: + return None, None + response = urllib2.urlopen(request) + return response.headers.get("Content-Type"), response.read() + + +@memoize('media.fetch_size', time=3600) +def _fetch_image_size(url, referer): + """Return the size of an image by URL downloading as little as possible.""" + + request = _initialize_request(url, referer) + if not request: + return None + + parser = ImageFile.Parser() + response = None + try: + response = urllib2.urlopen(request) + + while True: + chunk = response.read(1024) + if not chunk: + break + + parser.feed(chunk) + if parser.image: + return parser.image.size + except urllib2.URLError: + return None + finally: + if response: + response.close() def optimize_jpeg(filename, optimizer): @@ -151,29 +259,27 @@ def update_link(link, thumbnail, media_object, thumbnail_size=None): link._commit() -def set_media(link, force = False): +def _set_media(embedly_services, link, force=False): if link.is_self: return if not force and link.promoted: return elif not force and (link.has_thumbnail or link.media_object): return - - scraper = make_scraper(link.url) - thumbnail = scraper.thumbnail() - media_object = scraper.media_object() + scraper = Scraper.for_url(embedly_services, link.url) + thumbnail, media_object = scraper.scrape() if media_object: # the scraper should be able to make a media embed out of the # media object it just gave us. if not, null out the media object # to protect downstream code - res = scraper.media_embed(**media_object) + res = scraper.media_embed(media_object) if not res: print "%s made a bad media obj for link %s" % (scraper, link._id36) media_object = None - + thumbnail_url = upload_media(thumbnail) if thumbnail else None thumbnail_size = thumbnail.size if thumbnail else None @@ -181,7 +287,7 @@ def set_media(link, force = False): def force_thumbnail(link, image_data, never_expire=True, file_type=".jpg"): image = str_to_image(image_data) - image = prepare_image(image) + image = _prepare_image(image) thumb_url = upload_media(image, never_expire=never_expire, file_type=file_type) update_link(link, thumbnail=thumb_url, media_object=None, thumbnail_size=image.size) @@ -190,7 +296,7 @@ def upload_icon(file_name, image_data, size): image = str_to_image(image_data) image.format = 'PNG' image.thumbnail(size, Image.ANTIALIAS) - icon_data = image_to_str(image) + icon_data = _image_to_str(image) return s3_upload_media(icon_data, file_name=file_name, mime_type='image/png', @@ -201,16 +307,218 @@ def upload_icon(file_name, image_data, size): def can_upload_icon(): return g.media_store == 's3' + +def get_media_embed(media_object): + if not isinstance(media_object, dict): + return + + if "oembed" not in media_object: + return + + return _EmbedlyScraper.media_embed(media_object) + + +class MediaEmbed(object): + width = None + height = None + content = None + scrolling = False + + def __init__(self, height, width, content, scrolling=False): + self.height = int(height) + self.width = int(width) + self.content = content + self.scrolling = scrolling + + +def _make_thumbnail_from_url(thumbnail_url, referer): + if not thumbnail_url: + return + content_type, content = _fetch_url(thumbnail_url, referer=referer) + if not content: + return + image = str_to_image(content) + return _prepare_image(image) + + +class Scraper(object): + @classmethod + def for_url(cls, embedly_services, url): + url_domain = domain(url) + domain_embedly_regex = embedly_services.get(url_domain, None) + + if domain_embedly_regex and re.match(domain_embedly_regex, url): + return _EmbedlyScraper(url) + return _ThumbnailOnlyScraper(url) + + def scrape(self): + # should return a 2-tuple of: thumbnail, media_object + raise NotImplementedError + + @classmethod + def media_embed(cls, media_object): + # should take a media object and return an appropriate MediaEmbed + raise NotImplementedError + + +class _ThumbnailOnlyScraper(Scraper): + def __init__(self, url): + self.url = url + + def scrape(self): + thumbnail_url = self._find_thumbnail_image() + thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) + return thumbnail, None + + def _extract_image_urls(self, soup): + for img in soup.findAll("img", src=True): + yield urlparse.urljoin(self.url, img["src"]) + + def _find_thumbnail_image(self): + content_type, content = _fetch_url(self.url) + + # if it's an image. it's pretty easy to guess what we should thumbnail. + if "image" in content_type: + return self.url + + if content_type and "html" in content_type and content: + soup = BeautifulSoup.BeautifulSoup(content) + else: + return None + + # allow the content author to specify the thumbnail: + # + og_image = (soup.find('meta', property='og:image') or + soup.find('meta', attrs={'name': 'og:image'})) + if og_image and og_image['content']: + return og_image['content'] + + # + thumbnail_spec = soup.find('link', rel='image_src') + if thumbnail_spec and thumbnail_spec['href']: + return thumbnail_spec['href'] + + # ok, we have no guidance from the author. look for the largest + # image on the page with a few caveats. (see below) + max_area = 0 + max_url = None + for image_url in self._extract_image_urls(soup): + size = _fetch_image_size(image_url, referer=self.url) + if not size: + continue + + area = size[0] * size[1] + + # ignore little images + if area < 5000: + g.log.debug('ignore little %s' % image_url) + continue + + # ignore excessively long/wide images + if max(size) / min(size) > 1.5: + g.log.debug('ignore dimensions %s' % image_url) + continue + + # penalize images with "sprite" in their name + if 'sprite' in image_url.lower(): + g.log.debug('penalizing sprite %s' % image_url) + area /= 10 + + if area > max_area: + max_area = area + max_url = image_url + return max_url + + +class _EmbedlyScraper(Scraper): + EMBEDLY_API_URL = "http://api.embed.ly/1/oembed" + + def __init__(self, url): + self.url = url + + @classmethod + def _utf8_encode(cls, input): + """UTF-8 encodes any strings in an object (from json.loads)""" + if isinstance(input, dict): + return {cls._utf8_encode(key): cls._utf8_encode(value) + for key, value in input.iteritems()} + elif isinstance(input, list): + return [cls._utf8_encode(item) + for item in input] + elif isinstance(input, unicode): + return input.encode('utf-8') + else: + return input + + def scrape(self): + params = urllib.urlencode({ + "url": self.url, + "format": "json", + "maxwidth": 600, + "key": g.embedly_api_key, + }) + response = urllib2.urlopen(self.EMBEDLY_API_URL + "?" + params) + oembed = json.load(response, object_hook=self._utf8_encode) + + if not oembed: + return None, None + + if oembed.get("type") == "photo": + thumbnail_url = oembed.get("url") + else: + thumbnail_url = oembed.get("thumbnail_url") + thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url) + + embed = {} + if oembed.get("type") in ("video", "rich"): + embed = { + "type": domain(self.url), + "oembed": oembed, + } + + return thumbnail, embed + + @classmethod + def media_embed(cls, media_object): + oembed = media_object["oembed"] + + html = oembed.get("html") + width = oembed.get("width") + height = oembed.get("height") + if not (html and width and height): + return + + return MediaEmbed( + width=width, + height=height, + content=html, + ) + + +@memoize("media.embedly_services", time=3600) +def _fetch_embedly_services(): + response = urllib2.urlopen("http://api.embed.ly/1/services/python") + service_data = json.load(response) + + patterns_by_domain = collections.defaultdict(set) + for service in service_data: + for domain in [service["domain"]] + service["subdomains"]: + patterns_by_domain[domain].update(service["regex"]) + + return {domain: "(?:%s)" % "|".join(patterns) + for domain, patterns in patterns_by_domain.iteritems()} + + def run(): + embedly_services = _fetch_embedly_services() + @g.stats.amqp_processor('scraper_q') def process_link(msg): - def _process_link(fname): - link = Link._by_fullname(fname, data=True) - set_media(link) - fname = msg.body + link = Link._by_fullname(msg.body, data=True) + try: - TimeoutFunction(_process_link, 30)(fname) + TimeoutFunction(_set_media, 30)(embedly_services, link) except TimeoutFunctionException: print "Timed out on %s" % fname except KeyboardInterrupt: diff --git a/r2/r2/lib/pages/pages.py b/r2/r2/lib/pages/pages.py index 843f5989e..703619ab2 100755 --- a/r2/r2/lib/pages/pages.py +++ b/r2/r2/lib/pages/pages.py @@ -67,7 +67,6 @@ from r2.lib.utils import url_links_builder, make_offset_date, median, to36 from r2.lib.utils import trunc_time, timesince, timeuntil, weighted_lottery from r2.lib.template_helpers import add_sr, get_domain, format_number from r2.lib.subreddit_search import popular_searches -from r2.lib.scraper import get_media_embed from r2.lib.log import log_text from r2.lib.memoize import memoize from r2.lib.utils import trunc_string as _truncate, to_date @@ -3454,7 +3453,7 @@ def make_link_child(item): media_embed = item.media_object else: try: - media_embed = get_media_embed(item.media_object) + media_embed = media.get_media_embed(item.media_object) except TypeError: g.log.warning("link %s has a bad media object" % item) media_embed = None diff --git a/r2/r2/lib/scraper.py b/r2/r2/lib/scraper.py deleted file mode 100644 index c1285f953..000000000 --- a/r2/r2/lib/scraper.py +++ /dev/null @@ -1,1864 +0,0 @@ -# The contents of this file are subject to the Common Public Attribution -# License Version 1.0. (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public -# License Version 1.1, but Sections 14 and 15 have been added to cover use of -# software over a computer network and provide for limited attribution for the -# Original Developer. In addition, Exhibit A has been modified to be consistent -# with Exhibit B. -# -# Software distributed under the License is distributed on an "AS IS" basis, -# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for -# the specific language governing rights and limitations under the License. -# -# The Original Code is reddit. -# -# The Original Developer is the Initial Developer. The Initial Developer of -# the Original Code is reddit Inc. -# -# All portions of the code written by reddit are Copyright (c) 2006-2013 reddit -# Inc. All Rights Reserved. -############################################################################### - -from pylons import g -from r2.lib import utils -from r2.lib.memoize import memoize -import simplejson as json - -from urllib2 import Request, HTTPError, URLError, urlopen -from httplib import InvalidURL -import urlparse, re, urllib, logging, StringIO, logging -import Image, ImageFile, math -from BeautifulSoup import BeautifulSoup - -log = g.log -useragent = g.useragent - -chunk_size = 1024 -thumbnail_size = 70, 70 - -def image_to_str(image): - s = StringIO.StringIO() - image.save(s, image.format) - s.seek(0) - return s.read() - -def str_to_image(s): - s = StringIO.StringIO(s) - s.seek(0) - image = Image.open(s) - return image - -def prepare_image(image): - image = square_image(image) - image.thumbnail(thumbnail_size, Image.ANTIALIAS) - return image - -def image_entropy(img): - """calculate the entropy of an image""" - hist = img.histogram() - hist_size = sum(hist) - hist = [float(h) / hist_size for h in hist] - - return -sum([p * math.log(p, 2) for p in hist if p != 0]) - -def square_image(img): - """if the image is taller than it is wide, square it off. determine - which pieces to cut off based on the entropy pieces.""" - x,y = img.size - while y > x: - #slice 10px at a time until square - slice_height = min(y - x, 10) - - bottom = img.crop((0, y - slice_height, x, y)) - top = img.crop((0, 0, x, slice_height)) - - #remove the slice with the least entropy - if image_entropy(bottom) < image_entropy(top): - img = img.crop((0, 0, x, y - slice_height)) - else: - img = img.crop((0, slice_height, x, y)) - - x,y = img.size - - return img - -def clean_url(url): - """url quotes unicode data out of urls""" - s = url - url = url.encode('utf8') - url = ''.join([urllib.quote(c) if ord(c) >= 127 else c for c in url]) - return url - -def fetch_url(url, referer = None, retries = 1, dimension = False): - cur_try = 0 - log.debug('fetching: %s' % url) - nothing = None if dimension else (None, None) - url = clean_url(url) - #just basic urls - if not url.startswith(('http://', 'https://')): - return nothing - while True: - try: - req = Request(url) - if useragent: - req.add_header('User-Agent', useragent) - if referer: - req.add_header('Referer', referer) - - open_req = urlopen(req) - - #if we only need the dimension of the image, we may not - #need to download the entire thing - if dimension: - content = open_req.read(chunk_size) - else: - content = open_req.read() - content_type = open_req.headers.get('content-type') - - if not content_type: - return nothing - - if 'image' in content_type: - p = ImageFile.Parser() - new_data = content - while not p.image and new_data: - p.feed(new_data) - new_data = open_req.read(chunk_size) - content += new_data - - #return the size, or return the data - if dimension and p.image: - return p.image.size - elif dimension: - return nothing - elif dimension: - #expected an image, but didn't get one - return nothing - - return content_type, content - - except (URLError, HTTPError, InvalidURL), e: - cur_try += 1 - if cur_try >= retries: - log.debug('error while fetching: %s referer: %s' % (url, referer)) - log.debug(e) - return nothing - finally: - if 'open_req' in locals(): - open_req.close() - -@memoize('media.fetch_size') -def fetch_size(url, referer = None, retries = 1): - return fetch_url(url, referer, retries, dimension = True) - -class MediaEmbed(object): - width = None - height = None - content = None - scrolling = False - - def __init__(self, height, width, content, scrolling = False): - self.height = int(height) - self.width = int(width) - self.content = content - self.scrolling = scrolling - -class Scraper: - def __init__(self, url): - self.url = url - self.content = None - self.content_type = None - self.soup = None - - def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.url) - - def download(self): - self.content_type, self.content = fetch_url(self.url) - if self.content_type and 'html' in self.content_type and self.content: - self.soup = BeautifulSoup(self.content) - - def image_urls(self): - #if the original url was an image, use that - if 'image' in self.content_type: - yield self.url - elif self.soup: - images = self.soup.findAll('img', src = True) - for i in images: - image_url = urlparse.urljoin(self.url, i['src']) - yield image_url - - def largest_image_url(self): - if not self.content: - self.download() - - #if download didn't work - if not self.content or not self.content_type: - return None - - max_area = 0 - max_url = None - - if self.soup: - og_image = (self.soup.find('meta', property='og:image') or - self.soup.find('meta', attrs={'name': 'og:image'})) - if og_image and og_image['content']: - log.debug("Using og:image") - return og_image['content'] - thumbnail_spec = self.soup.find('link', rel = 'image_src') - if thumbnail_spec and thumbnail_spec['href']: - log.debug("Using image_src") - return thumbnail_spec['href'] - - for image_url in self.image_urls(): - size = fetch_size(image_url, referer = self.url) - if not size: - continue - - area = size[0] * size[1] - - #ignore little images - if area < 5000: - log.debug('ignore little %s' % image_url) - continue - - #ignore excessively long/wide images - if max(size) / min(size) > 1.5: - log.debug('ignore dimensions %s' % image_url) - continue - - #penalize images with "sprite" in their name - if 'sprite' in image_url.lower(): - log.debug('penalizing sprite %s' % image_url) - area /= 10 - - if area > max_area: - max_area = area - max_url = image_url - - return max_url - - def thumbnail(self): - image_url = self.largest_image_url() - if image_url: - content_type, image_str = fetch_url(image_url, referer = self.url) - if image_str: - image = str_to_image(image_str) - try: - image = prepare_image(image) - except IOError, e: - #can't read interlaced PNGs, ignore - if 'interlaced' in e.message: - return - raise - return image - - def media_object(self): - for deepscraper in deepscrapers: - ds = deepscraper() - found = ds.find_media_object(self) - if found: - return found - - @classmethod - def media_embed(cls): - raise NotImplementedError - -class MediaScraper(Scraper): - media_template = "" - thumbnail_template = "" - video_id = None - video_id_rx = None - - def __init__(self, url): - Scraper.__init__(self, url) - - # first try the simple regex against the URL. If that fails, - # see if the MediaScraper subclass has its own extraction - # function - if self.video_id_rx: - m = self.video_id_rx.match(url) - if m: - self.video_id = m.groups()[0] - if not self.video_id: - video_id = self.video_id_extract() - if video_id: - self.video_id = video_id - if not self.video_id: - #if we still can't find the id just treat it like a normal page - log.debug('reverting to regular scraper: %s' % url) - self.__class__ = Scraper - - def video_id_extract(self): - return None - - def largest_image_url(self): - if self.thumbnail_template: - return self.thumbnail_template.replace('$video_id', self.video_id) - else: - return Scraper.largest_image_url(self) - - def media_object(self): - return dict(video_id = self.video_id, - type = self.domains[0]) - - @classmethod - def media_embed(cls, video_id = None, height = None, width = None, **kw): - content = cls.media_template.replace('$video_id', video_id) - return MediaEmbed(height = height or cls.height, - width = width or cls.width, - content = content) - -def youtube_in_google(google_url): - h = Scraper(google_url) - h.download() - try: - youtube_url = h.soup.find('div', 'original-text').findNext('a')['href'] - log.debug('%s is really %s' % (google_url, youtube_url)) - return youtube_url - except AttributeError, KeyError: - pass - -def make_scraper(url): - domain = utils.domain(url) - scraper = Scraper - for suffix, clses in scrapers.iteritems(): - for cls in clses: - if domain.endswith(suffix): - scraper = cls - break - - #sometimes youtube scrapers masquerade as google scrapers - if scraper == GootubeScraper: - youtube_url = youtube_in_google(url) - if youtube_url: - return make_scraper(youtube_url) - return scraper(url) - -########## site-specific video scrapers ########## - -class YoutubeScraper(MediaScraper): - domains = ['youtube.com'] - height = 295 - width = 480 - media_template = '' - thumbnail_template = 'http://img.youtube.com/vi/$video_id/default.jpg' - video_id_rx = re.compile('.*v=([A-Za-z0-9-_]+).*') - video_deeplink_rx = re.compile('.*#t=(\d+)m(\d+)s.*') - - def video_id_extract(self): - vid = self.video_id_rx.match(self.url) - if(vid): - video_id = vid.groups()[0] - d = self.video_deeplink_rx.match(self.url) - if(d): - seconds = int(d.groups()[0])*60 + int(d.groups()[1]) - video_id += "&start=%d" % seconds - return video_id - - def largest_image_url(self): - # Remove the deeplink part from the video id - return self.thumbnail_template.replace("$video_id", - self.video_id.split("&")[0]) - -class TedScraper(MediaScraper): - domains = ['ted.com'] - height = 326 - width = 446 - media_template = ' ' - flashvars_rx = re.compile('.*flashvars="(.*)".*') - - def video_id_extract(self): - if "/talks/" in self.url: - content_type, content = fetch_url(self.url.replace("/talks/","/talks/embed/")) - if content: - m = self.flashvars_rx.match(content) - if m: - return m.groups()[0] - def largest_image_url(self): - if not self.soup: - self.download() - - if self.soup: - return self.soup.find('link', rel = 'image_src')['href'] - - -class MetacafeScraper(MediaScraper): - domains = ['metacafe.com'] - height = 345 - width = 400 - media_template = ' ' - video_id_rx = re.compile('.*/watch/([^/]+)/.*') - - def media_object(self): - if not self.soup: - self.download() - - if self.soup: - video_url = self.soup.find('link', rel = 'video_src')['href'] - return dict(video_id = video_url, - type = self.domains[0]) - -class GootubeScraper(MediaScraper): - domains = ['video.google.com'] - height = 326 - width = 400 - media_template = ' ' - video_id_rx = re.compile('.*videoplay\?docid=([A-Za-z0-9-_]+).*') - gootube_thumb_rx = re.compile(".*thumbnail:\s*\'(http://[^/]+/ThumbnailServer2[^\']+)\'.*", re.IGNORECASE | re.S) - - def largest_image_url(self): - if not self.content: - self.download() - - if not self.content: - return None - - m = self.gootube_thumb_rx.match(self.content) - if m: - image_url = m.groups()[0] - image_url = utils.safe_eval_str(image_url) - return image_url - -class VimeoScraper(MediaScraper): - domains = ['vimeo.com'] - height = 448 - width = 520 - media_template = ' ' - video_id_rx = re.compile('.*/(.*)') - - def media_object(self): - if not self.soup: - self.download() - - if self.soup: - video_url = self.soup.find('link', rel = 'video_src')['href'] - return dict(video_id = video_url, - type = self.domains[0]) - -class BreakScraper(MediaScraper): - domains = ['break.com'] - height = 421 - width = 520 - media_template = '' - video_id_rx = re.compile('.*/index/([^/]+).*'); - - def video_id_extract(self): - if not self.soup: - self.download() - - if self.soup: - video_src = self.soup.find('link', rel = 'video_src') - if video_src and video_src['href']: - return video_src['href'] - -class TheOnionScraper(MediaScraper): - domains = ['theonion.com'] - height = 430 - width = 480 - media_template = """ - - - - - - - - """ - video_id_rx = re.compile('.*/video/([^/?#]+).*') - - def media_object(self): - if not self.soup: - self.download() - - if self.soup: - video_url = self.soup.find('meta', attrs={'name': 'nid'})['content'] - return dict(video_id = video_url, - type = self.domains[0]) - -class CollegeHumorScraper(MediaScraper): - domains = ['collegehumor.com'] - height = 390 - width = 520 - media_template = '' - video_id_rx = re.compile('.*video:(\d+).*'); - -class FunnyOrDieScraper(MediaScraper): - domains = ['funnyordie.com'] - height = 438 - width = 464 - media_template = '' - thumbnail_template = 'http://assets1.ordienetworks.com/tmbs/$video_id/medium_2.jpg?c79e63ac' - video_id_rx = re.compile('.*/videos/([^/]+)/.*') - -class ComedyCentralScraper(MediaScraper): - domains = ['comedycentral.com'] - height = 316 - width = 332 - media_template = '' - video_id_rx = re.compile('.*videoId=(\d+).*') - -class TheDailyShowScraper(MediaScraper): - domains = ['thedailyshow.com'] - height = 353 - width = 360 - media_template = """""" - - def video_id_extract(self): - "This is a bit of a hack" - if not self.soup: - self.download() - - if self.soup: - embed_container = self.soup.find('div', {'class': 'videoplayerPromo module'}) - if embed_container: - if embed_container['id'].startswith('promo_'): - video_id = embed_container['id'].split('_')[1] - return video_id - -class ColbertNationScraper(ComedyCentralScraper): - domains = ['colbertnation.com'] - video_id_rx = re.compile('.*videos/(\d+)/.*') - -class LiveLeakScraper(MediaScraper): - domains = ['liveleak.com'] - height = 370 - width = 450 - media_template = '' - video_id_rx = re.compile('.*i=([a-zA-Z0-9_]+).*') - - def largest_image_url(self): - if not self.soup: - self.download() - - if self.soup: - return self.soup.find('link', rel = 'videothumbnail')['href'] - -class DailyMotionScraper(MediaScraper): - domains = ['dailymotion.com'] - height = 381 - width = 480 - media_template = '' - video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)_.*') - - def media_object(self): - if not self.soup: - self.download() - - if self.soup: - video_url = self.soup.find('link', rel = 'video_src')['href'] - return dict(video_id = video_url, - type = self.domains[0]) - -class RevverScraper(MediaScraper): - domains = ['revver.com'] - height = 392 - width = 480 - media_template = '' - video_id_rx = re.compile('.*/video/([a-zA-Z0-9]+)/.*') - -class EscapistScraper(MediaScraper): - domains = ['escapistmagazine.com'] - height = 294 - width = 480 - media_template = """""" - video_id_rx = re.compile('.*/videos/view/[A-Za-z-9-]+/([0-9]+).*') - -class JustintvScraper(MediaScraper): - """Can grab streams from justin.tv, but not clips""" - domains = ['justin.tv'] - height = 295 - width = 353 - stream_media_template = """""" - video_id_rx = re.compile('^http://www.justin.tv/([a-zA-Z0-9_]+)[^/]*$') - - @classmethod - def media_embed(cls, video_id, **kw): - content = cls.stream_media_template.replace('$video_id', video_id) - return MediaEmbed(height = cls.height, - width = cls.width, - content = content) - -class SoundcloudScraper(MediaScraper): - """soundcloud.com""" - domains = ['soundcloud.com'] - height = 81 - width = 400 - media_template = """
- - - - - - - """ - video_id_rx = re.compile('^http://soundcloud.com/[a-zA-Z0-9_-]+/([a-zA-Z0-9_-]+)') - -class CraigslistScraper(MediaScraper): - domains = ['craigslist.org'] - height = 480 - width = 640 - max_size_kb = 50 - - def video_id_extract(self): - return self.url - - def media_object(self): - if not self.soup: - self.download() - - if self.soup: - ub = self.soup.find('div', {'id': 'userbody'}) - if ub: - ub = str(ub) - if len(ub) <= self.max_size_kb * 1024: - return dict(content = ub, - type = self.domains[0]) - - @classmethod - def media_embed(cls, content, **kw): - return MediaEmbed(height = cls.height, - width = cls.width, - content = content, - scrolling = True) - - -########## oembed rich-media scrapers ########## - -class OEmbed(Scraper): - """ - Oembed Scraper - ============== - Tries to use the oembed standard to create a media object. - - url_re: Regular Expression to match the incoming url against. - api_endpoint: Url of the api end point you are using. - api_params: Default Params to be sent with the outgoing request. - """ - url_re = '' - api_endpoint = '' - api_params = {} - - def __init__(self, url): - Scraper.__init__(self, url) - self.oembed = None - - #Fallback to the scraper if the url doesn't match - if not self.url_re.match(self.url): - self.__class__ = Scraper - - def __repr__(self): - return "%s(%r)" % (self.__class__.__name__, self.url) - - def utf8_encode(self, input): - """UTF-8 encodes any strings in an object (from json.loads)""" - if isinstance(input, dict): - return {self.utf8_encode(key): self.utf8_encode(value) - for key, value in input.iteritems()} - elif isinstance(input, list): - return [self.utf8_encode(item) - for item in input] - elif isinstance(input, unicode): - return input.encode('utf-8') - else: - return input - - def download(self): - self.api_params.update( { 'url':self.url}) - query = urllib.urlencode(self.api_params) - api_url = "%s?%s" % (self.api_endpoint, query) - - self.content_type, self.content = fetch_url(api_url) - - #Either a 404 or 500. - if not self.content: - #raise ValueError('ISSUE CALLING %s' %api_url) - log.warning('oEmbed call (%s) failed to return content for %s' - %(api_url, self.url)) - return None - - try: - self.oembed = json.loads(self.content, - object_hook=self.utf8_encode) - except ValueError, e: - log.error('oEmbed call (%s) return invalid json for %s' - %(api_url, self.url)) - return None - - def image_urls(self): - #if the original url was an image, use that - if self.oembed and self.oembed.get('type') =='photo': - yield self.oembed.get('url') - elif self.oembed and self.oembed.get('thumbnail_url'): - yield self.oembed.get('thumbnail_url') - - def largest_image_url(self): - #Seems to be the default place to check if the download has happened. - if not self.oembed: - self.download() - - #if the original url was of the photo type - if self.oembed and self.oembed.get('type') =='photo': - return self.oembed.get('url') - elif self.oembed and self.oembed.get('thumbnail_url'): - return self.oembed.get('thumbnail_url') - - def media_object(self): - #Seems to be the default place to check if the download has happened. - if not self.oembed: - self.download() - - if self.oembed and self.oembed.get('type') in ['video', 'rich']: - for domain in self.domains: - if self.url.find(domain) > -1: - return dict(type=domain, oembed=self.oembed) - return None - - @classmethod - def media_embed(cls, video_id = None, height = None, width = None, **kw): - content = None - oembed = kw.get('oembed') - - # check if oembed is there and has html - if oembed and oembed.get('html'): - content = oembed.get('html') - if content and oembed.get('height') and oembed.get('width'): - return MediaEmbed(height = oembed['height'], - width = oembed['width'], - content = content) - -class EmbedlyOEmbed(OEmbed): - """ - Embedly oEmbed Provider - ======================= - documentation: http://api.embed.ly - """ - domains = ['23hq.com', '5min.com', '99dollarmusicvideos.com', - 'abcnews.go.com', 'achewood.com', 'allthingsd.com', 'amazon.com', - 'aniboom.com', 'animoto.com', 'asofterworld.com', 'atom.com', - 'audioboo.com', 'bambuser.com', 'bandcamp.com', 'barelydigital.com', - 'barelypolitical.com', 'bigthink.com', 'blip.tv', 'bnter.com', - 'boston.com', 'brainbird.net', 'bravotv.com', 'break.com', - 'brizzly.com', 'cbsnews.com', 'channelfrederator.com', 'chart.ly', - 'cl.ly', 'clikthrough.com', 'clipfish.de', 'clipshack.com', 'cnbc.com', - 'cnn.com', 'colbertnation.com', 'collegehumor.com', 'color.com', - 'comedycentral.com', 'compete.com', 'confreaks.net', 'crackle.com', - 'craigslist.org', 'crocodoc.com', 'crunchbase.com', 'dailybooth.com', - 'dailymile.com', 'dailymotion.com', 'deviantart.com', 'digg.com', - 'dipdive.com', 'discovery.com', 'dotsub.com', 'dribbble.com', - 'edition.cnn.com', 'emberapp.com', 'escapistmagazine.com', - 'espn.go.com', 'facebook.com', 'fancast.com', 'flickr.com', 'fora.tv', - 'formspring.me', 'fotopedia.com', 'freemusicarchive.org', - 'funnyordie.com', 'gametrailers.com', 'gist.github.com', - 'globalpost.com', 'godtube.com', 'gogoyoko.com', 'google.com', - 'graphicly.com', 'grindtv.com', 'grooveshark.com', 'guardian.co.uk', - 'hark.com', 'howcast.com', 'huffduffer.com', 'hulu.com', - 'hungrynation.tv', 'ifood.tv', 'img.ly', 'imgur.com', 'indenti.ca', - 'indymogul.com', 'instagr.am', 'issuu.com', 'itunes.apple.com', - 'justin.tv', 'kickstarter.com', 'kinomap.com', 'kiva.org', - 'koldcast.tv', 'last.fm', 'lightbox.com', 'liveleak.com', - 'livestream.com', 'lockerz.com', 'logotv.com', 'lonelyplanet.com', - 'maps.google.com', 'meadd.com', 'mediamatters.org', 'meetup.com', - 'metacafe.com', 'metacdn.com', 'mixcloud.com', 'mixergy.com', - 'mlkshk.com', 'mobypicture.com', 'money.cnn.com', 'movies.yahoo.com', - 'msnbc.com', 'my.opera.com', 'myloc.me', 'myvideo.de', - 'nationalgeographic.com', 'nfb.ca', 'npr.org', 'nzonscreen.com', - 'overstream.net', 'ow.ly', 'pastebin.com', 'pastie.org', - 'phodroid.com', 'photobucket.com', 'photozou.jp', - 'picasaweb.google.com', 'picplz.com', 'pikchur.com', 'ping.fm', - 'polldaddy.com', 'polleverywhere.com', 'posterous.com', 'prezi.com', - 'qik.com', 'quantcast.com', 'questionablecontent.net', 'qwantz.com', - 'qwiki.com', 'radionomy.com', 'radioreddit.com', 'rdio.com', - 'recordsetter.com','redux.com', 'revision3.com', 'revver.com', - 'saynow.com', 'schooltube.com', 'sciencestage.com', 'scrapblog.com', - 'screencast.com', 'screenr.com', 'scribd.com', 'sendables.jibjab.com', - 'share.ovi.com', 'shitmydadsays.com', 'shopstyle.com', 'skitch.com', - 'slideshare.net', 'smugmug.com', 'snotr.com', 'socialcam.com', - 'someecards.com', 'soundcloud.com', 'speakerdeck.com', 'spike.com', - 'statsheet.com', 'status.net', 'storify.com', 'streetfire.net', - 'studivz.net', 'tangle.com', 'teachertube.com', 'techcrunch.tv', - 'ted.com', 'thedailyshow.com', 'theonion.com', 'threadbanger.com', - 'timetoast.com', 'tinypic.com', 'tmiweekly.com', 'traileraddict.com', - 'trailerspy.com', 'trooptube.tv', 'trutv.com', 'tumblr.com', - 'twitgoo.com', 'twitlonger.com', 'twitpic.com', 'twitrpix.com', - 'twitter.com', 'twitvid.com', 'ultrakawaii.com', 'urtak.com', - 'uservoice.com', 'ustream.com', 'viddler.com', 'video.forbes.com', - 'video.google.com', 'video.jardenberg.com', 'video.pbs.org', - 'video.yahoo.com', 'videos.nymag.com', 'vids.myspace.com', 'vimeo.com', - 'vodcars.com', 'washingtonpost.com', 'whitehouse.gov', 'whosay.com', - 'wikimedia.org', 'wikipedia.org', 'wistia.com', 'wordpress.tv', - 'worldstarhiphop.com', 'xiami.com', 'xkcd.com', 'xtranormal.com', - 'yfrog.com', 'youku.com', 'youtu.be', 'youtube.com', 'zapiks.com', - 'zero-inch.com'] - - url_re = re.compile( - 'http:\\/\\/.*youtube\\.com\\/watch.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' + - 'https:\\/\\/.*youtube\\.com\\/watch.*|' + - 'https:\\/\\/.*\\.youtube\\.com\\/v\\/.*|' + - 'http:\\/\\/youtu\\.be\\/.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/user\\/.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/.*\\#.*\\/.*|' + - 'http:\\/\\/m\\.youtube\\.com\\/watch.*|' + - 'http:\\/\\/m\\.youtube\\.com\\/index.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/profile.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/view_play_list.*|' + - 'http:\\/\\/.*\\.youtube\\.com\\/playlist.*|' + - 'http:\\/\\/.*justin\\.tv\\/.*|' + - 'http:\\/\\/.*justin\\.tv\\/.*\\/b\\/.*|' + - 'http:\\/\\/.*justin\\.tv\\/.*\\/w\\/.*|' + - 'http:\\/\\/www\\.ustream\\.tv\\/recorded\\/.*|' + - 'http:\\/\\/www\\.ustream\\.tv\\/channel\\/.*|' + - 'http:\\/\\/www\\.ustream\\.tv\\/.*|' + - 'http:\\/\\/qik\\.com\\/video\\/.*|' + - 'http:\\/\\/qik\\.com\\/.*|' + - 'http:\\/\\/qik\\.ly\\/.*|' + - 'http:\\/\\/.*revision3\\.com\\/.*|' + - 'http:\\/\\/.*\\.dailymotion\\.com\\/video\\/.*|' + - 'http:\\/\\/.*\\.dailymotion\\.com\\/.*\\/video\\/.*|' + - 'http:\\/\\/collegehumor\\.com\\/video:.*|' + - 'http:\\/\\/collegehumor\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.collegehumor\\.com\\/video:.*|' + - 'http:\\/\\/www\\.collegehumor\\.com\\/video\\/.*|' + - 'http:\\/\\/.*twitvid\\.com\\/.*|' + - 'http:\\/\\/www\\.break\\.com\\/.*\\/.*|' + - 'http:\\/\\/vids\\.myspace\\.com\\/index\\.cfm\\?fuseaction=vids\\.individual&videoid.*|' + - 'http:\\/\\/www\\.myspace\\.com\\/index\\.cfm\\?fuseaction=.*&videoid.*|' + - 'http:\\/\\/www\\.metacafe\\.com\\/watch\\/.*|' + - 'http:\\/\\/www\\.metacafe\\.com\\/w\\/.*|' + - 'http:\\/\\/blip\\.tv\\/.*\\/.*|' + - 'http:\\/\\/.*\\.blip\\.tv\\/.*\\/.*|' + - 'http:\\/\\/video\\.google\\.com\\/videoplay\\?.*|' + - 'http:\\/\\/.*revver\\.com\\/video\\/.*|' + - 'http:\\/\\/video\\.yahoo\\.com\\/watch\\/.*\\/.*|' + - 'http:\\/\\/video\\.yahoo\\.com\\/network\\/.*|' + - 'http:\\/\\/.*viddler\\.com\\/explore\\/.*\\/videos\\/.*|' + - 'http:\\/\\/liveleak\\.com\\/view\\?.*|' + - 'http:\\/\\/www\\.liveleak\\.com\\/view\\?.*|' + - 'http:\\/\\/animoto\\.com\\/play\\/.*|' + - 'http:\\/\\/dotsub\\.com\\/view\\/.*|' + - 'http:\\/\\/www\\.overstream\\.net\\/view\\.php\\?oid=.*|' + - 'http:\\/\\/www\\.livestream\\.com\\/.*|' + - 'http:\\/\\/www\\.worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' + - 'http:\\/\\/worldstarhiphop\\.com\\/videos\\/video.*\\.php\\?v=.*|' + - 'http:\\/\\/teachertube\\.com\\/viewVideo\\.php.*|' + - 'http:\\/\\/www\\.teachertube\\.com\\/viewVideo\\.php.*|' + - 'http:\\/\\/www1\\.teachertube\\.com\\/viewVideo\\.php.*|' + - 'http:\\/\\/www2\\.teachertube\\.com\\/viewVideo\\.php.*|' + - 'http:\\/\\/bambuser\\.com\\/v\\/.*|' + - 'http:\\/\\/bambuser\\.com\\/channel\\/.*|' + - 'http:\\/\\/bambuser\\.com\\/channel\\/.*\\/broadcast\\/.*|' + - 'http:\\/\\/www\\.schooltube\\.com\\/video\\/.*\\/.*|' + - 'http:\\/\\/bigthink\\.com\\/ideas\\/.*|' + - 'http:\\/\\/bigthink\\.com\\/series\\/.*|' + - 'http:\\/\\/sendables\\.jibjab\\.com\\/view\\/.*|' + - 'http:\\/\\/sendables\\.jibjab\\.com\\/originals\\/.*|' + - 'http:\\/\\/www\\.xtranormal\\.com\\/watch\\/.*|' + - 'http:\\/\\/socialcam\\.com\\/v\\/.*|' + - 'http:\\/\\/www\\.socialcam\\.com\\/v\\/.*|' + - 'http:\\/\\/dipdive\\.com\\/media\\/.*|' + - 'http:\\/\\/dipdive\\.com\\/member\\/.*\\/media\\/.*|' + - 'http:\\/\\/dipdive\\.com\\/v\\/.*|' + - 'http:\\/\\/.*\\.dipdive\\.com\\/media\\/.*|' + - 'http:\\/\\/.*\\.dipdive\\.com\\/v\\/.*|' + - 'http:\\/\\/v\\.youku\\.com\\/v_show\\/.*\\.html|' + - 'http:\\/\\/v\\.youku\\.com\\/v_playlist\\/.*\\.html|' + - 'http:\\/\\/www\\.snotr\\.com\\/video\\/.*|' + - 'http:\\/\\/snotr\\.com\\/video\\/.*|' + - 'http:\\/\\/video\\.jardenberg\\.se\\/.*|' + - 'http:\\/\\/www\\.clipfish\\.de\\/.*\\/.*\\/video\\/.*|' + - 'http:\\/\\/www\\.myvideo\\.de\\/watch\\/.*|' + - 'http:\\/\\/www\\.whitehouse\\.gov\\/photos-and-video\\/video\\/.*|' + - 'http:\\/\\/www\\.whitehouse\\.gov\\/video\\/.*|' + - 'http:\\/\\/wh\\.gov\\/photos-and-video\\/video\\/.*|' + - 'http:\\/\\/wh\\.gov\\/video\\/.*|' + - 'http:\\/\\/www\\.hulu\\.com\\/watch.*|' + - 'http:\\/\\/www\\.hulu\\.com\\/w\\/.*|' + - 'http:\\/\\/hulu\\.com\\/watch.*|' + - 'http:\\/\\/hulu\\.com\\/w\\/.*|' + - 'http:\\/\\/.*crackle\\.com\\/c\\/.*|' + - 'http:\\/\\/www\\.fancast\\.com\\/.*\\/videos|' + - 'http:\\/\\/www\\.funnyordie\\.com\\/videos\\/.*|' + - 'http:\\/\\/www\\.funnyordie\\.com\\/m\\/.*|' + - 'http:\\/\\/funnyordie\\.com\\/videos\\/.*|' + - 'http:\\/\\/funnyordie\\.com\\/m\\/.*|' + - 'http:\\/\\/www\\.vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' + - 'http:\\/\\/www\\.vimeo\\.com\\/.*|' + - 'http:\\/\\/vimeo\\.com\\/groups\\/.*\\/videos\\/.*|' + - 'http:\\/\\/vimeo\\.com\\/.*|' + - 'http:\\/\\/vimeo\\.com\\/m\\/\\#\\/.*|' + - 'http:\\/\\/www\\.ted\\.com\\/talks\\/.*\\.html.*|' + - 'http:\\/\\/www\\.ted\\.com\\/talks\\/lang\\/.*\\/.*\\.html.*|' + - 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/.*\\.html.*|' + - 'http:\\/\\/www\\.ted\\.com\\/index\\.php\\/talks\\/lang\\/.*\\/.*\\.html.*|' + - 'http:\\/\\/.*nfb\\.ca\\/film\\/.*|' + - 'http:\\/\\/www\\.thedailyshow\\.com\\/watch\\/.*|' + - 'http:\\/\\/www\\.thedailyshow\\.com\\/full-episodes\\/.*|' + - 'http:\\/\\/www\\.thedailyshow\\.com\\/collection\\/.*\\/.*\\/.*|' + - 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video\\/.*|' + - 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/trailer|' + - 'http:\\/\\/movies\\.yahoo\\.com\\/movie\\/.*\\/video|' + - 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-collections\\/.*|' + - 'http:\\/\\/www\\.colbertnation\\.com\\/full-episodes\\/.*|' + - 'http:\\/\\/www\\.colbertnation\\.com\\/the-colbert-report-videos\\/.*|' + - 'http:\\/\\/www\\.comedycentral\\.com\\/videos\\/index\\.jhtml\\?.*|' + - 'http:\\/\\/www\\.theonion\\.com\\/video\\/.*|' + - 'http:\\/\\/theonion\\.com\\/video\\/.*|' + - 'http:\\/\\/wordpress\\.tv\\/.*\\/.*\\/.*\\/.*\\/|' + - 'http:\\/\\/www\\.traileraddict\\.com\\/trailer\\/.*|' + - 'http:\\/\\/www\\.traileraddict\\.com\\/clip\\/.*|' + - 'http:\\/\\/www\\.traileraddict\\.com\\/poster\\/.*|' + - 'http:\\/\\/www\\.escapistmagazine\\.com\\/videos\\/.*|' + - 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*\\/.*|' + - 'http:\\/\\/www\\.trailerspy\\.com\\/trailer\\/.*|' + - 'http:\\/\\/www\\.trailerspy\\.com\\/view_video\\.php.*|' + - 'http:\\/\\/www\\.atom\\.com\\/.*\\/.*\\/|' + - 'http:\\/\\/fora\\.tv\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.spike\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.gametrailers\\.com\\/video\\/.*|' + - 'http:\\/\\/gametrailers\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.koldcast\\.tv\\/video\\/.*|' + - 'http:\\/\\/www\\.koldcast\\.tv\\/\\#video:.*|' + - 'http:\\/\\/techcrunch\\.tv\\/watch.*|' + - 'http:\\/\\/techcrunch\\.tv\\/.*\\/watch.*|' + - 'http:\\/\\/mixergy\\.com\\/.*|' + - 'http:\\/\\/video\\.pbs\\.org\\/video\\/.*|' + - 'http:\\/\\/www\\.zapiks\\.com\\/.*|' + - 'http:\\/\\/tv\\.digg\\.com\\/diggnation\\/.*|' + - 'http:\\/\\/tv\\.digg\\.com\\/diggreel\\/.*|' + - 'http:\\/\\/tv\\.digg\\.com\\/diggdialogg\\/.*|' + - 'http:\\/\\/www\\.trutv\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.nzonscreen\\.com\\/title\\/.*|' + - 'http:\\/\\/nzonscreen\\.com\\/title\\/.*|' + - 'http:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' + - 'https:\\/\\/app\\.wistia\\.com\\/embed\\/medias\\/.*|' + - 'http:\\/\\/hungrynation\\.tv\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.hungrynation\\.tv\\/.*\\/episode\\/.*|' + - 'http:\\/\\/hungrynation\\.tv\\/episode\\/.*|' + - 'http:\\/\\/www\\.hungrynation\\.tv\\/episode\\/.*|' + - 'http:\\/\\/indymogul\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.indymogul\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/indymogul\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.indymogul\\.com\\/episode\\/.*|' + - 'http:\\/\\/channelfrederator\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.channelfrederator\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/channelfrederator\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.channelfrederator\\.com\\/episode\\/.*|' + - 'http:\\/\\/tmiweekly\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.tmiweekly\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/tmiweekly\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.tmiweekly\\.com\\/episode\\/.*|' + - 'http:\\/\\/99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/99dollarmusicvideos\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.99dollarmusicvideos\\.com\\/episode\\/.*|' + - 'http:\\/\\/ultrakawaii\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.ultrakawaii\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/ultrakawaii\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.ultrakawaii\\.com\\/episode\\/.*|' + - 'http:\\/\\/barelypolitical\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.barelypolitical\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/barelypolitical\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.barelypolitical\\.com\\/episode\\/.*|' + - 'http:\\/\\/barelydigital\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.barelydigital\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/barelydigital\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.barelydigital\\.com\\/episode\\/.*|' + - 'http:\\/\\/threadbanger\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.threadbanger\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/threadbanger\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.threadbanger\\.com\\/episode\\/.*|' + - 'http:\\/\\/vodcars\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/www\\.vodcars\\.com\\/.*\\/episode\\/.*|' + - 'http:\\/\\/vodcars\\.com\\/episode\\/.*|' + - 'http:\\/\\/www\\.vodcars\\.com\\/episode\\/.*|' + - 'http:\\/\\/confreaks\\.net\\/videos\\/.*|' + - 'http:\\/\\/www\\.confreaks\\.net\\/videos\\/.*|' + - 'http:\\/\\/video\\.allthingsd\\.com\\/video\\/.*|' + - 'http:\\/\\/videos\\.nymag\\.com\\/.*|' + - 'http:\\/\\/aniboom\\.com\\/animation-video\\/.*|' + - 'http:\\/\\/www\\.aniboom\\.com\\/animation-video\\/.*|' + - 'http:\\/\\/clipshack\\.com\\/Clip\\.aspx\\?.*|' + - 'http:\\/\\/www\\.clipshack\\.com\\/Clip\\.aspx\\?.*|' + - 'http:\\/\\/grindtv\\.com\\/.*\\/video\\/.*|' + - 'http:\\/\\/www\\.grindtv\\.com\\/.*\\/video\\/.*|' + - 'http:\\/\\/ifood\\.tv\\/recipe\\/.*|' + - 'http:\\/\\/ifood\\.tv\\/video\\/.*|' + - 'http:\\/\\/ifood\\.tv\\/channel\\/user\\/.*|' + - 'http:\\/\\/www\\.ifood\\.tv\\/recipe\\/.*|' + - 'http:\\/\\/www\\.ifood\\.tv\\/video\\/.*|' + - 'http:\\/\\/www\\.ifood\\.tv\\/channel\\/user\\/.*|' + - 'http:\\/\\/logotv\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.logotv\\.com\\/video\\/.*|' + - 'http:\\/\\/lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' + - 'http:\\/\\/www\\.lonelyplanet\\.com\\/Clip\\.aspx\\?.*|' + - 'http:\\/\\/streetfire\\.net\\/video\\/.*\\.htm.*|' + - 'http:\\/\\/www\\.streetfire\\.net\\/video\\/.*\\.htm.*|' + - 'http:\\/\\/trooptube\\.tv\\/videos\\/.*|' + - 'http:\\/\\/www\\.trooptube\\.tv\\/videos\\/.*|' + - 'http:\\/\\/sciencestage\\.com\\/v\\/.*\\.html|' + - 'http:\\/\\/sciencestage\\.com\\/a\\/.*\\.html|' + - 'http:\\/\\/www\\.sciencestage\\.com\\/v\\/.*\\.html|' + - 'http:\\/\\/www\\.sciencestage\\.com\\/a\\/.*\\.html|' + - 'http:\\/\\/www\\.godtube\\.com\\/featured\\/video\\/.*|' + - 'http:\\/\\/godtube\\.com\\/featured\\/video\\/.*|' + - 'http:\\/\\/www\\.godtube\\.com\\/watch\\/.*|' + - 'http:\\/\\/godtube\\.com\\/watch\\/.*|' + - 'http:\\/\\/www\\.tangle\\.com\\/view_video.*|' + - 'http:\\/\\/mediamatters\\.org\\/mmtv\\/.*|' + - 'http:\\/\\/www\\.clikthrough\\.com\\/theater\\/video\\/.*|' + - 'http:\\/\\/gist\\.github\\.com\\/.*|' + - 'http:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' + - 'http:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' + - 'http:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' + - 'http:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' + - 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' + - 'http:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' + - 'https:\\/\\/twitter\\.com\\/.*\\/status\\/.*|' + - 'https:\\/\\/twitter\\.com\\/.*\\/statuses\\/.*|' + - 'https:\\/\\/www\\.twitter\\.com\\/.*\\/status\\/.*|' + - 'https:\\/\\/www\\.twitter\\.com\\/.*\\/statuses\\/.*|' + - 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/status\\/.*|' + - 'https:\\/\\/mobile\\.twitter\\.com\\/.*\\/statuses\\/.*|' + - 'http:\\/\\/www\\.crunchbase\\.com\\/.*\\/.*|' + - 'http:\\/\\/crunchbase\\.com\\/.*\\/.*|' + - 'http:\\/\\/www\\.slideshare\\.net\\/.*\\/.*|' + - 'http:\\/\\/www\\.slideshare\\.net\\/mobile\\/.*\\/.*|' + - 'http:\\/\\/slidesha\\.re\\/.*|' + - 'http:\\/\\/scribd\\.com\\/doc\\/.*|' + - 'http:\\/\\/www\\.scribd\\.com\\/doc\\/.*|' + - 'http:\\/\\/scribd\\.com\\/mobile\\/documents\\/.*|' + - 'http:\\/\\/www\\.scribd\\.com\\/mobile\\/documents\\/.*|' + - 'http:\\/\\/screenr\\.com\\/.*|' + - 'http:\\/\\/polldaddy\\.com\\/community\\/poll\\/.*|' + - 'http:\\/\\/polldaddy\\.com\\/poll\\/.*|' + - 'http:\\/\\/answers\\.polldaddy\\.com\\/poll\\/.*|' + - 'http:\\/\\/www\\.5min\\.com\\/Video\\/.*|' + - 'http:\\/\\/www\\.howcast\\.com\\/videos\\/.*|' + - 'http:\\/\\/www\\.screencast\\.com\\/.*\\/media\\/.*|' + - 'http:\\/\\/screencast\\.com\\/.*\\/media\\/.*|' + - 'http:\\/\\/www\\.screencast\\.com\\/t\\/.*|' + - 'http:\\/\\/screencast\\.com\\/t\\/.*|' + - 'http:\\/\\/issuu\\.com\\/.*\\/docs\\/.*|' + - 'http:\\/\\/www\\.kickstarter\\.com\\/projects\\/.*\\/.*|' + - 'http:\\/\\/www\\.scrapblog\\.com\\/viewer\\/viewer\\.aspx.*|' + - 'http:\\/\\/ping\\.fm\\/p\\/.*|' + - 'http:\\/\\/chart\\.ly\\/symbols\\/.*|' + - 'http:\\/\\/chart\\.ly\\/.*|' + - 'http:\\/\\/maps\\.google\\.com\\/maps\\?.*|' + - 'http:\\/\\/maps\\.google\\.com\\/\\?.*|' + - 'http:\\/\\/maps\\.google\\.com\\/maps\\/ms\\?.*|' + - 'http:\\/\\/.*\\.craigslist\\.org\\/.*\\/.*|' + - 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/show\\.dml\\?id=.*|' + - 'http:\\/\\/my\\.opera\\.com\\/.*\\/albums\\/showpic\\.dml\\?album=.*&picture=.*|' + - 'http:\\/\\/tumblr\\.com\\/.*|' + - 'http:\\/\\/.*\\.tumblr\\.com\\/post\\/.*|' + - 'http:\\/\\/www\\.polleverywhere\\.com\\/polls\\/.*|' + - 'http:\\/\\/www\\.polleverywhere\\.com\\/multiple_choice_polls\\/.*|' + - 'http:\\/\\/www\\.polleverywhere\\.com\\/free_text_polls\\/.*|' + - 'http:\\/\\/www\\.quantcast\\.com\\/wd:.*|' + - 'http:\\/\\/www\\.quantcast\\.com\\/.*|' + - 'http:\\/\\/siteanalytics\\.compete\\.com\\/.*|' + - 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/statsheet\\.com\\/statplot\\/charts\\/e\\/.*|' + - 'http:\\/\\/statsheet\\.com\\/.*\\/teams\\/.*\\/.*|' + - 'http:\\/\\/statsheet\\.com\\/tools\\/chartlets\\?chart=.*|' + - 'http:\\/\\/.*\\.status\\.net\\/notice\\/.*|' + - 'http:\\/\\/identi\\.ca\\/notice\\/.*|' + - 'http:\\/\\/brainbird\\.net\\/notice\\/.*|' + - 'http:\\/\\/shitmydadsays\\.com\\/notice\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/Profile\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/l\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/Groups\\/Overview\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Info\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/Gadgets\\/Install\\/.*|' + - 'http:\\/\\/www\\.studivz\\.net\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/Profile\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/l\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/Groups\\/Overview\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Info\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/Gadgets\\/Install\\/.*|' + - 'http:\\/\\/www\\.meinvz\\.net\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/Profile\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/l\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/Groups\\/Overview\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Info\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/Gadgets\\/Install\\/.*|' + - 'http:\\/\\/www\\.schuelervz\\.net\\/.*|' + - 'http:\\/\\/myloc\\.me\\/.*|' + - 'http:\\/\\/pastebin\\.com\\/.*|' + - 'http:\\/\\/pastie\\.org\\/.*|' + - 'http:\\/\\/www\\.pastie\\.org\\/.*|' + - 'http:\\/\\/redux\\.com\\/stream\\/item\\/.*\\/.*|' + - 'http:\\/\\/redux\\.com\\/f\\/.*\\/.*|' + - 'http:\\/\\/www\\.redux\\.com\\/stream\\/item\\/.*\\/.*|' + - 'http:\\/\\/www\\.redux\\.com\\/f\\/.*\\/.*|' + - 'http:\\/\\/cl\\.ly\\/.*|' + - 'http:\\/\\/cl\\.ly\\/.*\\/content|' + - 'http:\\/\\/speakerdeck\\.com\\/u\\/.*\\/p\\/.*|' + - 'http:\\/\\/www\\.kiva\\.org\\/lend\\/.*|' + - 'http:\\/\\/www\\.timetoast\\.com\\/timelines\\/.*|' + - 'http:\\/\\/storify\\.com\\/.*\\/.*|' + - 'http:\\/\\/.*meetup\\.com\\/.*|' + - 'http:\\/\\/meetu\\.ps\\/.*|' + - 'http:\\/\\/www\\.dailymile\\.com\\/people\\/.*\\/entries\\/.*|' + - 'http:\\/\\/.*\\.kinomap\\.com\\/.*|' + - 'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/content\\/.*|' + - 'http:\\/\\/www\\.metacdn\\.com\\/api\\/users\\/.*\\/media\\/.*|' + - 'http:\\/\\/prezi\\.com\\/.*\\/.*|' + - 'http:\\/\\/.*\\.uservoice\\.com\\/.*\\/suggestions\\/.*|' + - 'http:\\/\\/formspring\\.me\\/.*|' + - 'http:\\/\\/www\\.formspring\\.me\\/.*|' + - 'http:\\/\\/formspring\\.me\\/.*\\/q\\/.*|' + - 'http:\\/\\/www\\.formspring\\.me\\/.*\\/q\\/.*|' + - 'http:\\/\\/twitlonger\\.com\\/show\\/.*|' + - 'http:\\/\\/www\\.twitlonger\\.com\\/show\\/.*|' + - 'http:\\/\\/tl\\.gd\\/.*|' + - 'http:\\/\\/www\\.qwiki\\.com\\/q\\/.*|' + - 'http:\\/\\/crocodoc\\.com\\/.*|' + - 'http:\\/\\/.*\\.crocodoc\\.com\\/.*|' + - 'https:\\/\\/crocodoc\\.com\\/.*|' + - 'https:\\/\\/.*\\.crocodoc\\.com\\/.*|' + - 'http:\\/\\/www\\.wikipedia\\.org\\/wiki\\/.*|' + - 'http:\\/\\/www\\.wikimedia\\.org\\/wiki\\/File.*|' + - 'https:\\/\\/urtak\\.com\\/u\\/.*|' + - 'https:\\/\\/urtak\\.com\\/clr\\/.*|' + - 'http:\\/\\/graphicly\\.com\\/.*\\/.*\\/.*|' + - 'http:\\/\\/.*yfrog\\..*\\/.*|' + - 'http:\\/\\/www\\.flickr\\.com\\/photos\\/.*|' + - 'http:\\/\\/flic\\.kr\\/.*|' + - 'http:\\/\\/twitpic\\.com\\/.*|' + - 'http:\\/\\/www\\.twitpic\\.com\\/.*|' + - 'http:\\/\\/twitpic\\.com\\/photos\\/.*|' + - 'http:\\/\\/www\\.twitpic\\.com\\/photos\\/.*|' + - 'http:\\/\\/.*imgur\\.com\\/.*|' + - 'http:\\/\\/.*\\.posterous\\.com\\/.*|' + - 'http:\\/\\/post\\.ly\\/.*|' + - 'http:\\/\\/twitgoo\\.com\\/.*|' + - 'http:\\/\\/i.*\\.photobucket\\.com\\/albums\\/.*|' + - 'http:\\/\\/s.*\\.photobucket\\.com\\/albums\\/.*|' + - 'http:\\/\\/phodroid\\.com\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.mobypicture\\.com\\/user\\/.*\\/view\\/.*|' + - 'http:\\/\\/moby\\.to\\/.*|' + - 'http:\\/\\/xkcd\\.com\\/.*|' + - 'http:\\/\\/www\\.xkcd\\.com\\/.*|' + - 'http:\\/\\/imgs\\.xkcd\\.com\\/.*|' + - 'http:\\/\\/www\\.asofterworld\\.com\\/index\\.php\\?id=.*|' + - 'http:\\/\\/www\\.asofterworld\\.com\\/.*\\.jpg|' + - 'http:\\/\\/asofterworld\\.com\\/.*\\.jpg|' + - 'http:\\/\\/www\\.qwantz\\.com\\/index\\.php\\?comic=.*|' + - 'http:\\/\\/23hq\\.com\\/.*\\/photo\\/.*|' + - 'http:\\/\\/www\\.23hq\\.com\\/.*\\/photo\\/.*|' + - 'http:\\/\\/.*dribbble\\.com\\/shots\\/.*|' + - 'http:\\/\\/drbl\\.in\\/.*|' + - 'http:\\/\\/.*\\.smugmug\\.com\\/.*|' + - 'http:\\/\\/.*\\.smugmug\\.com\\/.*\\#.*|' + - 'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*|' + - 'http:\\/\\/emberapp\\.com\\/.*\\/images\\/.*\\/sizes\\/.*|' + - 'http:\\/\\/emberapp\\.com\\/.*\\/collections\\/.*\\/.*|' + - 'http:\\/\\/emberapp\\.com\\/.*\\/categories\\/.*\\/.*\\/.*|' + - 'http:\\/\\/embr\\.it\\/.*|' + - 'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*\\#.*|' + - 'http:\\/\\/picasaweb\\.google\\.com.*\\/lh\\/photo\\/.*|' + - 'http:\\/\\/picasaweb\\.google\\.com.*\\/.*\\/.*|' + - 'http:\\/\\/dailybooth\\.com\\/.*\\/.*|' + - 'http:\\/\\/brizzly\\.com\\/pic\\/.*|' + - 'http:\\/\\/pics\\.brizzly\\.com\\/.*\\.jpg|' + - 'http:\\/\\/img\\.ly\\/.*|' + - 'http:\\/\\/www\\.tinypic\\.com\\/view\\.php.*|' + - 'http:\\/\\/tinypic\\.com\\/view\\.php.*|' + - 'http:\\/\\/www\\.tinypic\\.com\\/player\\.php.*|' + - 'http:\\/\\/tinypic\\.com\\/player\\.php.*|' + - 'http:\\/\\/www\\.tinypic\\.com\\/r\\/.*\\/.*|' + - 'http:\\/\\/tinypic\\.com\\/r\\/.*\\/.*|' + - 'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.jpg|' + - 'http:\\/\\/.*\\.tinypic\\.com\\/.*\\.png|' + - 'http:\\/\\/meadd\\.com\\/.*\\/.*|' + - 'http:\\/\\/meadd\\.com\\/.*|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/art\\/.*|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/gallery\\/.*|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/\\#\\/.*|' + - 'http:\\/\\/fav\\.me\\/.*|' + - 'http:\\/\\/.*\\.deviantart\\.com|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/gallery|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.jpg|' + - 'http:\\/\\/.*\\.deviantart\\.com\\/.*\\/.*\\.gif|' + - 'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.jpg|' + - 'http:\\/\\/.*\\.deviantart\\.net\\/.*\\/.*\\.gif|' + - 'http:\\/\\/www\\.fotopedia\\.com\\/.*\\/.*|' + - 'http:\\/\\/fotopedia\\.com\\/.*\\/.*|' + - 'http:\\/\\/photozou\\.jp\\/photo\\/show\\/.*\\/.*|' + - 'http:\\/\\/photozou\\.jp\\/photo\\/photo_only\\/.*\\/.*|' + - 'http:\\/\\/instagr\\.am\\/p\\/.*|' + - 'http:\\/\\/instagram\\.com\\/p\\/.*|' + - 'http:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' + - 'http:\\/\\/img\\.skitch\\.com\\/.*|' + - 'https:\\/\\/skitch\\.com\\/.*\\/.*\\/.*|' + - 'https:\\/\\/img\\.skitch\\.com\\/.*|' + - 'http:\\/\\/share\\.ovi\\.com\\/media\\/.*\\/.*|' + - 'http:\\/\\/www\\.questionablecontent\\.net\\/|' + - 'http:\\/\\/questionablecontent\\.net\\/|' + - 'http:\\/\\/www\\.questionablecontent\\.net\\/view\\.php.*|' + - 'http:\\/\\/questionablecontent\\.net\\/view\\.php.*|' + - 'http:\\/\\/questionablecontent\\.net\\/comics\\/.*\\.png|' + - 'http:\\/\\/www\\.questionablecontent\\.net\\/comics\\/.*\\.png|' + - 'http:\\/\\/picplz\\.com\\/.*|' + - 'http:\\/\\/twitrpix\\.com\\/.*|' + - 'http:\\/\\/.*\\.twitrpix\\.com\\/.*|' + - 'http:\\/\\/www\\.someecards\\.com\\/.*\\/.*|' + - 'http:\\/\\/someecards\\.com\\/.*\\/.*|' + - 'http:\\/\\/some\\.ly\\/.*|' + - 'http:\\/\\/www\\.some\\.ly\\/.*|' + - 'http:\\/\\/pikchur\\.com\\/.*|' + - 'http:\\/\\/achewood\\.com\\/.*|' + - 'http:\\/\\/www\\.achewood\\.com\\/.*|' + - 'http:\\/\\/achewood\\.com\\/index\\.php.*|' + - 'http:\\/\\/www\\.achewood\\.com\\/index\\.php.*|' + - 'http:\\/\\/www\\.whosay\\.com\\/content\\/.*|' + - 'http:\\/\\/www\\.whosay\\.com\\/photos\\/.*|' + - 'http:\\/\\/www\\.whosay\\.com\\/videos\\/.*|' + - 'http:\\/\\/say\\.ly\\/.*|' + - 'http:\\/\\/ow\\.ly\\/i\\/.*|' + - 'http:\\/\\/color\\.com\\/s\\/.*|' + - 'http:\\/\\/bnter\\.com\\/convo\\/.*|' + - 'http:\\/\\/mlkshk\\.com\\/p\\/.*|' + - 'http:\\/\\/lockerz\\.com\\/s\\/.*|' + - 'http:\\/\\/lightbox\\.com\\/.*|' + - 'http:\\/\\/www\\.lightbox\\.com\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/.*\\/dp\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/dp\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/o\\/ASIN\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/gp\\/offer-listing\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/.*\\/ASIN\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/gp\\/product\\/images\\/.*|' + - 'http:\\/\\/.*amazon\\..*\\/gp\\/aw\\/d\\/.*|' + - 'http:\\/\\/www\\.amzn\\.com\\/.*|' + - 'http:\\/\\/amzn\\.com\\/.*|' + - 'http:\\/\\/www\\.shopstyle\\.com\\/browse.*|' + - 'http:\\/\\/www\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' + - 'http:\\/\\/api\\.shopstyle\\.com\\/action\\/apiVisitRetailer.*|' + - 'http:\\/\\/www\\.shopstyle\\.com\\/action\\/viewLook.*|' + - 'http:\\/\\/itunes\\.apple\\.com\\/.*|' + - 'https:\\/\\/itunes\\.apple\\.com\\/.*|' + - 'http:\\/\\/soundcloud\\.com\\/.*|' + - 'http:\\/\\/soundcloud\\.com\\/.*\\/.*|' + - 'http:\\/\\/soundcloud\\.com\\/.*\\/sets\\/.*|' + - 'http:\\/\\/soundcloud\\.com\\/groups\\/.*|' + - 'http:\\/\\/snd\\.sc\\/.*|' + - 'http:\\/\\/www\\.last\\.fm\\/music\\/.*|' + - 'http:\\/\\/www\\.last\\.fm\\/music\\/+videos\\/.*|' + - 'http:\\/\\/www\\.last\\.fm\\/music\\/+images\\/.*|' + - 'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/_\\/.*|' + - 'http:\\/\\/www\\.last\\.fm\\/music\\/.*\\/.*|' + - 'http:\\/\\/www\\.mixcloud\\.com\\/.*\\/.*\\/|' + - 'http:\\/\\/www\\.radionomy\\.com\\/.*\\/radio\\/.*|' + - 'http:\\/\\/radionomy\\.com\\/.*\\/radio\\/.*|' + - 'http:\\/\\/www\\.hark\\.com\\/clips\\/.*|' + - 'http:\\/\\/www\\.rdio\\.com\\/\\#\\/artist\\/.*\\/album\\/.*|' + - 'http:\\/\\/www\\.rdio\\.com\\/artist\\/.*\\/album\\/.*|' + - 'http:\\/\\/www\\.zero-inch\\.com\\/.*|' + - 'http:\\/\\/.*\\.bandcamp\\.com\\/|' + - 'http:\\/\\/.*\\.bandcamp\\.com\\/track\\/.*|' + - 'http:\\/\\/.*\\.bandcamp\\.com\\/album\\/.*|' + - 'http:\\/\\/freemusicarchive\\.org\\/music\\/.*|' + - 'http:\\/\\/www\\.freemusicarchive\\.org\\/music\\/.*|' + - 'http:\\/\\/freemusicarchive\\.org\\/curator\\/.*|' + - 'http:\\/\\/www\\.freemusicarchive\\.org\\/curator\\/.*|' + - 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.npr\\.org\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.npr\\.org\\/templates\\/story\\/story\\.php.*|' + - 'http:\\/\\/huffduffer\\.com\\/.*\\/.*|' + - 'http:\\/\\/www\\.audioboo\\.fm\\/boos\\/.*|' + - 'http:\\/\\/audioboo\\.fm\\/boos\\/.*|' + - 'http:\\/\\/boo\\.fm\\/b.*|' + - 'http:\\/\\/www\\.xiami\\.com\\/song\\/.*|' + - 'http:\\/\\/xiami\\.com\\/song\\/.*|' + - 'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' + - 'http:\\/\\/www\\.saynow\\.com\\/playMsg\\.html.*|' + - 'http:\\/\\/grooveshark\\.com\\/.*|' + - 'http:\\/\\/radioreddit\\.com\\/songs.*|' + - 'http:\\/\\/www\\.radioreddit\\.com\\/songs.*|' + - 'http:\\/\\/radioreddit\\.com\\/\\?q=songs.*|' + - 'http:\\/\\/www\\.radioreddit\\.com\\/\\?q=songs.*|' + - 'http:\\/\\/www\\.gogoyoko\\.com\\/song\\/.*|' + - 'http:\\/\\/espn\\.go\\.com\\/video\\/clip.*|' + - 'http:\\/\\/espn\\.go\\.com\\/.*\\/story.*|' + - 'http:\\/\\/abcnews\\.com\\/.*\\/video\\/.*|' + - 'http:\\/\\/abcnews\\.com\\/video\\/playerIndex.*|' + - 'http:\\/\\/washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.washingtonpost\\.com\\/wp-dyn\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.boston\\.com\\/video.*|' + - 'http:\\/\\/boston\\.com\\/video.*|' + - 'http:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' + - 'http:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' + - 'http:\\/\\/www\\.facebook\\.com\\/v\\/.*|' + - 'https:\\/\\/www\\.facebook\\.com\\/photo\\.php.*|' + - 'https:\\/\\/www\\.facebook\\.com\\/video\\/video\\.php.*|' + - 'https:\\/\\/www\\.facebook\\.com\\/v\\/.*|' + - 'http:\\/\\/cnbc\\.com\\/id\\/.*\\?.*video.*|' + - 'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\?.*video.*|' + - 'http:\\/\\/cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' + - 'http:\\/\\/www\\.cnbc\\.com\\/id\\/.*\\/play\\/1\\/video\\/.*|' + - 'http:\\/\\/cbsnews\\.com\\/video\\/watch\\/.*|' + - 'http:\\/\\/www\\.google\\.com\\/buzz\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.google\\.com\\/buzz\\/.*|' + - 'http:\\/\\/www\\.google\\.com\\/profiles\\/.*|' + - 'http:\\/\\/google\\.com\\/buzz\\/.*\\/.*\\/.*|' + - 'http:\\/\\/google\\.com\\/buzz\\/.*|' + - 'http:\\/\\/google\\.com\\/profiles\\/.*|' + - 'http:\\/\\/www\\.cnn\\.com\\/video\\/.*|' + - 'http:\\/\\/edition\\.cnn\\.com\\/video\\/.*|' + - 'http:\\/\\/money\\.cnn\\.com\\/video\\/.*|' + - 'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' + - 'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/vp\\/.*|' + - 'http:\\/\\/www\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' + - 'http:\\/\\/today\\.msnbc\\.msn\\.com\\/id\\/.*\\/ns\\/.*|' + - 'http:\\/\\/www\\.globalpost\\.com\\/video\\/.*|' + - 'http:\\/\\/www\\.globalpost\\.com\\/dispatch\\/.*|' + - 'http:\\/\\/guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/www\\.guardian\\.co\\.uk\\/.*\\/video\\/.*\\/.*\\/.*\\/.*|' + - 'http:\\/\\/bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' + - 'http:\\/\\/www\\.bravotv\\.com\\/.*\\/.*\\/videos\\/.*|' + - 'http:\\/\\/video\\.nationalgeographic\\.com\\/.*\\/.*\\/.*\\.html|' + - 'http:\\/\\/dsc\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/animal\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/health\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/investigation\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/military\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/planetgreen\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/science\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/tlc\\.discovery\\.com\\/videos\\/.*|' + - 'http:\\/\\/video\\.forbes\\.com\\/fvn\\/.*|' + - 'http:\\/\\/recordsetter\\.com\\/*\\/*\\/*' - , re.I - ) - - api_endpoint = 'http://api.embed.ly/1/oembed' - api_params = {'format':'json', 'maxwidth':600, 'key' : g.embedly_api_key } - -class GenericScraper(MediaScraper): - """a special scrapper not associated with any domains, used to - write media objects to links by hand""" - domains = ['*'] - height = 480 - width = 640 - - @classmethod - def media_embed(cls, content, height = None, width = None, scrolling = False, **kw): - return MediaEmbed(height = height or cls.height, - width = width or cls.width, - scrolling = scrolling, - content = content) - -class DeepScraper(object): - """Subclasses of DeepScraper attempt to dive into generic pages - for embeds of other types (like YouTube videos on blog - sites).""" - - def find_media_object(self, scraper): - return None - -class YoutubeEmbedDeepScraper(DeepScraper): - youtube_url_re = re.compile('^(http://www.youtube.com/v/([_a-zA-Z0-9-]+)).*') - - def find_media_object(self, scraper): - # try to find very simple youtube embeds - if not scraper.soup: - scraper.download() - - if scraper.soup: - movie_embed = scraper.soup.find('embed', - attrs={'src': lambda x: self.youtube_url_re.match(x)}) - if movie_embed: - youtube_id = self.youtube_url_re.match(movie_embed['src']).group(2) - youtube_url = 'http://www.youtube.com/watch?v=%s"' % youtube_id - log.debug('found youtube embed %s' % youtube_url) - mo = make_scraper(youtube_url).media_object() - mo['deep'] = scraper.url - return mo - -#scrapers =:= dict(domain -> ScraperClass) -scrapers = {} -for scraper in [ EmbedlyOEmbed, - YoutubeScraper, - MetacafeScraper, - GootubeScraper, - VimeoScraper, - BreakScraper, - TheOnionScraper, - CollegeHumorScraper, - FunnyOrDieScraper, - ComedyCentralScraper, - ColbertNationScraper, - TheDailyShowScraper, - TedScraper, - LiveLeakScraper, - DailyMotionScraper, - RevverScraper, - EscapistScraper, - JustintvScraper, - SoundcloudScraper, - CraigslistScraper, - GenericScraper, - ]: - for domain in scraper.domains: - scrapers.setdefault(domain, []).append(scraper) - -deepscrapers = [YoutubeEmbedDeepScraper] - -def get_media_embed(media_object): - for scraper in scrapers.get(media_object['type'], []): - res = scraper.media_embed(**media_object) - if res: - return res - if 'content' in media_object: - return GenericScraper.media_embed(**media_object) - -def convert_old_media_objects(): - q = Link._query(Link.c.media_object is not None, - Link.c._date > whenever, - data = True) - for link in utils.fetch_things2(q): - if not getattr(link, 'media_object', None): - continue - - if 'youtube' in link.media_object: - # we can rewrite this one without scraping - video_id = YoutubeScraper.video_id_rx.match(link.url) - link.media_object = dict(type='youtube.com', - video_id = video_id.group(1)) - elif ('video.google.com' in link.media_object - or 'metacafe' in link.media_object): - scraper = make_scraper(link.url) - if not scraper: - continue - mo = scraper.media_object() - if not mo: - continue - - link.media_object = mo - - else: - print "skipping %s because it confuses me" % link._fullname - continue - - link._commit() - -test_urls = [ - 'http://www.facebook.com/pages/Rick-Astley/5807213510?sid=c99aaf3888171e73668a38e0749ae12d', # regular thumbnail finder - 'http://www.flickr.com/photos/septuagesima/317819584/', # thumbnail with image_src - - #'http://www.youtube.com/watch?v=Yu_moia-oVI', - 'http://www.metacafe.com/watch/sy-1473689248/rick_astley_never_gonna_give_you_up_official_music_video/', - 'http://video.google.com/videoplay?docid=5908758151704698048', - #'http://vimeo.com/4495451', - 'http://www.break.com/usercontent/2008/11/Macy-s-Thankgiving-Day-Parade-Rick-Roll-611965.html', - 'http://www.theonion.com/content/video/sony_releases_new_stupid_piece_of', - 'http://www.collegehumor.com/video:1823712', - 'http://www.funnyordie.com/videos/7f2a184755/macys-thanksgiving-day-parade-gets-rick-rolled-from-that-happened', - 'http://www.comedycentral.com/videos/index.jhtml?videoId=178342&title=ultimate-fighting-vs.-bloggers', - - # old style - 'http://www.thedailyshow.com/video/index.jhtml?videoId=175244&title=Photoshop-of-Horrors', - # new style - 'http://www.thedailyshow.com/watch/wed-july-22-2009/the-born-identity', - - 'http://www.colbertnation.com/the-colbert-report-videos/63549/may-01-2006/sign-off---spam', - 'http://www.liveleak.com/view?i=e09_1207983531', - 'http://www.dailymotion.com/relevance/search/rick+roll/video/x5l8e6_rickroll_fun', - 'http://revver.com/video/1199591/rick-rolld-at-work/', - 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10-The-Orange-Box', - 'http://www.escapistmagazine.com/videos/view/unskippable/736-Lost-Odyssey', - - # justin.tv has two media types that we care about, streams, which - # we can scrape, and clips, which we can't - 'http://www.justin.tv/help', # stream - 'http://www.justin.tv/clip/c07a333f94e5716b', # clip, which we can't currently scrape, and shouldn't try - - 'http://soundcloud.com/kalhonaaho01/never-gonna-stand-you-up-rick-astley-vs-ludacris-album-version', - - 'http://www.craigslist.org/about/best/sea/240705630.html', - - 'http://listen.grooveshark.com/#/song/Never_Gonna_Give_You_Up/12616328', - 'http://tinysong.com/2WOJ', # also Grooveshark - 'http://www.slideshare.net/doina/happy-easter-from-holland-slideshare', - 'http://www.slideshare.net/stinson/easter-1284190', - 'http://www.slideshare.net/angelspascual/easter-events', - 'http://www.slideshare.net/sirrods/happy-easter-3626014', - 'http://www.slideshare.net/sirrods/happy-easter-wide-screen', - 'http://www.slideshare.net/carmen_serbanescu/easter-holiday', - 'http://www.slideshare.net/Lithuaniabook/easter-1255880', - 'http://www.slideshare.net/hues/easter-plants', - 'http://www.slideshare.net/Gospelman/passover-week', - 'http://www.slideshare.net/angelspascual/easter-around-the-world-1327542', - 'http://www.scribd.com/doc/13994900/Easter', - 'http://www.scribd.com/doc/27425714/Celebrating-Easter-ideas-for-adults-and-children', - 'http://www.scribd.com/doc/28010101/Easter-Foods-No-Name', - 'http://www.scribd.com/doc/28452730/Easter-Cards', - 'http://www.scribd.com/doc/19026714/The-Easter-Season', - 'http://www.scribd.com/doc/29183659/History-of-Easter', - 'http://www.scribd.com/doc/15632842/The-Last-Easter', - 'http://www.scribd.com/doc/28741860/The-Plain-Truth-About-Easter', - 'http://www.scribd.com/doc/23616250/4-27-08-ITS-EASTER-AGAIN-ORTHODOX-EASTER-by-vanderKOK', - 'http://screenr.com/t9d', - 'http://screenr.com/yLS', - 'http://screenr.com/gzS', - 'http://screenr.com/IwU', - 'http://screenr.com/FM7', - 'http://screenr.com/Ejg', - 'http://screenr.com/u4h', - 'http://screenr.com/QiN', - 'http://screenr.com/zts', - 'http://www.5min.com/Video/How-to-Decorate-Easter-Eggs-with-Decoupage-142076462', - 'http://www.5min.com/Video/How-to-Color-Easter-Eggs-Dye-142076281', - 'http://www.5min.com/Video/How-to-Make-an-Easter-Egg-Diorama-142076482', - 'http://www.5min.com/Video/How-to-Make-Sequined-Easter-Eggs-142076512', - 'http://www.5min.com/Video/How-to-Decorate-Wooden-Easter-Eggs-142076558', - 'http://www.5min.com/Video/How-to-Blow-out-an-Easter-Egg-142076367', - 'http://www.5min.com/Video/Learn-About-Easter-38363995', - 'http://www.howcast.com/videos/368909-Easter-Egg-Dying-How-To-Make-Ukrainian-Easter-Eggs', - 'http://www.howcast.com/videos/368911-Easter-Egg-Dying-How-To-Color-Easter-Eggs-With-Food-Dyes', - 'http://www.howcast.com/videos/368913-Easter-Egg-Dying-How-To-Make-Homemade-Easter-Egg-Dye', - 'http://www.howcast.com/videos/220110-The-Meaning-Of-Easter', - 'http://my.opera.com/nirvanka/albums/show.dml?id=519866', - 'http://img402.yfrog.com/i/mfe.jpg/', - 'http://img20.yfrog.com/i/dy6.jpg/', - 'http://img145.yfrog.com/i/4mu.mp4/', - 'http://img15.yfrog.com/i/mygreatmovie.mp4/', - 'http://img159.yfrog.com/i/500x5000401.jpg/', - 'http://tweetphoto.com/14784358', - 'http://tweetphoto.com/16044847', - 'http://tweetphoto.com/16718883', - 'http://tweetphoto.com/16451148', - 'http://tweetphoto.com/16133984', - 'http://tweetphoto.com/8069529', - 'http://tweetphoto.com/16207556', - 'http://tweetphoto.com/7448361', - 'http://tweetphoto.com/16069325', - 'http://tweetphoto.com/4791033', - 'http://www.flickr.com/photos/10349896@N08/4490293418/', - 'http://www.flickr.com/photos/mneylon/4483279051/', - 'http://www.flickr.com/photos/xstartxtodayx/4488996521/', - 'http://www.flickr.com/photos/mommyknows/4485313917/', - 'http://www.flickr.com/photos/29988430@N06/4487127638/', - 'http://www.flickr.com/photos/excomedia/4484159563/', - 'http://www.flickr.com/photos/sunnybrook100/4471526636/', - 'http://www.flickr.com/photos/jaimewalsh/4489497178/', - 'http://www.flickr.com/photos/29988430@N06/4486475549/', - 'http://www.flickr.com/photos/22695183@N08/4488681694/', - 'http://twitpic.com/1cnsf6', - 'http://twitpic.com/1cgtti', - 'http://twitpic.com/1coc0n', - 'http://twitpic.com/1cm8us', - 'http://twitpic.com/1cgks4', - 'http://imgur.com/6pLoN', - 'http://onegoodpenguin.posterous.com/golden-tee-live-2010-easter-egg', - 'http://adland.posterous.com/?tag=royaleastershowauckland', - 'http://apartmentliving.posterous.com/biggest-easter-egg-hunts-in-the-dc-area', - 'http://twitgoo.com/1as', - 'http://twitgoo.com/1p94', - 'http://twitgoo.com/4kg2', - 'http://twitgoo.com/6c9', - 'http://twitgoo.com/1w5', - 'http://twitgoo.com/6mu', - 'http://twitgoo.com/1w3', - 'http://twitgoo.com/1om', - 'http://twitgoo.com/1mh', - 'http://www.qwantz.com/index.php?comic=1686', - 'http://www.qwantz.com/index.php?comic=773', - 'http://www.qwantz.com/index.php?comic=1018', - 'http://www.qwantz.com/index.php?comic=1019', - 'http://www.23hq.com/mhg/photo/5498347', - 'http://www.23hq.com/Greetingdesignstudio/photo/5464607', - 'http://www.23hq.com/Greetingdesignstudio/photo/5464590', - 'http://www.23hq.com/Greetingdesignstudio/photo/5464605', - 'http://www.23hq.com/Greetingdesignstudio/photo/5464604', - 'http://www.23hq.com/dvilles2/photo/5443192', - 'http://www.23hq.com/Greetingdesignstudio/photo/5464606', - 'http://www.youtube.com/watch?v=gghKdx558Qg', - 'http://www.youtube.com/watch?v=yPid9BLQQcg', - 'http://www.youtube.com/watch?v=uEo2vboUYUk', - 'http://www.youtube.com/watch?v=geUhtoHbLu4', - 'http://www.youtube.com/watch?v=Zk7dDekYej0', - 'http://www.youtube.com/watch?v=Q3tgMosx_tI', - 'http://www.youtube.com/watch?v=s9P8_vgmLfs', - 'http://www.youtube.com/watch?v=1cmtN1meMmk', - 'http://www.youtube.com/watch?v=AVzj-U5Ihm0', - 'http://www.veoh.com/collection/easycookvideos/watch/v366931kcdgj7Hd', - 'http://www.veoh.com/collection/easycookvideos/watch/v366991zjpANrqc', - 'http://www.veoh.com/browse/videos/category/educational/watch/v7054535EZGFJqyX', - 'http://www.veoh.com/browse/videos/category/lifestyle/watch/v18155013XBBtnYwq', - 'http://www.justin.tv/easter7presents', - 'http://www.justin.tv/easterfraud', - 'http://www.justin.tv/cccog27909', - 'http://www.justin.tv/clip/6e8c18f7050', - 'http://www.justin.tv/venom24', - 'http://qik.com/video/1622287', - 'http://qik.com/video/1503735', - 'http://qik.com/video/40504', - 'http://qik.com/video/1445763', - 'http://qik.com/video/743285', - 'http://qik.com/video/1445299', - 'http://qik.com/video/1443200', - 'http://qik.com/video/1445889', - 'http://qik.com/video/174242', - 'http://qik.com/video/1444897', - 'http://revision3.com/hak5/DualCore', - 'http://revision3.com/popsiren/charm', - 'http://revision3.com/tekzilla/eyefinity', - 'http://revision3.com/diggnation/2005-10-06', - 'http://revision3.com/hak5/netcat-virtualization-wordpress/', - 'http://revision3.com/infected/forsaken', - 'http://revision3.com/hak5/purepwnage', - 'http://revision3.com/tekzilla/wowheadset', - 'http://www.dailymotion.com/video/xcstzd_greek-wallets-tighten-during-easter_news', - 'http://www.dailymotion.com/video/xcso4y_exclusive-easter-eggs-easter-basket_lifestyle', - 'http://www.dailymotion.com/video/x2sgkt_evil-easter-bunny', - 'http://www.dailymotion.com/video/xco7oc_invitation-to-2010-easter-services_news', - 'http://www.dailymotion.com/video/xcss6b_big-cat-easter_animals', - 'http://www.dailymotion.com/video/xcszw1_easter-bunny-visits-buenos-aires-zo_news', - 'http://www.dailymotion.com/video/xcsfvs_forecasters-warn-of-easter-misery_news', - 'http://www.collegehumor.com/video:1682246', - 'http://www.twitvid.com/D9997', - 'http://www.twitvid.com/902B9', - 'http://www.twitvid.com/C33F8', - 'http://www.twitvid.com/63F73', - 'http://www.twitvid.com/BC0BA', - 'http://www.twitvid.com/1C33C', - 'http://www.twitvid.com/8A8E2', - 'http://www.twitvid.com/51035', - 'http://www.twitvid.com/5C733', - 'http://www.break.com/game-trailers/game/just-cause-2/just-cause-2-lost-easter-egg?res=1', - 'http://www.break.com/usercontent/2010/3/10/easter-holiday-2009-slideshow-1775624', - 'http://www.break.com/index/a-very-sexy-easter-video.html', - 'http://www.break.com/usercontent/2010/3/11/this-video-features-gizzi-erskine-making-easter-cookies-1776089', - 'http://www.break.com/usercontent/2007/4/4/happy-easter-265717', - 'http://www.break.com/usercontent/2007/4/17/extreme-easter-egg-hunting-276064', - 'http://www.break.com/usercontent/2006/11/18/the-evil-easter-bunny-184789', - 'http://www.break.com/usercontent/2006/4/16/hoppy-easter-kitty-91040', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104063637', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004674', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103928002', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103999188', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103920940', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=103981831', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104004673', - 'http://vids.myspace.com/index.cfm?fuseaction=vids.individual&videoid=104046456', - 'http://www.metacafe.com/watch/105023/the_easter_bunny/', - 'http://www.metacafe.com/watch/4376131/easter_lay/', - 'http://www.metacafe.com/watch/2245996/how_to_make_ukraine_easter_eggs/', - 'http://www.metacafe.com/watch/4374339/easter_eggs/', - 'http://www.metacafe.com/watch/2605860/filled_easter_baskets/', - 'http://www.metacafe.com/watch/2372088/easter_eggs/', - 'http://www.metacafe.com/watch/3043671/www_goodnews_ws_easter_island/', - 'http://www.metacafe.com/watch/1652057/easter_eggs/', - 'http://www.metacafe.com/watch/1173632/ultra_kawaii_easter_bunny_party/', - 'http://celluloidremix.blip.tv/file/3378272/', - 'http://blip.tv/file/449469', - 'http://blip.tv/file/199776', - 'http://blip.tv/file/766967', - 'http://blip.tv/file/770127', - 'http://blip.tv/file/854925', - 'http://www.blip.tv/file/22695?filename=Uncle_dale-THEEASTERBUNNYHATESYOU395.flv', - 'http://iofa.blip.tv/file/3412333/', - 'http://blip.tv/file/190393', - 'http://blip.tv/file/83152', - 'http://video.google.com/videoplay?docid=-5427138374898988918&q=easter+bunny&pl=true', - 'http://video.google.com/videoplay?docid=7785441737970480237', - 'http://video.google.com/videoplay?docid=2320995867449957036', - 'http://video.google.com/videoplay?docid=-2586684490991458032&q=peeps&pl=true', - 'http://video.google.com/videoplay?docid=5621139047118918034', - 'http://video.google.com/videoplay?docid=4232304376070958848', - 'http://video.google.com/videoplay?docid=-6612726032157145299', - 'http://video.google.com/videoplay?docid=4478549130377875994&hl=en', - 'http://video.google.com/videoplay?docid=9169278170240080877', - 'http://video.google.com/videoplay?docid=2551240967354893096', - 'http://video.yahoo.com/watch/7268801/18963438', - 'http://video.yahoo.com/watch/2224892/7014048', - 'http://video.yahoo.com/watch/7244748/18886014', - 'http://video.yahoo.com/watch/4656845/12448951', - 'http://video.yahoo.com/watch/363942/2249254', - 'http://video.yahoo.com/watch/2232968/7046348', - 'http://video.yahoo.com/watch/4530253/12135472', - 'http://video.yahoo.com/watch/2237137/7062908', - 'http://video.yahoo.com/watch/952841/3706424', - 'http://www.viddler.com/explore/BigAppleChannel/videos/113/', - 'http://www.viddler.com/explore/cheezburger/videos/379/', - 'http://www.viddler.com/explore/warnerbros/videos/350/', - 'http://www.viddler.com/explore/tvcgroup/videos/169/', - 'http://www.viddler.com/explore/thebrickshow/videos/12/', - 'http://www.liveleak.com/view?i=e0b_1239827917', - 'http://www.liveleak.com/view?i=715_1239490211', - 'http://www.liveleak.com/view?i=d30_1206233786&p=1', - 'http://www.liveleak.com/view?i=d91_1239548947', - 'http://www.liveleak.com/view?i=f58_1190741182', - 'http://www.liveleak.com/view?i=44e_1179885621&c=1', - 'http://www.liveleak.com/view?i=451_1188059885', - 'http://www.liveleak.com/view?i=3f5_1267456341&c=1', - 'http://www.hulu.com/watch/67313/howcast-how-to-make-braided-easter-bread', - 'http://www.hulu.com/watch/133583/access-hollywood-glees-matthew-morrison-on-touring-and-performing-for-president-obama', - 'http://www.hulu.com/watch/66319/saturday-night-live-easter-album', - 'http://www.hulu.com/watch/80229/explorer-end-of-easter-island', - 'http://www.hulu.com/watch/139020/nbc-today-show-lamb-and-ham-create-easter-feast', - 'http://www.hulu.com/watch/84272/rex-the-runt-easter-island', - 'http://www.hulu.com/watch/132203/everyday-italian-easter-pie', - 'http://www.hulu.com/watch/23349/nova-secrets-of-lost-empires-ii-easter-island', - 'http://movieclips.com/watch/dirty_harry_1971/do_you_feel_lucky_punk/', - 'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/', - 'http://movieclips.com/watch/dumb__dumber_1994/the_toilet_doesnt_flush/', - 'http://movieclips.com/watch/jaws_1975/youre_gonna_need_a_bigger_boat/', - 'http://movieclips.com/watch/napoleon_dynamite_2004/chatting_online_with_babes/61.495/75.413', - 'http://movieclips.com/watch/super_troopers_2001/the_cat_game/12.838/93.018', - 'http://movieclips.com/watch/this_is_spinal_tap_1984/these_go_to_eleven/79.703/129.713', - 'http://crackle.com/c/Originals/What_s_the_deal_with_Easter_candy_/2303243', - 'http://crackle.com/c/How_To/Dryer_Lint_Easter_Bunny_Trailer_Park_Craft/2223902', - 'http://crackle.com/c/How_To/Pagan_Origin_of_Easter_Easter_Egg_Rabbit_Playb_/2225124', - 'http://crackle.com/c/Funny/Happy_Easter/2225363', - 'http://crackle.com/c/Funny/Crazy_and_Hilarious_Easter_Egg_Hunt/2225737', - 'http://crackle.com/c/How_To/Learn_About_Greek_Orthodox_Easter/2262294', - 'http://crackle.com/c/How_To/How_to_Make_Ukraine_Easter_Eggs/2262274', - 'http://crackle.com/c/How_To/Symbolism_Of_Ukrainian_Easter_Eggs/2262267', - 'http://crackle.com/c/Funny/Easter_Retard/931976', - 'http://www.fancast.com/tv/It-s-the-Easter-Beagle,-Charlie-Brown/74789/1078053475/Peanuts:-Specials:-It-s-the-Easter-Beagle,-Charlie-Brown/videos', - 'http://www.fancast.com/movies/Easter-Parade/97802/687440525/Easter-Parade/videos', - 'http://www.fancast.com/tv/Saturday-Night-Live/10009/1083396482/Easter-Album/videos', - 'http://www.fancast.com/movies/The-Proposal/147176/1140660489/The-Proposal:-Easter-Egg-Hunt/videos', - 'http://www.funnyordie.com/videos/f6883f54ae/the-unsettling-ritualistic-origin-of-the-easter-bunny', - 'http://www.funnyordie.com/videos/3ccb03863e/easter-tail-keaster-bunny', - 'http://www.funnyordie.com/videos/17b1d36ad0/easter-bunny-from-leatherfink', - 'http://www.funnyordie.com/videos/0c55aa116d/easter-exposed-from-bryan-erwin', - 'http://www.funnyordie.com/videos/040dac4eff/easter-eggs', - 'http://vimeo.com/10446922', - 'http://vimeo.com/10642542', - 'http://www.vimeo.com/10664068', - 'http://vimeo.com/819176', - 'http://www.vimeo.com/10525353', - 'http://vimeo.com/10429123', - 'http://www.vimeo.com/10652053', - 'http://vimeo.com/10572216', - 'http://www.ted.com/talks/jared_diamond_on_why_societies_collapse.html', - 'http://www.ted.com/talks/nathan_myhrvold_on_archeology_animal_photography_bbq.html', - 'http://www.ted.com/talks/johnny_lee_demos_wii_remote_hacks.html', - 'http://www.ted.com/talks/robert_ballard_on_exploring_the_oceans.html', - 'http://www.omnisio.com/v/Z3QxbTUdjhG/wall-e-collection-of-videos', - 'http://www.omnisio.com/v/3ND6LTvdjhG/php-tutorial-4-login-form-updated', - 'http://www.thedailyshow.com/watch/thu-december-14-2000/intro---easter', - 'http://www.thedailyshow.com/watch/tue-april-18-2006/headlines---easter-charade', - 'http://www.thedailyshow.com/watch/tue-april-18-2006/egg-beaters', - 'http://www.thedailyshow.com/watch/tue-april-18-2006/moment-of-zen---scuba-diver-hiding-easter-eggs', - 'http://www.thedailyshow.com/watch/tue-april-7-2009/easter---passover-highlights', - 'http://www.thedailyshow.com/watch/tue-february-29-2000/headlines---leap-impact', - 'http://www.thedailyshow.com/watch/thu-march-1-2007/tomb-with-a-jew', - 'http://www.thedailyshow.com/watch/mon-april-24-2000/the-meaning-of-passover', - 'http://www.colbertnation.com/the-colbert-report-videos/268800/march-31-2010/easter-under-attack---peeps-display-update', - 'http://www.colbertnation.com/the-colbert-report-videos/268797/march-31-2010/intro---03-31-10', - 'http://www.colbertnation.com/full-episodes/wed-march-31-2010-craig-mullaney', - 'http://www.colbertnation.com/the-colbert-report-videos/60902/march-28-2006/the-word---easter-under-attack---marketing', - 'http://www.colbertnation.com/the-colbert-report-videos/83362/march-07-2007/easter-under-attack---bunny', - 'http://www.colbertnation.com/the-colbert-report-videos/61404/april-06-2006/easter-under-attack---recalled-eggs?videoId=61404', - 'http://www.colbertnation.com/the-colbert-report-videos/223957/april-06-2009/colbert-s-easter-parade', - 'http://www.colbertnation.com/the-colbert-report-videos/181772/march-28-2006/intro---3-28-06', - 'http://www.traileraddict.com/trailer/despicable-me/easter-greeting', - 'http://www.traileraddict.com/trailer/easter-parade/trailer', - 'http://www.traileraddict.com/clip/the-proposal/easter-egg-hunt', - 'http://www.traileraddict.com/trailer/despicable-me/international-teaser-trailer', - 'http://www.traileraddict.com/trailer/despicable-me/today-show-minions', - 'http://revver.com/video/263817/happy-easter/', - 'http://www.revver.com/video/1574939/easter-bunny-house/', - 'http://revver.com/video/771140/easter-08/', - ] - -def submit_all(): - from r2.models import Subreddit, Account, Link, NotFound - from r2.lib.media import set_media - from r2.lib.db import queries - sr = Subreddit._by_name('testmedia') - author = Account._by_name('testmedia') - links = [] - for url in test_urls: - try: - # delete any existing version of the link - l = Link._by_url(url, sr) - print "Deleting %s" % l - l._deleted = True - l._commit() - except NotFound: - pass - - l = Link._submit(url, url, author, sr, '0.0.0.0') - - try: - set_media(l) - except Exception, e: - print e - - queries.new_link(l) - - links.append(l) - - return links - -def test_real(nlinks): - from r2.models import Link, desc - from r2.lib.utils import fetch_things2 - - counter = 0 - q = Link._query(sort = desc("_date")) - - print "" - for l in fetch_things2(q): - if counter > nlinks: - break - if not l.is_self: - h = make_scraper(l.url) - mo = h.media_object() - print "scraper: %s" % mo - if mo: - print get_media_embed(mo).content - counter +=1 - print "
" - -def test_url(url): - import sys - from r2.lib.filters import websafe - sys.stderr.write("%s\n" % url) - print "" - h = make_scraper(url) - print "" - print "", websafe(url), "" - print "
" - print websafe(repr(h)) - img = h.largest_image_url() - if img: - print "" % img - else: - print "(no image)" - mo = h.media_object() - print "" - if mo: - print get_media_embed(mo).content - else: - print "None" - print "" - print "" - -def test(): - """Take some example URLs and print out a nice pretty HTML table - of their extracted thubmnails and media objects""" - print "" - for url in test_urls: - test_url(url) - print "
" -