Overhaul scraper_q processing.

This is primarily about removing the dead old scrapers in favor of a
streamlined oEmbed-based system.  The new embed.ly scraper uses their
Service API to determine which URLs can be scraped through them. This
removes the giant list of domains and regexes in the code and makes the
scraper_q proc always have the latest list of scrapable domains.
This commit is contained in:
Neil Williams
2013-08-29 10:50:21 -07:00
parent ee5f8e188b
commit 252437da17
6 changed files with 348 additions and 1910 deletions

View File

@@ -66,7 +66,7 @@ from r2.lib import tracking, cssfilter, emailer
from r2.lib.subreddit_search import search_reddits from r2.lib.subreddit_search import search_reddits
from r2.lib.log import log_text from r2.lib.log import log_text
from r2.lib.filters import safemarkdown from r2.lib.filters import safemarkdown
from r2.lib.scraper import str_to_image from r2.lib.media import str_to_image
from r2.controllers.api_docs import api_doc, api_section from r2.controllers.api_docs import api_doc, api_section
from r2.lib.search import SearchQuery from r2.lib.search import SearchQuery
from r2.controllers.oauth2 import OAuth2ResourceController, require_oauth2_scope from r2.controllers.oauth2 import OAuth2ResourceController, require_oauth2_scope

View File

@@ -25,7 +25,7 @@ from pylons.controllers.util import abort
from r2.controllers.reddit_base import MinimalController from r2.controllers.reddit_base import MinimalController
from r2.lib.pages import MediaEmbedBody from r2.lib.pages import MediaEmbedBody
from r2.lib.scraper import get_media_embed from r2.lib.media import get_media_embed
from r2.lib.validator import validate, VLink from r2.lib.validator import validate, VLink
@@ -39,11 +39,6 @@ class MediaembedController(MinimalController):
if not link or not link.media_object: if not link or not link.media_object:
abort(404) abort(404)
if isinstance(link.media_object, basestring):
# it's an old-style string
content = link.media_object
elif isinstance(link.media_object, dict): elif isinstance(link.media_object, dict):
# otherwise it's the new style, which is a dict(type=type, **args) # otherwise it's the new style, which is a dict(type=type, **args)
media_embed = get_media_embed(link.media_object) media_embed = get_media_embed(link.media_object)

View File

@@ -414,7 +414,7 @@ class LinkJsonTemplate(ThingJsonTemplate):
) )
def thing_attr(self, thing, attr): def thing_attr(self, thing, attr):
from r2.lib.scraper import get_media_embed from r2.lib.media import get_media_embed
if attr == "media_embed": if attr == "media_embed":
if (thing.media_object and if (thing.media_object and
not isinstance(thing.media_object, basestring)): not isinstance(thing.media_object, basestring)):

View File

@@ -20,36 +20,144 @@
# Inc. All Rights Reserved. # Inc. All Rights Reserved.
############################################################################### ###############################################################################
import subprocess import base64
import collections
from pylons import g, config import cStringIO
import hashlib
from r2.models.link import Link import json
from r2.lib import s3cp import math
from r2.lib.utils import timeago, fetch_things2 import mimetypes
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException
from r2.lib.db.operators import desc
from r2.lib.scraper import make_scraper, str_to_image, image_to_str, prepare_image
from r2.lib import amqp
from r2.lib.nymph import optimize_png
import Image
import os import os
import re
import subprocess
import tempfile import tempfile
import traceback import traceback
import urllib
import urllib2
import urlparse
import base64 import BeautifulSoup
import hashlib import Image
import ImageFile
from pylons import g
from r2.lib import amqp, s3cp
from r2.lib.memoize import memoize
from r2.lib.nymph import optimize_png
from r2.lib.utils import TimeoutFunction, TimeoutFunctionException, domain
from r2.models.link import Link
import mimetypes
s3_direct_url = "s3.amazonaws.com" s3_direct_url = "s3.amazonaws.com"
threads = 20
log = g.log
MEDIA_FILENAME_LENGTH = 12 MEDIA_FILENAME_LENGTH = 12
thumbnail_size = 70, 70
def _image_to_str(image):
s = cStringIO.StringIO()
image.save(s, image.format)
return s.getvalue()
def str_to_image(s):
s = cStringIO.StringIO(s)
image = Image.open(s)
return image
def _image_entropy(img):
"""calculate the entropy of an image"""
hist = img.histogram()
hist_size = sum(hist)
hist = [float(h) / hist_size for h in hist]
return -sum(p * math.log(p, 2) for p in hist if p != 0)
def _square_image(img):
"""if the image is taller than it is wide, square it off. determine
which pieces to cut off based on the entropy pieces."""
x,y = img.size
while y > x:
#slice 10px at a time until square
slice_height = min(y - x, 10)
bottom = img.crop((0, y - slice_height, x, y))
top = img.crop((0, 0, x, slice_height))
#remove the slice with the least entropy
if _image_entropy(bottom) < _image_entropy(top):
img = img.crop((0, 0, x, y - slice_height))
else:
img = img.crop((0, slice_height, x, y))
x,y = img.size
return img
def _prepare_image(image):
image = _square_image(image)
image.thumbnail(thumbnail_size, Image.ANTIALIAS)
return image
def _clean_url(url):
"""url quotes unicode data out of urls"""
url = url.encode('utf8')
url = ''.join(urllib.quote(c) if ord(c) >= 127 else c for c in url)
return url
def _initialize_request(url, referer):
url = _clean_url(url)
if not url.startswith(("http://", "https://")):
return
req = urllib2.Request(url)
if g.useragent:
req.add_header('User-Agent', g.useragent)
if referer:
req.add_header('Referer', referer)
return req
def _fetch_url(url, referer=None):
request = _initialize_request(url, referer=referer)
if not request:
return None, None
response = urllib2.urlopen(request)
return response.headers.get("Content-Type"), response.read()
@memoize('media.fetch_size', time=3600)
def _fetch_image_size(url, referer):
"""Return the size of an image by URL downloading as little as possible."""
request = _initialize_request(url, referer)
if not request:
return None
parser = ImageFile.Parser()
response = None
try:
response = urllib2.urlopen(request)
while True:
chunk = response.read(1024)
if not chunk:
break
parser.feed(chunk)
if parser.image:
return parser.image.size
except urllib2.URLError:
return None
finally:
if response:
response.close()
def optimize_jpeg(filename, optimizer): def optimize_jpeg(filename, optimizer):
@@ -151,7 +259,7 @@ def update_link(link, thumbnail, media_object, thumbnail_size=None):
link._commit() link._commit()
def set_media(link, force = False): def _set_media(embedly_services, link, force=False):
if link.is_self: if link.is_self:
return return
if not force and link.promoted: if not force and link.promoted:
@@ -159,16 +267,14 @@ def set_media(link, force = False):
elif not force and (link.has_thumbnail or link.media_object): elif not force and (link.has_thumbnail or link.media_object):
return return
scraper = make_scraper(link.url) scraper = Scraper.for_url(embedly_services, link.url)
thumbnail, media_object = scraper.scrape()
thumbnail = scraper.thumbnail()
media_object = scraper.media_object()
if media_object: if media_object:
# the scraper should be able to make a media embed out of the # the scraper should be able to make a media embed out of the
# media object it just gave us. if not, null out the media object # media object it just gave us. if not, null out the media object
# to protect downstream code # to protect downstream code
res = scraper.media_embed(**media_object) res = scraper.media_embed(media_object)
if not res: if not res:
print "%s made a bad media obj for link %s" % (scraper, link._id36) print "%s made a bad media obj for link %s" % (scraper, link._id36)
@@ -181,7 +287,7 @@ def set_media(link, force = False):
def force_thumbnail(link, image_data, never_expire=True, file_type=".jpg"): def force_thumbnail(link, image_data, never_expire=True, file_type=".jpg"):
image = str_to_image(image_data) image = str_to_image(image_data)
image = prepare_image(image) image = _prepare_image(image)
thumb_url = upload_media(image, never_expire=never_expire, file_type=file_type) thumb_url = upload_media(image, never_expire=never_expire, file_type=file_type)
update_link(link, thumbnail=thumb_url, media_object=None, thumbnail_size=image.size) update_link(link, thumbnail=thumb_url, media_object=None, thumbnail_size=image.size)
@@ -190,7 +296,7 @@ def upload_icon(file_name, image_data, size):
image = str_to_image(image_data) image = str_to_image(image_data)
image.format = 'PNG' image.format = 'PNG'
image.thumbnail(size, Image.ANTIALIAS) image.thumbnail(size, Image.ANTIALIAS)
icon_data = image_to_str(image) icon_data = _image_to_str(image)
return s3_upload_media(icon_data, return s3_upload_media(icon_data,
file_name=file_name, file_name=file_name,
mime_type='image/png', mime_type='image/png',
@@ -201,16 +307,218 @@ def upload_icon(file_name, image_data, size):
def can_upload_icon(): def can_upload_icon():
return g.media_store == 's3' return g.media_store == 's3'
def get_media_embed(media_object):
if not isinstance(media_object, dict):
return
if "oembed" not in media_object:
return
return _EmbedlyScraper.media_embed(media_object)
class MediaEmbed(object):
width = None
height = None
content = None
scrolling = False
def __init__(self, height, width, content, scrolling=False):
self.height = int(height)
self.width = int(width)
self.content = content
self.scrolling = scrolling
def _make_thumbnail_from_url(thumbnail_url, referer):
if not thumbnail_url:
return
content_type, content = _fetch_url(thumbnail_url, referer=referer)
if not content:
return
image = str_to_image(content)
return _prepare_image(image)
class Scraper(object):
@classmethod
def for_url(cls, embedly_services, url):
url_domain = domain(url)
domain_embedly_regex = embedly_services.get(url_domain, None)
if domain_embedly_regex and re.match(domain_embedly_regex, url):
return _EmbedlyScraper(url)
return _ThumbnailOnlyScraper(url)
def scrape(self):
# should return a 2-tuple of: thumbnail, media_object
raise NotImplementedError
@classmethod
def media_embed(cls, media_object):
# should take a media object and return an appropriate MediaEmbed
raise NotImplementedError
class _ThumbnailOnlyScraper(Scraper):
def __init__(self, url):
self.url = url
def scrape(self):
thumbnail_url = self._find_thumbnail_image()
thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
return thumbnail, None
def _extract_image_urls(self, soup):
for img in soup.findAll("img", src=True):
yield urlparse.urljoin(self.url, img["src"])
def _find_thumbnail_image(self):
content_type, content = _fetch_url(self.url)
# if it's an image. it's pretty easy to guess what we should thumbnail.
if "image" in content_type:
return self.url
if content_type and "html" in content_type and content:
soup = BeautifulSoup.BeautifulSoup(content)
else:
return None
# allow the content author to specify the thumbnail:
# <meta property="og:image" content="http://...">
og_image = (soup.find('meta', property='og:image') or
soup.find('meta', attrs={'name': 'og:image'}))
if og_image and og_image['content']:
return og_image['content']
# <link rel="image_src" href="http://...">
thumbnail_spec = soup.find('link', rel='image_src')
if thumbnail_spec and thumbnail_spec['href']:
return thumbnail_spec['href']
# ok, we have no guidance from the author. look for the largest
# image on the page with a few caveats. (see below)
max_area = 0
max_url = None
for image_url in self._extract_image_urls(soup):
size = _fetch_image_size(image_url, referer=self.url)
if not size:
continue
area = size[0] * size[1]
# ignore little images
if area < 5000:
g.log.debug('ignore little %s' % image_url)
continue
# ignore excessively long/wide images
if max(size) / min(size) > 1.5:
g.log.debug('ignore dimensions %s' % image_url)
continue
# penalize images with "sprite" in their name
if 'sprite' in image_url.lower():
g.log.debug('penalizing sprite %s' % image_url)
area /= 10
if area > max_area:
max_area = area
max_url = image_url
return max_url
class _EmbedlyScraper(Scraper):
EMBEDLY_API_URL = "http://api.embed.ly/1/oembed"
def __init__(self, url):
self.url = url
@classmethod
def _utf8_encode(cls, input):
"""UTF-8 encodes any strings in an object (from json.loads)"""
if isinstance(input, dict):
return {cls._utf8_encode(key): cls._utf8_encode(value)
for key, value in input.iteritems()}
elif isinstance(input, list):
return [cls._utf8_encode(item)
for item in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
def scrape(self):
params = urllib.urlencode({
"url": self.url,
"format": "json",
"maxwidth": 600,
"key": g.embedly_api_key,
})
response = urllib2.urlopen(self.EMBEDLY_API_URL + "?" + params)
oembed = json.load(response, object_hook=self._utf8_encode)
if not oembed:
return None, None
if oembed.get("type") == "photo":
thumbnail_url = oembed.get("url")
else:
thumbnail_url = oembed.get("thumbnail_url")
thumbnail = _make_thumbnail_from_url(thumbnail_url, referer=self.url)
embed = {}
if oembed.get("type") in ("video", "rich"):
embed = {
"type": domain(self.url),
"oembed": oembed,
}
return thumbnail, embed
@classmethod
def media_embed(cls, media_object):
oembed = media_object["oembed"]
html = oembed.get("html")
width = oembed.get("width")
height = oembed.get("height")
if not (html and width and height):
return
return MediaEmbed(
width=width,
height=height,
content=html,
)
@memoize("media.embedly_services", time=3600)
def _fetch_embedly_services():
response = urllib2.urlopen("http://api.embed.ly/1/services/python")
service_data = json.load(response)
patterns_by_domain = collections.defaultdict(set)
for service in service_data:
for domain in [service["domain"]] + service["subdomains"]:
patterns_by_domain[domain].update(service["regex"])
return {domain: "(?:%s)" % "|".join(patterns)
for domain, patterns in patterns_by_domain.iteritems()}
def run(): def run():
embedly_services = _fetch_embedly_services()
@g.stats.amqp_processor('scraper_q') @g.stats.amqp_processor('scraper_q')
def process_link(msg): def process_link(msg):
def _process_link(fname):
link = Link._by_fullname(fname, data=True)
set_media(link)
fname = msg.body fname = msg.body
link = Link._by_fullname(msg.body, data=True)
try: try:
TimeoutFunction(_process_link, 30)(fname) TimeoutFunction(_set_media, 30)(embedly_services, link)
except TimeoutFunctionException: except TimeoutFunctionException:
print "Timed out on %s" % fname print "Timed out on %s" % fname
except KeyboardInterrupt: except KeyboardInterrupt:

View File

@@ -67,7 +67,6 @@ from r2.lib.utils import url_links_builder, make_offset_date, median, to36
from r2.lib.utils import trunc_time, timesince, timeuntil, weighted_lottery from r2.lib.utils import trunc_time, timesince, timeuntil, weighted_lottery
from r2.lib.template_helpers import add_sr, get_domain, format_number from r2.lib.template_helpers import add_sr, get_domain, format_number
from r2.lib.subreddit_search import popular_searches from r2.lib.subreddit_search import popular_searches
from r2.lib.scraper import get_media_embed
from r2.lib.log import log_text from r2.lib.log import log_text
from r2.lib.memoize import memoize from r2.lib.memoize import memoize
from r2.lib.utils import trunc_string as _truncate, to_date from r2.lib.utils import trunc_string as _truncate, to_date
@@ -3454,7 +3453,7 @@ def make_link_child(item):
media_embed = item.media_object media_embed = item.media_object
else: else:
try: try:
media_embed = get_media_embed(item.media_object) media_embed = media.get_media_embed(item.media_object)
except TypeError: except TypeError:
g.log.warning("link %s has a bad media object" % item) g.log.warning("link %s has a bad media object" % item)
media_embed = None media_embed = None

File diff suppressed because it is too large Load Diff