From 7774fc8d3fe6436c4e9a8a065037234a8b5058ee Mon Sep 17 00:00:00 2001 From: Ben Newhouse Date: Tue, 26 Nov 2013 12:24:42 -0800 Subject: [PATCH] scraper_q: Add gzip support. Helpful because Amazon S3 doesn't support switching between gzipped and non-gzipped content so often times resources are uploaded directly gzipped with gzipped headers. Because all modern browsers support gzip, S3 always serves gzip and it's not generally a problem. Urllib, however, doesn't support gzip so gzipped resource currently fail to decode when scraped. --- r2/r2/lib/media.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/r2/r2/lib/media.py b/r2/r2/lib/media.py index b9afbef12..d0e4849b7 100644 --- a/r2/r2/lib/media.py +++ b/r2/r2/lib/media.py @@ -33,6 +33,7 @@ import traceback import urllib import urllib2 import urlparse +import gzip import BeautifulSoup import Image @@ -124,6 +125,7 @@ def _initialize_request(url, referer): return req = urllib2.Request(url) + req.add_header('Accept-Encoding', 'gzip') if g.useragent: req.add_header('User-Agent', g.useragent) if referer: @@ -136,7 +138,13 @@ def _fetch_url(url, referer=None): if not request: return None, None response = urllib2.urlopen(request) - return response.headers.get("Content-Type"), response.read() + response_data = response.read() + content_encoding = response.info().get("Content-Encoding") + if content_encoding and content_encoding.lower() in ["gzip", "x-gzip"]: + buf = cStringIO.StringIO(response_data) + f = gzip.GzipFile(fileobj=buf) + response_data = f.read() + return response.headers.get("Content-Type"), response_data @memoize('media.fetch_size', time=3600)