mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-24 14:27:58 -05:00
scraper_q: Add gzip support.
Helpful because Amazon S3 doesn't support switching between gzipped and non-gzipped content so often times resources are uploaded directly gzipped with gzipped headers. Because all modern browsers support gzip, S3 always serves gzip and it's not generally a problem. Urllib, however, doesn't support gzip so gzipped resource currently fail to decode when scraped.
This commit is contained in:
committed by
Neil Williams
parent
4729b88f5e
commit
7774fc8d3f
@@ -33,6 +33,7 @@ import traceback
|
||||
import urllib
|
||||
import urllib2
|
||||
import urlparse
|
||||
import gzip
|
||||
|
||||
import BeautifulSoup
|
||||
import Image
|
||||
@@ -124,6 +125,7 @@ def _initialize_request(url, referer):
|
||||
return
|
||||
|
||||
req = urllib2.Request(url)
|
||||
req.add_header('Accept-Encoding', 'gzip')
|
||||
if g.useragent:
|
||||
req.add_header('User-Agent', g.useragent)
|
||||
if referer:
|
||||
@@ -136,7 +138,13 @@ def _fetch_url(url, referer=None):
|
||||
if not request:
|
||||
return None, None
|
||||
response = urllib2.urlopen(request)
|
||||
return response.headers.get("Content-Type"), response.read()
|
||||
response_data = response.read()
|
||||
content_encoding = response.info().get("Content-Encoding")
|
||||
if content_encoding and content_encoding.lower() in ["gzip", "x-gzip"]:
|
||||
buf = cStringIO.StringIO(response_data)
|
||||
f = gzip.GzipFile(fileobj=buf)
|
||||
response_data = f.read()
|
||||
return response.headers.get("Content-Type"), response_data
|
||||
|
||||
|
||||
@memoize('media.fetch_size', time=3600)
|
||||
|
||||
Reference in New Issue
Block a user