From 04e902b2f2e9fbbb5cadffb3668bcd480291fc7a Mon Sep 17 00:00:00 2001 From: Neil Williams Date: Mon, 21 Jan 2013 15:02:04 -0800 Subject: [PATCH] "suggest title": Respect encoding of response from target URL. --- r2/r2/lib/utils/utils.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py index 39938f381..fe8cf3535 100644 --- a/r2/r2/lib/utils/utils.py +++ b/r2/r2/lib/utils/utils.py @@ -24,6 +24,7 @@ import os import base64 import traceback import ConfigParser +import codecs from urllib import unquote_plus from urllib2 import urlopen @@ -255,18 +256,26 @@ def get_title(url): try: opener = urlopen(url, timeout=15) - - # Attempt to find the title in the first 1kb - data = opener.read(1024) - title = extract_title(data) - - # Title not found in the first kb, try searching an additional 2kb - if not title: - data += opener.read(2048) + + # determine the encoding of the response + for param in opener.info().getplist(): + if param.startswith("charset="): + param_name, sep, charset = param.partition("=") + codec = codecs.getreader(charset) + break + else: + codec = codecs.getreader("utf-8") + + with codec(opener, "ignore") as reader: + # Attempt to find the title in the first 1kb + data = reader.read(1024) title = extract_title(data) - - opener.close() - + + # Title not found in the first kb, try searching an additional 2kb + if not title: + data += reader.read(2048) + title = extract_title(data) + return title except: