From c46309c338d7179b0051f7bbf6f537c85649e650 Mon Sep 17 00:00:00 2001 From: Ricky Ramirez Date: Wed, 11 Sep 2013 16:48:11 -0700 Subject: [PATCH] sanitize_url: Convert unicode domain names to ascii. Previously, unicode domain names were only checked for validity, and the coversion was dropped. This can cause problems down the line where we expect URL's to be ascii. --- r2/r2/lib/utils/utils.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py index e5fc16a62..bb0791c52 100644 --- a/r2/r2/lib/utils/utils.py +++ b/r2/r2/lib/utils/utils.py @@ -349,21 +349,21 @@ def sanitize_url(url, require_scheme = False): return if u.username is not None or u.password is not None: return - labels = u.hostname.split('.') - for label in labels: - try: - #if this succeeds, this portion of the dns is almost - #valid and converted to ascii - label = label.encode('idna') - except TypeError: - print "label sucks: [%r]" % label - raise - except UnicodeError: + + try: + idna_hostname = u.hostname.encode('idna') + except TypeError as e: + g.log.warning("Bad hostname given [%r]: %s", u.hostname, e) + raise + except UnicodeError: + return + + for label in idna_hostname.split('.'): + if not re.match(valid_dns, label): return - else: - #then if this success, this portion of the dns is really valid - if not re.match(valid_dns, label): - return + + if idna_hostname != u.hostname: + url = urlunparse((u[0], idna_hostname, u[2], u[3], u[4], u[5])) return url def trunc_string(text, length):