Fix some issues with unicode in URLs

First, `UrlParser.update_query` didn't like 7-bit unclean values. `unicode()` should work everywhere `str()` did. Second, the check for emedded NBSPs in `UrlParser.is_web_safe_url` could be bypassed since `b'\xa0'` couldn't automatically be promoted to unicode (thus `u'\xa0'` != b'\xa0'.) The check was fixed to handle the NBSP char in either unicode or byte strings.
2026-04-27 03:00:12 -04:00 · 2015-05-13 15:29:54 -07:00
parent d0108cfe3b
commit d7acfce856
2 changed files with 16 additions and 2 deletions
--- a/r2/r2/lib/utils/utils.py
+++ b/r2/r2/lib/utils/utils.py
@@ -522,7 +522,7 @@ class UrlParser(object):
        # Since in HTTP everything's a string, coercing values to strings now
        # makes equality testing easier.  Python will throw an error if you try
        # to pass in a non-string key, so that's already taken care of for us.
-        updates = {k: str(v) for k, v in updates.iteritems()}
+        updates = {k: _force_unicode(v) for k, v in updates.iteritems()}
        self.query_dict.update(updates)

    @property
@@ -715,7 +715,8 @@ class UrlParser(object):
            # should be safe enough to allow after three slashes. Opera 12's the
            # only browser that trips over them, and it doesn't fall for
            # `http:///foo.com/`.
-            if match.group(0) == '\xa0':
+            # Check both in case unicode promotion fails
+            if match.group(0) in {u'\xa0', '\xa0'}:
                if match.string[0:match.start(0)].count('/') < 3:
                    return False
            else:
--- a/r2/r2/tests/unit/lib/urlparser_test.py
+++ b/r2/r2/tests/unit/lib/urlparser_test.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # The contents of this file are subject to the Common Public Attribution
 # License Version 1.0. (the "License"); you may not use this file except in
 # compliance with the License. You may obtain a copy of the License at
@@ -138,6 +139,12 @@ class TestIsRedditURL(unittest.TestCase):
        self.assertIsNotSafeRedditUrl("\xa0http://%s/" % g.domain)
        self.assertIsSafeRedditUrl("http://%s/\xa0" % g.domain)
        self.assertIsSafeRedditUrl("/foo/bar/\xa0baz")
+        # Make sure this works if the URL is unicode
+        self.assertIsNotSafeRedditUrl(u"http://\xa0.%s/" % g.domain)
+        self.assertIsNotSafeRedditUrl(u"\xa0http://%s/" % g.domain)
+        self.assertIsSafeRedditUrl(u"http://%s/\xa0" % g.domain)
+        self.assertIsSafeRedditUrl(u"/foo/bar/\xa0baz")
+


 class TestSwitchSubdomainByExtension(unittest.TestCase):
@@ -281,3 +288,9 @@ class TestEquality(unittest.TestCase):
        u2 = UrlParser('http://example.com/')
        u2.update_query(page=1234)
        self.assertEquals(u, u2)
+
+    def test_unicode_query_params(self):
+        u = UrlParser(u'http://example.com/?page=ｕｎｉｃｏｄｅ：（')
+        u2 = UrlParser('http://example.com/')
+        u2.update_query(page=u'ｕｎｉｃｏｄｅ：（')
+        self.assertEquals(u, u2)