Fix several is_reddit_url() bypasses

Thanks to Yassine ABOUKIR (http://www.yassineaboukir.com) for the report! `////example.com` doesn't survive the round-trip through `urlunparse(urlparse())` and becomes `//example.com`. Things like `///\t/foo.com` were also getting by because some UAs ignore various characters when they occur before the path section. `UrlParser.is_web_safe_url()` was added to check if a URL could be interpreted differently by different parsers
2026-04-27 03:00:12 -04:00 · 2015-02-23 14:33:35 -08:00
parent bb2a5f3a8d
commit 689a9554e6
4 changed files with 230 additions and 14 deletions
--- a/r2/r2/lib/utils/utils.py
+++ b/r2/r2/lib/utils/utils.py
@@ -405,6 +405,26 @@ def query_string(dict):
    else:
        return ''

+
+# Characters that might cause parsing differences in different implementations
+# Spaces only seem to cause parsing differences when occurring directly before
+# the scheme
+URL_PROBLEMATIC_RE = re.compile(
+    ur'(\A\x20|[\x00-\x19\xA0\u1680\u180E\u2000-\u2029\u205f\u3000\\])',
+    re.UNICODE
+)
+
+
+def paranoid_urlparser_method(check):
+    """
+    Decorator for checks on `UrlParser` instances that need to be paranoid
+    """
+    def check_wrapper(parser, *args, **kwargs):
+        return UrlParser.perform_paranoid_check(parser, check, *args, **kwargs)
+
+    return check_wrapper
+
+
 class UrlParser(object):
    """
    Wrapper for urlparse and urlunparse for making changes to urls.
@@ -425,8 +445,8 @@ class UrlParser(object):
    """

    __slots__ = ['scheme', 'path', 'params', 'query',
-                 'fragment', 'username', 'password', 'hostname',
-                 'port', '_url_updates', '_orig_url', '_query_dict']
+                 'fragment', 'username', 'password', 'hostname', 'port',
+                 '_url_updates', '_orig_url', '_orig_netloc', '_query_dict']

    valid_schemes = ('http', 'https', 'ftp', 'mailto')
    cname_get = "cnameframe"
@@ -438,6 +458,7 @@ class UrlParser(object):
                setattr(self, s, getattr(u, s))
        self._url_updates = {}
        self._orig_url    = url
+        self._orig_netloc = getattr(u, 'netloc', '')
        self._query_dict  = None

    def update_query(self, **updates):
@@ -554,7 +575,64 @@ class UrlParser(object):
            pass
        return None

-    def is_reddit_url(self, subreddit = None):
+    def perform_paranoid_check(self, check, *args, **kwargs):
+        """
+        Perform a check on a URL that needs to account for bugs in `unparse()`
+
+        If you need to account for quirks in browser URL parsers, you should
+        use this along with `is_web_safe_url()`. Trying to parse URLs like
+        a browser would just makes things really hairy.
+        """
+        variants_to_check = (
+            self,
+            UrlParser(self.unparse())
+        )
+        # If the check doesn't pass on *every* variant, it's a fail.
+        return all(
+            check(variant, *args, **kwargs) for variant in variants_to_check
+        )
+
+    @paranoid_urlparser_method
+    def is_web_safe_url(self):
+        """Determine if this URL could cause issues with different parsers"""
+
+        # There's no valid reason for this, and just serves to confuse UAs.
+        # and urllib2.
+        if self._orig_url.startswith("///"):
+            return False
+
+        # Double-checking the above
+        if not self.hostname and self.path.startswith('//'):
+            return False
+
+        # A host-relative link with a scheme like `https:/baz` or `https:?quux`
+        if self.scheme and not self.hostname:
+            return False
+
+        # Credentials in the netloc? Not on reddit!
+        if "@" in self._orig_netloc:
+            return False
+
+        # `javascript://www.reddit.com/%0D%Aalert(1)` is not safe, obviously
+        if self.scheme and self.scheme.lower() not in self.valid_schemes:
+            return False
+
+        # Reject any URLs that contain characters known to cause parsing
+        # differences between parser implementations
+        for match in re.finditer(URL_PROBLEMATIC_RE, self._orig_url):
+            # XXX: Yuck. We have non-breaking spaces in title slugs! They
+            # should be safe enough to allow after three slashes. Opera 12's the
+            # only browser that trips over them, and it doesn't fall for
+            # `http:///foo.com/`.
+            if match.group(0) == '\xa0':
+                if match.string[0:match.start(0)].count('/') < 3:
+                    return False
+            else:
+                return False
+
+        return True
+
+    def is_reddit_url(self, subreddit=None):
        """utility method for seeing if the url is associated with
        reddit as we don't necessarily want to mangle non-reddit
        domains
@@ -562,22 +640,19 @@ class UrlParser(object):
        returns true only if hostname is nonexistant, a subdomain of
        g.domain, or a subdomain of the provided subreddit's cname.
        """
+
        from pylons import g
-        subdomain = (
+        valid_subdomain = (
            not self.hostname or
            is_subdomain(self.hostname, g.domain) or
            (subreddit and subreddit.domain and
                is_subdomain(self.hostname, subreddit.domain))
        )
-        # Handle backslash trickery like /\example.com/ being treated as
-        # equal to //example.com/ by some browsers
-        if not self.hostname and not self.scheme and self.path:
-            if self.path.startswith("/\\"):
-                return False
-        if not subdomain or not self.hostname or not g.offsite_subdomains:
-            return subdomain
+
+        if not valid_subdomain or not self.hostname or not g.offsite_subdomains:
+            return valid_subdomain
        return not any(
-            self.hostname.startswith(subdomain + '.')
+            is_subdomain(self.hostname, "%s.%s" % (subdomain, g.domain))
            for subdomain in g.offsite_subdomains
        )

--- a/r2/r2/lib/validator/validator.py
+++ b/r2/r2/lib/validator/validator.py
@@ -2211,7 +2211,7 @@ class VDestination(Validator):
        if ld.startswith(('/', 'http://', 'https://')):
            u = UrlParser(dest)

-            if u.is_reddit_url(c.site):
+            if u.is_reddit_url(c.site) and u.is_web_safe_url():
                return dest

        ip = getattr(request, "ip", "[unknown]")
--- a/r2/r2/models/link.py
+++ b/r2/r2/models/link.py
@@ -540,7 +540,8 @@ class Link(Thing, Printable):
            item.domain_str = None
            if c.user.pref_domain_details:
                urlparser = UrlParser(item.url)
-                if not item.is_self and urlparser.is_reddit_url():
+                if (not item.is_self and urlparser.is_reddit_url() and
+                        urlparser.is_web_safe_url()):
                    url_subreddit = urlparser.get_subreddit()
                    if (url_subreddit and
                            not isinstance(url_subreddit, DefaultSR)):
--- a/r2/r2/tests/unit/lib/urlparser_test.py
+++ b/r2/r2/tests/unit/lib/urlparser_test.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# The contents of this file are subject to the Common Public Attribution
+# License Version 1.0. (the "License"); you may not use this file except in
+# compliance with the License. You may obtain a copy of the License at
+# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
+# License Version 1.1, but Sections 14 and 15 have been added to cover use of
+# software over a computer network and provide for limited attribution for the
+# Original Developer. In addition, Exhibit A has been modified to be consistent
+# with Exhibit B.
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
+# the specific language governing rights and limitations under the License.
+#
+# The Original Code is reddit.
+#
+# The Original Developer is the Initial Developer.  The Initial Developer of
+# the Original Code is reddit Inc.
+#
+# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
+# Inc. All Rights Reserved.
+###############################################################################
+import unittest
+
+from r2.lib.utils import UrlParser
+from r2.tests import stage_for_paste
+from pylons import g
+
+
+class TestIsRedditURL(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        stage_for_paste()
+        cls._old_offsite = g.offsite_subdomains
+        g.offsite_subdomains = ["blog"]
+
+    @classmethod
+    def tearDownClass(cls):
+        g.offsite_subdomains = cls._old_offsite
+
+    def _is_safe_reddit_url(self, url, subreddit=None):
+        web_safe = UrlParser(url).is_web_safe_url()
+        return web_safe and UrlParser(url).is_reddit_url(subreddit)
+
+    def assertIsSafeRedditUrl(self, url, subreddit=None):
+        self.assertTrue(self._is_safe_reddit_url(url, subreddit))
+
+    def assertIsNotSafeRedditUrl(self, url, subreddit=None):
+        self.assertFalse(self._is_safe_reddit_url(url, subreddit))
+
+    def test_normal_urls(self):
+        self.assertIsSafeRedditUrl("https://%s/" % g.domain)
+        self.assertIsSafeRedditUrl("https://en.%s/" % g.domain)
+        self.assertIsSafeRedditUrl("https://foobar.baz.%s/quux/?a" % g.domain)
+        self.assertIsSafeRedditUrl("#anchorage")
+        self.assertIsSafeRedditUrl("?path_relative_queries")
+        self.assertIsSafeRedditUrl("/")
+        self.assertIsSafeRedditUrl("/cats")
+        self.assertIsSafeRedditUrl("/cats/")
+        self.assertIsSafeRedditUrl("/cats/#maru")
+        self.assertIsSafeRedditUrl("//foobaz.%s/aa/baz#quux" % g.domain)
+        # XXX: This is technically a legal relative URL, are there any UAs
+        # stupid enough to treat this as absolute?
+        self.assertIsSafeRedditUrl("path_relative_subpath.com")
+        # "blog.reddit.com" is not a reddit URL.
+        self.assertIsNotSafeRedditUrl("http://blog.%s/" % g.domain)
+        self.assertIsNotSafeRedditUrl("http://foo.blog.%s/" % g.domain)
+
+    def test_incorrect_anchoring(self):
+        self.assertIsNotSafeRedditUrl("http://www.%s.whatever.com/" % g.domain)
+
+    def test_protocol_relative(self):
+        self.assertIsNotSafeRedditUrl("//foobaz.example.com/aa/baz#quux")
+
+    def test_weird_protocols(self):
+        self.assertIsNotSafeRedditUrl(
+            "javascript://%s/%%0d%%0aalert(1)" % g.domain
+        )
+        self.assertIsNotSafeRedditUrl("hackery:whatever")
+
+    def test_http_auth(self):
+        # There's no legitimate reason to include HTTP auth details in the URL,
+        # they only serve to confuse everyone involved.
+        # For example, this used to be the behaviour of `UrlParser`, oops!
+        # > UrlParser("http://everyoneforgets:aboutthese@/baz.com/").unparse()
+        # 'http:///baz.com/'
+        self.assertIsNotSafeRedditUrl("http://foo:bar@/example.com/")
+
+    def test_browser_quirks(self):
+        # Some browsers try to be helpful and ignore characters in URLs that
+        # they think might have been accidental (I guess due to things like:
+        # `<a href=" http://badathtml.com/ ">`. We need to ignore those when
+        # determining if a URL is local.
+        self.assertIsNotSafeRedditUrl("/\x00/example.com")
+        self.assertIsNotSafeRedditUrl("\x09//example.com")
+        self.assertIsNotSafeRedditUrl(" http://example.com/")
+
+        # This is makes sure we're not vulnerable to a bug in
+        # urlparse / urlunparse.
+        # urlunparse(urlparse("////foo.com")) == "//foo.com"! screwy!
+        self.assertIsNotSafeRedditUrl("////example.com/")
+        self.assertIsNotSafeRedditUrl("//////example.com/")
+        # Similar, but with a scheme
+        self.assertIsNotSafeRedditUrl(r"http:///example.com/")
+        # Webkit and co like to treat backslashes as equivalent to slashes in
+        # different places, maybe to make OCD Windows users happy.
+        self.assertIsNotSafeRedditUrl(r"/\example.com/")
+        # On chrome this goes to example.com, not a subdomain of reddit.com!
+        self.assertIsNotSafeRedditUrl(
+            r"http://\\example.com\a.%s/foo" % g.domain
+        )
+
+        # Combo attacks!
+        self.assertIsNotSafeRedditUrl(r"///\example.com/")
+        self.assertIsNotSafeRedditUrl(r"\\example.com")
+        self.assertIsNotSafeRedditUrl("/\x00//\\example.com/")
+        self.assertIsNotSafeRedditUrl(
+            "\x09javascript://%s/%%0d%%0aalert(1)" % g.domain
+        )
+        self.assertIsNotSafeRedditUrl(
+            "http://\x09example.com\\%s/foo" % g.domain
+        )
+
+    def test_url_mutation(self):
+        u = UrlParser("http://example.com/")
+        u.hostname = g.domain
+        self.assertTrue(u.is_reddit_url())
+
+        u = UrlParser("http://%s/" % g.domain)
+        u.hostname = "example.com"
+        self.assertFalse(u.is_reddit_url())
+
+    def test_nbsp_allowances(self):
+        # We have to allow nbsps in URLs, let's just allow them where they can't
+        # do any damage.
+        self.assertIsNotSafeRedditUrl("http://\xa0.%s/" % g.domain)
+        self.assertIsNotSafeRedditUrl("\xa0http://%s/" % g.domain)
+        self.assertIsSafeRedditUrl("http://%s/\xa0" % g.domain)
+        self.assertIsSafeRedditUrl("/foo/bar/\xa0baz")