Fix several is_reddit_url() bypasses

Thanks to Yassine ABOUKIR (http://www.yassineaboukir.com) for the report!

`////example.com` doesn't survive the round-trip through
`urlunparse(urlparse())` and becomes `//example.com`.

Things like `///\t/foo.com` were also getting by because some UAs
ignore various characters when they occur before the path section.
`UrlParser.is_web_safe_url()` was added to check if a URL could be
interpreted differently by different parsers
This commit is contained in:
Jordan Milne
2015-02-23 14:33:35 -08:00
parent bb2a5f3a8d
commit 689a9554e6
4 changed files with 230 additions and 14 deletions

View File

@@ -405,6 +405,26 @@ def query_string(dict):
else:
return ''
# Characters that might cause parsing differences in different implementations
# Spaces only seem to cause parsing differences when occurring directly before
# the scheme
URL_PROBLEMATIC_RE = re.compile(
ur'(\A\x20|[\x00-\x19\xA0\u1680\u180E\u2000-\u2029\u205f\u3000\\])',
re.UNICODE
)
def paranoid_urlparser_method(check):
"""
Decorator for checks on `UrlParser` instances that need to be paranoid
"""
def check_wrapper(parser, *args, **kwargs):
return UrlParser.perform_paranoid_check(parser, check, *args, **kwargs)
return check_wrapper
class UrlParser(object):
"""
Wrapper for urlparse and urlunparse for making changes to urls.
@@ -425,8 +445,8 @@ class UrlParser(object):
"""
__slots__ = ['scheme', 'path', 'params', 'query',
'fragment', 'username', 'password', 'hostname',
'port', '_url_updates', '_orig_url', '_query_dict']
'fragment', 'username', 'password', 'hostname', 'port',
'_url_updates', '_orig_url', '_orig_netloc', '_query_dict']
valid_schemes = ('http', 'https', 'ftp', 'mailto')
cname_get = "cnameframe"
@@ -438,6 +458,7 @@ class UrlParser(object):
setattr(self, s, getattr(u, s))
self._url_updates = {}
self._orig_url = url
self._orig_netloc = getattr(u, 'netloc', '')
self._query_dict = None
def update_query(self, **updates):
@@ -554,7 +575,64 @@ class UrlParser(object):
pass
return None
def is_reddit_url(self, subreddit = None):
def perform_paranoid_check(self, check, *args, **kwargs):
"""
Perform a check on a URL that needs to account for bugs in `unparse()`
If you need to account for quirks in browser URL parsers, you should
use this along with `is_web_safe_url()`. Trying to parse URLs like
a browser would just makes things really hairy.
"""
variants_to_check = (
self,
UrlParser(self.unparse())
)
# If the check doesn't pass on *every* variant, it's a fail.
return all(
check(variant, *args, **kwargs) for variant in variants_to_check
)
@paranoid_urlparser_method
def is_web_safe_url(self):
"""Determine if this URL could cause issues with different parsers"""
# There's no valid reason for this, and just serves to confuse UAs.
# and urllib2.
if self._orig_url.startswith("///"):
return False
# Double-checking the above
if not self.hostname and self.path.startswith('//'):
return False
# A host-relative link with a scheme like `https:/baz` or `https:?quux`
if self.scheme and not self.hostname:
return False
# Credentials in the netloc? Not on reddit!
if "@" in self._orig_netloc:
return False
# `javascript://www.reddit.com/%0D%Aalert(1)` is not safe, obviously
if self.scheme and self.scheme.lower() not in self.valid_schemes:
return False
# Reject any URLs that contain characters known to cause parsing
# differences between parser implementations
for match in re.finditer(URL_PROBLEMATIC_RE, self._orig_url):
# XXX: Yuck. We have non-breaking spaces in title slugs! They
# should be safe enough to allow after three slashes. Opera 12's the
# only browser that trips over them, and it doesn't fall for
# `http:///foo.com/`.
if match.group(0) == '\xa0':
if match.string[0:match.start(0)].count('/') < 3:
return False
else:
return False
return True
def is_reddit_url(self, subreddit=None):
"""utility method for seeing if the url is associated with
reddit as we don't necessarily want to mangle non-reddit
domains
@@ -562,22 +640,19 @@ class UrlParser(object):
returns true only if hostname is nonexistant, a subdomain of
g.domain, or a subdomain of the provided subreddit's cname.
"""
from pylons import g
subdomain = (
valid_subdomain = (
not self.hostname or
is_subdomain(self.hostname, g.domain) or
(subreddit and subreddit.domain and
is_subdomain(self.hostname, subreddit.domain))
)
# Handle backslash trickery like /\example.com/ being treated as
# equal to //example.com/ by some browsers
if not self.hostname and not self.scheme and self.path:
if self.path.startswith("/\\"):
return False
if not subdomain or not self.hostname or not g.offsite_subdomains:
return subdomain
if not valid_subdomain or not self.hostname or not g.offsite_subdomains:
return valid_subdomain
return not any(
self.hostname.startswith(subdomain + '.')
is_subdomain(self.hostname, "%s.%s" % (subdomain, g.domain))
for subdomain in g.offsite_subdomains
)

View File

@@ -2211,7 +2211,7 @@ class VDestination(Validator):
if ld.startswith(('/', 'http://', 'https://')):
u = UrlParser(dest)
if u.is_reddit_url(c.site):
if u.is_reddit_url(c.site) and u.is_web_safe_url():
return dest
ip = getattr(request, "ip", "[unknown]")

View File

@@ -540,7 +540,8 @@ class Link(Thing, Printable):
item.domain_str = None
if c.user.pref_domain_details:
urlparser = UrlParser(item.url)
if not item.is_self and urlparser.is_reddit_url():
if (not item.is_self and urlparser.is_reddit_url() and
urlparser.is_web_safe_url()):
url_subreddit = urlparser.get_subreddit()
if (url_subreddit and
not isinstance(url_subreddit, DefaultSR)):

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python
# The contents of this file are subject to the Common Public Attribution
# License Version 1.0. (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
# software over a computer network and provide for limited attribution for the
# Original Developer. In addition, Exhibit A has been modified to be consistent
# with Exhibit B.
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
#
# The Original Code is reddit.
#
# The Original Developer is the Initial Developer. The Initial Developer of
# the Original Code is reddit Inc.
#
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
# Inc. All Rights Reserved.
###############################################################################
import unittest
from r2.lib.utils import UrlParser
from r2.tests import stage_for_paste
from pylons import g
class TestIsRedditURL(unittest.TestCase):
@classmethod
def setUpClass(cls):
stage_for_paste()
cls._old_offsite = g.offsite_subdomains
g.offsite_subdomains = ["blog"]
@classmethod
def tearDownClass(cls):
g.offsite_subdomains = cls._old_offsite
def _is_safe_reddit_url(self, url, subreddit=None):
web_safe = UrlParser(url).is_web_safe_url()
return web_safe and UrlParser(url).is_reddit_url(subreddit)
def assertIsSafeRedditUrl(self, url, subreddit=None):
self.assertTrue(self._is_safe_reddit_url(url, subreddit))
def assertIsNotSafeRedditUrl(self, url, subreddit=None):
self.assertFalse(self._is_safe_reddit_url(url, subreddit))
def test_normal_urls(self):
self.assertIsSafeRedditUrl("https://%s/" % g.domain)
self.assertIsSafeRedditUrl("https://en.%s/" % g.domain)
self.assertIsSafeRedditUrl("https://foobar.baz.%s/quux/?a" % g.domain)
self.assertIsSafeRedditUrl("#anchorage")
self.assertIsSafeRedditUrl("?path_relative_queries")
self.assertIsSafeRedditUrl("/")
self.assertIsSafeRedditUrl("/cats")
self.assertIsSafeRedditUrl("/cats/")
self.assertIsSafeRedditUrl("/cats/#maru")
self.assertIsSafeRedditUrl("//foobaz.%s/aa/baz#quux" % g.domain)
# XXX: This is technically a legal relative URL, are there any UAs
# stupid enough to treat this as absolute?
self.assertIsSafeRedditUrl("path_relative_subpath.com")
# "blog.reddit.com" is not a reddit URL.
self.assertIsNotSafeRedditUrl("http://blog.%s/" % g.domain)
self.assertIsNotSafeRedditUrl("http://foo.blog.%s/" % g.domain)
def test_incorrect_anchoring(self):
self.assertIsNotSafeRedditUrl("http://www.%s.whatever.com/" % g.domain)
def test_protocol_relative(self):
self.assertIsNotSafeRedditUrl("//foobaz.example.com/aa/baz#quux")
def test_weird_protocols(self):
self.assertIsNotSafeRedditUrl(
"javascript://%s/%%0d%%0aalert(1)" % g.domain
)
self.assertIsNotSafeRedditUrl("hackery:whatever")
def test_http_auth(self):
# There's no legitimate reason to include HTTP auth details in the URL,
# they only serve to confuse everyone involved.
# For example, this used to be the behaviour of `UrlParser`, oops!
# > UrlParser("http://everyoneforgets:aboutthese@/baz.com/").unparse()
# 'http:///baz.com/'
self.assertIsNotSafeRedditUrl("http://foo:bar@/example.com/")
def test_browser_quirks(self):
# Some browsers try to be helpful and ignore characters in URLs that
# they think might have been accidental (I guess due to things like:
# `<a href=" http://badathtml.com/ ">`. We need to ignore those when
# determining if a URL is local.
self.assertIsNotSafeRedditUrl("/\x00/example.com")
self.assertIsNotSafeRedditUrl("\x09//example.com")
self.assertIsNotSafeRedditUrl(" http://example.com/")
# This is makes sure we're not vulnerable to a bug in
# urlparse / urlunparse.
# urlunparse(urlparse("////foo.com")) == "//foo.com"! screwy!
self.assertIsNotSafeRedditUrl("////example.com/")
self.assertIsNotSafeRedditUrl("//////example.com/")
# Similar, but with a scheme
self.assertIsNotSafeRedditUrl(r"http:///example.com/")
# Webkit and co like to treat backslashes as equivalent to slashes in
# different places, maybe to make OCD Windows users happy.
self.assertIsNotSafeRedditUrl(r"/\example.com/")
# On chrome this goes to example.com, not a subdomain of reddit.com!
self.assertIsNotSafeRedditUrl(
r"http://\\example.com\a.%s/foo" % g.domain
)
# Combo attacks!
self.assertIsNotSafeRedditUrl(r"///\example.com/")
self.assertIsNotSafeRedditUrl(r"\\example.com")
self.assertIsNotSafeRedditUrl("/\x00//\\example.com/")
self.assertIsNotSafeRedditUrl(
"\x09javascript://%s/%%0d%%0aalert(1)" % g.domain
)
self.assertIsNotSafeRedditUrl(
"http://\x09example.com\\%s/foo" % g.domain
)
def test_url_mutation(self):
u = UrlParser("http://example.com/")
u.hostname = g.domain
self.assertTrue(u.is_reddit_url())
u = UrlParser("http://%s/" % g.domain)
u.hostname = "example.com"
self.assertFalse(u.is_reddit_url())
def test_nbsp_allowances(self):
# We have to allow nbsps in URLs, let's just allow them where they can't
# do any damage.
self.assertIsNotSafeRedditUrl("http://\xa0.%s/" % g.domain)
self.assertIsNotSafeRedditUrl("\xa0http://%s/" % g.domain)
self.assertIsSafeRedditUrl("http://%s/\xa0" % g.domain)
self.assertIsSafeRedditUrl("/foo/bar/\xa0baz")