mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-04-27 03:00:12 -04:00
Fix several is_reddit_url() bypasses
Thanks to Yassine ABOUKIR (http://www.yassineaboukir.com) for the report! `////example.com` doesn't survive the round-trip through `urlunparse(urlparse())` and becomes `//example.com`. Things like `///\t/foo.com` were also getting by because some UAs ignore various characters when they occur before the path section. `UrlParser.is_web_safe_url()` was added to check if a URL could be interpreted differently by different parsers
This commit is contained in:
@@ -405,6 +405,26 @@ def query_string(dict):
|
||||
else:
|
||||
return ''
|
||||
|
||||
|
||||
# Characters that might cause parsing differences in different implementations
|
||||
# Spaces only seem to cause parsing differences when occurring directly before
|
||||
# the scheme
|
||||
URL_PROBLEMATIC_RE = re.compile(
|
||||
ur'(\A\x20|[\x00-\x19\xA0\u1680\u180E\u2000-\u2029\u205f\u3000\\])',
|
||||
re.UNICODE
|
||||
)
|
||||
|
||||
|
||||
def paranoid_urlparser_method(check):
|
||||
"""
|
||||
Decorator for checks on `UrlParser` instances that need to be paranoid
|
||||
"""
|
||||
def check_wrapper(parser, *args, **kwargs):
|
||||
return UrlParser.perform_paranoid_check(parser, check, *args, **kwargs)
|
||||
|
||||
return check_wrapper
|
||||
|
||||
|
||||
class UrlParser(object):
|
||||
"""
|
||||
Wrapper for urlparse and urlunparse for making changes to urls.
|
||||
@@ -425,8 +445,8 @@ class UrlParser(object):
|
||||
"""
|
||||
|
||||
__slots__ = ['scheme', 'path', 'params', 'query',
|
||||
'fragment', 'username', 'password', 'hostname',
|
||||
'port', '_url_updates', '_orig_url', '_query_dict']
|
||||
'fragment', 'username', 'password', 'hostname', 'port',
|
||||
'_url_updates', '_orig_url', '_orig_netloc', '_query_dict']
|
||||
|
||||
valid_schemes = ('http', 'https', 'ftp', 'mailto')
|
||||
cname_get = "cnameframe"
|
||||
@@ -438,6 +458,7 @@ class UrlParser(object):
|
||||
setattr(self, s, getattr(u, s))
|
||||
self._url_updates = {}
|
||||
self._orig_url = url
|
||||
self._orig_netloc = getattr(u, 'netloc', '')
|
||||
self._query_dict = None
|
||||
|
||||
def update_query(self, **updates):
|
||||
@@ -554,7 +575,64 @@ class UrlParser(object):
|
||||
pass
|
||||
return None
|
||||
|
||||
def is_reddit_url(self, subreddit = None):
|
||||
def perform_paranoid_check(self, check, *args, **kwargs):
|
||||
"""
|
||||
Perform a check on a URL that needs to account for bugs in `unparse()`
|
||||
|
||||
If you need to account for quirks in browser URL parsers, you should
|
||||
use this along with `is_web_safe_url()`. Trying to parse URLs like
|
||||
a browser would just makes things really hairy.
|
||||
"""
|
||||
variants_to_check = (
|
||||
self,
|
||||
UrlParser(self.unparse())
|
||||
)
|
||||
# If the check doesn't pass on *every* variant, it's a fail.
|
||||
return all(
|
||||
check(variant, *args, **kwargs) for variant in variants_to_check
|
||||
)
|
||||
|
||||
@paranoid_urlparser_method
|
||||
def is_web_safe_url(self):
|
||||
"""Determine if this URL could cause issues with different parsers"""
|
||||
|
||||
# There's no valid reason for this, and just serves to confuse UAs.
|
||||
# and urllib2.
|
||||
if self._orig_url.startswith("///"):
|
||||
return False
|
||||
|
||||
# Double-checking the above
|
||||
if not self.hostname and self.path.startswith('//'):
|
||||
return False
|
||||
|
||||
# A host-relative link with a scheme like `https:/baz` or `https:?quux`
|
||||
if self.scheme and not self.hostname:
|
||||
return False
|
||||
|
||||
# Credentials in the netloc? Not on reddit!
|
||||
if "@" in self._orig_netloc:
|
||||
return False
|
||||
|
||||
# `javascript://www.reddit.com/%0D%Aalert(1)` is not safe, obviously
|
||||
if self.scheme and self.scheme.lower() not in self.valid_schemes:
|
||||
return False
|
||||
|
||||
# Reject any URLs that contain characters known to cause parsing
|
||||
# differences between parser implementations
|
||||
for match in re.finditer(URL_PROBLEMATIC_RE, self._orig_url):
|
||||
# XXX: Yuck. We have non-breaking spaces in title slugs! They
|
||||
# should be safe enough to allow after three slashes. Opera 12's the
|
||||
# only browser that trips over them, and it doesn't fall for
|
||||
# `http:///foo.com/`.
|
||||
if match.group(0) == '\xa0':
|
||||
if match.string[0:match.start(0)].count('/') < 3:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_reddit_url(self, subreddit=None):
|
||||
"""utility method for seeing if the url is associated with
|
||||
reddit as we don't necessarily want to mangle non-reddit
|
||||
domains
|
||||
@@ -562,22 +640,19 @@ class UrlParser(object):
|
||||
returns true only if hostname is nonexistant, a subdomain of
|
||||
g.domain, or a subdomain of the provided subreddit's cname.
|
||||
"""
|
||||
|
||||
from pylons import g
|
||||
subdomain = (
|
||||
valid_subdomain = (
|
||||
not self.hostname or
|
||||
is_subdomain(self.hostname, g.domain) or
|
||||
(subreddit and subreddit.domain and
|
||||
is_subdomain(self.hostname, subreddit.domain))
|
||||
)
|
||||
# Handle backslash trickery like /\example.com/ being treated as
|
||||
# equal to //example.com/ by some browsers
|
||||
if not self.hostname and not self.scheme and self.path:
|
||||
if self.path.startswith("/\\"):
|
||||
return False
|
||||
if not subdomain or not self.hostname or not g.offsite_subdomains:
|
||||
return subdomain
|
||||
|
||||
if not valid_subdomain or not self.hostname or not g.offsite_subdomains:
|
||||
return valid_subdomain
|
||||
return not any(
|
||||
self.hostname.startswith(subdomain + '.')
|
||||
is_subdomain(self.hostname, "%s.%s" % (subdomain, g.domain))
|
||||
for subdomain in g.offsite_subdomains
|
||||
)
|
||||
|
||||
|
||||
@@ -2211,7 +2211,7 @@ class VDestination(Validator):
|
||||
if ld.startswith(('/', 'http://', 'https://')):
|
||||
u = UrlParser(dest)
|
||||
|
||||
if u.is_reddit_url(c.site):
|
||||
if u.is_reddit_url(c.site) and u.is_web_safe_url():
|
||||
return dest
|
||||
|
||||
ip = getattr(request, "ip", "[unknown]")
|
||||
|
||||
@@ -540,7 +540,8 @@ class Link(Thing, Printable):
|
||||
item.domain_str = None
|
||||
if c.user.pref_domain_details:
|
||||
urlparser = UrlParser(item.url)
|
||||
if not item.is_self and urlparser.is_reddit_url():
|
||||
if (not item.is_self and urlparser.is_reddit_url() and
|
||||
urlparser.is_web_safe_url()):
|
||||
url_subreddit = urlparser.get_subreddit()
|
||||
if (url_subreddit and
|
||||
not isinstance(url_subreddit, DefaultSR)):
|
||||
|
||||
140
r2/r2/tests/unit/lib/urlparser_test.py
Normal file
140
r2/r2/tests/unit/lib/urlparser_test.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python
|
||||
# The contents of this file are subject to the Common Public Attribution
|
||||
# License Version 1.0. (the "License"); you may not use this file except in
|
||||
# compliance with the License. You may obtain a copy of the License at
|
||||
# http://code.reddit.com/LICENSE. The License is based on the Mozilla Public
|
||||
# License Version 1.1, but Sections 14 and 15 have been added to cover use of
|
||||
# software over a computer network and provide for limited attribution for the
|
||||
# Original Developer. In addition, Exhibit A has been modified to be consistent
|
||||
# with Exhibit B.
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS IS" basis,
|
||||
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
|
||||
# the specific language governing rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is reddit.
|
||||
#
|
||||
# The Original Developer is the Initial Developer. The Initial Developer of
|
||||
# the Original Code is reddit Inc.
|
||||
#
|
||||
# All portions of the code written by reddit are Copyright (c) 2006-2015 reddit
|
||||
# Inc. All Rights Reserved.
|
||||
###############################################################################
|
||||
import unittest
|
||||
|
||||
from r2.lib.utils import UrlParser
|
||||
from r2.tests import stage_for_paste
|
||||
from pylons import g
|
||||
|
||||
|
||||
class TestIsRedditURL(unittest.TestCase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
stage_for_paste()
|
||||
cls._old_offsite = g.offsite_subdomains
|
||||
g.offsite_subdomains = ["blog"]
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
g.offsite_subdomains = cls._old_offsite
|
||||
|
||||
def _is_safe_reddit_url(self, url, subreddit=None):
|
||||
web_safe = UrlParser(url).is_web_safe_url()
|
||||
return web_safe and UrlParser(url).is_reddit_url(subreddit)
|
||||
|
||||
def assertIsSafeRedditUrl(self, url, subreddit=None):
|
||||
self.assertTrue(self._is_safe_reddit_url(url, subreddit))
|
||||
|
||||
def assertIsNotSafeRedditUrl(self, url, subreddit=None):
|
||||
self.assertFalse(self._is_safe_reddit_url(url, subreddit))
|
||||
|
||||
def test_normal_urls(self):
|
||||
self.assertIsSafeRedditUrl("https://%s/" % g.domain)
|
||||
self.assertIsSafeRedditUrl("https://en.%s/" % g.domain)
|
||||
self.assertIsSafeRedditUrl("https://foobar.baz.%s/quux/?a" % g.domain)
|
||||
self.assertIsSafeRedditUrl("#anchorage")
|
||||
self.assertIsSafeRedditUrl("?path_relative_queries")
|
||||
self.assertIsSafeRedditUrl("/")
|
||||
self.assertIsSafeRedditUrl("/cats")
|
||||
self.assertIsSafeRedditUrl("/cats/")
|
||||
self.assertIsSafeRedditUrl("/cats/#maru")
|
||||
self.assertIsSafeRedditUrl("//foobaz.%s/aa/baz#quux" % g.domain)
|
||||
# XXX: This is technically a legal relative URL, are there any UAs
|
||||
# stupid enough to treat this as absolute?
|
||||
self.assertIsSafeRedditUrl("path_relative_subpath.com")
|
||||
# "blog.reddit.com" is not a reddit URL.
|
||||
self.assertIsNotSafeRedditUrl("http://blog.%s/" % g.domain)
|
||||
self.assertIsNotSafeRedditUrl("http://foo.blog.%s/" % g.domain)
|
||||
|
||||
def test_incorrect_anchoring(self):
|
||||
self.assertIsNotSafeRedditUrl("http://www.%s.whatever.com/" % g.domain)
|
||||
|
||||
def test_protocol_relative(self):
|
||||
self.assertIsNotSafeRedditUrl("//foobaz.example.com/aa/baz#quux")
|
||||
|
||||
def test_weird_protocols(self):
|
||||
self.assertIsNotSafeRedditUrl(
|
||||
"javascript://%s/%%0d%%0aalert(1)" % g.domain
|
||||
)
|
||||
self.assertIsNotSafeRedditUrl("hackery:whatever")
|
||||
|
||||
def test_http_auth(self):
|
||||
# There's no legitimate reason to include HTTP auth details in the URL,
|
||||
# they only serve to confuse everyone involved.
|
||||
# For example, this used to be the behaviour of `UrlParser`, oops!
|
||||
# > UrlParser("http://everyoneforgets:aboutthese@/baz.com/").unparse()
|
||||
# 'http:///baz.com/'
|
||||
self.assertIsNotSafeRedditUrl("http://foo:bar@/example.com/")
|
||||
|
||||
def test_browser_quirks(self):
|
||||
# Some browsers try to be helpful and ignore characters in URLs that
|
||||
# they think might have been accidental (I guess due to things like:
|
||||
# `<a href=" http://badathtml.com/ ">`. We need to ignore those when
|
||||
# determining if a URL is local.
|
||||
self.assertIsNotSafeRedditUrl("/\x00/example.com")
|
||||
self.assertIsNotSafeRedditUrl("\x09//example.com")
|
||||
self.assertIsNotSafeRedditUrl(" http://example.com/")
|
||||
|
||||
# This is makes sure we're not vulnerable to a bug in
|
||||
# urlparse / urlunparse.
|
||||
# urlunparse(urlparse("////foo.com")) == "//foo.com"! screwy!
|
||||
self.assertIsNotSafeRedditUrl("////example.com/")
|
||||
self.assertIsNotSafeRedditUrl("//////example.com/")
|
||||
# Similar, but with a scheme
|
||||
self.assertIsNotSafeRedditUrl(r"http:///example.com/")
|
||||
# Webkit and co like to treat backslashes as equivalent to slashes in
|
||||
# different places, maybe to make OCD Windows users happy.
|
||||
self.assertIsNotSafeRedditUrl(r"/\example.com/")
|
||||
# On chrome this goes to example.com, not a subdomain of reddit.com!
|
||||
self.assertIsNotSafeRedditUrl(
|
||||
r"http://\\example.com\a.%s/foo" % g.domain
|
||||
)
|
||||
|
||||
# Combo attacks!
|
||||
self.assertIsNotSafeRedditUrl(r"///\example.com/")
|
||||
self.assertIsNotSafeRedditUrl(r"\\example.com")
|
||||
self.assertIsNotSafeRedditUrl("/\x00//\\example.com/")
|
||||
self.assertIsNotSafeRedditUrl(
|
||||
"\x09javascript://%s/%%0d%%0aalert(1)" % g.domain
|
||||
)
|
||||
self.assertIsNotSafeRedditUrl(
|
||||
"http://\x09example.com\\%s/foo" % g.domain
|
||||
)
|
||||
|
||||
def test_url_mutation(self):
|
||||
u = UrlParser("http://example.com/")
|
||||
u.hostname = g.domain
|
||||
self.assertTrue(u.is_reddit_url())
|
||||
|
||||
u = UrlParser("http://%s/" % g.domain)
|
||||
u.hostname = "example.com"
|
||||
self.assertFalse(u.is_reddit_url())
|
||||
|
||||
def test_nbsp_allowances(self):
|
||||
# We have to allow nbsps in URLs, let's just allow them where they can't
|
||||
# do any damage.
|
||||
self.assertIsNotSafeRedditUrl("http://\xa0.%s/" % g.domain)
|
||||
self.assertIsNotSafeRedditUrl("\xa0http://%s/" % g.domain)
|
||||
self.assertIsSafeRedditUrl("http://%s/\xa0" % g.domain)
|
||||
self.assertIsSafeRedditUrl("/foo/bar/\xa0baz")
|
||||
Reference in New Issue
Block a user