From d7c2161c66dfd5dcfcfe2ff8994db6e69f85155a Mon Sep 17 00:00:00 2001 From: Max Goodman Date: Tue, 30 Aug 2011 11:29:46 -0700 Subject: [PATCH] Verify url scheme and domain properly in CSS properties. utils.domain will find a domain at any position within a URL, so it was unsuitable for validating CSS URLs. --- r2/r2/lib/cssfilter.py | 22 +++++++++++++++------- r2/r2/lib/utils/utils.py | 10 +++++++--- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/r2/r2/lib/cssfilter.py b/r2/r2/lib/cssfilter.py index eb147c56a..68f249477 100644 --- a/r2/r2/lib/cssfilter.py +++ b/r2/r2/lib/cssfilter.py @@ -22,7 +22,7 @@ from __future__ import with_statement from r2.models import * -from r2.lib.utils import sanitize_url, domain, randstr +from r2.lib.utils import sanitize_url, strip_www, randstr from r2.lib.strings import string_dict from r2.lib.pages.things import wrap_links @@ -37,6 +37,7 @@ from md5 import md5 from r2.lib.contrib.nymph import optimize_png import re +from urlparse import urlparse import cssutils from cssutils import CSSParser @@ -177,6 +178,7 @@ class ValidationError(Exception): local_urls = re.compile(r'\A/static/[a-z./-]+\Z') # substitutable urls will be css-valid labels surrounded by "%%" custom_img_urls = re.compile(r'%%([a-zA-Z0-9\-]+)%%') +valid_url_schemes = ('http', 'https') def valid_url(prop,value,report): """ checks url(...) arguments in CSS, ensuring that the contents are @@ -214,13 +216,19 @@ def valid_url(prop,value,report): report.append(ValidationError(msgs['broken_url'] % dict(brokenurl = value.cssText), value)) - # allowed domains are ok - elif domain(url) in g.allowed_css_linked_domains: - pass else: - report.append(ValidationError(msgs['broken_url'] - % dict(brokenurl = value.cssText), - value)) + try: + u = urlparse(url) + valid_scheme = u.scheme and u.scheme in valid_url_schemes + valid_domain = strip_www(u.netloc) in g.allowed_css_linked_domains + except ValueError: + u = False + + # allowed domains are ok + if not (u and valid_scheme and valid_domain): + report.append(ValidationError(msgs['broken_url'] + % dict(brokenurl = value.cssText), + value)) #elif sanitize_url(url) != url: # report.append(ValidationError(msgs['broken_url'] # % dict(brokenurl = value.cssText), diff --git a/r2/r2/lib/utils/utils.py b/r2/r2/lib/utils/utils.py index 828f7685f..2b2236ec1 100644 --- a/r2/r2/lib/utils/utils.py +++ b/r2/r2/lib/utils/utils.py @@ -206,6 +206,12 @@ class Results(): else: raise StopIteration +def strip_www(domain): + if domain.count('.') >= 2 and domain.startswith("www."): + return domain[4:] + else: + return domain + r_base_url = re.compile("(?i)(?:.+?://)?(?:www[\d]*\.)?([^#]*[^#/])/?") def base_url(url): res = r_base_url.findall(url) @@ -611,9 +617,7 @@ class UrlParser(object): u = cls(url) # strip off any www and lowercase the hostname: - netloc = u.netloc.lower() - if len(netloc.split('.')) > 2 and netloc.startswith("www."): - netloc = netloc[4:] + netloc = strip_www(u.netloc.lower()) # http://code.google.com/web/ajaxcrawling/docs/specification.html fragment = u.fragment if u.fragment.startswith("!") else ""