Verify url scheme and domain properly in CSS properties.

utils.domain will find a domain at any position within a URL, so it was
unsuitable for validating CSS URLs.
This commit is contained in:
Max Goodman
2011-08-30 11:29:46 -07:00
committed by Neil Williams
parent 0828095f73
commit d7c2161c66
2 changed files with 22 additions and 10 deletions

View File

@@ -22,7 +22,7 @@
from __future__ import with_statement
from r2.models import *
from r2.lib.utils import sanitize_url, domain, randstr
from r2.lib.utils import sanitize_url, strip_www, randstr
from r2.lib.strings import string_dict
from r2.lib.pages.things import wrap_links
@@ -37,6 +37,7 @@ from md5 import md5
from r2.lib.contrib.nymph import optimize_png
import re
from urlparse import urlparse
import cssutils
from cssutils import CSSParser
@@ -177,6 +178,7 @@ class ValidationError(Exception):
local_urls = re.compile(r'\A/static/[a-z./-]+\Z')
# substitutable urls will be css-valid labels surrounded by "%%"
custom_img_urls = re.compile(r'%%([a-zA-Z0-9\-]+)%%')
valid_url_schemes = ('http', 'https')
def valid_url(prop,value,report):
"""
checks url(...) arguments in CSS, ensuring that the contents are
@@ -214,13 +216,19 @@ def valid_url(prop,value,report):
report.append(ValidationError(msgs['broken_url']
% dict(brokenurl = value.cssText),
value))
# allowed domains are ok
elif domain(url) in g.allowed_css_linked_domains:
pass
else:
report.append(ValidationError(msgs['broken_url']
% dict(brokenurl = value.cssText),
value))
try:
u = urlparse(url)
valid_scheme = u.scheme and u.scheme in valid_url_schemes
valid_domain = strip_www(u.netloc) in g.allowed_css_linked_domains
except ValueError:
u = False
# allowed domains are ok
if not (u and valid_scheme and valid_domain):
report.append(ValidationError(msgs['broken_url']
% dict(brokenurl = value.cssText),
value))
#elif sanitize_url(url) != url:
# report.append(ValidationError(msgs['broken_url']
# % dict(brokenurl = value.cssText),

View File

@@ -206,6 +206,12 @@ class Results():
else:
raise StopIteration
def strip_www(domain):
if domain.count('.') >= 2 and domain.startswith("www."):
return domain[4:]
else:
return domain
r_base_url = re.compile("(?i)(?:.+?://)?(?:www[\d]*\.)?([^#]*[^#/])/?")
def base_url(url):
res = r_base_url.findall(url)
@@ -611,9 +617,7 @@ class UrlParser(object):
u = cls(url)
# strip off any www and lowercase the hostname:
netloc = u.netloc.lower()
if len(netloc.split('.')) > 2 and netloc.startswith("www."):
netloc = netloc[4:]
netloc = strip_www(u.netloc.lower())
# http://code.google.com/web/ajaxcrawling/docs/specification.html
fragment = u.fragment if u.fragment.startswith("!") else ""