Verify url scheme and domain properly in CSS properties.

utils.domain will find a domain at any position within a URL, so it was unsuitable for validating CSS URLs.
2026-01-27 07:48:16 -05:00 · 2011-08-30 11:29:46 -07:00
parent 0828095f73
commit d7c2161c66
2 changed files with 22 additions and 10 deletions
--- a/r2/r2/lib/cssfilter.py
+++ b/r2/r2/lib/cssfilter.py
@@ -22,7 +22,7 @@
 from __future__ import with_statement

 from r2.models import *
-from r2.lib.utils import sanitize_url, domain, randstr
+from r2.lib.utils import sanitize_url, strip_www, randstr
 from r2.lib.strings import string_dict
 from r2.lib.pages.things import wrap_links

@@ -37,6 +37,7 @@ from md5 import md5
 from r2.lib.contrib.nymph import optimize_png

 import re
+from urlparse import urlparse

 import cssutils
 from cssutils import CSSParser
@@ -177,6 +178,7 @@ class ValidationError(Exception):
 local_urls = re.compile(r'\A/static/[a-z./-]+\Z')
 # substitutable urls will be css-valid labels surrounded by "%%"
 custom_img_urls = re.compile(r'%%([a-zA-Z0-9\-]+)%%')
+valid_url_schemes = ('http', 'https')
 def valid_url(prop,value,report):
    """
    checks url(...) arguments in CSS, ensuring that the contents are
@@ -214,13 +216,19 @@ def valid_url(prop,value,report):
            report.append(ValidationError(msgs['broken_url']
                                          % dict(brokenurl = value.cssText),
                                          value))
-    # allowed domains are ok
-    elif domain(url) in g.allowed_css_linked_domains:
-        pass
    else:
-        report.append(ValidationError(msgs['broken_url']
-                                      % dict(brokenurl = value.cssText),
-                                      value))
+        try:
+            u = urlparse(url)
+            valid_scheme = u.scheme and u.scheme in valid_url_schemes
+            valid_domain = strip_www(u.netloc) in g.allowed_css_linked_domains
+        except ValueError:
+            u = False
+
+        # allowed domains are ok
+        if not (u and valid_scheme and valid_domain):
+            report.append(ValidationError(msgs['broken_url']
+                                          % dict(brokenurl = value.cssText),
+                                          value))
    #elif sanitize_url(url) != url:
    #    report.append(ValidationError(msgs['broken_url']
    #                                  % dict(brokenurl = value.cssText),
--- a/r2/r2/lib/utils/utils.py
+++ b/r2/r2/lib/utils/utils.py
@@ -206,6 +206,12 @@ class Results():
        else:
            raise StopIteration

+def strip_www(domain):
+    if domain.count('.') >= 2 and domain.startswith("www."):
+        return domain[4:]
+    else:
+        return domain
+
 r_base_url = re.compile("(?i)(?:.+?://)?(?:www[\d]*\.)?([^#]*[^#/])/?")
 def base_url(url):
    res = r_base_url.findall(url)
@@ -611,9 +617,7 @@ class UrlParser(object):
        u = cls(url)

        # strip off any www and lowercase the hostname:
-        netloc = u.netloc.lower()
-        if len(netloc.split('.')) > 2 and netloc.startswith("www."):
-            netloc = netloc[4:]
+        netloc = strip_www(u.netloc.lower())

        # http://code.google.com/web/ajaxcrawling/docs/specification.html
        fragment = u.fragment if u.fragment.startswith("!") else ""