Refactor and reorganize email canonicalization; add some tests.

The email_attrs thing wasn't actually being used (`if False`) and so
I've removed it in the process.
This commit is contained in:
Neil Williams
2012-12-21 16:09:05 -08:00
parent 9b66333c2e
commit 76594cdeb6
5 changed files with 59 additions and 62 deletions

View File

@@ -1488,3 +1488,27 @@ class GoldPrice(object):
def config_gold_price(v, key=None, data=None):
return GoldPrice(v)
def canonicalize_email(email):
"""Return the given email address without various localpart manglings.
a.s.d.f+something@gmail.com --> asdf@gmail.com
This is not at all RFC-compliant or correct. It's only intended to be a
quick heuristic to remove commonly used mangling techniques.
"""
if not email:
return ""
email = _force_utf8(email.lower())
localpart, at, domain = email.partition("@")
if not at or "@" in domain:
return ""
localpart = localpart.replace(".", "")
localpart = localpart.partition("+")[0]
return localpart + "@" + domain

View File

@@ -27,7 +27,7 @@ from r2.lib.db import tdb_cassandra
from r2.lib.memoize import memoize
from r2.lib.utils import modhash, valid_hash, randstr, timefromnow
from r2.lib.utils import UrlParser
from r2.lib.utils import constant_time_compare
from r2.lib.utils import constant_time_compare, canonicalize_email
from r2.lib.cache import sgm
from r2.lib import filters
from r2.lib.log import log_text
@@ -570,19 +570,7 @@ class Account(Thing):
return which.get(canon, None)
def canonical_email(self):
email = self.email.lower().encode('utf-8')
if email.count("@") != 1:
return "invalid@invalid.invalid"
localpart, domain = email.split("@")
# a.s.d.f+something@gmail.com --> asdf@gmail.com
localpart.replace(".", "")
plus = localpart.find("+")
if plus > 0:
localpart = localpart[:plus]
return localpart + "@" + domain
return canonicalize_email(self.email)
def cromulent(self):
"""Return whether the user has validated their email address and

View File

@@ -142,47 +142,6 @@ class AdminTools(object):
author = authors[aid]
author._incr('spammer', len(author_things) if spam else -len(author_things))
def email_attrs(self, account_ids, return_dict=True):
account_ids, single = tup(account_ids, True)
accounts = Account._byID(account_ids, data=True, return_dict=False)
rv = {}
canons = {}
aids_by_canon = {} # sounds terrifying
for a in accounts:
attrs = []
rv[a._id] = attrs
if not getattr(a, "email", None):
attrs.append(("gray", "no email specified", "X"))
else:
canon = a.canonical_email()
aids_by_canon.setdefault(canon, [])
aids_by_canon[canon].append(a._id)
verify_str = "verified email: " + canon
if getattr(a, "email_verified", None):
attrs.append(("green", verify_str, "V"))
else:
attrs.append(("gray", "un" + verify_str, "@"))
ban_reasons = Account.which_emails_are_banned(aids_by_canon.keys())
for canon, ban_reason in ban_reasons.iteritems():
if ban_reason:
for aid in aids_by_canon[canon]:
rv[aid].append(("wrong", "banned email " + ban_reason, "B"))
if single:
return rv[account_ids[0]]
elif return_dict:
return rv
else:
return filter(None, (rv.get(i) for i in account_ids))
def set_last_sr_ban(self, things):
by_srid = {}
for thing in things:

View File

@@ -74,13 +74,10 @@ class Builder(object):
authors = {}
cup_infos = {}
email_attrses = {}
friend_rels = None
if aids:
authors = Account._byID(aids, data=True, stale=self.stale) if aids else {}
cup_infos = Account.cup_info_multi(aids)
if c.user_is_admin:
email_attrses = admintools.email_attrs(aids, return_dict=True)
if user and user.gold:
friend_rels = user.friend_rels()
@@ -168,10 +165,6 @@ class Builder(object):
args['kind'] = 'special'
add_attr(w.attribs, **args)
if False and w.author and c.user_is_admin:
for attr in email_attrses[w.author._id]:
add_attr(w.attribs, attr[2], label=attr[1])
if w.author and w.author._id in cup_infos and not c.profilepage:
cup_info = cup_infos[w.author._id]
label = _(cup_info["label_template"]) % \

View File

@@ -32,5 +32,38 @@ class UtilsTest(unittest.TestCase):
expect('z', 5)
self.assertRaises(ValueError, expect, None, 6)
class TestCanonicalizeEmail(unittest.TestCase):
def test_empty_string(self):
canonical = utils.canonicalize_email("")
self.assertEquals(canonical, "")
def test_unicode(self):
canonical = utils.canonicalize_email(u"\u2713@example.com")
self.assertEquals(canonical, "\xe2\x9c\x93@example.com")
def test_localonly(self):
canonical = utils.canonicalize_email("invalid")
self.assertEquals(canonical, "")
def test_multiple_ats(self):
canonical = utils.canonicalize_email("invalid@invalid@invalid")
self.assertEquals(canonical, "")
def test_remove_dots(self):
canonical = utils.canonicalize_email("d.o.t.s@example.com")
self.assertEquals(canonical, "dots@example.com")
def test_remove_plus_address(self):
canonical = utils.canonicalize_email("fork+nork@example.com")
self.assertEquals(canonical, "fork@example.com")
def test_unicode_in_byte_str(self):
# this shouldn't ever happen, but some entries in postgres appear
# to be byte strings with non-ascii in 'em.
canonical = utils.canonicalize_email("\xe2\x9c\x93@example.com")
self.assertEquals(canonical, "\xe2\x9c\x93@example.com")
if __name__ == '__main__':
unittest.main()