mirror of
https://github.com/dropbox/zxcvbn.git
synced 2026-01-09 22:18:06 -05:00
130 lines
4.2 KiB
CoffeeScript
130 lines
4.2 KiB
CoffeeScript
matching = require '../lib/matching'
|
|
scoring = require '../lib/scoring'
|
|
|
|
fs = require 'fs'
|
|
byline = require 'byline'
|
|
sprintf = require('sprintf-js').sprintf
|
|
|
|
|
|
check_usage = () ->
|
|
usage = '''
|
|
|
|
Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
|
|
descending frequency. That file can be found by googling around for:
|
|
"xato 10-million-combos.txt"
|
|
|
|
Passwords that both:
|
|
-- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs
|
|
-- have a higher rank than the corresponding match guess number
|
|
|
|
are excluded from the final password set, since zxcvbn would score them lower through
|
|
other means anyhow. in practice this rules out dates and years most often and makes room
|
|
for more useful data.
|
|
|
|
To use, first run from zxcvbn base dir:
|
|
|
|
npm run build
|
|
|
|
then change into data-scripts directory and run:
|
|
|
|
coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt
|
|
|
|
'''
|
|
valid = process.argv.length == 5
|
|
valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n']
|
|
valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts'
|
|
unless valid
|
|
console.log usage
|
|
process.exit(0)
|
|
|
|
# after all passwords are counted, discard pws with counts <= COUNTS
|
|
CUTOFF = 10
|
|
|
|
# to save memory, after every batch of size BATCH_SIZE, go through counts and delete
|
|
# long tail of entries with only one count.
|
|
BATCH_SIZE = 1000000
|
|
|
|
counts = {} # maps pw -> count
|
|
skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens
|
|
line_count = 0 # current number of lines processed
|
|
|
|
normalize = (token) ->
|
|
token.toLowerCase()
|
|
|
|
should_include = (password, xato_rank) ->
|
|
for i in [0...password.length]
|
|
if password.charCodeAt(i) > 127
|
|
# xato mostly contains ascii-only passwords, so in practice
|
|
# this will only skip one or two top passwords over the cutoff.
|
|
# were that not the case / were this used on a different data source, consider using
|
|
# a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
|
|
console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}"
|
|
return false
|
|
matches = []
|
|
for matcher in [
|
|
matching.spatial_match
|
|
matching.repeat_match
|
|
matching.sequence_match
|
|
matching.regex_match
|
|
matching.date_match
|
|
]
|
|
matches.push.apply matches, matcher.call(matching, password)
|
|
matches = matches.filter (match) ->
|
|
# only keep matches that span full password
|
|
match.i == 0 and match.j == password.length - 1
|
|
for match in matches
|
|
if scoring.estimate_guesses(match, password) < xato_rank
|
|
# filter out this entry: non-dictionary matching will assign
|
|
# a lower guess estimate.
|
|
return false
|
|
return true
|
|
|
|
prune = (counts) ->
|
|
for pw, count of counts
|
|
if count == 1
|
|
delete counts[pw]
|
|
|
|
main = (xato_filename, output_filename) ->
|
|
stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8')
|
|
stream.on 'readable', ->
|
|
while null != (line = stream.read())
|
|
line_count += 1
|
|
if line_count % BATCH_SIZE == 0
|
|
console.log 'counting tokens:', line_count
|
|
prune counts
|
|
tokens = line.trim().split /\s+/
|
|
unless tokens.length == 2
|
|
skipped_lines += 1
|
|
continue
|
|
[username, password] = tokens[..1]
|
|
password = normalize password
|
|
if password of counts
|
|
counts[password] += 1
|
|
else
|
|
counts[password] = 1
|
|
stream.on 'end', ->
|
|
console.log 'skipped lines:', skipped_lines
|
|
pairs = []
|
|
console.log 'copying to tuples'
|
|
for pw, count of counts
|
|
if count > CUTOFF
|
|
pairs.push [pw, count]
|
|
delete counts[pw] # save memory to avoid v8 1GB limit
|
|
console.log 'sorting'
|
|
pairs.sort (p1, p2) ->
|
|
# sort by count. higher counts go first.
|
|
p2[1] - p1[1]
|
|
console.log 'filtering'
|
|
pairs = pairs.filter (pair, i) ->
|
|
rank = i + 1
|
|
[pw, count] = pair
|
|
should_include pw, rank
|
|
output_stream = fs.createWriteStream output_filename, encoding: 'utf8'
|
|
for pair in pairs
|
|
[pw, count] = pair
|
|
output_stream.write sprintf("%-15s %d\n", pw, count)
|
|
output_stream.end()
|
|
|
|
check_usage()
|
|
main process.argv[3], process.argv[4]
|