zxcvbn/data-scripts/count_xato.coffee

matching = require '../lib/matching'
scoring = require '../lib/scoring'

fs = require 'fs'
byline = require 'byline'
sprintf = require('sprintf-js').sprintf


check_usage = () ->
  usage = '''

  Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
  descending frequency. That file can be found by googling around for:
  "xato 10-million-combos.txt"

  Passwords that both:
  -- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs
  -- have a higher rank than the corresponding match guess number

  are excluded from the final password set, since zxcvbn would score them lower through
  other means anyhow. in practice this rules out dates and years most often and makes room
  for more useful data.

  To use, first run from zxcvbn base dir:

  npm run build

  then change into data-scripts directory and run:

  coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt

  '''
  valid = process.argv.length == 5
  valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n']
  valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts'
  unless valid
    console.log usage
    process.exit(0)

# after all passwords are counted, discard pws with counts <= COUNTS
CUTOFF = 10

# to save memory, after every batch of size BATCH_SIZE, go through counts and delete
# long tail of entries with only one count.
BATCH_SIZE = 1000000

counts = {}       # maps pw -> count
skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens
line_count = 0    # current number of lines processed

normalize = (token) ->
  token.toLowerCase()

should_include = (password, xato_rank) ->
  for i in [0...password.length]
    if password.charCodeAt(i) > 127
      # xato mostly contains ascii-only passwords, so in practice
      # this will only skip one or two top passwords over the cutoff.
      # were that not the case / were this used on a different data source, consider using
      # a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
      console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}"
      return false
  matches = []
  for matcher in [
    matching.spatial_match
    matching.repeat_match
    matching.sequence_match
    matching.regex_match
    matching.date_match
    ]
    matches.push.apply matches, matcher.call(matching, password)
  matches = matches.filter (match) ->
    # only keep matches that span full password
    match.i == 0 and match.j == password.length - 1
  for match in matches
    if scoring.estimate_guesses(match, password) < xato_rank
      # filter out this entry: non-dictionary matching will assign
      # a lower guess estimate.
      return false
  return true

prune = (counts) ->
  for pw, count of counts
    if count == 1
      delete counts[pw]

main = (xato_filename, output_filename) ->
  stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8')
  stream.on 'readable', ->
    while null != (line = stream.read())
      line_count += 1
      if line_count % BATCH_SIZE == 0
        console.log 'counting tokens:', line_count
        prune counts
      tokens = line.trim().split /\s+/
      unless tokens.length == 2
        skipped_lines += 1
        continue
      [username, password] = tokens[..1]
      password = normalize password
      if password of counts
        counts[password] += 1
      else
        counts[password] = 1
  stream.on 'end', ->
    console.log 'skipped lines:', skipped_lines
    pairs = []
    console.log 'copying to tuples'
    for pw, count of counts
      if count > CUTOFF
        pairs.push [pw, count]
      delete counts[pw] # save memory to avoid v8 1GB limit
    console.log 'sorting'
    pairs.sort (p1, p2) ->
      # sort by count. higher counts go first.
      p2[1] - p1[1]
    console.log 'filtering'
    pairs = pairs.filter (pair, i) ->
      rank = i + 1
      [pw, count] = pair
      should_include pw, rank
    output_stream = fs.createWriteStream output_filename, encoding: 'utf8'
    for pair in pairs
      [pw, count] = pair
      output_stream.write sprintf("%-15s %d\n", pw, count)
    output_stream.end()

check_usage()
main process.argv[3], process.argv[4]