doc tweak: make usage in data-scripts consistent with filenames in data/

2026-01-09 14:07:58 -05:00 · 2015-11-09 22:53:36 -08:00
parent 87b555ff3f
commit 92f5ce5e29
3 changed files with 7 additions and 3 deletions
--- a/data-scripts/count_wikipedia.py
+++ b/data-scripts/count_wikipedia.py
@@ -16,7 +16,7 @@ def usage():
 tokenize a directory of text and count unigrams.

 usage:
-%s input_dir ../data/written_english.txt
+%s input_dir ../data/english_wikipedia.txt

 input_dir is the root directory where sentence files live. Each file should contain
 one sentence per line, with punctuation. This script will walk the directory recursively,
--- a/data-scripts/count_wiktionary.py
+++ b/data-scripts/count_wiktionary.py
@@ -17,7 +17,7 @@ https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists#TV_and_movie_scripts

 Put those into a single directory and point it to this script:

-%s wiktionary_html_dir ../data/spoken_english.txt
+%s wiktionary_html_dir ../data/us_tv_and_film.txt

 output.txt will include one line per word in the study, ordered by rank, of the form:

@@ -31,6 +31,7 @@ def parse_wiki_tokens(html_doc_str):
    results = []
    last3 = ['', '', '']
    header = True
+    skipped = 0
    for line in html_doc_str.split('\n'):
        last3.pop(0)
        last3.append(line.strip())
@@ -49,9 +50,12 @@ def parse_wiki_tokens(html_doc_str):
            #
            # otherwise end up with a bunch of duplicates eg victor / victor's
            if token.endswith("'s") and rank > 1000:
+                skipped += 1
                continue
            count = int(count)
            results.append((rank, token, count))
+    # early docs have 1k entries, later 2k, last 1284
+    assert len(results) + skipped in [1000, 2000, 1284]
    return results

 def normalize(token):
--- a/data-scripts/count_xato.coffee
+++ b/data-scripts/count_xato.coffee
@@ -9,7 +9,7 @@ sprintf = require('sprintf-js').sprintf
 check_usage = () ->
  usage = '''

-  Run a frequency count on the raw 10M xato password set and keep the top 40k by
+  Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
  descending frequency. That file can be found by googling around for:
  "xato 10-million-combos.txt"