This commit is contained in:
cutemeli
2025-12-22 10:35:30 +00:00
parent 0bfc6c8425
commit 5ce7ca2c5d
38927 changed files with 0 additions and 4594700 deletions

View File

@@ -1,135 +0,0 @@
#!/usr/bin/python
import os
import sys
import time
import codecs
import json
from operator import itemgetter
def usage():
return '''
usage:
%s data-dir src/Matchers/frequency_lists.json
generates frequency_lists.json (zxcvbn's ranked dictionary file) from word frequency data.
data-dir should contain frequency counts, as generated by the data-scripts/count_* scripts.
DICTIONARIES controls which frequency data will be included and at maximum how many tokens
per dictionary.
If a token appears in multiple frequency lists, it will only appear once in emitted .json file,
in the dictionary where it has lowest rank.
Short tokens, if rare, are also filtered out. If a token has higher rank than 10**(token.length),
it will be excluded because a bruteforce match would have given it a lower guess score.
A warning will be printed if DICTIONARIES contains a dictionary name that doesn't appear in
passed data dir, or vice-versa.
''' % sys.argv[0]
# maps dict name to num words. None value means "include all words"
DICTIONARIES = dict(
us_tv_and_film = 30000,
english_wikipedia = 30000,
passwords = 30000,
surnames = 10000,
male_names = None,
female_names = None,
)
# returns {list_name: {token: rank}}, as tokens and ranks occur in each file.
def parse_frequency_lists(data_dir):
freq_lists = {}
for filename in os.listdir(data_dir):
freq_list_name, ext = os.path.splitext(filename)
if freq_list_name not in DICTIONARIES:
msg = 'Warning: %s appears in %s directory but not in DICTIONARY settings. Excluding.'
print msg % (freq_list_name, data_dir)
continue
token_to_rank = {}
with codecs.open(os.path.join(data_dir, filename), 'r', 'utf8') as f:
for i, line in enumerate(f):
rank = i + 1 # rank starts at 1
token = line.split()[0]
token_to_rank[token] = rank
freq_lists[freq_list_name] = token_to_rank
for freq_list_name in DICTIONARIES:
if freq_list_name not in freq_lists:
msg = 'Warning: %s appears in DICTIONARY settings but not in %s directory. Excluding.'
print msg % (freq_list, data_dir)
return freq_lists
def is_rare_and_short(token, rank):
return rank >= 10**len(token)
def has_comma_or_double_quote(token, rank, lst_name):
# hax, switch to csv or similar if this excludes too much.
# simple comma joining has the advantage of being easy to process
# client-side w/o needing a lib, and so far this only excludes a few
# very high-rank tokens eg 'ps8,000' at rank 74868 from wikipedia list.
if ',' in token or '"' in token:
return True
return False
def filter_frequency_lists(freq_lists):
'''
filters frequency data according to:
- filter out short tokens if they are too rare.
- filter out tokens if they already appear in another dict
at lower rank.
- cut off final freq_list at limits set in DICTIONARIES, if any.
'''
filtered_token_and_rank = {} # maps {name: [(token, rank), ...]}
token_count = {} # maps freq list name: current token count.
for name in freq_lists:
filtered_token_and_rank[name] = []
token_count[name] = 0
minimum_rank = {} # maps token -> lowest token rank across all freq lists
minimum_name = {} # maps token -> freq list name with lowest token rank
for name, token_to_rank in freq_lists.iteritems():
for token, rank in token_to_rank.iteritems():
if token not in minimum_rank:
assert token not in minimum_name
minimum_rank[token] = rank
minimum_name[token] = name
else:
assert token in minimum_name
assert minimum_name[token] != name, 'same token occurs multiple times in %s' % name
min_rank = minimum_rank[token]
if rank < min_rank:
minimum_rank[token] = rank
minimum_name[token] = name
for name, token_to_rank in freq_lists.iteritems():
for token, rank in token_to_rank.iteritems():
if minimum_name[token] != name:
continue
if is_rare_and_short(token, rank) or has_comma_or_double_quote(token, rank, name):
continue
filtered_token_and_rank[name].append((token, rank))
token_count[name] += 1
result = {}
for name, token_rank_pairs in filtered_token_and_rank.iteritems():
token_rank_pairs.sort(key=itemgetter(1))
cutoff_limit = DICTIONARIES[name]
if cutoff_limit and len(token_rank_pairs) > cutoff_limit:
token_rank_pairs = token_rank_pairs[:cutoff_limit]
result[name] = [pair[0] for pair in token_rank_pairs] # discard rank post-sort
return result
def to_kv(lst, lst_name):
val = '"%s".split(",")' % ','.join(lst)
return '%s: %s' % (lst_name, val)
def main():
if len(sys.argv) != 3:
print usage()
sys.exit(0)
data_dir, output_file = sys.argv[1:]
unfiltered_freq_lists = parse_frequency_lists(data_dir)
freq_lists = filter_frequency_lists(unfiltered_freq_lists)
with codecs.open(output_file, 'w', 'utf8') as f:
json.dump(freq_lists, f)
if __name__ == '__main__':
main()

View File

@@ -1,105 +0,0 @@
#!/usr/bin/python
import sys
import json as simplejson
def usage():
return '''
constructs adjacency_graphs.json from QWERTY and DVORAK keyboard layouts
usage:
%s src/Matchers/adjacency_graphs.json
''' % sys.argv[0]
qwerty = r'''
`~ 1! 2@ 3# 4$ 5% 6^ 7& 8* 9( 0) -_ =+
qQ wW eE rR tT yY uU iI oO pP [{ ]} \|
aA sS dD fF gG hH jJ kK lL ;: '"
zZ xX cC vV bB nN mM ,< .> /?
'''
dvorak = r'''
`~ 1! 2@ 3# 4$ 5% 6^ 7& 8* 9( 0) [{ ]}
'" ,< .> pP yY fF gG cC rR lL /? =+ \|
aA oO eE uU iI dD hH tT nN sS -_
;: qQ jJ kK xX bB mM wW vV zZ
'''
keypad = r'''
/ * -
7 8 9 +
4 5 6
1 2 3
0 .
'''
mac_keypad = r'''
= / *
7 8 9 -
4 5 6 +
1 2 3
0 .
'''
def get_slanted_adjacent_coords(x, y):
'''
returns the six adjacent coordinates on a standard keyboard, where each row is slanted to the
right from the last. adjacencies are clockwise, starting with key to the left, then two keys
above, then right key, then two keys below. (that is, only near-diagonal keys are adjacent,
so g's coordinate is adjacent to those of t,y,b,v, but not those of r,u,n,c.)
'''
return [(x-1, y), (x, y-1), (x+1, y-1), (x+1, y), (x, y+1), (x-1, y+1)]
def get_aligned_adjacent_coords(x, y):
'''
returns the nine clockwise adjacent coordinates on a keypad, where each row is vert aligned.
'''
return [(x-1, y), (x-1, y-1), (x, y-1), (x+1, y-1), (x+1, y), (x+1, y+1), (x, y+1), (x-1, y+1)]
def build_graph(layout_str, slanted):
'''
builds an adjacency graph as a dictionary: {character: [adjacent_characters]}.
adjacent characters occur in a clockwise order.
for example:
* on qwerty layout, 'g' maps to ['fF', 'tT', 'yY', 'hH', 'bB', 'vV']
* on keypad layout, '7' maps to [None, None, None, '=', '8', '5', '4', None]
'''
position_table = {} # maps from tuple (x,y) -> characters at that position.
tokens = layout_str.split()
token_size = len(tokens[0])
x_unit = token_size + 1 # x position unit len is token len plus 1 for the following whitespace.
adjacency_func = get_slanted_adjacent_coords if slanted else get_aligned_adjacent_coords
assert all(len(token) == token_size for token in tokens), 'token len mismatch:\n ' + layout_str
for y, line in enumerate(layout_str.split('\n')):
# the way I illustrated keys above, each qwerty row is indented one space in from the last
slant = y - 1 if slanted else 0
for token in line.split():
x, remainder = divmod(line.index(token) - slant, x_unit)
assert remainder == 0, 'unexpected x offset for %s in:\n%s' % (token, layout_str)
position_table[(x,y)] = token
adjacency_graph = {}
for (x,y), chars in position_table.iteritems():
for char in chars:
adjacency_graph[char] = []
for coord in adjacency_func(x, y):
# position in the list indicates direction
# (for qwerty, 0 is left, 1 is top, 2 is top right, ...)
# for edge chars like 1 or m, insert None as a placeholder when needed
# so that each character in the graph has a same-length adjacency list.
adjacency_graph[char].append(position_table.get(coord, None))
return adjacency_graph
if __name__ == '__main__':
if len(sys.argv) != 2:
print usage()
sys.exit(0)
with open(sys.argv[1], 'w') as f:
data = {
'qwerty': build_graph(qwerty, True),
'dvorak': build_graph(dvorak, True),
'keypad': build_graph(keypad, False),
'mac_keypad': build_graph(mac_keypad, False),
}
simplejson.dump(data, f)
sys.exit(0)