-
Notifications
You must be signed in to change notification settings - Fork 0
/
recheck_all_result_layouts.py
executable file
·200 lines (167 loc) · 9.49 KB
/
recheck_all_result_layouts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3
# encoding: utf-8
"""get all layout results from the results folder.
Depends on the layouts info starting with OA'Evolved Layout'
"""
from check_neo import print_layout_with_statistics, get_all_data, total_cost
from layout_base import find_layout_families, Layout, Layouts
from layout_info import csv_data
from ngrams import split_uppercase_trigrams
import layout_cost
import logging
from regularity_check import regularity, std
from os import listdir, mkdir
from os.path import join, isdir
from subprocess import call
import sys
import multiprocessing as mp
# mp = _mp.get_context('spawn') # avoid copying the full memory for each process. Needs python3.4
def get_all_layouts_in_textfile(textfile):
"""Get all layouts in the given textfile.
@return: a list of layout strings."""
with open(textfile, encoding="utf-8") as f:
try:
d = f.read()
except UnicodeError:
print("can’t open", textfile)
# normal result splitting
if "Evolved Layout" in d:
e = d.split("Evolved Layout")
else: # splitting for sorted best-XX files.
e = d.split("####")
layout_strings = []
for i in e[1:]:
layout_strings.append("\n".join(i.splitlines()[1:4]))
all_layouts = []
for l in layout_strings:
if l.strip():
try: all_layouts.append(Layout.from_string(l, Layouts.NEO2))
except IndexError:
print("parsing failed for the layout string:")
print(l)
return all_layouts
def get_all_layouts_in_text_files_in(folder="results", namepart=""):
"""get all layouts from check_neo runs saved in the textfile."""
all_layouts = []
for i in listdir(folder):
if ((not i.endswith(".txt") and not i.endswith(".out")) or
(namepart and not namepart in i)):
continue
print("# reading", join(folder, i), file=sys.stderr)
all_layouts.extend(get_all_layouts_in_textfile(join(folder, i)))
return all_layouts
def check_regularity(lay, textfile):
segments, words = regularity(
lay, textfile,
output=None, output_words=None, # avoid writing unnecessary output files
verbose=False,
maxsegments=320, maxwords=320) # limit the runtime of this check by only reading the beginning of the file
return segments, words
# 320,320 should be enough, see:
# import pylab as pl
# pl.ion()
# pl.plot([10, 20, 40, 80, 160, 320, 640], [250.78243431915746, 269.44446973415927, 272.38856521679264, 242.11454523066135, 206.93258984508503, 247.160678690749, 239.74251752409359], label="segments")
# pl.plot([20, 40, 80, 160, 320, 640], [1307.0456755768544, 1033.2352400430161, 886.1693410012002, 995.1180056582663, 738.5571115676071, 572.0787321328082], label="words")
# pl.legend()
def main(options, args):
# ensure that irregularity is always using the same words, regardless of the cost
# FIXME: Add a clean option for this
layout_cost.IRREGULARITY_WORDS_RANDOMLY_SAMPLED_FRACTION = 1.0
# reduce the cost by choosing an optimized reference text (only the lines which are closest to the ngram distribution in the corpus.
layout_cost.IRREGULARITY_REFERENCE_TEXT = "beispieltext-prosa-best-lines.txt"
if options.print_csv:
print("layoutstring,total penalty per letter,key position cost,finger repeats,disbalance of fingers,top to bottom or vice versa,handswitching in trigram,(rows²/dist)²,shortcut keys,handswitching after unbalancing,movement pattern,hand disbalance, manual penalty, neighboring unbalance, asymmetric bigrams, asymmetric similar keys, irregularity, regularity segments mean,regularity segments std,regularity words mean,regularity words std")
all_layouts = get_all_layouts_in_text_files_in(folder=options.folder, namepart = options.namepart)
letters, number_of_letters, repeats, number_of_bigrams, trigrams, number_of_trigrams = get_all_data(datapath=options.data)
trigrams = split_uppercase_trigrams(trigrams)
if options.families:
# sort the layouts by value, lowest total cost first.
lays = []
for lay in all_layouts:
cost = total_cost(lay, letters=letters, repeats=repeats, trigrams=trigrams)[0]
lays.append((cost, lay))
print (lay.to_layer_1_string())
print()
lays.sort()
# remove the cost information again.
lays = [lay for co, lay in lays]
layout_families = find_layout_families(lays, letters, max_diff=options.family_threshold)
# all layouts should contain only the best from each family.
all_layouts = [fam[0] for fam in layout_families]
# make sure the best is shown last
all_layouts.reverse()
textfile = options.data
if textfile is None:
textfile = "beispieltext-prosa.txt"
if options.regularity:
def f(lay):
return (lay,) + check_regularity(lay, textfile)
def tuplit(t):
return tuple(map(tuplit, t)) if isinstance(t, (list, tuple)) else t
#with mp.Pool(3) as p: # eats about 500 MiB per process
regularity_data = {}
for lay, seg, word in map(f, all_layouts):
regularity_data[tuplit(lay)] = seg, word
for lay in all_layouts:
if options.regularity:
print("# checking regularity for\n" + lay.to_layer_1_string(), file=sys.stderr)
# segment_costs, word_costs = check_regularity(lay, textfile)
segment_costs, word_costs = regularity_data[tuplit(lay)]
cost_segments_mean = sum(segment_costs) / len(segment_costs)
cost_segments_std = std(segment_costs)
cost_words_mean = sum(word_costs) / len(word_costs)
cost_words_std = std(word_costs)
regularity = "%s,%s,%s,%s" % (cost_segments_mean, cost_segments_std, cost_words_mean, cost_words_std)
else:
regularity = "nan,nan,nan,nan"
if options.print_csv:
csv = [str(i) for i in
csv_data(lay, letters=letters, repeats=repeats, number_of_letters=number_of_letters, number_of_bigrams=number_of_bigrams, trigrams=trigrams, number_of_trigrams=number_of_trigrams)]
name_lines = lay.to_layer_1_string().splitlines()
layoutstring = "-".join((name_lines[1], name_lines[0], name_lines[2]))
layoutstring = layoutstring.replace('"', '\"').replace(" ", "_")
layoutstring = '"' + layoutstring + '"'
print(layoutstring + "," + ",".join(csv) + "," + regularity)
else:
print_layout_with_statistics(lay, verbose=True, letters=letters, repeats=repeats, number_of_letters=number_of_letters, number_of_bigrams=number_of_bigrams, trigrams=trigrams, number_of_trigrams=number_of_trigrams)
if options.regularity:
call(["./regularity_check.py", "-t", textfile, "-l", lay.to_layer_1_string()])
print()
if options.svg:
if not isdir("svgs"):
mkdir("svgs")
cost = total_cost(lay, letters=letters, repeats=repeats, trigrams=trigrams)[0]
cost = cost/max(1, number_of_letters)
cost = "{:>7.4f}".format(cost)
name_lines = lay.to_layer_1_string().splitlines()
name = "-".join((name_lines[1], name_lines[0], name_lines[2])) + ".svg"
name = name.replace(" ", "_")
name = cost + "-" + name
name = join("svgs", name)
from bigramm_statistik import print_bigram_info
print_bigram_info(lay, number=1000, svg=True, svg_output=name, filepath=options.data)
if __name__ == "__main__":
from optparse import OptionParser
parser = OptionParser(description="recheck all result layouts with the current config.")
parser.add_option("--file", dest="data", type="string", default=None,
help="use the given textfile as korpus instead of the pregenerated corpus", metavar="file")
parser.add_option("--namepart", dest="namepart", type="string", default="",
help="read only files whose names contain the given string", metavar="string")
parser.add_option("--folder", dest="folder", type="string", default="results",
help="search for result files in the given folder (no recursions, requires .txt suffix)", metavar="string")
parser.add_option("--csv",
action="store_true", dest="print_csv", default=False,
help="print a csv instead of the normal layout statistics")
parser.add_option("--regularity",
action="store_true", dest="regularity", default=None,
help="Check the regularity of each result layout against a text file. If --file is not given, it defaults to beispieltext-prosa.txt.")
parser.add_option("--svg",
action="store_true", dest="svg", default=None,
help="save an svg file in the folder svgs/ for every printed layout. Can take a long time. You might want to use --families, too.")
parser.add_option("--families",
action="store_true", dest="families", default=False,
help="Sort the layouts into families and print only the best layout in each familiy. ")
parser.add_option("--family-threshold", dest="family_threshold", type="float", default=0.6,
help="Treat layouts with at most the given difference as belonging to the same family. Default: 0.6", metavar="max_difference")
(options, args) = parser.parse_args()
main(options, args)