-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_default_wf_lists.py
138 lines (101 loc) · 4.08 KB
/
create_default_wf_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import time
import requests
from frequencyman.static_lang_data import DEFAULT_WF_LISTS_SOURCES
from frequencyman.text_processing import LangId
from frequencyman.language_data import WordFrequencyLists
def add_en_contractions(wf_list: list[str]) -> list[str]:
# Dictionary of contractions from previous artifact
contractions = {
"don't": ("do", "not"),
"i'll": ("i", "will"),
"didn't": ("did", "not"),
"can't": ("can", "not"),
"you're": ("you", "are"),
"i've": ("i", "have"),
"we're": ("we", "are"),
"doesn't": ("does", "not"),
"won't": ("will", "not"),
"we'll": ("we", "will"),
"they're": ("they", "are"),
"you'll": ("you", "will"),
"haven't": ("have", "not"),
"couldn't": ("could", "not"),
"wouldn't": ("would", "not"),
"isn't": ("is", "not"),
"you've": ("you", "have"),
"aren't": ("are", "not"),
"shouldn't": ("should", "not"),
"we've": ("we", "have"),
"you'd": ("you", "would"),
"weren't": ("were", "not"),
"hasn't": ("has", "not"),
"wasn't": ("was", "not"),
"they'll": ("they", "will"),
"we'd": ("we", "would"),
"hadn't": ("had", "not"),
"they've": ("they", "have"),
"she'll": ("she", "will"),
"would've": ("would", "have"),
"he'll": ("he", "will"),
"must've": ("must", "have"),
"should've": ("should", "have"),
"that'll": ("that", "will"),
"they'd": ("they", "would"),
"could've": ("could", "have"),
"cannot": ("can", "not")
}
# Create a new list to store results
result = wf_list.copy()
# For each contraction, find the position to insert it
for contraction, (word1, word2) in contractions.items():
# Find positions of both words
if word1 == 'i':
pos1 = 0
else:
pos1 = wf_list.index(word1.lower())
pos2 = wf_list.index(word2.lower())
# Calculate the difference between positions
space = int((abs(pos2 - pos1) + 125) * 5)
# Use the maximum position plus the difference as the target position
target_position = max(pos1, pos2) + space
# Ensure we don't exceed list length
target_position = min(target_position, len(result))
# Add the contraction if it's not already in the list
if contraction.lower() in result:
raise Exception("Contraction {} already in list!".format(contraction))
result.insert(target_position, contraction.lower())
return result
def download_default_wf_lists():
target_dir = 'default_wf_lists'
for lang_id, wf_list_url in DEFAULT_WF_LISTS_SOURCES.items():
print(wf_list_url)
target_file = os.path.join(target_dir, lang_id+'.txt')
if os.path.isfile(target_file):
continue
response = requests.get(wf_list_url)
if response.status_code // 100 != 2:
raise Exception("Error downloading file \"{}\". Status code: {}".format(wf_list_url, response.status_code))
temp_file = os.path.join(target_dir, os.path.basename(wf_list_url))
with open(temp_file, 'wb') as f:
f.write(response.content)
wf_list: list[str] = []
wf_list_size = 0
for word, _ in WordFrequencyLists.get_words_from_file(temp_file, LangId(lang_id[:2])):
wf_list.append(word)
wf_list_size += len(word.encode("utf-8"))
if len(wf_list) > 50_000:
break
if wf_list_size > 80_000:
break
if wf_list:
if lang_id == 'en':
wf_list = add_en_contractions(wf_list)
with open(target_file, "w", encoding="utf-8") as f:
wf_list_content = "\n".join(wf_list)
f.write(wf_list_content)
os.remove(temp_file)
time.sleep(1)
if __name__ == "__main__":
download_default_wf_lists()
print("Done!")