-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
executable file
·200 lines (181 loc) · 7.16 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import random as random
import numpy
import torch
from torch.autograd import Variable
def convert2tensor(x):
x = torch.FloatTensor(x)
return x
def to_var(x, volatile=False):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x, volatile=volatile)
def load_top_k(file):
dict = {}
with open(file, 'r') as f:
for line in f:
parts = line.strip().split("\t")
if len(parts) >= 2:
query = parts[0].strip()
nearest_words = parts[1].strip().split(" ")
contexts = []
for word in nearest_words:
contexts.append(word)
dict[query] = contexts
return dict
def load_word_clusters(langs, function_file_path, file_suffix, vocabs):
mappings = {}
lang_functions = {}
for lang in langs:
functions = {}
word_2_idx = vocabs[lang].word2idx
file_path = os.path.join(function_file_path, lang + file_suffix)
line_index = 0
function_name = ""
with open(file_path, 'r') as f:
for line in f:
if line_index % 2 == 0:
function_name = line.strip().lower()
elif line_index % 2 == 1:
parts = line.strip().lower().split('\t')
idx = []
for part in parts:
if part in word_2_idx:
id = word_2_idx[part]
idx.append(id)
functions[function_name] = idx
line_index += 1
lang_functions[lang] = functions
for i in range(len(langs)):
lang1 = langs[i]
for j in range(len(langs)):
lang2 = langs[j]
if lang1==lang2:
continue
else:
lang1_functions = lang_functions[lang1]
lang2_functions = lang_functions[lang2]
key = lang1+"-"+lang2
dict1 = []
dict2 = []
for func in lang1_functions:
if func in lang2_functions:
dict1.append(lang1_functions[func])
dict2.append(lang2_functions[func])
value = (dict1, dict2)
mappings[key] = value
return mappings
def load_functions(file, vocab, vectors):
function_vectors = {}
line_index = 0
function_name = ""
with open(file, 'r') as f:
for line in f:
if line_index % 2 == 0:
function_name = line.strip().lower()
elif line_index % 2 == 1:
parts = line.strip().lower().split('\t')
vec = numpy.zeros(len(vectors[0]))
count = 0
for part in parts:
if part in vocab.word2idx:
tmp_vec = vectors[vocab.word2idx[part]]
count += 1.0
for i in range(len(tmp_vec)):
vec[i] += tmp_vec[i]
if count > 0:
for i in range(len(vec)):
vec[i] = vec[i]/count
function_vectors[function_name] = vec
line_index += 1
return function_vectors
def load_prefix_vectors(prefix_file, vocab1, vocab2, vectors1, vectors2):
lang1_prefix_vectors = {}
lang2_prefix_vectors = {}
line_index = 0
with open(prefix_file, 'r') as f:
for line in f:
if line_index % 3 == 0:
name = line.strip().lower()
elif line_index % 3 == 1:
lang1_vec = numpy.zeros(len(vectors1[0]))
count = 0
low = line.strip().lower().split('\t')
for part in low:
parts = part.split("##")
score = float(parts[2])
if score > 0.2 and parts[0] in vocab1.word2idx and parts[1] in vocab1.word2idx:
count += 1.0
vec_tmp1 = vectors1[vocab1.word2idx[parts[0]]]
vec_tmp2 = vectors1[vocab1.word2idx[parts[1]]]
for t in range(len(vec_tmp1)):
lang1_vec[t] += vec_tmp2[t]-vec_tmp1[t]
if count > 0:
for t in range(len(lang1_vec)):
lang1_vec[t] = lang1_vec[t]/count
lang1_prefix_vectors[name] = lang1_vec
elif line_index % 3 == 2:
high = line.strip().lower().split()
lang2_vec = numpy.zeros(len(vectors2[0]))
count = 0
for part in high:
parts = part.split("##")
score = float(parts[2])
if score > 0.2 and parts[0] in vocab2.word2idx and parts[1] in vocab2.word2idx:
count += 1.0
vec_tmp1 = vectors2[vocab2.word2idx[parts[0]]]
vec_tmp2 = vectors2[vocab2.word2idx[parts[1]]]
for t in range(len(vec_tmp1)):
lang2_vec[t] += vec_tmp2[t] - vec_tmp1[t]
if count > 0:
for t in range(len(lang2_vec)):
lang2_vec[t] = lang2_vec[t] / count
lang2_prefix_vectors[name] = lang2_vec
line_index += 1
return lang1_prefix_vectors, lang2_prefix_vectors
def load_linguistic_vector(langs, file_path):
lang_vectors = {}
for lang in langs:
file = os.path.join(file_path, lang + ".linguistic.vec")
vectors = {}
with open(file, 'r') as f:
for line in f:
parts = line.strip().split(" ")
word = parts[0]
vec = []
for item in parts[1:]:
d = float(item)
vec.append(d)
vectors[word] = vec
lang_vectors[lang] = vectors
for i in range(len(langs)):
for j in range(len(langs)):
if i == j:
continue
else:
vectors_i = lang_vectors[langs[i]]
vectors_j = lang_vectors[langs[j]]
num = 0
dim = 0
for f in vectors_i:
dim = len(vectors_i[f])
if f in vectors_j:
num += 1
matrix1 = numpy.zeros((num, dim)).astype("float32", casting="same_kind")
matrix2 = numpy.zeros((num, dim)).astype("float32", casting="same_kind")
index = 0
for f in vectors_i:
if f in vectors_j:
tmp1 = vectors_i[f]
tmp2 = vectors_j[f]
for m in range(len(tmp1)):
matrix1[index][m] = tmp1[m]
matrix2[index][m] = tmp2[m]
index += 1
key1 = langs[i] + "#" + langs[j]
value1 = (matrix1, matrix2)
key2 = langs[j] + "#" + langs[i]
value2 = (matrix2, matrix1)
lang_vectors[key1] = value1
lang_vectors[key2] = value2
return lang_vectors