import os import random as random import numpy import torch from torch.autograd import Variable def convert2tensor(x): x = torch.FloatTensor(x) return x def to_var(x, volatile=False): if torch.cuda.is_available(): x = x.cuda() return Variable(x, volatile=volatile) def load_top_k(file): dict = {} with open(file, 'r') as f: for line in f: parts = line.strip().split("\t") if len(parts) >= 2: query = parts[0].strip() nearest_words = parts[1].strip().split(" ") contexts = [] for word in nearest_words: contexts.append(word) dict[query] = contexts return dict def load_word_clusters(langs, function_file_path, file_suffix, vocabs): mappings = {} lang_functions = {} for lang in langs: functions = {} word_2_idx = vocabs[lang].word2idx file_path = os.path.join(function_file_path, lang + file_suffix) line_index = 0 function_name = "" with open(file_path, 'r') as f: for line in f: if line_index % 2 == 0: function_name = line.strip().lower() elif line_index % 2 == 1: parts = line.strip().lower().split('\t') idx = [] for part in parts: if part in word_2_idx: id = word_2_idx[part] idx.append(id) functions[function_name] = idx line_index += 1 lang_functions[lang] = functions for i in range(len(langs)): lang1 = langs[i] for j in range(len(langs)): lang2 = langs[j] if lang1==lang2: continue else: lang1_functions = lang_functions[lang1] lang2_functions = lang_functions[lang2] key = lang1+"-"+lang2 dict1 = [] dict2 = [] for func in lang1_functions: if func in lang2_functions: dict1.append(lang1_functions[func]) dict2.append(lang2_functions[func]) value = (dict1, dict2) mappings[key] = value return mappings def load_functions(file, vocab, vectors): function_vectors = {} line_index = 0 function_name = "" with open(file, 'r') as f: for line in f: if line_index % 2 == 0: function_name = line.strip().lower() elif line_index % 2 == 1: parts = line.strip().lower().split('\t') vec = numpy.zeros(len(vectors[0])) count = 0 for part in parts: if part in vocab.word2idx: tmp_vec = vectors[vocab.word2idx[part]] count += 1.0 for i in range(len(tmp_vec)): vec[i] += tmp_vec[i] if count > 0: for i in range(len(vec)): vec[i] = vec[i]/count function_vectors[function_name] = vec line_index += 1 return function_vectors def load_prefix_vectors(prefix_file, vocab1, vocab2, vectors1, vectors2): lang1_prefix_vectors = {} lang2_prefix_vectors = {} line_index = 0 with open(prefix_file, 'r') as f: for line in f: if line_index % 3 == 0: name = line.strip().lower() elif line_index % 3 == 1: lang1_vec = numpy.zeros(len(vectors1[0])) count = 0 low = line.strip().lower().split('\t') for part in low: parts = part.split("##") score = float(parts[2]) if score > 0.2 and parts[0] in vocab1.word2idx and parts[1] in vocab1.word2idx: count += 1.0 vec_tmp1 = vectors1[vocab1.word2idx[parts[0]]] vec_tmp2 = vectors1[vocab1.word2idx[parts[1]]] for t in range(len(vec_tmp1)): lang1_vec[t] += vec_tmp2[t]-vec_tmp1[t] if count > 0: for t in range(len(lang1_vec)): lang1_vec[t] = lang1_vec[t]/count lang1_prefix_vectors[name] = lang1_vec elif line_index % 3 == 2: high = line.strip().lower().split() lang2_vec = numpy.zeros(len(vectors2[0])) count = 0 for part in high: parts = part.split("##") score = float(parts[2]) if score > 0.2 and parts[0] in vocab2.word2idx and parts[1] in vocab2.word2idx: count += 1.0 vec_tmp1 = vectors2[vocab2.word2idx[parts[0]]] vec_tmp2 = vectors2[vocab2.word2idx[parts[1]]] for t in range(len(vec_tmp1)): lang2_vec[t] += vec_tmp2[t] - vec_tmp1[t] if count > 0: for t in range(len(lang2_vec)): lang2_vec[t] = lang2_vec[t] / count lang2_prefix_vectors[name] = lang2_vec line_index += 1 return lang1_prefix_vectors, lang2_prefix_vectors def load_linguistic_vector(langs, file_path): lang_vectors = {} for lang in langs: file = os.path.join(file_path, lang + ".linguistic.vec") vectors = {} with open(file, 'r') as f: for line in f: parts = line.strip().split(" ") word = parts[0] vec = [] for item in parts[1:]: d = float(item) vec.append(d) vectors[word] = vec lang_vectors[lang] = vectors for i in range(len(langs)): for j in range(len(langs)): if i == j: continue else: vectors_i = lang_vectors[langs[i]] vectors_j = lang_vectors[langs[j]] num = 0 dim = 0 for f in vectors_i: dim = len(vectors_i[f]) if f in vectors_j: num += 1 matrix1 = numpy.zeros((num, dim)).astype("float32", casting="same_kind") matrix2 = numpy.zeros((num, dim)).astype("float32", casting="same_kind") index = 0 for f in vectors_i: if f in vectors_j: tmp1 = vectors_i[f] tmp2 = vectors_j[f] for m in range(len(tmp1)): matrix1[index][m] = tmp1[m] matrix2[index][m] = tmp2[m] index += 1 key1 = langs[i] + "#" + langs[j] value1 = (matrix1, matrix2) key2 = langs[j] + "#" + langs[i] value2 = (matrix2, matrix1) lang_vectors[key1] = value1 lang_vectors[key2] = value2 return lang_vectors