-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathword2vec.py
71 lines (59 loc) · 2.7 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: UTF-8 -*-
import tensorflow as tf
import numpy as np
import math
from transform_data_w2v import TransformDataW2V
class Word2Vec:
def __init__(self, output, batch_size=128, skip_window=1, embed_size=100,dict_path='corpus/emr_ner_dict.utf8',
num_sampled=64, steps=25000):
self.output = output
self.batch_size = batch_size
self.skip_window = skip_window
self.embed_size = embed_size
self.num_sampled = num_sampled
self.steps = steps
self.tran = TransformDataW2V(self.batch_size, self.skip_window,dict_path=dict_path)
self.vocab_size = len(self.tran.dictionary)
self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0))
def train(self):
train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
embed = tf.nn.embedding_lookup(self.embeddings, train_inputs)
nce_weights = tf.Variable(
tf.truncated_normal([self.vocab_size, self.embed_size],
stddev=1.0 / math.sqrt(self.embed_size)))
nce_biases = tf.Variable(tf.zeros([self.vocab_size]))
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed,
num_sampled=self.num_sampled, num_classes=self.vocab_size))
optimizer = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
with tf.Session() as sess:
tf.global_variables_initializer().run()
aver_loss = 0
for step in range(self.steps):
batch_inputs, batch_labels = self.tran.generate_batch()
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
_, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
aver_loss += loss_val
if step % 2000 == 0:
if step > 0:
aver_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", aver_loss)
aver_loss = 0
np.save(self.output, self.embeddings.eval())
def test(self):
valid_dataset = [3021]
norm = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keep_dims=True))
normalized_embeddings = self.embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.abs(tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True))
print(similarity.eval())
pair = zip(range(self.vocab_size), similarity.eval()[0])
spair = sorted(pair, key=lambda x: x[1])
print(spair[0:10])
if __name__ == '__main__':
w2v = Word2Vec('corpus/embed/embeddings', embed_size=100)
w2v.train()