From 5446ce816702854727e13adb3a7be489044ec954 Mon Sep 17 00:00:00 2001 From: zepingyu0512 <39480215+zepingyu0512@users.noreply.github.com> Date: Mon, 21 May 2018 16:09:55 +0800 Subject: [PATCH] Create SRNN.py --- SRNN.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 SRNN.py diff --git a/SRNN.py b/SRNN.py new file mode 100644 index 0000000..e890a07 --- /dev/null +++ b/SRNN.py @@ -0,0 +1,170 @@ +''' +author: Zeping Yu +Sliced Recurrent Neural Network (SRNN). +SRNN is able to get much faster speed than standard RNN by slicing the sequences into many subsequences. +This work is accepted by COLING 2018. +The code is written in keras, using tensorflow backend. We implement the SRNN(8,2) here, and Yelp 2013 dataset is used. +''' + +import pandas as pd +import numpy as np + +from keras.utils.np_utils import to_categorical +from keras.preprocessing.text import Tokenizer, text_to_word_sequence +from keras.preprocessing.sequence import pad_sequences +from keras.models import Model +from keras.layers import Input, Embedding, GRU, TimeDistributed, Dense + +#load data +df = pd.read_csv("yelp_2013.csv") +df = df.sample(5000) + +Y = df.stars.values-1 +Y = to_categorical(Y,num_classes=5) +X = df.text.values + +#set hyper parameters +MAX_NUM_WORDS = 30000 +EMBEDDING_DIM = 200 +VALIDATION_SPLIT = 0.1 +TEST_SPLIT=0.1 +NUM_FILTERS = 50 +MAX_LEN = 512 +Batch_size = 100 +EPOCHS = 10 + +#shuffle the data +indices = np.arange(X.shape[0]) +np.random.seed(2018) +np.random.shuffle(indices) +X=X[indices] +Y=Y[indices] + +#training set, validation set and testing set +nb_validation_samples_val = int((VALIDATION_SPLIT + TEST_SPLIT) * X.shape[0]) +nb_validation_samples_test = int(TEST_SPLIT * X.shape[0]) + +x_train = X[:-nb_validation_samples_val] +y_train = Y[:-nb_validation_samples_val] +x_val = X[-nb_validation_samples_val:-nb_validation_samples_test] +y_val = Y[-nb_validation_samples_val:-nb_validation_samples_test] +x_test = X[-nb_validation_samples_test:] +y_test = Y[-nb_validation_samples_test:] + +#use tokenizer to build vocab +tokenizer1 = Tokenizer(num_words=MAX_NUM_WORDS) +tokenizer1.fit_on_texts(df.text) +vocab = tokenizer1.word_index + +x_train_word_ids = tokenizer1.texts_to_sequences(x_train) +x_test_word_ids = tokenizer1.texts_to_sequences(x_test) +x_val_word_ids = tokenizer1.texts_to_sequences(x_val) + +#pad sequences into the same length +x_train_padded_seqs = pad_sequences(x_train_word_ids, maxlen=MAX_LEN) +x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=MAX_LEN) +x_val_padded_seqs = pad_sequences(x_val_word_ids, maxlen=MAX_LEN) + +#slice sequences into many subsequences +x_test_padded_seqs_split=[] +for i in range(x_test_padded_seqs.shape[0]): + split1=np.split(x_test_padded_seqs[i],8) + a=[] + for j in range(8): + s=np.split(split1[j],8) + a.append(s) + x_test_padded_seqs_split.append(a) + +x_val_padded_seqs_split=[] +for i in range(x_val_padded_seqs.shape[0]): + split1=np.split(x_val_padded_seqs[i],8) + a=[] + for j in range(8): + s=np.split(split1[j],8) + a.append(s) + x_val_padded_seqs_split.append(a) + +x_train_padded_seqs_split=[] +for i in range(x_train_padded_seqs.shape[0]): + split1=np.split(x_train_padded_seqs[i],8) + a=[] + for j in range(8): + s=np.split(split1[j],8) + a.append(s) + x_train_padded_seqs_split.append(a) + +#load pre-trained GloVe word embeddings +print "Using GloVe embeddings" +glove_path = 'glove.6B.200d.txt' +embeddings_index = {} +f = open(glove_path) +for line in f: + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + embeddings_index[word] = coefs +f.close() +print('Found %s word vectors.' % len(embeddings_index)) + +#use pre-trained GloVe word embeddings to initialize the embedding layer +embedding_matrix = np.random.random((MAX_NUM_WORDS + 1, EMBEDDING_DIM)) +for word, i in vocab.items(): + if i