-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmodel_zoo_for_bidaf.py
150 lines (130 loc) · 6.86 KB
/
model_zoo_for_bidaf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import Input, InputLayer
from keras.layers.core import Dense, RepeatVector, Masking, Dropout, Lambda
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers import recurrent, Dense, Input, Dropout, TimeDistributed, Flatten, concatenate, Embedding, Merge, merge, multiply, add
from keras.layers.recurrent import GRU
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from utils.layers.core_layer import highway_network_wrapper, Linear_layer, softmax_layer, tile_layer, MaskedDense
from utils.layers.attention_layer4BiDAF import FasterBidirectionAttention, softselAttention, FasterSelf_Attention4BiDAF
from utils.layers.conv_layer import build_convs, multi_conv_withlayers
def BIDAF_Model(vocab_size, char_vocab_size, embedding_matrix, cfg):
# Init parameters
MAX_CHAR_SIZE = cfg.getint('Hyper-parameters', 'MAX_CHAR_SIZE')
CHAR_EMBEDDING_DIM = cfg.getint('Hyper-parameters', 'CHAR_EMBEDDING_DIM')
CHAR_EMBED_METHOD = cfg.get('Hyper-parameters', 'CHAR_EMBED_METHOD')
HIDDEN_SIZE = cfg.getint('Hyper-parameters', 'HIDDEN_SIZE')
ENCODER_LAYERS = cfg.getint('Hyper-parameters', 'ENCODER_LAYERS')
RNN_Cell = cfg.get('Hyper-parameters', 'RNN_Cell')
DP = cfg.getfloat('Hyper-parameters', 'DP')
UNROLL = cfg.getboolean('Hyper-parameters', 'UNROLL')
MLP_LAYER = cfg.getint('Hyper-parameters', 'MLP_LAYER')
ATT_TYPE = cfg.get('Hyper-parameters', 'ATT_TYPE')
user_char_embed = cfg.getboolean('Hyper-parameters', 'CHAR_EMBED')
use_highway = cfg.getboolean('Hyper-parameters', 'USE_HIGHWAY')
use_sefatt = cfg.getboolean('Hyper-parameters', 'USE_SELFATT')
share_sm = cfg.getboolean('Hyper-parameters', 'SHARE_SM')
# Model details
question_input = Input(shape=(None,), dtype='int32', name='question_input')
context_input = Input(shape=(None,), dtype='int32', name='context_input')
question_char = Input(shape=(None, MAX_CHAR_SIZE,),
dtype='int32', name='question_input_char')
context_char = Input(shape=(None, MAX_CHAR_SIZE,),
dtype='int32', name='context_input_char')
EMBEDDING_DIM = embedding_matrix.shape[-1]
layer_emb = Embedding(output_dim=EMBEDDING_DIM, input_dim=vocab_size, weights=[embedding_matrix],
trainable=False, mask_zero=True)
questionEmbd = layer_emb(question_input)
contextEmbd = layer_emb(context_input) # mask_zero=True,
if user_char_embed:
embedding_char = Embedding(
output_dim=CHAR_EMBEDDING_DIM, input_dim=char_vocab_size, trainable=True, name='char_emb') # no mask
if CHAR_EMBED_METHOD == 'RNN':
char_embedding_layer = TimeDistributed(Sequential([
InputLayer(input_shape=(MAX_CHAR_SIZE,), dtype='int32'),
embedding_char,
Bidirectional(GRU(units=HIDDEN_SIZE))
])) # 200
context_charEmbed = char_embedding_layer(context_char)
question_charEmbed = char_embedding_layer(question_char)
else:
context_charEmbed = embedding_char(context_char)
question_charEmbed = embedding_char(question_char)
convs, NGRAM_FILTERS = build_convs(NGRAM_FILTERS=[1,2,3,4], NUM_FILTER=HIDDEN_SIZE, name='char_cnn')
context_charEmbed = multi_conv_withlayers(context_charEmbed, convs, NGRAM_FILTERS, name='char_context') # [batch,len,4*H]
question_charEmbed = multi_conv_withlayers(question_charEmbed, convs, NGRAM_FILTERS, name='char_query') # out[batch,len,4*H]
context = concatenate([context_charEmbed, contextEmbd])
question = concatenate([question_charEmbed, questionEmbd])
if use_highway:
# the question and context dimension same
cur_dim = context.get_shape().as_list()[-1]
hn = highway_network_wrapper(cur_dim, 3)
context = hn.build(context)
question = hn.build(question) # reuse
# contextual Embedding
RNN = recurrent.LSTM
if RNN_Cell == 'LSTM':
RNN = recurrent.LSTM
elif RNN_Cell == 'GRU':
RNN = recurrent.GRU
if share_sm:
rnns = []
for i in range(ENCODER_LAYERS):
rnns.append(Bidirectional(
RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP)))
for rnn in rnns:
context = rnn(context)
question = rnn(question)
else:
# context = Masking()(context)
for i in range(ENCODER_LAYERS):
context = Bidirectional(
RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(context)
question = Bidirectional(RNN(HIDDEN_SIZE,
return_sequences=True,
dropout=DP))(question)
context = Dropout(rate=DP, name='uP')(context)
question = Dropout(rate=DP, name='uQ')(question)
BiAttenlayer = FasterBidirectionAttention(linear_fun=ATT_TYPE, is_q2c_att=True)
P0 = BiAttenlayer([context, question])
if use_sefatt:
# P0 = Dropout(rate=DP, name='P0_drop')(P0)
lP0 = MaskedDense(2 * HIDDEN_SIZE, activation='relu')(P0)
pre_selfatt = Bidirectional(RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(lP0)
pre_selfatt = Dropout(rate=DP, name='pre_selfatt_drop')(pre_selfatt)
after_selfatt = FasterSelf_Attention4BiDAF(linear_fun=ATT_TYPE)(pre_selfatt)
# after_selfatt = Dropout(rate=DP, name='after_selfatt_drop')(after_selfatt)
lafter_selfatt = MaskedDense(2 * HIDDEN_SIZE, activation='relu')(after_selfatt)
g0 = add([lP0, lafter_selfatt])
else:
g0 = Bidirectional(RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(P0)
g1 = Bidirectional(RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(g0)
if use_sefatt:
D1 = concatenate([g1, g0])
else:
D1 = concatenate([g1, P0])
D1 = Dropout(DP)(D1)
logits = Linear_layer(1, use_squeeze=True)(D1) # start logits
# for end inference
ali = softselAttention()([g1, logits]) # [B,d]
ali = tile_layer(1, g1)(ali)
if use_sefatt:
g2 = Bidirectional(RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(concatenate([g0, g1, ali, multiply([g1, ali])]))
else:
g2 = Bidirectional(RNN(HIDDEN_SIZE, return_sequences=True, dropout=DP))(
concatenate([P0, g1, ali, multiply([g1, ali])])) # merge
if use_sefatt:
D2 = concatenate([g2, g0])
else:
D2 = concatenate([g2, P0])
D2 = Dropout(DP)(D2)
logits2 = Linear_layer(1, use_squeeze=True)(D2)
yp = softmax_layer(axis=-1, name='answer_start')(logits)
yp2 = softmax_layer(axis=-1, name='answer_end')(logits2)
inputs = [question_input, question_char, context_input, context_char]
outputs = [yp, yp2]
model = Model(input=inputs,
output=outputs)
return model