本文模型
定一个attention后面会用到
#自定义注意力层
from tf.keras import initializers, constraints,activations,regularizers
from tf.keras import backend as K
from tf.keras.layers import Layer
class Attention(Layer):
#返回值:返回的不是attention权重,而是每个timestep乘以权重后相加得到的向量。
#输入:输入是rnn的timesteps,也是最长输入序列的长度
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(shape=(input_shape[-1],),initializer=self.init,name='{}_W'.format(self.name),
regularizer=self.W_regularizer,constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight(shape=(input_shape[1],),initializer='zero', name='{}_b'.format(self.name),
regularizer=self.b_regularizer,constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
return None ## 后面的层不需要mask了,所以这里可以直接返回none
def call(self, x, mask=None):
features_dim = self.features_dim ## 这里应该是 step_dim是我们指定的参数,它等于input_shape[1],也就是rnn的timesteps
step_dim = self.step_dim
# 输入和参数分别reshape再点乘后,tensor.shape变成了(batch_size*timesteps, 1),之后每个batch要分开进行归一化
# 所以应该有 eij = K.reshape(..., (-1, timesteps))
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij) #RNN一般默认激活函数为tanh, 对attention来说激活函数差别不大,因为要做softmax
a = K.exp(eij)
if mask is not None: ## 如果前面的层有mask,那么后面这些被mask掉的timestep肯定是不能参与计算输出的,也就是将他们attention权重设为0
a *= K.cast(mask, K.floatx()) ## cast是做类型转换,keras计算时会检查类型,可能是因为用gpu的原因
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a) # a = K.expand_dims(a, axis=-1) , axis默认为-1, 表示在最后扩充一个维度。比如shape = (3,)变成 (3, 1)
## 此时a.shape = (batch_size, timesteps, 1), x.shape = (batch_size, timesteps, units)
weighted_input = x * a
# weighted_input的shape为 (batch_size, timesteps, units), 每个timestep的输出向量已经乘上了该timestep的权重
# weighted_input在axis=1上取和,返回值的shape为 (batch_size, 1, units)
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape): ## 返回的结果是c,其shape为 (batch_size, units)
return input_shape[0], self.features_dim
常用的文本模型
def build_model(top_words=top_words,max_words=max_words,num_labels=num_labels,mode='LSTM',hidden_dim=[32]):
if mode=='RNN':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(SimpleRNN(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='MLP':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='LSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(LSTM(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='GRU':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(GRU(32))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='CNN': #一维卷积
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Conv1D(filters=32, kernel_size=3, padding="same",activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='CNN+LSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Dropout(0.25))
model.add(Conv1D(filters=32, kernel_size=3, padding="same",activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(64))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation="softmax"))
elif mode=='BiLSTM':
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_labels, activation='softmax'))
#下面的网络采用Funcional API实现
elif mode=='TextCNN':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
## 词嵌入使用预训练的词向量
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
## 词窗大小分别为3,4,5
cnn1 = Conv1D(32, 3, padding='same', strides = 1, activation='relu')(layer)
cnn1 = MaxPooling1D(pool_size=2)(cnn1)
cnn2 = Conv1D(32, 4, padding='same', strides = 1, activation='relu')(layer)
cnn2 = MaxPooling1D(pool_size=2)(cnn2)
cnn3 = Conv1D(32, 5, padding='same', strides = 1, activation='relu')(layer)
cnn3 = MaxPooling1D(pool_size=2)(cnn3)
# 合并三个模型的输出向量
cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.2)(flat)
main_output = Dense(num_labels, activation='softmax')(drop)
model = Model(inputs=inputs, outputs=main_output)
elif mode=='Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(64)(attention_mul) #原始的全连接
fla=Flatten()(mlp)
output = Dense(num_labels, activation='softmax')(fla)
model = Model(inputs=[inputs], outputs=output)
elif mode=='Attention*3':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(32,activation='relu')(attention_mul)
attention_probs = Dense(32, activation='softmax', name='attention_vec1')(mlp)
attention_mul = Multiply()([mlp, attention_probs])
mlp2 = Dense(32,activation='relu')(attention_mul)
attention_probs = Dense(32, activation='softmax', name='attention_vec2')(mlp2)
attention_mul = Multiply()([mlp2, attention_probs])
mlp3 = Dense(32,activation='relu')(attention_mul)
fla=Flatten()(mlp3)
output = Dense(num_labels, activation='softmax')(fla)
model = Model(inputs=[inputs], outputs=output)
elif mode=='BiLSTM+Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
bilstm = Bidirectional(LSTM(64, return_sequences=True))(layer) #参数保持维度3
bilstm = Bidirectional(LSTM(64, return_sequences=True))(bilstm)
layer = Dense(256, activation='relu')(bilstm)
layer = Dropout(0.2)(layer)
## 注意力机制
attention = Attention(step_dim=max_words)(layer)
layer = Dense(128, activation='relu')(attention)
output = Dense(num_labels, activation='softmax')(layer)
model = Model(inputs=inputs, outputs=output)
elif mode=='BiGRU+Attention':
inputs = Input(name='inputs',shape=[max_words,], dtype='float64')
layer = Embedding(top_words, 32, input_length=max_words, trainable=False)(inputs)
attention_probs = Dense(32, activation='softmax', name='attention_vec')(layer)
attention_mul = Multiply()([layer, attention_probs])
mlp = Dense(64,activation='relu')(attention_mul) #原始的全连接
#bat=BatchNormalization()(mlp)
#act=Activation('relu')
gru=Bidirectional(GRU(32))(mlp)
mlp = Dense(16,activation='relu')(gru)
output = Dense(num_labels, activation='softmax')(mlp)
model = Model(inputs=[inputs], outputs=output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
return model