BiLSTM + attention 以BiLSTM 的输出output_fw+output_bw作为输入,先计算每个词在句子中的贡献,然后以加权的方式对embedding进行融合。计算词对句子的贡献时,以BiLSTM的输出经过一个manual操作,之后进行softmax即可得到,具体可以看代码实现部分。
有几个问题,需要思考一下:
(1)BiLSTM 的输出output_fw+output_bw如果以concat的方式效果如何?
实验发现,效果和output_fw+output_bw无明显差异。
基于tensorflow的实现代码
#!/usr/bin/python
# coding=utf8
import os
import numpy as np
from datetime import datetime
import tensorflow as tf
from sklearn import metrics
from nlp_utils import *
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
class BiLSTM():
"""
文本分类,LSTM模型
"""
def __init__(self, num_classes, max_seq_len, pretrained, embedding_pretrained, vocab_size, embedding_dim, epochs, learning_rate):
"""
:param config:
"""
self.num_classes = num_classes
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.pretrained = None
self.embedding_pretrained = embedding_pretrained
self.epochs = epochs
self.learning_rate = learning_rate
self.input_x = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.l2_loss = tf.constant(0.0)
self.regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
self.inference()
def inference(self):
"""
:return:
"""
# 词向量映射
with tf.name_scope("embedding"):
embedding = tf.get_variable("embedding", [self.vocab_size, self.embedding_dim])
if self.pretrained: # 加载预训练的word embedding
embedding.assign(self.embedding_pretrained)
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) # batch_size * max_seq_len * embedding_dim
embedding_inputs = tf.nn.dropout(embedding_inputs, self.keep_prob)
with tf.name_scope("lstm"):
lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
(output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
lstm_bw_cell,
embedding_inputs,
dtype=tf.float32,
time_major=False,
scope=None)
with tf.name_scope("attention"):
out = output_fw + output_bw # batch_size * max_seq_len * embedding_dim
# 初始化一个权重向量,是可训练的参数
W = tf.Variable(tf.random_normal([self.embedding_dim], stddev=0.1)) # (1, embedding_dim)
# 对Bi-LSTM的输出用激活函数做非线性转换
out_h = tf.tanh(out)
alpha = tf.matmul(tf.reshape(out_h, [-1, self.embedding_dim]), tf.reshape(W, [-1, 1])) # (batch_size*max_seq_len, 1)
alpha = tf.nn.softmax(tf.reshape(alpha, [-1, self.max_seq_len])) # (batch_size, max_seq_len)
# 利用求得的alpha的值对H进行加权求和,用矩阵运算直接操作
att_out = tf.matmul(tf.transpose(out, [0, 2, 1]), tf.reshape(alpha, [-1, self.max_seq_len, 1]))
att_out = tf.tanh(tf.squeeze(att_out, [2]))
with tf.name_scope("dropout"):
attention_drop = tf.nn.dropout(att_out, self.keep_prob)
with tf.name_scope("score"):
# classify
self.logits = tf.layers.dense(attention_drop, self.num_classes, name='fc2') # batch_size * num_classes
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred")
with tf.name_scope("loss"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=self.input_y)
l2_loss = tf.losses.get_regularization_loss()
self.loss = tf.reduce_mean(cross_entropy, name="loss")
self.loss += l2_loss
# optim
self.optim = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="acc")
def batch_iter(self, x, y, batch_size=32, shuffle=True):
"""
生成batch数据
:param x: 训练集特征变量
:param y: 训练集标签
:param batch_size: 每个batch的大小
:param shuffle: 是否在每个epoch时打乱数据
:return:
"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[shuffle_indices]
y_shuffle = y[shuffle_indices]
else:
x_shuffle = x
y_shuffle = y
for i in range(num_batch):
start_index = i * batch_size
end_index = min((i + 1) * batch_size, data_len)
yield (x_shuffle[start_index:end_index], y_shuffle[start_index:end_index])
def evaluate(self, sess, x_, y_):
"""
评估 val data 的准确率和损失
"""
data_len = len(x_)
batch_eval =self.batch_iter(x_, y_, 64)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
self.keep_prob: 1}
loss, acc = sess.run([self.loss, self.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss / data_len, total_acc / data_len
def fit(self, train_x, train_y, val_x, val_y, batch_size, keep_prob):
"""
训练过程
"""
train_steps = 0
best_acc_val = 0.0 # 最佳验证集准确率
saver = tf.train.Saver(max_to_keep=10)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer()) # 初始化变量
for epoch in range(self.epochs):
batch_train = self.batch_iter(train_x, train_y, batch_size)
for x_batch, y_batch in batch_train:
train_steps += 1
feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
self.keep_prob: keep_prob}
_, train_loss, train_acc = sess.run([self.optim, self.loss,
self.acc], feed_dict=feed_dict)
if train_steps % 1000 == 0:
val_loss, val_acc = self.evaluate(sess, val_x, val_y)
if val_acc > best_acc_val:
# 保存最好结果
best_acc_val = val_acc
saver.save(sess, "./model/lstm", global_step=train_steps)
msg = 'epoch:%d/%d,train_steps:%d,train_loss:%.4f,train_acc:%.4f,val_loss:%.4f,val_acc:%.4f'
print(msg % (epoch+1, self.epochs, train_steps, train_loss, train_acc, val_loss, val_acc))
if __name__ == "__main__":
train_file = "./train.txt"
val_file = "./val.txt"
data_set_train = load_data2(train_file) # 加载数据
data_set_val = load_data2(val_file) # 加载数据
word2id_dict, label2id_dict = build_dict(data_set_train) # 构建dict
save_dict(word2id_dict, "word2id_dict.txt")
save_dict(label2id_dict, "label2id_dict.txt")
batch_size = 64
max_seq_len = 64
num_classes = len(label2id_dict)
vocab_size = len(word2id_dict)
embedding_dim = 128
learning_rate = 0.001
epochs = 10
keep_prob = 0.5
train_x, train_y = convert_corpus_to_id_with_padding(data_set_train, word2id_dict, label2id_dict, max_seq_len, num_classes)
val_x, val_y = convert_corpus_to_id_with_padding(data_set_val, word2id_dict, label2id_dict, max_seq_len, num_classes)
lstm_model = BiLSTM(num_classes, max_seq_len, False, None, vocab_size, embedding_dim, epochs, learning_rate)
lstm_model.fit(train_x, train_y, val_x, val_y, batch_size, keep_prob)