BiLSTM + attention 文本分类实践

BiLSTM + attention 以BiLSTM 的输出output_fw+output_bw作为输入,先计算每个词在句子中的贡献,然后以加权的方式对embedding进行融合。计算词对句子的贡献时,以BiLSTM的输出经过一个manual操作,之后进行softmax即可得到,具体可以看代码实现部分。

有几个问题,需要思考一下:

(1)BiLSTM 的输出output_fw+output_bw如果以concat的方式效果如何?

实验发现,效果和output_fw+output_bw无明显差异。

 

基于tensorflow的实现代码

#!/usr/bin/python
# coding=utf8

import os
import numpy as np
from datetime import datetime
import tensorflow as tf
from sklearn import metrics

from nlp_utils import *

#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

class BiLSTM():
    """
    文本分类,LSTM模型
    """
    def __init__(self, num_classes, max_seq_len, pretrained, embedding_pretrained, vocab_size, embedding_dim, epochs, learning_rate):
        """
        :param config:
        """
        self.num_classes = num_classes
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.pretrained = None
        self.embedding_pretrained = embedding_pretrained
        self.epochs = epochs
        self.learning_rate = learning_rate

        self.input_x = tf.placeholder(tf.int32, [None, self.max_seq_len], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
        self.l2_loss = tf.constant(0.0)
        self.regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
        self.inference()

    def inference(self):
        """
        :return:
        """
        # 词向量映射
        with tf.name_scope("embedding"):
            embedding = tf.get_variable("embedding", [self.vocab_size, self.embedding_dim])
            if self.pretrained: # 加载预训练的word embedding
                embedding.assign(self.embedding_pretrained)
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) # batch_size * max_seq_len * embedding_dim
            embedding_inputs = tf.nn.dropout(embedding_inputs, self.keep_prob)

        with tf.name_scope("lstm"):
            lstm_fw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
            lstm_bw_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_dim, forget_bias=1.0, state_is_tuple=True)
            (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,
                                                                            lstm_bw_cell,
                                                                            embedding_inputs,
                                                                            dtype=tf.float32,
                                                                            time_major=False,
                                                                            scope=None)
        with tf.name_scope("attention"):
            out = output_fw + output_bw # batch_size * max_seq_len * embedding_dim

            # 初始化一个权重向量,是可训练的参数
            W = tf.Variable(tf.random_normal([self.embedding_dim], stddev=0.1)) # (1, embedding_dim)
            # 对Bi-LSTM的输出用激活函数做非线性转换
            out_h = tf.tanh(out)
            alpha = tf.matmul(tf.reshape(out_h, [-1, self.embedding_dim]), tf.reshape(W, [-1, 1])) # (batch_size*max_seq_len, 1)
            alpha = tf.nn.softmax(tf.reshape(alpha, [-1, self.max_seq_len])) # (batch_size, max_seq_len)
        
            # 利用求得的alpha的值对H进行加权求和,用矩阵运算直接操作
            att_out = tf.matmul(tf.transpose(out, [0, 2, 1]), tf.reshape(alpha, [-1, self.max_seq_len, 1]))
            att_out = tf.tanh(tf.squeeze(att_out, [2]))
        
        with tf.name_scope("dropout"):
            attention_drop = tf.nn.dropout(att_out, self.keep_prob)
        
        with tf.name_scope("score"):
            # classify
            self.logits = tf.layers.dense(attention_drop, self.num_classes, name='fc2') # batch_size * num_classes
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1, name="pred")

        with tf.name_scope("loss"):
            # 损失函数,交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.input_y)

            l2_loss = tf.losses.get_regularization_loss()
            self.loss = tf.reduce_mean(cross_entropy, name="loss")
            self.loss += l2_loss

            # optim
            self.optim = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准确率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="acc")


    def batch_iter(self, x, y, batch_size=32, shuffle=True):
        """
        生成batch数据
        :param x: 训练集特征变量
        :param y: 训练集标签
        :param batch_size: 每个batch的大小
        :param shuffle: 是否在每个epoch时打乱数据
        :return:
        """
        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1
 
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_len))
            x_shuffle = x[shuffle_indices]
            y_shuffle = y[shuffle_indices]
        else:
            x_shuffle = x
            y_shuffle = y
        for i in range(num_batch):
            start_index = i * batch_size
            end_index = min((i + 1) * batch_size, data_len)
            yield (x_shuffle[start_index:end_index], y_shuffle[start_index:end_index])

    def evaluate(self, sess, x_, y_):
        """
        评估 val data 的准确率和损失
        """
        data_len = len(x_)
        batch_eval =self.batch_iter(x_, y_, 64)
        total_loss = 0.0
        total_acc = 0.0
        for x_batch, y_batch in batch_eval:
            batch_len = len(x_batch)
            feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
                        self.keep_prob: 1}
            loss, acc = sess.run([self.loss, self.acc], feed_dict=feed_dict)
            total_loss += loss * batch_len
            total_acc += acc * batch_len

        return total_loss / data_len, total_acc / data_len

    def fit(self, train_x, train_y, val_x, val_y, batch_size, keep_prob):
        """
        训练过程
        """
        train_steps = 0
        best_acc_val = 0.0  # 最佳验证集准确率
        
        saver = tf.train.Saver(max_to_keep=10)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer()) # 初始化变量
            for epoch in range(self.epochs):
                batch_train = self.batch_iter(train_x, train_y, batch_size)
                for x_batch, y_batch in batch_train:
                    train_steps += 1
                    feed_dict = {self.input_x: x_batch, self.input_y: y_batch,
                                self.keep_prob: keep_prob}
                    _, train_loss, train_acc = sess.run([self.optim, self.loss,
                                                        self.acc], feed_dict=feed_dict)
                    if train_steps % 1000 == 0:
                        val_loss, val_acc = self.evaluate(sess, val_x, val_y)
                        if val_acc > best_acc_val:
                            # 保存最好结果
                            best_acc_val = val_acc
                            saver.save(sess, "./model/lstm", global_step=train_steps)
                    
                        msg = 'epoch:%d/%d,train_steps:%d,train_loss:%.4f,train_acc:%.4f,val_loss:%.4f,val_acc:%.4f'
                        print(msg % (epoch+1, self.epochs, train_steps, train_loss, train_acc, val_loss, val_acc))

if __name__ == "__main__":
    train_file = "./train.txt"
    val_file = "./val.txt"

    data_set_train = load_data2(train_file) # 加载数据
    data_set_val = load_data2(val_file) # 加载数据

    word2id_dict, label2id_dict = build_dict(data_set_train) # 构建dict
    save_dict(word2id_dict, "word2id_dict.txt")
    save_dict(label2id_dict, "label2id_dict.txt")
    
    batch_size = 64
    max_seq_len = 64
    num_classes = len(label2id_dict)
    vocab_size = len(word2id_dict)
    embedding_dim = 128
    learning_rate = 0.001
    epochs = 10
    keep_prob = 0.5

    train_x, train_y = convert_corpus_to_id_with_padding(data_set_train, word2id_dict, label2id_dict, max_seq_len, num_classes)
    val_x, val_y = convert_corpus_to_id_with_padding(data_set_val, word2id_dict, label2id_dict, max_seq_len, num_classes)
    
    lstm_model = BiLSTM(num_classes, max_seq_len, False, None, vocab_size, embedding_dim, epochs, learning_rate)
    lstm_model.fit(train_x, train_y, val_x, val_y, batch_size, keep_prob)

 

你可能感兴趣的:(深度学习)