【自然语言处理】文本分类模型_BiLSTM+Attention_TensorFlow实现

一、模型结构

1. Embedding层:获得词的分布式表示;

2. BiLSTM层:将词向量依次送入到双向LSTM中并得到每个cell的输出outputs;

3. Attention层:

h i h_i hi表示由BiLSTM产生的包含单词 w i w_i wi上下文信息的隐藏层向量;

通过全连接层将 h i h_i hi转换为 u i u_i ui u i = t a n h ( W h i + b ) u_i=tanh(Wh_i+b) ui=tanh(Whi+b);

计算 u i u_i ui与上下文向量 u w u_w uw的相似度并通过softmax转换为概率分布: α i = e x p ( u i T u w ) ∑ i e x p ( u i T u w ) \alpha_i=\frac{exp(u_i^Tu_w)}{\sum_{i}exp(u_i^Tu_w)} αi=iexp(uiTuw)exp(uiTuw)

α i \alpha_i αi可以看做是每个单词的句子的重要程度,因此使用 α i \alpha_i αi作为全局对 h i h_i hi加权求和得到表达句子的向量: s = ∑ i α i h i s=\sum_{i}\alpha_{i}h_i s=iαihi

(注:其中上下文向量 u w u_w uw比较玄,可以看做是一个query“单词对句子的贡献是多少”,而通过计算 u i u_i ui u w u_w uw的相似度可以得到每个 u i u_i ui对> u w u_w uw的贡献;其中 u w u_w uw是随机初始化并通过训练获得的)

4. Dropout+全连接层

二、使用TensorFlow实现模型

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import rnn
import math
import datetime
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
Using TensorFlow backend.

参数

max_features = 10000 # vocabulary的大小
maxlen = 500
embedding_size = 128
batch_size = 512 # 每个batch中样本的数量
hidden_size = 300 # LSTM
num_epochs = 20
max_learning_rate = 0.005
min_learning_rate = 0.0001
decay_coefficient = 2.5 # learning_rate的衰减系数
dropout_keep_prob = 0.5 # dropout的比例
evaluate_every = 100 # 每100step进行一次eval

数据读取

train = pd.read_csv("../input/labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)
test = pd.read_csv("../input/testData.tsv",header=0,delimiter="\t", quoting=3)

数据处理

# 建立tokenizer
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(list(train['review']) + list(test['review']))
#word_index = tokenizer.word_index
x_train = tokenizer.texts_to_sequences(list(train['review']))
x_train = pad_sequences(x_train,maxlen=maxlen) # padding
y_train = to_categorical(list(train['sentiment'])) # one-hot
x_test = tokenizer.texts_to_sequences(list(test['review']))
x_test = pad_sequences(x_test,maxlen=maxlen) # padding
# 划分训练和验证集
x_train,x_dev,y_train,y_dev = train_test_split(x_train,y_train,test_size=0.3,random_state=0)

构建模型

class TextBiLSTM_Attention(object):
    def __init__(self,
                 sequence_length,
                 num_classes,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 l2_reg_lambda=0.0):
        # 定义需要用户输入的placeholder
        self.input_x = tf.placeholder(tf.int32, [None,sequence_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None,num_classes], name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        self.learning_rate = tf.placeholder(tf.float32, name='learning_rate') # 定义为placeholder是为了实现lr递减
        
        l2_loss = tf.constant(0.0)
        
        # Embedding层
        with tf.name_scope('embedding'):
            self.W = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0),
                                name='W',trainable=True)
            # [batch_size, sequence_length, embedding_size]
            self.embedded_words = tf.nn.embedding_lookup(self.W, self.input_x)
            
        # biLSTM层
        with tf.name_scope('biLSTM'):
            lstm_fw_cell = tf.keras.layers.LSTMCell(hidden_size)
            lstm_bw_cell = tf.keras.layers.LSTMCell(hidden_size)
            embedded_words_list = tf.unstack(self.embedded_words, sequence_length, axis=1)
            outputs,states_fw,states_bw = rnn.static_bidirectional_rnn(lstm_fw_cell,
                                                                       lstm_bw_cell,
                                                                       embedded_words_list,
                                                                       dtype = tf.float32)
            self.outputs = tf.stack(outputs,axis=1)
            
        with tf.name_scope('attention'):
            self.W_attention = tf.get_variable(shape=[hidden_size*2,hidden_size*2],
                                           initializer=tf.random_normal_initializer(stddev=0.1),
                                           name='W_attention')
            self.b_attention = tf.get_variable(shape=[hidden_size*2],name='b_attention')
            self.context_vector = tf.get_variable("what_is_the_informative_word", 
                                                  shape=[hidden_size * 2],
                                                  initializer=tf.random_normal_initializer(stddev=0.1))
            # [batch_size*sequence_length, hidden_size*2]
            hidden_state = tf.reshape(self.outputs,[-1,hidden_size*2])  
            hidden_representation = tf.nn.tanh(tf.matmul(hidden_state,self.W_attention) + self.b_attention)
            hidden_representation = tf.reshape(hidden_representation, shape=[-1,sequence_length,hidden_size * 2])
            # 计算相似度
            hidden_state_context_similiarity = tf.multiply(hidden_representation,self.context_vector)
            attention_logits = tf.reduce_sum(hidden_state_context_similiarity,axis=2)
            # 为了防止softmax溢出,所以用logits减去max,再进行softmax
            attention_logits_max = tf.reduce_max(attention_logits, axis=1,keep_dims=True)
            p_attention = tf.nn.softmax(attention_logits - attention_logits_max)
            p_attention_expanded = tf.expand_dims(p_attention, axis=2)
            # 加权求和得到表示句子的向量
            sentence_representation = tf.multiply(p_attention_expanded,self.outputs)
            sentence_representation = tf.reduce_sum(sentence_representation,axis=1)
            
        with tf.name_scope('dropout'):
            # dropout防止过拟合
            self.rnn_drop = tf.nn.dropout(sentence_representation,self.dropout_keep_prob)
            
        with tf.name_scope('output'):
            W = tf.get_variable(shape=[hidden_size*2,num_classes],
                            initializer=tf.contrib.layers.xavier_initializer(),
                           name='W')
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            # 将dropout的输出乘以w再加b
            self.scores = tf.nn.xw_plus_b(self.rnn_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            
        with tf.name_scope('loss'):
            # 交叉熵loss
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.input_y)
            # L2正则化后的loss
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
# 用于产生batch
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    data_size = len(data)
    num_batches_per_epoch = data_size// batch_size # 每个epoch中包含的batch数量
    for epoch in range(num_epochs):
        # 每个epoch是否进行shuflle
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data

        for batch_num in range(num_batches_per_epoch+1):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

模型训练

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=True, # 如果指定的设备不存在,允许tf自动分配设备
      log_device_placement=False) # 不打印设备分配日志
    sess = tf.Session(config=session_conf) # 使用session_conf对session进行配置
    # 构建模型
    nn = TextBiLSTM_Attention(sequence_length=x_train.shape[1],
                              num_classes=y_train.shape[1],
                              vocab_size=max_features,
                              embedding_size=embedding_size,
                              hidden_size=hidden_size)
    # 用于统计全局的step
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(nn.learning_rate)
    tvars = tf.trainable_variables() # 返回需要训练的variable
    # tf.gradients(nn.loss, tvars),计算loss对tvars的梯度
    grads, _ = tf.clip_by_global_norm(tf.gradients(nn.loss, tvars), 5) # 为了防止梯度爆炸,对梯度进行控制
    grads_and_vars = tuple(zip(grads, tvars))
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
    
    sess.run(tf.global_variables_initializer())
    batches = batch_iter(np.hstack((x_train,y_train)), batch_size, num_epochs)
    decay_speed = decay_coefficient*len(y_train)/batch_size
    counter = 0 # 用于记录当前的batch数
    for batch in batches:
        learning_rate = min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-counter/decay_speed)
        counter += 1
        x_batch,y_batch = batch[:,:-2],batch[:,-2:]
        # 训练
        feed_dict = {nn.input_x: x_batch,
                     nn.input_y: y_batch,
                     nn.dropout_keep_prob: dropout_keep_prob,
                     nn.learning_rate: learning_rate}
        _, step, loss, accuracy= sess.run(
            [train_op, global_step, nn.loss, nn.accuracy],
            feed_dict)
        current_step = tf.train.global_step(sess, global_step)
         # Evaluate
        if current_step % evaluate_every == 0:
            print("\nEvaluation:")
            loss_sum = 0
            accuracy_sum = 0
            step = None
            batches_in_dev = len(y_dev) // batch_size
            for batch in range(batches_in_dev):
                start_index = batch * batch_size
                end_index = (batch + 1) * batch_size
                feed_dict = {
                        nn.input_x: x_dev[start_index:end_index],
                        nn.input_y: y_dev[start_index:end_index],
                        nn.dropout_keep_prob: 1.0
                    }
                step, loss, accuracy = sess.run(
                    [global_step, nn.loss, nn.accuracy],feed_dict)
                loss_sum += loss
                accuracy_sum += accuracy
            loss = loss_sum / batches_in_dev
            accuracy = accuracy_sum / batches_in_dev
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            print("")
            
    # predict test set
    all_predictions = []
    test_batches = batch_iter(x_test, batch_size, num_epochs=1, shuffle=False)
    for batch in test_batches:
        feed_dict = {
            nn.input_x: batch,
            nn.dropout_keep_prob: 1.0
        }        
        predictions = sess.run([nn.predictions],feed_dict)[0]
        all_predictions.extend(list(predictions))
Evaluation:
2019-04-16T07:07:49.449825: step 100, loss 0.410615, acc 0.817801


Evaluation:
2019-04-16T07:10:44.440823: step 200, loss 0.341489, acc 0.860073


Evaluation:
2019-04-16T07:13:39.760141: step 300, loss 0.376103, acc 0.865653


Evaluation:
2019-04-16T07:16:35.310548: step 400, loss 0.425659, acc 0.865513


Evaluation:
2019-04-16T07:19:30.973752: step 500, loss 0.45173, acc 0.863979


Evaluation:
2019-04-16T07:22:26.216372: step 600, loss 0.485102, acc 0.862584


Evaluation:
2019-04-16T07:25:21.299178: step 700, loss 0.531893, acc 0.861607

你可能感兴趣的:(自然语言处理)