NLP--Attention,HAN,文本分类

1. 基本的Attention原理。参考翻译任务中的attention。

1.1.我理解attention是对于每个数据进行权重划分。看一下代码就明白了。

with tf.name_scope('attention'):
    input_shape = _outputs.shape  # (batch_size, sequence_length, hidden_size)
    sequence_size = input_shape[1].value  # the length of sequences processed in the RNN layer
    hidden_size = input_shape[2].value  # hidden size of the RNN layer
    attention_w = tf.Variable(tf.truncated_normal([hidden_size, self.config.attention_size], stddev=0.1), name='attention_w')
    attention_b = tf.Variable(tf.constant(0.1, shape=[self.config.attention_size]), name='attention_b')
    attention_u = tf.Variable(tf.truncated_normal([self.config.attention_size], stddev=0.1), name='attention_u')
    z_list = []
    for t in range(sequence_size):
        u_t = tf.tanh(tf.matmul(_outputs[:, t, :], attention_w) + tf.reshape(attention_b, [1, -1]))
        z_t = tf.matmul(u_t, tf.reshape(attention_u, [-1, 1]))
        z_list.append(z_t)
    # Transform to batch_size * sequence_size
    attention_z = tf.concat(z_list, axis=1)
    self.alpha = tf.nn.softmax(attention_z)
    # Transform to batch_size * sequence_size * 1 , same rank as rnn_output
    attention_output = tf.reduce_sum(_outputs * tf.reshape(self.alpha, [-1, sequence_size, 1]), 1)

2. HAN的原理(Hierarchical Attention Networks)。

NLP--Attention,HAN,文本分类_第1张图片

整个网络结构包括五个部分:
1)词序列编码器
2)基于词级的注意力层
3)句子编码器
4)基于句子级的注意力层
5)分类
整个网络结构由双向GRU网络和注意力机制组合而成。

3. 利用Attention模型进行文本分类。

用的是TextRNN+ATTENTION

# coding: utf-8

from __future__ import print_function

import os
import sys
import time
from datetime import timedelta

import numpy as np
import tensorflow as tf
from sklearn import metrics

from rnn_model import TRNNConfig, TextRNN
from word2vec_model import word2vecConfig
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab,process_file_2,process_file_test,process_file_3,process_file_w2c
import matplotlib.pyplot as plt

base_dir = 'data/new_data'
train_dir = os.path.join(base_dir, 'new_train_set.csv')
test_dir = os.path.join(base_dir, 'test_set.csv')
baseline_dir = os.path.join(base_dir, 'baseline.csv')
#val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')

save_dir = 'checkpoints/textrnn'
save_path = os.path.join(save_dir, 'best_validation')  # 最佳验证结果保存路径


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))


def feed_data(x_batch, y_batch, keep_prob):
    feed_dict = {
        model.input_x: x_batch,
        model.input_y: y_batch,
        model.keep_prob: keep_prob
    }
    return feed_dict


def evaluate(sess, x_, y_):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_)
    batch_eval = batch_iter(x_, y_, 128)
    total_loss = 0.0
    total_acc = 0.0
    for x_batch, y_batch in batch_eval:
        batch_len = len(x_batch)
        feed_dict = feed_data(x_batch, y_batch, 1.0)
        y_pred_class,loss, acc = sess.run([model.y_pred_cls,model.loss, model.acc], feed_dict=feed_dict)
        total_loss += loss * batch_len
        total_acc += acc * batch_len

    return y_pred_class,total_loss / data_len, total_acc / data_len


def train():
    print("Configuring TensorBoard and Saver...")
    # 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
    tensorboard_dir = 'tensorboard/textrnn'
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)

    tf.summary.scalar("loss", model.loss)
    tf.summary.scalar("accuracy", model.acc)
    merged_summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(tensorboard_dir)

    # 配置 Saver
    saver = tf.train.Saver()
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    print("Loading training and validation data...")
    # 载入训练集与验证集
    start_time = time.time()

    #x_train, y_train  = process_file_2(train_dir, word_to_id,cat_to_id, config.seq_length)
    x_train, y_train = process_file_w2c(train_dir , config.seq_length,cat_to_id)
    #x_val, y_val = process_file_2(val_dir, cat_to_id, config.seq_length)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    writer.add_graph(session.graph)

    print('Training and evaluating...')
    start_time = time.time()
    total_batch = 0  # 总批次
    best_acc_val = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 1000  # 如果超过1000轮未提升,提前结束训练
    preclass = ""
    flag = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)

            # if total_batch % config.save_per_batch == 0:
                # 每多少轮次将训练结果写入tensorboard scalar
            s = session.run(merged_summary, feed_dict=feed_dict)
            writer.add_summary(s, total_batch)

            # if total_batch % config.print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
            #feed_dict[model.keep_prob] = 1.0
            loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                #preclass,loss_val, acc_val = evaluate(session, x_val, y_val)  # todo

                # if acc_val > best_acc_val:
                #     # 保存最好结果
                #     best_acc_val = acc_val
                #     last_improved = total_batch
                #     saver.save(sess=session, save_path=save_path)
                #     improved_str = '*'
                # else:
                #     improved_str = ''
            improved_str = ''
            last_improved = total_batch
            time_dif = get_time_dif(start_time)
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%}, Time: {3} {4}'
            print(msg.format(total_batch, loss_train, acc_train, time_dif, improved_str))

            session.run(model.optim, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            # if total_batch  > require_improvement:
            #     # 验证集正确率长期不提升,提前结束训练
            #     print("No optimization for a long time, auto-stopping...")
            #     flag = True
            #     saver.save(sess=session, save_path=save_path)
            #     break  # 跳出循环
        if flag:  # 同上
            break
    #训练完保存模型
    saver.save(sess=session, save_path=save_path)
    test2()

def test2():
    print("Loading test data...")
    start_time = time.time()
    x_test = process_file_test(test_dir, word_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    # print('Testing...')
    # y_pred,loss_test, acc_test = evaluate(session, x_test, y_test)
    # msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    # print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

   # y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    i = 0
    fid0 = open(baseline_dir, 'w')
    fid0.write("id,class" + "\n")
    for item in y_pred_cls:
        fid0.write(str(i) + "," + str(item + 1) + "\n")
        i = i + 1
    fid0.close()
    # 评估
    print("Precision, Recall and F1-Score...")
    #print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    #cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    #print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


def test():
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = process_file_2(train_dir, cat_to_id, config.seq_length)

    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=save_path)  # 读取保存的模型

    print('Testing...')
    y_pred,loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果
    for i in range(num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    print('Configuring RNN model...')
    config = TRNNConfig()
    w2vconfig = word2vecConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextRNN(config)
    option='train'
    if option == 'train':
        train()
    elif option == 'test2':
        test2()
    else:
        test()
    # x_train, y_train,lenXtrain = process_file_3(train_dir, word_to_id, cat_to_id, config.seq_length)
    # print(sum(lenXtrain)/len(lenXtrain))
    # plt.hist(lenXtrain, (int(sum(lenXtrain)/len(lenXtrain))))
    # plt.xlabel('Sequence Length')
    # plt.ylabel('Frequency')
    # plt.axis([0, 2500, 0, 20000])
    # plt.show()
#!/usr/bin/python
# -*- coding: utf-8 -*-

import tensorflow as tf

class TRNNConfig(object):
    """RNN配置参数"""

    # 模型参数
    embedding_dim = 200      # 词向量维度
    seq_length = 720        # 序列长度
    num_classes = 19        # 类别数
    vocab_size = 5000      # 词汇表达小

    num_layers= 2           # 隐藏层层数
    hidden_dim = 128        # 隐藏层神经元
    rnn = 'gru'             # lstm 或 gru

    dropout_keep_prob = 0.5 # dropout保留比例
    learning_rate = 1e-3    # 学习率

    batch_size = 128         # 每批训练大小
    num_epochs = 10          # 总迭代轮次

    print_per_batch = 100    # 每多少轮输出一次结果
    save_per_batch = 10      # 每多少轮存入tensorboard

    attention_size = 100

class TextRNN(object):
    """文本分类,RNN模型"""
    def __init__(self, config):
        self.config = config

        # 三个待输入的数据
        self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.rnn()

    def rnn(self):
        """rnn模型"""

        def lstm_cell():   # lstm核
            return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)

        def gru_cell():  # gru核
            return tf.contrib.rnn.GRUCell(self.config.hidden_dim)

        def dropout(): # 为每一个rnn核后面加一个dropout层
            if (self.config.rnn == 'lstm'):
                cell = lstm_cell()
            else:
                cell = gru_cell()
            return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)

        # 词向量映射
        with tf.device('/gpu:0'):
            embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
            embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)

        with tf.name_scope("rnn"):
            # 多层rnn网络
            cells = [dropout() for _ in range(self.config.num_layers)]
            rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)

            _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
            #last = _outputs[:, -1, :]  # 取最后一个时序输出作为结果

        with tf.name_scope('attention'):
            input_shape = _outputs.shape  # (batch_size, sequence_length, hidden_size)
            sequence_size = input_shape[1].value  # the length of sequences processed in the RNN layer
            hidden_size = input_shape[2].value  # hidden size of the RNN layer
            attention_w = tf.Variable(tf.truncated_normal([hidden_size, self.config.attention_size], stddev=0.1), name='attention_w')
            attention_b = tf.Variable(tf.constant(0.1, shape=[self.config.attention_size]), name='attention_b')
            attention_u = tf.Variable(tf.truncated_normal([self.config.attention_size], stddev=0.1), name='attention_u')
            z_list = []
            for t in range(sequence_size):
                u_t = tf.tanh(tf.matmul(_outputs[:, t, :], attention_w) + tf.reshape(attention_b, [1, -1]))
                z_t = tf.matmul(u_t, tf.reshape(attention_u, [-1, 1]))
                z_list.append(z_t)
            # Transform to batch_size * sequence_size
            attention_z = tf.concat(z_list, axis=1)
            self.alpha = tf.nn.softmax(attention_z)
            # Transform to batch_size * sequence_size * 1 , same rank as rnn_output
            attention_output = tf.reduce_sum(_outputs * tf.reshape(self.alpha, [-1, sequence_size, 1]), 1)

        with tf.name_scope("score"):
            # 全连接层,后面接dropout以及relu激活
            batch_last = tf.layers.batch_normalization(attention_output,training=False)
            fc = tf.layers.dense(batch_last , self.config.hidden_dim, name='fc1')
            fc = tf.contrib.layers.dropout(fc, self.keep_prob)
            fc = tf.nn.relu(fc)
            # 分类器
            batch_fc = tf.layers.batch_normalization(fc, training=False)
            self.logits = tf.layers.dense(batch_fc, self.config.num_classes, name='fc2')
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)  # 预测类别

        with tf.name_scope("optimize"):
            # 损失函数,交叉熵
            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
            self.loss = tf.reduce_mean(cross_entropy)
            # 优化器
            self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)

        with tf.name_scope("accuracy"):
            # 准确率
            correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
            self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

 

你可能感兴趣的:(ñNLP)