Tensorflow使用CNN卷积神经网络以及RNN(Lstm、Gru)循环神经网络进行中文文本分类
cnews.train数据源记录:
cnews.vocab词汇表记录:
基于TensorFlow 的CNN-RNN中文文本分类,github代码来源于(https://github.com/gaussic/text-classification-cnn-rnn)
run_rnn代码:
# coding: utf-8
from __future__ import print_function
import os
import sys
import time
from datetime import timedelta
import numpy as np
import tensorflow as tf
from sklearn import metrics
from rnn_model import TRNNConfig, TextRNN
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
base_dir = 'data/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = 'checkpoints/textrnn'
save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
def feed_data(x_batch, y_batch, keep_prob):
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.keep_prob: keep_prob
}
return feed_dict
def evaluate(sess, x_, y_):
"""评估在某一数据上的准确率和损失"""
data_len = len(x_)
batch_eval = batch_iter(x_, y_, 128)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch, y_batch, 1.0)
loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss / data_len, total_acc / data_len
def train():
print("Configuring TensorBoard and Saver...")
# 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
tensorboard_dir = 'tensorboard/textrnn'
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
# 配置 Saver
saver = tf.train.Saver()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
print("Loading training and validation data...")
# 载入训练集与验证集
start_time = time.time()
x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# 创建session
session = tf.Session()
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
print('Training and evaluating...')
start_time = time.time()
total_batch = 0 # 总批次
best_acc_val = 0.0 # 最佳验证集准确率
last_improved = 0 # 记录上一次提升批次
require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练
flag = False
for epoch in range(config.num_epochs):
print('Epoch:', epoch + 1)
batch_train = batch_iter(x_train, y_train, config.batch_size)
for x_batch, y_batch in batch_train:
feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
if total_batch % config.save_per_batch == 0:
# 每多少轮次将训练结果写入tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, total_batch)
if total_batch % config.print_per_batch == 0:
# 每多少轮次输出在训练集和验证集上的性能
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
loss_val, acc_val = evaluate(session, x_val, y_val) # todo
if acc_val > best_acc_val:
# 保存最好结果
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=session, save_path=save_path)
improved_str = '*'
else:
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
session.run(model.optim, feed_dict=feed_dict) # 运行优化
total_batch += 1
if total_batch - last_improved > require_improvement:
# 验证集正确率长期不提升,提前结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break # 跳出循环
if flag: # 同上
break
def test():
print("Loading test data...")
start_time = time.time()
x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
print('Testing...')
loss_test, acc_test = evaluate(session, x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
batch_size = 128
data_len = len(x_test)
num_batch = int((data_len - 1) / batch_size) + 1
y_test_cls = np.argmax(y_test, 1)
y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果
for i in range(num_batch): # 逐批次处理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
model.input_x: x_test[start_id:end_id],
model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
# 评估
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
print(cm)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
raise ValueError("""usage: python run_rnn.py [train / test]""")
print('Configuring RNN model...')
config = TRNNConfig()
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextRNN(config)
if sys.argv[1] == 'train':
train()
else:
test()
https://github.com/duanzhihua/text-classification-cnn-rnn
run_rnn.py代码解读:
1,生成分类列表、分类对应编号。categories, cat_to_id = read_category()
categories: ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
cat_to_id:{'体育': 0, '财经': 1, '房产': 2, '家居': 3, '教育': 4, '科技': 5, '时尚': 6, '时政': 7, '游戏': 8, '娱乐': 9}
2,生成词汇表,词汇对应编号。 words, word_to_id = read_vocab(vocab_dir)
words: ['
word_to_id: {'
词汇表的数据源:cnews.vocab.txt (如果没有数据源文件,也可以使用cnews_loader.py的build_vocab方法从训练集中生成词汇文件, 遍历数据集的数据获取单词词汇,注:在训练时句子长度为600,有的句子的长度不够的情况下,需使用PAD补齐,
因此这里加上一个词汇
初始化模型: model = TextRNN(config)
整体网络结构:输入层--》RNN(lstm、gru+dropout)----》全连接层(dropout+relu)----》输出层:分类器及softmax预测类别
3,三个占位变量 :input_x:shape=(?, 600) input_y:shape=(?, 10) keep_prob:dropout层的drop概率
4,词向量映射:
词向量表:embedding shape=(5000, 64):(5000是词汇表的大小,64是词向量的维度)
输入数据的词向量: embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
embedding_inputs shape=(?, 600, 64):(?输入数据的记录条数,600是句子长度,64是词向量维度)
input_x 是将在embedding词向量表中查询的IDs 。
5,构建多层RNN网络。
rnn_cell创建2层神经网络,RNN核cell可以选择lstm(BasicLSTMCell)或 gru(GRUCell)隐藏层128,每一个RNN核后面加一个dropout层。
计算输出: _outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
_outputs:shape=(?, 600, 128)
取最后一个时序输出作为结果:last = _outputs[:, -1, :]
一个句子600个字,每个单词(一个时序)经过循环神经网络处理以后,最后一个单词时序的输出作为这个句子的结果。
6,全连接层,后面接dropout以及relu激活。
fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
fc--》dropout+relu fc: shape=(?, 128)
7,分类器及softmax预测类别
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.logits : shape=(?, 10) 得出这一个句子分别对应于10个分类的分数. ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别,softmax预测类别。
8,损失函数,交叉熵。 由于句子的预测分类可能会判错,因此要计算预测的分类和正确分类的误差,这里使用softmax_cross_entropy_with_logits损失函数
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y) #shape=(?,)
self.loss = tf.reduce_mean(cross_entropy)
9,优化器。权重更新。
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
10,模型准确率的计算。
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
模型训练:
1,载入训练集与验证集
process_file函数的数据预处理:
1,读取文件,获取每一句的内容及标签 contents, labels
contents:[['马','晓'..],['商','瑞'..],....]
labels: ['体育'.....]
2,将文件中的每句话从文本转换为数字表示:
data_id:[[387,1197..],[199,964..],....]
label_id:[0,.....]
注意:这里每一句话的长度是不一样的。例如第一句746个字;第二句:1598个字..
3.使用keras提供的pad_sequences来将文本pad为固定长度。
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
为了便于Tensorflow RNN模型的处理,将文本的长度固定,不够的句子使用pad_sequences填充0,0对应词汇表中的
max_length为600,超过600个字的句子将被截断。
x_pad:(50000, 600),其中每一个元素是一个列表[0:600],如:[1609,659,56.....],表示600个字的编号。
4. 将标签转换为one-hot表示
y_pad: (50000, 10) 每一个元素是一个数组,对应1的索引表示分类编号,如:[1,0,0,0,0,0,0,0,0,0]
训练集process_file函数预处理: x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
x_train:(50000, 600)
y_train:(50000, 10)
验证集process_file函数预处理:
x_val:(5000, 600)
y_val:(5000, 10)
2. Tensorlfow 创建session,将model.loss,model.acc加入tf.summary.scalar,循环遍历每一个Epoch,及遍历每批次的数据。
x_batch: (128, 600)
y_batch: (128, 10)
将训练结果写入tensorboard scalar,输出在训练集和验证集上的性能,保存最好结果。
如果验证集正确率长期不提升,提前结束训练。
如果采用CNN卷积神经网络,run_cnn.py的示意图如下:
run_cnn.py的运行结果如下:
D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master>python runn_cnn.py train
python: can't open file 'runn_cnn.py': [Errno 2] No such file or directory
D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master>python run_cnn.py train
G:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Configuring CNN model...
WARNING:tensorflow:From D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master\cnn_model.py:66: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.
See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:21
2018-12-20 14:34:40.444623: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
Training and evaluating...
Epoch: 1
Iter: 0, Train Loss: 2.3, Train Acc: 9.38%, Val Loss: 2.3, Val Acc: 9.10%, Time: 0:00:12 *
Iter: 100, Train Loss: 0.9, Train Acc: 81.25%, Val Loss: 1.1, Val Acc: 67.50%, Time: 0:01:57 *
Iter: 200, Train Loss: 0.59, Train Acc: 84.38%, Val Loss: 0.64, Val Acc: 80.50%, Time: 0:03:43 *
Iter: 300, Train Loss: 0.34, Train Acc: 92.19%, Val Loss: 0.43, Val Acc: 87.78%, Time: 0:10:49 *
Iter: 400, Train Loss: 0.38, Train Acc: 89.06%, Val Loss: 0.4, Val Acc: 89.38%, Time: 0:12:12 *
Iter: 500, Train Loss: 0.14, Train Acc: 98.44%, Val Loss: 0.33, Val Acc: 91.58%, Time: 0:13:38 *
Iter: 600, Train Loss: 0.11, Train Acc: 96.88%, Val Loss: 0.31, Val Acc: 92.52%, Time: 0:15:06 *
Iter: 700, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.27, Val Acc: 93.08%, Time: 0:16:31 *
Epoch: 2
Iter: 800, Train Loss: 0.13, Train Acc: 96.88%, Val Loss: 0.24, Val Acc: 93.02%, Time: 0:18:03
Iter: 900, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.27, Val Acc: 93.38%, Time: 0:19:38 *
Iter: 1000, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.23, Val Acc: 93.38%, Time: 0:21:17
Iter: 1100, Train Loss: 0.13, Train Acc: 96.88%, Val Loss: 0.24, Val Acc: 94.20%, Time: 0:22:50 *
Iter: 1200, Train Loss: 0.066, Train Acc: 98.44%, Val Loss: 0.24, Val Acc: 93.18%, Time: 0:24:27
Iter: 1300, Train Loss: 0.12, Train Acc: 95.31%, Val Loss: 0.26, Val Acc: 93.06%, Time: 0:26:07
Iter: 1400, Train Loss: 0.15, Train Acc: 96.88%, Val Loss: 0.21, Val Acc: 94.64%, Time: 0:27:40 *
Iter: 1500, Train Loss: 0.057, Train Acc: 98.44%, Val Loss: 0.19, Val Acc: 94.58%, Time: 0:29:13
Epoch: 3
Iter: 1600, Train Loss: 0.033, Train Acc: 98.44%, Val Loss: 0.23, Val Acc: 93.08%, Time: 0:30:54
Iter: 1700, Train Loss: 0.03, Train Acc: 100.00%, Val Loss: 0.18, Val Acc: 94.68%, Time: 0:32:39 *
Iter: 1800, Train Loss: 0.075, Train Acc: 96.88%, Val Loss: 0.2, Val Acc: 93.22%, Time: 0:34:21
Iter: 1900, Train Loss: 0.091, Train Acc: 96.88%, Val Loss: 0.17, Val Acc: 95.36%, Time: 0:36:09 *
Iter: 2000, Train Loss: 0.14, Train Acc: 95.31%, Val Loss: 0.17, Val Acc: 95.32%, Time: 0:38:01
Iter: 2100, Train Loss: 0.059, Train Acc: 98.44%, Val Loss: 0.17, Val Acc: 95.44%, Time: 0:39:41 *
Iter: 2200, Train Loss: 0.079, Train Acc: 96.88%, Val Loss: 0.14, Val Acc: 96.28%, Time: 0:41:12 *
Iter: 2300, Train Loss: 0.092, Train Acc: 96.88%, Val Loss: 0.19, Val Acc: 94.62%, Time: 0:42:43
Epoch: 4
Iter: 2400, Train Loss: 0.011, Train Acc: 100.00%, Val Loss: 0.17, Val Acc: 95.60%, Time: 0:44:14
Iter: 2500, Train Loss: 0.014, Train Acc: 100.00%, Val Loss: 0.21, Val Acc: 94.46%, Time: 0:45:50
Iter: 2600, Train Loss: 0.071, Train Acc: 98.44%, Val Loss: 0.19, Val Acc: 94.72%, Time: 0:47:19
Iter: 2700, Train Loss: 0.02, Train Acc: 98.44%, Val Loss: 0.16, Val Acc: 95.40%, Time: 0:55:56
Iter: 2800, Train Loss: 0.21, Train Acc: 96.88%, Val Loss: 0.23, Val Acc: 93.86%, Time: 0:59:57
Iter: 2900, Train Loss: 0.037, Train Acc: 96.88%, Val Loss: 0.21, Val Acc: 94.30%, Time: 1:01:33
Iter: 3000, Train Loss: 0.04, Train Acc: 96.88%, Val Loss: 0.17, Val Acc: 95.36%, Time: 1:03:09
Iter: 3100, Train Loss: 0.0024, Train Acc: 100.00%, Val Loss: 0.24, Val Acc: 92.74%, Time: 1:04:42
Epoch: 5
Iter: 3200, Train Loss: 0.049, Train Acc: 98.44%, Val Loss: 0.21, Val Acc: 94.08%, Time: 1:06:19
No optimization for a long time, auto-stopping...
查询Tensorflow tensorboard的web页面:
D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master>tensorboard --logdir=D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master\tensorboard\textcnn
TensorBoard 1.10.0 at http://duanzhihua:6006 (Press CTRL+C to quit)
在测试集上进行分类测试,测试准确率为96.64%:
D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master>python run_cnn.py test
G:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
Configuring CNN model...
WARNING:tensorflow:From D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master\cnn_model.py:66: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.
See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
Loading test data...
2018-12-21 10:40:03.297798: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
Testing...
Test Loss: 0.11, Test Acc: 96.64%
Precision, Recall and F1-Score...
precision recall f1-score support
体育 0.99 0.99 0.99 1000
财经 0.95 0.99 0.97 1000
房产 0.99 1.00 0.99 1000
家居 0.97 0.91 0.94 1000
教育 0.91 0.96 0.93 1000
科技 0.97 0.97 0.97 1000
时尚 0.98 0.97 0.97 1000
时政 0.97 0.92 0.94 1000
游戏 0.99 0.96 0.98 1000
娱乐 0.96 0.98 0.97 1000
avg / total 0.97 0.97 0.97 10000
Confusion Matrix...
[[995 0 0 0 4 0 0 0 0 1]
[ 0 991 0 0 3 2 0 4 0 0]
[ 0 0 998 1 1 0 0 0 0 0]
[ 1 15 7 914 23 10 5 18 1 6]
[ 3 6 1 7 957 6 7 5 1 7]
[ 0 6 0 4 3 975 3 2 5 2]
[ 2 1 0 7 9 1 966 0 1 13]
[ 0 26 3 2 37 7 0 919 1 5]
[ 1 2 0 4 8 3 5 1 965 11]
[ 3 1 0 3 5 2 2 0 0 984]]
Time usage: 0:00:52
输入两句话,预测文本是哪个类型:
['三星ST550以全新的拍摄方式超越了以往任何一款数码相机',
'热火vs骑士前瞻:皇帝回乡二番战 东部次席唾手可得新浪体育讯北京时间3月30日7:00']
预测结果如下,预测分类正确:
D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master>python predict.py
G:\ProgramData\Anaconda3\lib\site-packages\h5py\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
WARNING:tensorflow:From D:\PycharmProjects\git_noc_text_classification_2019\text-classification-cnn-rnn-master\cnn_model.py:66: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.
See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
2018-12-21 10:46:54.016809: I T:\src\github\tensorflow\tensorflow\core\platform\cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
科技
体育