cnews新闻文件夹下载路径:链接:https://pan.baidu.com/s/1H3K94E7JGJdIrGuTKBXvIg 密码:fmdq
# coding: utf-8
read_file(): 读取文件数据;
build_vocab(): 构建词汇表,使用字符级的表示,这一函数会将词汇表存储下来,避免每一次重复处理;
read_vocab(): 读取上一步存储的词汇表,转换为{词:id}表示;
read_category(): 将分类目录固定,转换为{类别: id}表示;
to_words(): 将一条由id表示的数据重新转换为文字;
process_file(): 将数据集从文字转换为固定长度的id序列表示;
batch_iter(): 为神经网络的训练准备经过shuffle的批次的数据。
x_train [50000, 600] y_train [50000, 10]
x_val [5000, 600] y_val [5000, 10]
x_test [10000, 600] y_test [10000, 10]
import sys
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
if sys.version_info[0] > 2:
is_py3 = True
is_py3 = False
def native_word(word, encoding='utf-8'):
if not is_py3:
return word.encode(encoding)
return word
def native_content(content):
if not is_py3:
return content.decode('utf-8')
return content
def open_file(filename, mode='r'):
mode: 'r' or 'w' for read or write
if is_py3:
return open(filename, mode, encoding='utf-8', errors='ignore')
return open(filename, mode)
def read_file(filename):
contents, labels = [], []
with open_file(filename) as f:
for line in f:
label, content = line.strip().split('\t')
if content:
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1)
words, _ = list(zip(*count_pairs))
# 添加一个 来将所有文本pad为同一长度
words = [''] + list(words)
open_file(vocab_dir, mode='w').write('\n'.join(words) + '\n')
def read_vocab(vocab_dir):
# words = open_file(vocab_dir).read().strip().split('\n')
with open_file(vocab_dir) as fp:
# 如果是py2 则每个值都转化为unicode
words = [native_content(_.strip()) for _ in fp.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
def read_category():
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = [native_content(x) for x in categories]
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
def to_words(content, words):
return ''.join(words[x] for x in content)
def process_file(filename, word_to_id, cat_to_id, max_length=600):
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
return x_pad, y_pad
def batch_iter(x, y, batch_size=64):
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
# coding: utf-8
Embedding, CNN, max pooling, fully connected, fully connected, softmax, category id.
class TCNNConfig(object):
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
num_classes = 10 # 类别数
num_filters = 128 # 卷积核数目
kernel_size = 5 # 卷积核尺寸
vocab_size = 5000 # 词汇表达小
hidden_dim = 128 # 全连接层神经元
dropout_keep_prob = 0.5 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 64 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
import tensorflow as tf
class TCNNConfig(object):
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
num_classes = 10 # 类别数
num_filters = 256 # 卷积核数目
kernel_size = 5 # 卷积核尺寸
vocab_size = 5000 # 词汇表达小
hidden_dim = 128 # 全连接层神经元
dropout_keep_prob = 0.5 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 64 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
class TextCNN(object):
def __init__(self, config):
self.config = config
# 三个待输入的数据
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
def cnn(self):
# 词向量映射
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("cnn"):
# CNN layer
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
# 优化器
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
终端运行 python3.5 run_cnn.py train,可以开始训练。
终端运行 python3.5 run_cnn.py test 在测试集上进行测试。
base_dir = '/Users/apple/Documents/ST/python/python项目/CNN_RNN_text_classification/cnews'
save_dir = '/Users/apple/Documents/ST/python/python项目/CNN_RNN_text_classification/checkpoints/textcnn'
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import sys
import time
from datetime import timedelta
import numpy as np
import tensorflow as tf
from sklearn import metrics
from cnn_model import TCNNConfig, TextCNN
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
#base_dir = 'data/cnews'
base_dir = '/Users/apple/Documents/ST/python/python项目/CNN_RNN_text_classification/cnews'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = '/Users/apple/Documents/ST/python/python项目/CNN_RNN_text_classification/checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation') # 最佳验证结果保存路径
def get_time_dif(start_time):
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
def feed_data(x_batch, y_batch, keep_prob):
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.keep_prob: keep_prob
return feed_dict
def evaluate(sess, x_, y_):
data_len = len(x_)
batch_eval = batch_iter(x_, y_, 128)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch, y_batch, 1.0)
loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss / data_len, total_acc / data_len
def train():
print("Configuring TensorBoard and Saver...")
# 配置 Tensorboard,重新训练时,请将tensorboard文件夹删除,不然图会覆盖
tensorboard_dir = 'tensorboard/textcnn'
if not os.path.exists(tensorboard_dir):
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
# 配置 Saver
saver = tf.train.Saver()
if not os.path.exists(save_dir):
print("Loading training and validation data...")
# 载入训练集与验证集
start_time = time.time()
x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
# 创建session
session = tf.Session()
print('Training and evaluating...')
start_time = time.time()
total_batch = 0 # 总批次
best_acc_val = 0.0 # 最佳验证集准确率
last_improved = 0 # 记录上一次提升批次
require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练
flag = False
for epoch in range(config.num_epochs):
print('Epoch:', epoch + 1)
batch_train = batch_iter(x_train, y_train, config.batch_size)
for x_batch, y_batch in batch_train:
feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
if total_batch % config.save_per_batch == 0:
# 每多少轮次将训练结果写入tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, total_batch)
if total_batch % config.print_per_batch == 0:
# 每多少轮次输出在训练集和验证集上的性能
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
loss_val, acc_val = evaluate(session, x_val, y_val) # todo
if acc_val > best_acc_val:
# 保存最好结果
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=session, save_path=save_path)
improved_str = '*'
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))
session.run(model.optim, feed_dict=feed_dict) # 运行优化
total_batch += 1
if total_batch - last_improved > require_improvement:
# 验证集正确率长期不提升,提前结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break # 跳出循环
if flag: # 同上
def test():
print("Loading test data...")
start_time = time.time()
x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
session = tf.Session()
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
loss_test, acc_test = evaluate(session, x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
batch_size = 128
data_len = len(x_test)
num_batch = int((data_len - 1) / batch_size) + 1
y_test_cls = np.argmax(y_test, 1)
y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果
for i in range(num_batch): # 逐批次处理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
model.input_x: x_test[start_id:end_id],
model.keep_prob: 1.0
y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
# 评估
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
raise ValueError("""usage: python run_cnn.py [train / test]""")
print('Configuring CNN model...')
config = TCNNConfig()
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)
if sys.argv[1] == 'train':
appledeMBP:CNN_RNN_text_classification apple$ python3.5 run_cnn.py train
Configuring CNN model...
Configuring TensorBoard and Saver...
Loading training and validation data...
Time usage: 0:00:18
2018-11-25 08:52:08.149886: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2018-11-25 08:52:08.149909: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2018-11-25 08:52:08.149925: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2018-11-25 08:52:08.149930: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
Training and evaluating...
Epoch: 1
Iter: 0, Train Loss: 2.3, Train Acc: 7.81%, Val Loss: 2.3, Val Acc: 9.44%, Time: 0:00:08 *
Iter: 100, Train Loss: 0.79, Train Acc: 81.25%, Val Loss: 0.99, Val Acc: 69.72%, Time: 0:01:21 *
Iter: 200, Train Loss: 0.36, Train Acc: 89.06%, Val Loss: 0.65, Val Acc: 81.68%, Time: 0:02:34 *
Iter: 300, Train Loss: 0.34, Train Acc: 90.62%, Val Loss: 0.42, Val Acc: 88.58%, Time: 0:03:44 *
Iter: 400, Train Loss: 0.28, Train Acc: 90.62%, Val Loss: 0.37, Val Acc: 89.48%, Time: 0:04:54 *
Iter: 500, Train Loss: 0.25, Train Acc: 93.75%, Val Loss: 0.3, Val Acc: 92.16%, Time: 0:06:05 *
Iter: 600, Train Loss: 0.33, Train Acc: 89.06%, Val Loss: 0.31, Val Acc: 91.16%, Time: 0:07:17
Iter: 700, Train Loss: 0.087, Train Acc: 96.88%, Val Loss: 0.28, Val Acc: 91.70%, Time: 0:08:30
Epoch: 2
Iter: 800, Train Loss: 0.11, Train Acc: 96.88%, Val Loss: 0.27, Val Acc: 91.68%, Time: 0:09:40
Iter: 900, Train Loss: 0.031, Train Acc: 98.44%, Val Loss: 0.22, Val Acc: 93.68%, Time: 0:10:51 *
Iter: 1000, Train Loss: 0.15, Train Acc: 93.75%, Val Loss: 0.23, Val Acc: 93.64%, Time: 0:12:04
Iter: 1100, Train Loss: 0.2, Train Acc: 95.31%, Val Loss: 0.24, Val Acc: 92.46%, Time: 0:13:15
Iter: 1200, Train Loss: 0.048, Train Acc: 100.00%, Val Loss: 0.19, Val Acc: 95.02%, Time: 0:14:26 *
Iter: 1300, Train Loss: 0.08, Train Acc: 96.88%, Val Loss: 0.2, Val Acc: 94.60%, Time: 0:15:37
Iter: 1400, Train Loss: 0.14, Train Acc: 95.31%, Val Loss: 0.24, Val Acc: 92.78%, Time: 0:16:47
Iter: 1500, Train Loss: 0.11, Train Acc: 96.88%, Val Loss: 0.22, Val Acc: 94.36%, Time: 0:17:57
Epoch: 3
Iter: 1600, Train Loss: 0.049, Train Acc: 98.44%, Val Loss: 0.2, Val Acc: 94.72%, Time: 0:19:07
Iter: 1700, Train Loss: 0.13, Train Acc: 96.88%, Val Loss: 0.23, Val Acc: 92.84%, Time: 0:20:22
Iter: 1800, Train Loss: 0.062, Train Acc: 98.44%, Val Loss: 0.19, Val Acc: 94.98%, Time: 0:21:35
Iter: 1900, Train Loss: 0.031, Train Acc: 100.00%, Val Loss: 0.22, Val Acc: 93.82%, Time: 0:22:48
Iter: 2000, Train Loss: 0.094, Train Acc: 95.31%, Val Loss: 0.24, Val Acc: 93.66%, Time: 0:23:59
Iter: 2100, Train Loss: 0.063, Train Acc: 96.88%, Val Loss: 0.22, Val Acc: 94.10%, Time: 0:25:11
Iter: 2200, Train Loss: 0.049, Train Acc: 98.44%, Val Loss: 0.24, Val Acc: 92.64%, Time: 0:26:21
No optimization for a long time, auto-stopping...
appledeMBP:CNN_RNN_text_classification apple$
运行 python run_cnn.py test
在测试集上的准确率达到了96.04%,且各类的precision, recall和f1-score都超过了0.9。
Configuring CNN model...
Loading test data...
Test Loss: 0.14, Test Acc: 96.04%
Precision, Recall and F1-Score...
precision recall f1-score support
体育 0.99 0.99 0.99 1000
财经 0.96 0.99 0.97 1000
房产 1.00 1.00 1.00 1000
家居 0.95 0.91 0.93 1000
教育 0.95 0.89 0.92 1000
科技 0.94 0.97 0.95 1000
时尚 0.95 0.97 0.96 1000
时政 0.94 0.94 0.94 1000
游戏 0.97 0.96 0.97 1000
娱乐 0.95 0.98 0.97 1000
avg / total 0.96 0.96 0.96 10000
Confusion Matrix...
[[991 0 0 0 2 1 0 4 1 1]
[ 0 992 0 0 2 1 0 5 0 0]
[ 0 1 996 0 1 1 0 0 0 1]
[ 0 14 0 912 7 15 9 29 3 11]
[ 2 9 0 12 892 22 18 21 10 14]
[ 0 0 0 10 1 968 4 3 12 2]
[ 1 0 0 9 4 4 971 0 2 9]
[ 1 16 0 4 18 12 1 941 1 6]
[ 2 4 1 5 4 5 10 1 962 6]
[ 1 0 1 6 4 3 5 0 1 979]]
Time usage: 0:00:05