注:数据提取码
代码所需训练数据
python3.6
tensorflow-gpu1.12
Windows10
pycharm
**1、**结合Character-level Convolutional Networks for Text Classification这篇论文,参考了github上的代码进行学习,github地址
**2、**本篇代码尚未使用词向量模型处理,通过简单的统计出所有文本中高频出现的5000个词的个数,然后根据每篇文章中的词得出相应的词的id,组成一个文本的向量作为输入值
**注:**运行时,要在终端运行
(1)先运行python run_cnn.py train
(2)在运行python run_cnn.py test
(3)这个项目一共有三个文件分别为 cnn_model.py(cnn搭建模型)、cnews_loader.py(信息加载)、run_cnn.py(运行文件)
(4)我们需要在同级目录下创建data文件夹、Checkpoint文件夹、tensorboard文件夹
1、 cnn_model.py文件
import sys
import numpy as np
import tensorflow as tf
from collections import Counter
def read_file(filename):
#读取文件数据,将每个文本中的每个字分隔开
contents,labels = [],[]
with open(filename,mode='r',encoding='utf-8',errors='ignore') as f:
for line in f:
try:
label,content = line.strip().split('\t')
if content:
contents.append(list(content))
#将当前文本中的每一个字作为一个字符串分开
labels.append(label)
except:
pass
return contents,labels
def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
#根据训练集构建词汇表存储
data_train,_ = read_file(train_dir)
#将每篇文章分为每一个字为一个字符串的链表
all_data = []
for content in data_train:
all_data.extend(content)
#将所有文本中的内容放在一个链表中
#在all_data的数组中追加content列表中的内容,将所有内容都放在一个列表中
counter = Counter(all_data)
#统计每个字出现的次数
count_pairs = counter.most_common(vocab_size-1)
#返回的内容为top(vocab_size-1)的字符和频率
words,_ = list(zip(*count_pairs))
#将元组列表解压为列表,只取前top的
words = ['' ]+list(words)
# 添加一个来将所有的文本pad为同一长度
open(vocab_dir,mode='w').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
#读取词汇表
with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
words = [words.strip() for words in fp.readlines()]
word_to_id = dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
#读取分类目录,固定
categories = categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = [content for content in categories]
cat_to_id = dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def to_words(content,words):
#将id的内容转化为文字
return ''.join(words[x] for x in content)
def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
#将文件转化为id的表示
contents,labels = read_file(filename)
data_id,label_id = [],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
#当长度不够时默认在开始补零
y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
#将每个字的序号作为输入的向量值
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
#生成批次数据
data_len = len(x)
num_batch = int((data_len-1)/batch_size)+1
indices = np.random.permutation(np.arange(data_len))
#得到一个data_len长度的随机排列的数组,将x和y重新排列
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i*batch_size
end_id = min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
2、cnews_load.py
import sys
import numpy as np
import tensorflow as tf
from collections import Counter
def read_file(filename):
#读取文件数据
contents,labels = [],[]
with open(filename,mode='r',encoding='utf-8',errors='ignore') as fp:
for line in fp.readlines():
try:
label,content = line.strip().split('\t')
if content:
contents.append(list(content))
#将文本中的每个字作为一个字符存入列表中
labels.append(label)
except:
pass
return contents,labels
def build_vocab(train_dir,vocab_dir,vocab_size = 5000):
#根据训练集构建词汇表存储
data_train,_ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
#将所有文本中的内容添加到列表中,后续统计所有字出现的次数
counter = Counter(all_data)
#统计每个字出现的次数
count_pairs = counter.most_common(vocab_size-1)
#找到出现频率为top(vocab_size-1)的值
words,_ = list(zip(*count_pairs))
#将元组列表解压为二维列表
words = ['' ]+list(words)
# 添加一个来将所有的文本pad为同一长度
open(vocab_dir,mode='w').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
#读取词汇表
with open(vocab_dir,mode='r',encoding='utf-8',errors='ignore') as fp:
words = [words.strip() for words in fp.readlines()]
word_to_id = dict(zip(words,range(len(words))))
return words,word_to_id
def read_category():
#读取分类目录,固定
categories = categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = [content for content in categories]
cat_to_id = dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def to_words(content,words):
#将id的内容转化为文字
return ''.join(words[x] for x in content)
def process_file(filename,word_to_id,cat_to_id,max_lenth = 600):
#将文件转化为id的表示
contents,labels = read_file(filename)
data_id,label_id = [],[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = tf.keras.preprocessing.sequence.pad_sequences(data_id,max_lenth)
#当长度不够时默认在开始补零
y_pad = tf.keras.utils.to_categorical(label_id,num_classes=len(cat_to_id))
#将每个字的序号作为输入的向量值
return x_pad,y_pad
def batch_iter(x,y,batch_size=64):
#生成批次数据
data_len = len(x)
num_batch = int((data_len-1)/batch_size)+1
indices = np.random.permutation(np.arange(data_len))
#得到一个data_len长度的随机排列的数组,将x和y重新排列
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i*batch_size
end_id = min((i+1)*batch_size,data_len)
yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]
3、run_cnn.py
from __future__ import print_function
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')
from datetime import timedelta
import numpy as np
import tensorflow as tf
from sklearn import metrics
from cnn_model import TCNNConfig,TextCNN
from cnews_loader import read_vocab,read_category,batch_iter,process_file,build_vocab
train_dir = './data/cnews.train.txt'
test_dir = './data/cnews.test.txt'
val_dir = './data/cnews.val.txt'
vocab_dir = './data/cnews.vocab.txt'
save_dir = './Checkpoint'
save_path = './Checkpoint/best_validation'
def get_time_dif(start_time):
#获取已使用的时间
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds = int(round(time_dif)))
def feed_data(x_batch,y_batch,keep_prob):
feed_dict ={
model.input_x:x_batch,
model.input_y:y_batch,
model.keep_prob:keep_prob
}
return feed_dict
def evaluate(sess,x,y):
#评估在某一数据上的准确率和损失
data_len = len(x)
batch_eval = batch_iter(x,y,128)
total_loss = 0.0
total_acc = 0.0
for x_batch,y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch,y_batch,1.0)
loss,acc = sess.run([model.loss,model.acc],feed_dict=feed_dict)
total_loss += loss*batch_len
total_acc += acc*batch_len
return total_loss/data_len,total_acc/data_len
def train():
print('Configuring TensorBoard and Saver')
#每次训练之前需要将tensorboard文件夹清空,否则会覆盖
tensorboard_dir = './tensorboard'
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
tf.summary.scalar("loss",model.loss)
tf.summary.scalar("accuracy",model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
#配置Saver
saver = tf.train.Saver()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
print('Loading training and validation data....')
#载入训练集和验证集
start_time = time.time()
x_train,y_train = process_file(train_dir,word_to_id,cat_to_id,config.seq_length)
x_val,y_val = process_file(val_dir,word_to_id,cat_to_id,config.seq_length)
time_dif = get_time_dif(start_time)
print('Time usage:',time_dif)
#创建session
session = tf.Session()
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
print('training and evaluating....')
start_time = time.time()
total_batch = 0 #总批次
best_acc_val = 0.0 #最佳验证集准确率
last_improved = 0 #记录上次提升批次
require_improvement= 1000 #如果超过一千轮未提升则提前结束
flag = False
for epoch in range(config.num_epochs):
print('Epoch',epoch+1)
batch_train = batch_iter(x_train,y_train,config.batch_size)
for x_batch,y_batch in batch_train:
feed_dict = feed_data(x_batch,y_batch,config.dropout_keep_prob)
if total_batch%config.save_per_batch == 0:
#每多少轮次将训练结果写入tensorboard scalar
s = session.run(merged_summary,feed_dict=feed_dict)
writer.add_summary(s,total_batch)
if total_batch % config.print_per_batch == 0:
# 每多少轮次输出在训练集和验证集上的性能
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
loss_val, acc_val = evaluate(session, x_val, y_val) # todo
if acc_val > best_acc_val:
# 保存最好结果
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=session, save_path=save_path)
improved_str = '*'
else:
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
print(msg.format(total_batch,loss_train,acc_train,loss_val,acc_val,time_dif,improved_str))
feed_dict[model.keep_prob] = config.dropout_keep_prob
session.run(model.optim, feed_dict=feed_dict) # 运行优化
total_batch += 1
if total_batch - last_improved > require_improvement:
# 验证集正确率长期不提升,提前结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break # 跳出循环
if flag: # 同上
break
def test():
print("Loading test data...")
start_time = time.time()
x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
print('Testing...')
loss_test, acc_test = evaluate(session, x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
batch_size = 128
data_len = len(x_test)
num_batch = int((data_len - 1) / batch_size) + 1
y_test_cls = np.argmax(y_test, 1)
y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) # 保存预测结果
for i in range(num_batch): # 逐批次处理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
model.input_x: x_test[start_id:end_id],
model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
# 评估
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
print(cm)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
raise ValueError("""usage: python run_cnn.py [train / test]""")
print('Configuring CNN model...')
config = TCNNConfig()
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)
if sys.argv[1] == 'train':
train()
else:
test()