https://github.com/jiegzhan/multi-class-text-classification-cnn-rnn
data--->batch iter-->cnn input-->embedding--->卷积--->池化--->rnn输入--->lstm cell--softmax
class TCNNRNNConfig(object):
# 模型参数
embedding_dim = 64 # 词向量维度
seq_length = 300 # 序列长度
num_classes = 2 # 类别数
num_filters = 256 # 卷积核数目
kernel_size = 5 # 卷积核尺寸
vocab_size = 130000 # 词汇表达小
max_pool_size=4 #最大的pool层
hidden_dim = 128 # 全连接层神经元
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
hidden_unit=256 #lstm神经元的个数
batch_size = 128 # 每批训练大小
num_epochs = 20 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
multi_kernel_size = '3,4,5'
l2_reg_lambda = 0.0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
class TextCnnRnn(object):
def __init__(self,config):
self.config=config
self.input_x=tf.placeholder(tf.int32,[None, self.config.seq_length],name="input_x")
self.input_y=tf.placeholder(tf.float32,[None, self.config.num_classes],name="inpyt_y")
self.keep_prob=tf.placeholder(tf.float32,None,name='keep_prob')
self.pad = tf.placeholder(tf.float32, [None, 1, self.config.embedding_dim, 1], name='pad')
self.l2_loss = tf.constant(0.0)
self.real_len = tf.placeholder(tf.int32, [None], name='real_len')
self.filter_sizes = list(map(int, self.config.multi_kernel_size.split(",")))
self.cnnrnn()
def input_embedding(self):
"""词嵌套"""
with tf.device('/cpu:0'):
embedding =tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
_input = tf.nn.embedding_lookup(embedding, self.input_x)
_input_expanded = tf.expand_dims(_input, -1)
return _input_expanded
def cnnrnn(self):
emb=self.input_embedding()
pooled_concat = []
reduced = np.int32(np.ceil((self.config.seq_length) * 1.0 / self.config.max_pool_size))
for i, filter_size in enumerate(self.filter_sizes):
with tf.name_scope('conv-maxpool-%s' % filter_size):
# Zero paddings so that the convolution output have dimension batch x sequence_length x emb_size x channel
num_prio = (filter_size - 1) // 2
num_post = (filter_size - 1) - num_prio
pad_prio = tf.concat([self.pad] * num_prio, 1)
pad_post = tf.concat([self.pad] * num_post, 1)
emb_pad = tf.concat([pad_prio, emb, pad_post], 1)
filter_shape = [filter_size, self.config.embedding_dim, 1, self.config.num_filters]
W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_filters]), name='b')
conv = tf.nn.conv2d(emb_pad, W, strides=[1, 1, 1, 1], padding='VALID', name='conv')
h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
# Maxpooling over the outputs
pooled = tf.nn.max_pool(h, ksize=[1, self.config.max_pool_size, 1, 1], strides=[1, self.config.max_pool_size, 1, 1], padding='SAME',
name='pool')
pooled = tf.reshape(pooled, [-1, reduced, self.config.num_filters])
pooled_concat.append(pooled)
pooled_concat = tf.concat(pooled_concat, 2)
pooled_concat = tf.nn.dropout(pooled_concat, self.keep_prob)
# lstm_cell = tf.nn.rnn_cell.LSTMCell(num_units=self.config.hidden_unit)
# lstm_cell = tf.nn.rnn_cell.GRUCell(num_units=self.config.hidden_unit)
lstm_cell = tf.contrib.rnn.GRUCell(num_units=self.config.hidden_unit)
# lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=self.dropout_keep_prob)
lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell, output_keep_prob=self.keep_prob)
self._initial_state = lstm_cell.zero_state(self.config.batch_size, tf.float32)
# inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, reduced, pooled_concat)]
inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(pooled_concat, num_or_size_splits=int(reduced), axis=1)]
# outputs, state = tf.nn.rnn(lstm_cell, inputs, initial_state=self._initial_state, sequence_length=self.real_len)
#outputs, state = tf.contrib.rnn.static_rnn(lstm_cell, inputs, initial_state=self._initial_state,
# sequence_length=self.real_len)
outputs, state=tf.nn.static_rnn( lstm_cell, inputs,self._initial_state,sequence_length=self.real_len)
# Collect the appropriate last words into variable output (dimension = batch x embedding_size)
output = outputs[0]
with tf.variable_scope('Output'):
tf.get_variable_scope().reuse_variables()
one = tf.ones([1, self.config.hidden_unit], tf.float32)
for i in range(1, len(outputs)):
ind = self.real_len < (i + 1)
ind = tf.to_float(ind)
ind = tf.expand_dims(ind, -1)
mat = tf.matmul(ind, one)
output = tf.add(tf.multiply(output, mat), tf.multiply(outputs[i], 1.0 - mat))
with tf.name_scope('score'):
self.W = tf.Variable(tf.truncated_normal([self.config.hidden_unit, self.config.num_classes], stddev=0.1), name='W')
b = tf.Variable(tf.constant(0.1, shape=[self.config.num_classes]), name='b')
self.l2_loss += tf.nn.l2_loss(W)
self.l2_loss += tf.nn.l2_loss(b)
self.scores = tf.nn.xw_plus_b(output, self.W, b, name='scores')
self.pred_y = tf.nn.softmax(self.scores, name="pred_y")
tf.add_to_collection('pred_network', self.pred_y)
self.predictions = tf.argmax(self.scores, 1, name='predictions')
with tf.name_scope('loss'):
losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,
logits=self.scores) # only named arguments accepted
self.loss = tf.reduce_mean(losses) + self.config.l2_reg_lambda * self.l2_loss
with tf.name_scope("optimize"):
# 优化器
optimizer = tf.train.AdamOptimizer(
learning_rate=self.config.learning_rate)
self.optim = optimizer.minimize(self.loss)
with tf.name_scope('accuracy'):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.acc = tf.reduce_mean(tf.cast(correct_predictions, "float"), name='accuracy')
with tf.name_scope('num_correct'):
correct = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.num_correct = tf.reduce_sum(tf.cast(correct, 'float'))
#!/usr/bin/python
# -*- coding: utf-8 -*-
from cnn_rnn_model import TextCnnRnn
from configuration import TCNNRNNConfig
from data_utils_cut import preocess_file,batch_iter
import time
import tensorflow as tf
import os
import numpy as np
from datetime import timedelta
trainpath="/Users/shuubiasahi/Desktop/tensorflow/adx/"
def run_epoch(cnnrnnmodel=True):
# 载入数据
print('Loading data...')
start_time = time.time()
x_train, y_train, words = preocess_file(data_path=trainpath+"cnn.txt")
if cnnrnnmodel:
print('Using CNNRNN model...')
config = TCNNRNNConfig()
config.vocab_size = len(words)
print("vocab_size is:", config.vocab_size)
model = TextCnnRnn(config)
tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/boardlog'
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
print('Time usage:', time_dif)
print('Constructing TensorFlow Graph...')
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
# 配置 tensorboard
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
writer.add_graph(session.graph)
# 生成批次数据
print('Generating batch...')
batch_train = batch_iter(list(zip(x_train, y_train)),
config.batch_size, config.num_epochs)
def feed_data(batch):
"""准备需要喂入模型的数据"""
x_batch, y_batch = zip(*batch)
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.real_len:real_len(x_batch)
}
return feed_dict, len(x_batch)
def real_len(batches):
return [np.ceil(np.argmin(batch + [0]) * 1.0 / config.max_pool_size) for batch in batches]
def evaluate(x_, y_):
"""
模型评估
一次运行所有的数据会OOM,所以需要分批和汇总
"""
batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
total_loss = 0.0
total_acc = 0.0
cnt = 0
for batch in batch_eval:
feed_dict, cur_batch_len = feed_data(batch)
feed_dict[model.keep_prob] = 1.0
loss, acc = session.run([model.loss, model.acc],
feed_dict=feed_dict)
total_loss += loss * cur_batch_len
total_acc += acc * cur_batch_len
cnt += cur_batch_len
return total_loss / cnt, total_acc / cnt
# 训练与验证
print('Training and evaluating...')
start_time = time.time()
print_per_batch = config.print_per_batch
for i, batch in enumerate(batch_train):
feed_dict, lenbatch = feed_data(batch)
feed_dict[model.keep_prob] = config.dropout_keep_prob
feed_dict[model.pad]=np.zeros([lenbatch, 1, config.embedding_dim, 1])
if i % 5 == 0: # 每5次将训练结果写入tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, i)
if i % print_per_batch == print_per_batch - 1: # 每200次输出在训练集和验证集上的性能
loss_train, acc_train = session.run([model.loss, model.acc],
feed_dict=feed_dict)
#loss, acc = evaluate(x_val, y_val) 验证机暂时不需要
# 时间
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
+ ' Time: {3}'
print(msg.format(i + 1, loss_train, acc_train, time_dif))
# if i%10==0 and i>0:
# graph=tf.graph_util.convert_variables_to_constants(session,session.graph_def,["keep_prob","input_x","score/pred_y"])
# tf.train.write_graph(graph,".","/Users/shuubiasahi/Desktop/tensorflow/modelsavegraph/graph.db",as_text=False)
if i%500==0 and i>0:
graph = tf.graph_util.convert_variables_to_constants(session, session.graph_def,
["keep_prob","real_len","pad", "input_x", "score/pred_y"])
if cnnrnnmodel:
tf.train.write_graph(graph, ".", trainpath+"graphcnnrnn.model",
as_text=False)
print("模型在第{0}步已经保存".format(i))
session.run(model.optim, feed_dict=feed_dict) # 运行优化
# 最后在测试集上进行评估
session.close()
if __name__ == '__main__':
run_epoch()
Using CNNRNN model...
vocab_size is: 160238
Time usage: 0:00:35
Constructing TensorFlow Graph...
2017-10-30 23:22:18.426329: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426342: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426346: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-10-30 23:22:18.426351: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
Generating batch...
Training and evaluating...
Iter: 100, Train Loss: 0.66, Train Acc: 71.09%, Time: 0:02:47
Iter: 200, Train Loss: 0.65, Train Acc: 61.72%, Time: 0:05:38