首先还是感谢https://github.com/chatopera/insuranceqa-corpus-zh作者的辛苦付出,构建了保险行业的中文语料库,并且提供了一个训练以及测试例程,解决了很多人的燃眉之急,可以说是雪中送炭了。
前几篇文章是对原始例程的详细注解,但这个例程并不是基于tensorflow的,而且也只是提供了loss和accuracy的计算数据,并未提供给其他人(如用户或客户)演示的方法。比如:将每一个测试语句中的id转换为词,并且分离问题和答复,问题与答复是否搭配能够让人一目了然。基于这2点重要不足,我hu花了已经时间和精力将原始例程进行了改写,现将程序代码贴出,大家一起分享。
源码如下(基于anaconda):
import os
import sys
import numpy as np
import tensorflow as tf
import deep_qa_1.data as corpus
import visual.loss as visual_loss
import visual.accuracy as visual_acc
curdir = os.path.dirname(os.path.abspath('__file__'))
sys.path.insert(0, os.path.dirname(curdir))
print(curdir)
print(sys.path)
input_layer_size = 0
output_layer_size = 0
layers = []
layers_num = 0
epoch = 0
learning_rate = 0.0
batch_size = 0
eval_every_N_steps = 0
model_dir = "model/"
def init(hidden_layers = [100, 50],
question_max_length = 20,
utterance_max_length = 99,
lr = 0.001,
ep = 10,
batchsize = 100,
eens = 500):
'''
Neural Network to train question and answering model
'''
global input_layer_size
global output_layer_size
global layers
global layers_num
global epoch
global learning_rate
global batch_size
global eval_every_N_steps
input_layer_size = question_max_length + utterance_max_length + 1 # 1 is for , 20+99+1=120
output_layer_size = 2 # just the same shape as labels
layers = [input_layer_size] + hidden_layers + [output_layer_size] # [2] is for output layer
layers_num = len(layers)
epoch = ep
learning_rate = lr
batch_size = batchsize
eval_every_N_steps = eens
#init(ep=10, lr=0.0001, eens=200)
#init(ep=50, lr=0.0001, eens=200)
init(ep=500, lr=0.0001, eens=200)
#测试数据集
test_data = corpus.load_test()
#输入数据(问+分隔符+答)
x_data = tf.placeholder(dtype=tf.float32, shape=[None, input_layer_size], name="input_data")
#输出目标
y_target = tf.placeholder(dtype=tf.float32, shape=[None, output_layer_size], name="output_label")
'''
定义一个创建一层神经网络并进行数据汇总的函数add_layer。
这个函数的输入参数有输入数据inputs,输入的维度in_size,输出的维度out_size和激活函数activation_function,默认使用Relu。
在函数内,显示初始化这层神经网络的权重和偏置。然后对输入做矩阵乘法并加上偏置,如果定义了激活函数还要经过激活函数。
'''
def add_layer(inputs, in_size, out_size, activation_function=None):
loc_w = tf.Variable(tf.random_normal([in_size, out_size]))
loc_b = tf.Variable(tf.zeros([1, out_size]) + 0.1) # 官方推荐biases初始值不为0
#loc_y = tf.matmul(inputs, loc_w) + loc_b
loc_y = tf.add(tf.matmul(inputs, loc_w), loc_b)
if activation_function is None:
loc_outputs = loc_y
else:
loc_outputs = activation_function(loc_y)
return loc_outputs
"""
定义feed_dict。
该函数先判断训练标记,如果训练标记为true,则从训练集中获取一个batch的样本;
如果训练标记为false,则获取测试集数据。
"""
def feed_dict(train):
xs = []
ys = []
if train:
for mini_batch in corpus.load_train():
for x,y_ in mini_batch:
xs.append(x)
ys.append(y_)
else:
for (x,y_) in test_data:
xs.append(x)
ys.append(y_)
return {x_data: xs, y_target: ys}
"""
使用add_layer创建隐藏层1,输入维度是一条问答语句的尺寸(99+1+20=120),输出的维度是隐藏节点数100。
"""
#hidden_layer1 = add_layer(x_data, 120, 100, activation_function=tf.sigmoid)
w_1 = tf.Variable(tf.random_normal([120, 100]), name="w_1")
b_1 = tf.Variable(tf.zeros([1, 100]) + 0.1, name="b_1")
hidden_output1 = tf.sigmoid(tf.add(tf.matmul(x_data, w_1), b_1))
"""
使用add_layer创建隐藏层2,输入维度是隐含层1的尺寸(100),输出的维度是隐藏节点数50。
"""
#hidden_layer2 = add_layer(hidden_layer1, 100, 50, activation_function=tf.sigmoid)
w_2 = tf.Variable(tf.random_normal([100, 50]), name="w_2")
b_2 = tf.Variable(tf.zeros([1, 50]) + 0.1, name="b_2")
hidden_output2 = tf.sigmoid(tf.add(tf.matmul(hidden_output1, w_2), b_2))
"""
使用add_layer创建输出层,输入维度是隐含层2的尺寸(50),输出的维度是输出层节点数2。
"""
#output = add_layer(hidden_layer2, 50, 2)
w_output = tf.Variable(tf.random_normal([50, 2]), name="w_out")
b_output = tf.Variable(tf.zeros([1, 2]) + 0.1, name="b_out")
output = tf.add(tf.matmul(hidden_output2, w_output), b_output)
variables_dict = {'w_out': w_output, 'w_2': w_2, 'w_1': w_1, 'b_out': b_output, 'b_2': b_2, 'b_1': b_1}
"""
使用tf.nn.sigmoid_cross_entropy_with_logits()对前面输出层的结果进行sigmoid处理并计算交叉熵损失cross_entropy。
"""
diff = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_target, logits=output)
cross_entropy = tf.reduce_mean(diff)
"""
使用Adma优化器对损失进行优化。
"""
optimizer = tf.train.AdamOptimizer(learning_rate);
train_step = optimizer.minimize(cross_entropy);
"""
统计预测正确的样本数并计算正确率accuray。
"""
correct_prediction = tf.equal(tf.argmax(output, 1), tf.arg_max(y_target, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#init_op = tf.initialize_all_variables() #deprecated
init_op = tf.global_variables_initializer()
"""
使用tf.train.Saver()创建模型的保存器。
"""
saver = tf.train.Saver(variables_dict)
#is_train = True
is_train = False
with tf.Session() as sess:
sess.run(init_op)
if is_train:
for i in range(epoch):
sess.run(train_step, feed_dict=feed_dict(True))
print("i is: %d" %i)
if (i+1)%50 == 0:
print("save model parameter")
#将模型保存到./model/model.ckpt 文件
saver.save(sess, model_dir+"model.ckpt", global_step=i)
print("train finished")
else:
#ckpt = tf.train.get_checkpoint_state(model_dir)
#if ckpt and ckpt.model_checkpoint_path:
#saver.restore(sess, ckpt.model_checkpoint_path)
model_file = tf.train.latest_checkpoint(model_dir)
print("model_file is: %s" %model_file)
saver.restore(sess, model_file)
val_acc = sess.run(accuracy, feed_dict=feed_dict(False))
#print('val_loss:%f, val_acc:%f'%(val_loss,val_acc))
print('val_acc: %f' %(val_acc))
#将问题由id转换成中文,并选取正确的回复,也转换成中文
xs = []
ys = []
print("len of test_data is")
print(len(test_data))
for (x, y_) in test_data:
xs.append(x)
ys.append(y_)
#print(xs[0:1])
for sentence in xs[0:2]:
for word in sentence:
#print(word)
if word == 24998:
print("", end="")
elif word == 24999:
print("?")
else:
print(corpus.vocab_data["id2word"][str(word)], end="")
print("\n")