TensorFlow可被用于语音识别或图像识别等多项机器深度学习领域,它可在小到手机、大到数千台服务器上运行。前段时间在做有关情感分类的实验,利用了神经网络对数据进行分类;效果还不错,达到80+%。
数据集来源:评论数据集,中文的,很不容易,感谢作者!
pos数据
neg数据
数据处理:
import random
def loadfile():
neg = pd.read_excel('data/neg.xls', header=None, index=None)
pos = pd.read_excel('data/pos.xls', header=None, index=None) # 读取训练语料完毕
pos['mark'] = 1
neg['mark'] = 0 # 给训练语料贴上标签
pn = pd.concat([pos, neg], ignore_index=True) # 合并语料
#neglen = len(neg)
#poslen = len(pos) # 计算语料数目
#print(type(neg['mark'].values[0]))
#print(pn[:10],pn[-10:-1])
print (len(pn[0].values),len(pn['mark'].values))
with open('data/data.txt','w',encoding='utf-8') as f:
for x in pn[0].values:
f.write(x+'\n')
with open('data/label.txt', 'w', encoding='utf-8') as f:
for x in pn['mark'].values:
f.write(str(x)+'\n')
loadfile()#加载并合并数据
-------------------------------------------------
#分词,去停用词
import jieba
import numpy as np
with open('data/stopwords', 'r', encoding='utf-8') as f:
stopwords = []
for line in f.readlines():
stopwords.append(line.strip())
def split_word():
with open('data/data.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
#lines=random.sample(lines, len(lines))#打乱次序
lines_1 = []
#word_list = []
for line in lines:
line = ' '.join(jieba.cut(line.strip()))
#for word in line.split(' '):
# if word not in stopwords:
# word_list.append(word)
lines_1.append(line)
with open('data/split_data.txt','w',encoding='utf-8') as f1:
for line in lines_1:
f1.write(line+'\n')
#print(lines_1[0])
#with open('data/clean_data.txt','w',encoding='utf-8') as f:
# for line in lines_1:
# f.write((" ".join([word for word in line]) + "\n"))
with open('data/split_data.txt','r',encoding='utf-8') as f:
line_list=[]
#len_list=[]
for line in f.readlines():
line =line.strip().split(' ')
line_1=[]
for word in line:
if word not in stopwords:
line_1.append(word)
#len_list.append(len(line_1))
line_list.append(line_1)
with open('data_clean.txt','w',encoding='utf-8') as f1:
for line in line_list:
f1.write((" ".join([num for num in line]) + "\n"))
停用词表(stopwords):
"
..
>>
/
...
8
二
<
@
]
、
,
“
”
。
-
&
《
》
…
?
^
_
(
)
#
啊
此
这
呢
哦
仅
*
+
=
0
1
2
3
4
5
6
7
8
9
@
$
【
】
[
]
矣
兮
~
>
<
{
}
了
个
呵
的
」
「
;
%
.
.
:
—
TWILIGHT
,
\
;
.....
创建词典:
#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"
def set_dataset_path(path):
dataset_path=path
if not os.path.exists(dataset_path_1):
print('training dataset is null')
exit()
#gen_vocabulary(生成字典)
def gen_vocabulary_file(input_file, output_file,vocab_size,input_file2=None):
f = open(input_file, encoding='utf-8')
train_set_x = []
#train_set_y = []
#test_set_x = []
#test_set_y = []
for line in f.readlines():
x = line.strip()
train_set_x.append(x)
#train_set_y.append(y)
f.close()
#train_set_x = train_set_x[1:]
vocabulary = {}
counter = 0
for line in train_set_x:
counter += 1
# print line
tokens = line.strip().split(' ') # 这一步有问题,输出的不是汉字
#print(tokens)
for word in tokens:
if word in vocabulary: # 已在词汇表中,则词频加1
vocabulary[word] += 1
else: # 不在则为1
vocabulary[word] = 1
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
# print vocabulary
# 取前5000个常用汉字, 应该差不多够用了
if len(vocabulary_list) > vocab_size:
vocabulary_list = vocabulary_list[:vocab_size] # vocab_size大小的词汇表
print(input_file, " 词汇表大小:", len(vocabulary_list))
with open(output_file, "w",encoding='utf-8') as ff:
for word in vocabulary_list:
ff.write(word + '\n')
print ("vocabulary start convert...:")
gen_vocabulary_file(dataset_path_1,"train_set_vocabulary",20000)
句子转换id:
#coding=utf-8
import numpy as np
import random
import os
from io import open
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
dataset_path_1='data_clean.txt'
#dataset_path_2="data/sentiment_XS_test.txt"
# 把对话字符串转为向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
starttime = datetime.datetime.now()
tmp_vocab = []
with open(vocabulary_file, "r",encoding='utf-8') as f:
tmp_vocab.extend(f.readlines())#将词汇表填入tmp_vocab
tmp_vocab = [line.strip() for line in tmp_vocab]#去除一些无用字符
vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
# {'硕': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
#print vocab以上内容正确组成了字典
output_f = open(output_file, 'w',encoding='utf-8')#写入输出文件
train_set_x=[]
train_set_y=[]
with open(input_file, encoding='utf-8') as f:
for line in f:
x = line.strip().split(' ')
#print (x)
train_set_x.append(x)
#train_set_x = train_set_x[1:]
for line in train_set_x:
line_vec = []
for words in line:
line_vec.append(vocab.get(words, UNK_ID))
#如果words在vocab里,则填入vocab[words],否则3
#print line_vec
output_f.write((" ".join([str(num) for num in line_vec]) + "\n"))
#返回一个字符串的连接,以空格为分隔符,以换行符为结尾
output_f.close()
endtime = datetime.datetime.now()
print("运行时间:%d 秒"%((endtime - starttime).seconds))
convert_to_vector(dataset_path_1,vocabulary_file="train_set_vocabulary",output_file="train_set_encode")
#convert_to_vector(dataset_path_2,vocabulary_file="train_set_vocabulary",output_file="test_set_encode")
自己手动提取10%的数据作为测试集
接下来,进行分类模型构建:
MLP模型:mlp_model.py
#coding=utf-8
import tensorflow as tf
import numpy as np
#coding=utf-8
import tensorflow as tf
import numpy as np
class MLP_Model(object):
def __init__(self,config,is_training=True):
self.keep_prob=config.keep_prob
self.batch_size=tf.Variable(0,dtype=tf.int32,trainable=False)
self.is_training =is_training
num_step=config.num_step
self.input_data=tf.placeholder(tf.int32,[None,num_step])
self.target = tf.placeholder(tf.int64,[None])
#self.mask_x = tf.placeholder(tf.float32,[num_step,None])
#emotion_embed_dim = config.emotion_embed_dim
class_num=config.class_num
hidden_neural_size=config.hidden_neural_size
vocabulary_size=config.vocabulary_size
max_len = config.max_len
embed_dim=config.embed_dim
hidden_layer_num = config.hidden_layer_num
self.new_batch_size = tf.placeholder(tf.int32,shape=[],name="new_batch_size")
self._batch_size_update = tf.assign(self.batch_size,self.new_batch_size)
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([embed_dim, hidden_neural_size])),
'h2': tf.Variable(tf.random_normal([hidden_neural_size, hidden_neural_size])),
'out': tf.Variable(tf.random_normal([hidden_neural_size, class_num]))
}
biases = {
'b1': tf.Variable(tf.random_normal([hidden_neural_size])),
'b2': tf.Variable(tf.random_normal([hidden_neural_size])),
'out': tf.Variable(tf.random_normal([class_num]))
}
#build mlp network
def multilayer_perceptron(_X, _weights, _biases):
layer_1=[]
layer_2=[]
for i in range(max_len):
if i > 0: tf.get_variable_scope().reuse_variables()
layer_1.append(tf.nn.relu(
tf.add(tf.matmul(_X[i], _weights['h1']), _biases['b1']))) # Hidden layer with sigmoid activation
layer_2.append(tf.nn.relu(
tf.add(tf.matmul(layer_1[i], _weights['h2']), _biases['b2']))) # Hidden layer with RELU activation
with tf.name_scope("mean_pooling_layer"):
out_put = tf.reduce_mean(layer_2, 0)
return tf.matmul(out_put, _weights['out']) + _biases['out']
#lstm_fw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True)
#lstm_bw_cell = rnn_cell.BasicLSTMCell(hidden_neural_size, forget_bias=0.0,state_is_tuple=True)
#if self.keep_prob<1:
# lstm_fw_cell = rnn_cell.DropoutWrapper(
# lstm_fw_cell,output_keep_prob=self.keep_prob
# )
# lstm_bw_cell = rnn_cell.DropoutWrapper(
# lstm_bw_cell, output_keep_prob=self.keep_prob
# )
#lstm_fw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*hidden_layer_num)
#lstm_bw_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*hidden_layer_num)
#self._initial_state = cell.zero_state(self.batch_size,dtype=tf.float32)
#embedding layer
with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
embedding = tf.get_variable("embedding",[vocabulary_size,embed_dim],dtype=tf.float32)
inputs= tf.nn.embedding_lookup(embedding,self.input_data)
inputs_emb = tf.transpose(inputs, [1, 0, 2])
inputs_emb = tf.reshape(inputs_emb, [-1, embed_dim])
inputs_emb = tf.split(0, num_step, inputs_emb)
#print(inputs)
if self.keep_prob<1:
inputs = tf.nn.dropout(inputs,self.keep_prob)
with tf.variable_scope("mlp_layer"):
self.logits = multilayer_perceptron(inputs_emb,weights,biases)
#out_put=[]
#state=self._initial_state
#with tf.variable_scope("LSTM_layer"):
# for time_step in range(num_step):
# if time_step>0: tf.get_variable_scope().reuse_variables()
# (cell_output,state)=cell(inputs[:,time_step,:],state)
# out_put.append(cell_output)
#out_put=out_put*self.mask_x[:,:,None]
#with tf.name_scope("mean_pooling_layer"):
# out_put=tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None])
#with tf.name_scope("Softmax_layer_and_output"):
# softmax_w = tf.get_variable("softmax_w",[2*hidden_neural_size,class_num],dtype=tf.float32)
# softmax_b = tf.get_variable("softmax_b",[class_num],dtype=tf.float32)
# self.logits = tf.matmul(outputs[-1],softmax_w)+softmax_b
with tf.name_scope("loss"):
self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits+1e-10,self.target)
self.cost = tf.reduce_mean(self.loss)
with tf.name_scope("accuracy"):
self.prediction = tf.argmax(self.logits,1)
correct_prediction = tf.equal(self.prediction,self.target)
self.correct_num=tf.reduce_sum(tf.cast(correct_prediction,tf.float32))
self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name="accuracy")
#add summary
loss_summary = tf.summary.scalar("loss",self.cost)
#add summary
accuracy_summary=tf.summary.scalar("accuracy_summary",self.accuracy)
if not self.is_training:
self.saver = tf.train.Saver(tf.global_variables())
return
self.globle_step = tf.Variable(0,name="globle_step",trainable=False)
self.lr = tf.Variable(0.0,trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
config.max_grad_norm)
# Keep track of gradient values and sparsity (optional)
grad_summaries = []
for g, v in zip(grads, tvars):
if g is not None:
grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
self.grad_summaries_merged = tf.summary.merge(grad_summaries)
self.summary =tf.summary.merge([loss_summary,accuracy_summary,self.grad_summaries_merged])
optimizer = tf.train.GradientDescentOptimizer(self.lr)
optimizer.apply_gradients(zip(grads, tvars))
self.train_op=optimizer.apply_gradients(zip(grads, tvars))
self.new_lr = tf.placeholder(tf.float32,shape=[],name="new_learning_rate")
self._lr_update = tf.assign(self.lr,self.new_lr)
self.global_step = tf.Variable(0, trainable=False)
self.saver = tf.train.Saver(tf.global_variables())
def assign_new_lr(self,session,lr_value):
session.run(self._lr_update,feed_dict={self.new_lr:lr_value})
def assign_new_batch_size(self,session,batch_size_value):
session.run(self._batch_size_update,feed_dict={self.new_batch_size:batch_size_value})
训练模型:mlp.py
import os
import time
import tensorflow as tf
#import datetime
#from rnn_model import RNN_Model
from mlp_model import MLP_Model
import data_process
flags =tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
flags.DEFINE_float('lr',0.1,'the learning rate')
flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')
flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')
#emotion embedding
flags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neural
flags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')
flags.DEFINE_integer('emdedding_dim',128,'embedding dim')
flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')
flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')
flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')
flags.DEFINE_integer('max_len',100,'max_len of training sentence')
flags.DEFINE_integer('valid_num',100,'epoch num of validation')
flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')
flags.DEFINE_float('init_scale',0.1,'init scale')
flags.DEFINE_integer('class_num',2,'class num')
flags.DEFINE_float('keep_prob',0.5,'dropout rate')
flags.DEFINE_integer('num_epoch',81,'num epoch')
flags.DEFINE_integer('max_decay_epoch',30,'num epoch')
flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')
flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')
class Config(object):
hidden_neural_size=FLAGS.hidden_neural_size
vocabulary_size=FLAGS.vocabulary_size
embed_dim=FLAGS.emdedding_dim
#emotion
emotion_nums=FLAGS.emotion_nums
emotion_embed_dim=FLAGS.emotion_embed_dim
#
hidden_layer_num=FLAGS.hidden_layer_num
class_num=FLAGS.class_num
keep_prob=FLAGS.keep_prob
lr = FLAGS.lr
lr_decay = FLAGS.lr_decay
batch_size=FLAGS.batch_size
num_step = FLAGS.max_len
max_grad_norm=FLAGS.max_grad_norm
num_epoch = FLAGS.num_epoch
max_decay_epoch = FLAGS.max_decay_epoch
valid_num=FLAGS.valid_num
out_dir=FLAGS.out_dir
max_len = FLAGS.max_len
checkpoint_every = FLAGS.check_point_every
def evaluate(model,session,data,global_steps=None,summary_writer=None):
correct_num=0
total_num=len(data[0])
for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):
fetches = model.correct_num
feed_dict={}
feed_dict[model.input_data]=x
feed_dict[model.target]=y
#feed_dict[model.mask_x]=mask_x
model.assign_new_batch_size(session,len(x))
#state = session.run(model._initial_state)
#for i , (c,h) in enumerate(model._initial_state):
# feed_dict[c]=state[i].c
# feed_dict[h]=state[i].h
count=session.run(fetches,feed_dict)
correct_num+=count
accuracy=float(correct_num)/total_num
dev_summary = tf.summary.scalar('dev_accuracy',accuracy)
dev_summary = session.run(dev_summary)
if summary_writer:
summary_writer.add_summary(dev_summary,global_steps)
summary_writer.flush()
return accuracy
def run_epoch(model,session,data,global_steps,valid_model,valid_data,train_summary_writer,valid_summary_writer=None):
for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):
feed_dict={}
feed_dict[model.input_data]=x
#feed_dict[model.emotion_state]=[]
feed_dict[model.target]=y
#feed_dict[model.mask_x]=mask_x
model.assign_new_batch_size(session,len(x))
fetches = [model.cost,model.accuracy,model.train_op,model.summary]
#state = session.run(model._initial_state)
#for i , (c,h) in enumerate(model._initial_state):
# feed_dict[c]=state[i].c
# feed_dict[h]=state[i].h
cost,accuracy,_,summary = session.run(fetches,feed_dict)
train_summary_writer.add_summary(summary,global_steps)
train_summary_writer.flush()
model.is_training=False
valid_accuracy=evaluate(valid_model,session,valid_data,global_steps,valid_summary_writer)
if(global_steps%100==0):
print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print("the %i step, train cost is: %f and the train accuracy is %f and the valid accuracy is %f"%(global_steps,cost,accuracy,valid_accuracy))
global_steps+=1
return global_steps
def train_step():
print("loading the dataset...")
config = Config()
eval_config=Config()
eval_config.keep_prob=1.0
train_data,valid_data,test_data= data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)
print("begin training")
# gpu_config=tf.ConfigProto()
# gpu_config.gpu_options.allow_growth=True
with tf.Graph().as_default(), tf.Session() as session:
initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
with tf.variable_scope("model",reuse=None,initializer=initializer):
model = MLP_Model(config=config,is_training=True)
# train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy])
train_summary_dir = os.path.join(config.out_dir,"summaries","train")
train_summary_writer = tf.summary.FileWriter(train_summary_dir,session.graph)
# dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy])
dev_summary_dir = os.path.join(eval_config.out_dir,"summaries","dev")
dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,session.graph)
#add checkpoint
checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver = tf.train.Saver(tf.global_variables())
tf.global_variables_initializer().run()
global_steps=1
begin_time=int(time.time())
for i in range(config.num_epoch):
print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print("the %d epoch training..."%(i+1))
lr_decay = config.lr_decay ** max(i-config.max_decay_epoch,0.0)
model.assign_new_lr(session,config.lr*lr_decay)
global_steps=run_epoch(model,session,train_data,global_steps,model,valid_data,train_summary_writer,dev_summary_writer)
if i% config.checkpoint_every==0:
path = saver.save(session,checkpoint_prefix,global_steps)
print("Saved model chechpoint to{}\n".format(path))
print("the train is finished")
end_time=int(time.time())
print("training takes %d seconds already\n"%(end_time-begin_time))
#test_accuracy=evaluate(test_model,session,test_data)
#print("the test data accuracy is %f"%test_accuracy)
print (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
print("program end!")
def main(_):
train_step()
if __name__ == "__main__":
tf.app.run()
指定python mlp.py
评估:evalute.py
import os
import time
import numpy as np
import tensorflow as tf
#import datetime
#from rnn_model import RNN_Model
from mlp_model import MLP_Model
import data_process
flags =tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size',64,'the batch_size of the training procedure')
flags.DEFINE_float('lr',0.1,'the learning rate')
flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')
flags.DEFINE_integer('vocabulary_size',40000,'vocabulary_size')
#emotion embedding
flags.DEFINE_integer("emotion_nums",2,'emotion_nums')#posivitive,negative,neural
flags.DEFINE_integer("emotion_embed_dim",128,'emotion embedding_dim')
flags.DEFINE_integer('emdedding_dim',128,'embedding dim')
flags.DEFINE_integer('hidden_neural_size',128,'LSTM hidden neural size')
flags.DEFINE_integer('hidden_layer_num',3,'LSTM hidden layer num')
flags.DEFINE_string('dataset_path','data/subj0.pkl','dataset path')
flags.DEFINE_integer('max_len',100,'max_len of training sentence')
flags.DEFINE_integer('valid_num',100,'epoch num of validation')
flags.DEFINE_integer('checkpoint_num',1000,'epoch num of checkpoint')
flags.DEFINE_float('init_scale',0.1,'init scale')
flags.DEFINE_integer('class_num',2,'class num')
flags.DEFINE_float('keep_prob',0.5,'dropout rate')
flags.DEFINE_integer('num_epoch',81,'num epoch')
flags.DEFINE_integer('max_decay_epoch',30,'num epoch')
flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
flags.DEFINE_string('out_dir',os.path.abspath(os.path.join(os.path.curdir,"review_runs2_81")),'output directory')
flags.DEFINE_integer('check_point_every',10,'checkpoint every num epoch ')
class Config(object):
hidden_neural_size=FLAGS.hidden_neural_size
vocabulary_size=FLAGS.vocabulary_size
embed_dim=FLAGS.emdedding_dim
#emotion
emotion_nums=FLAGS.emotion_nums
emotion_embed_dim=FLAGS.emotion_embed_dim
#
hidden_layer_num=FLAGS.hidden_layer_num
class_num=FLAGS.class_num
keep_prob=FLAGS.keep_prob
lr = FLAGS.lr
lr_decay = FLAGS.lr_decay
batch_size=FLAGS.batch_size
num_step = FLAGS.max_len
max_grad_norm=FLAGS.max_grad_norm
num_epoch = FLAGS.num_epoch
max_decay_epoch = FLAGS.max_decay_epoch
valid_num=FLAGS.valid_num
out_dir=FLAGS.out_dir
max_len = FLAGS.max_len
checkpoint_every = FLAGS.check_point_every
def evaluate(model,session,data,global_steps=None,summary_writer=None):
#pre_label=[]
accuracy=[]
for step, (x,y,mask_x) in enumerate(data_process.batch_iter(data, batch_size=FLAGS.batch_size)):
fetches = model.correct_num
label=model.prediction
feed_dict={}
feed_dict[model.input_data]=x
#feed_dict[model.target]=y
#feed_dict[model.mask_x]=mask_x
model.assign_new_batch_size(session,len(x))
#state = session.run(model._initial_state)
#for i , (c,h) in enumerate(model._initial_state):
# feed_dict[c]=state[i].c
# feed_dict[h]=state[i].h
#count=session.run(fetches,feed_dict)
pre=session.run(label,feed_dict)
correct_num=0
#pre_label.append(pre)
for i in range(len(pre)):
if pre[i]== y[i]:
correct_num +=1
accuracy.append(correct_num/len(pre))
#accuracy=float(correct_num)/total_num
#dev_summary = tf.summary.scalar('dev_accuracy',accuracy)
#dev_summary = session.run(dev_summary)
'''
if summary_writer:
summary_writer.add_summary(dev_summary,global_steps)
summary_writer.flush()
'''
return accuracy
def test_step():
print("loading the dataset...")
config = Config()
eval_config = Config()
eval_config.keep_prob = 1.0
train_data, valid_data, test_data = data_process.load_data(FLAGS.max_len, batch_size=config.batch_size)
print("begin testing....")
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
with tf.Session() as session:
initializer = tf.random_uniform_initializer(-1 * FLAGS.init_scale, 1 * FLAGS.init_scale)
with tf.variable_scope("model", reuse=None, initializer=initializer):
test_model = MLP_Model(config=eval_config, is_training=False)
curdir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
#curdir ="D:\\emotion_classifier\\runs3_60\\checkpoints\\model-18922"
ckpt = tf.train.get_checkpoint_state(curdir)
if ckpt != None:
print(ckpt.model_checkpoint_path)
test_model.saver.restore(session, ckpt.model_checkpoint_path)
else:
print("该路径不存在,结束!")
tf.global_variables_initializer().run()
return
accs = evaluate(test_model, session, test_data)
accuracy = np.mean(accs)
print("精确率为:%f"%(accuracy))
test_step()
数据处理data_process:
#coding=utf-8
import numpy as np
import random
import os
from io import open
import string
import datetime
"""
***yuchuli
"""
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
dataset_path_1='data/train_label.txt'
dataset_path_2="data/test_label.txt"
def set_dataset_path(path):
dataset_path=path
if not os.path.exists(dataset_path_1):
print('training dataset is null')
exit()
if not os.path.exists(dataset_path_2):
print('test dataset is null')
exit()
def load_data(max_len,batch_size,n_words=40000,valid_portion=0.2,sort_by_len=False):
f=open(dataset_path_1,'rb')
f1=open(dataset_path_2,'rb')
f2=open('data/train_set_encode','rb')
f3=open('data/test_set_encode','rb')
print ('load training label from %s\nload test label from %s'%(dataset_path_1,dataset_path_2))
train_set_x=[]
train_set_y=[]
test_set_x=[]
test_set_y=[]
#load label
for line in f.readlines():
y=int(line.strip())
train_set_y.append(y)
for line1 in f1.readlines():
y = int(line1.strip())
test_set_y.append(y)
#get the trainset
for line in f2.readlines():
line=line.decode('utf-8').strip().split(' ')
train_set_x.append(line)
for line in f3.readlines():
line=line.decode('utf-8').strip().split(' ')
test_set_x.append(line)
f.close()
f1.close()
f2.close()
f3.close()
#string matrix-->int matrix
def string_to_int(input):
output=[]
for line in input:
line_vec=[]
for word in line:
num=int(word)
line_vec.append(num)
output.append(line_vec)
return output
train_set_x=string_to_int(train_set_x)
test_set_x=string_to_int(test_set_x)
valid_set_y=[]
valid_set_x=[]
#split train/valid set
n_samples = len(train_set_x)
sidx=np.random.permutation(n_samples)
n_train = int(np.round(n_samples * (1. - valid_portion)))
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
train_set=(train_set_x,train_set_y)
valid_set=(valid_set_x,valid_set_y)
test_set=(test_set_x,test_set_y)
# remove unknow words
def remove_unk(x):
return [[UNK_ID if w >= n_words else w for w in sen] for sen in x]
test_set_x, test_set_y = test_set
valid_set_x, valid_set_y = valid_set
train_set_x, train_set_y = train_set
train_set_x = remove_unk(train_set_x)
valid_set_x = remove_unk(valid_set_x)
test_set_x = remove_unk(test_set_x)
def len_argsort(seq):
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
if sort_by_len:
sorted_index = len_argsort(test_set_x)
test_set_x = [test_set_x[i] for i in sorted_index]
test_set_y = [test_set_y[i] for i in sorted_index]
sorted_index = len_argsort(valid_set_x)
valid_set_x = [valid_set_x[i] for i in sorted_index]
valid_set_y = [valid_set_y[i] for i in sorted_index]
sorted_index = len_argsort(train_set_x)
train_set_x = [train_set_x[i] for i in sorted_index]
train_set_y = [train_set_y[i] for i in sorted_index]
train_set = (train_set_x, train_set_y)
valid_set = (valid_set_x, valid_set_y)
test_set = (test_set_x, test_set_y)
new_train_set_x = np.zeros([len(train_set[0]), max_len])
new_train_set_y = np.zeros(len(train_set[0]))
new_valid_set_x = np.zeros([len(valid_set[0]), max_len])
new_valid_set_y = np.zeros(len(valid_set[0]))
new_test_set_x = np.zeros([len(test_set[0]), max_len])
new_test_set_y = np.zeros(len(test_set[0]))
mask_train_x = np.zeros([max_len, len(train_set[0])])
mask_valid_x = np.zeros([max_len, len(valid_set[0])])
mask_test_x = np.zeros([max_len, len(test_set[0])])
#padding
def padding_and_generate_mask(x, y, new_x, new_y, new_mask_x):
for i, (x, y) in enumerate(zip(x, y)):
if len(x) <= max_len:
new_x[i, 0:len(x)] = x
new_mask_x[0:len(x), i] = 1
new_y[i] = y
else:
new_x[i] = (x[0:max_len])
new_mask_x[:, i] = 1
new_y[i] = y
new_set = (new_x, new_y, new_mask_x)
del new_x, new_y
return new_set
train_set = padding_and_generate_mask(train_set[0], train_set[1], new_train_set_x, new_train_set_y, mask_train_x)
valid_set = padding_and_generate_mask(valid_set[0], valid_set[1], new_valid_set_x, new_valid_set_y, mask_valid_x)
test_set = padding_and_generate_mask(test_set[0], test_set[1], new_test_set_x, new_test_set_y, mask_test_x)
return train_set,valid_set,test_set
#return batch dataset
def batch_iter(data,batch_size):
#get dataset and label
x,y,mask_x=data#wentiguanjian
x=np.array(x)
y=np.array(y)
data_size=len(x)
num_batches_per_epoch=int((data_size-1)/batch_size)+1
for batch_index in range(num_batches_per_epoch):
start_index=batch_index*batch_size
end_index=min((batch_index+1)*batch_size,data_size)
return_x = x[start_index:end_index]
return_y = y[start_index:end_index]
return_mask_x = mask_x[:,start_index:end_index]
# if(len(return_x)
# print(len(return_x))
# print return_x
# print return_y
# print return_mask_x
# import sys
# sys.exit(0)
yield (return_x,return_y,return_mask_x)
最终结果86%左右,还不错。