import numpy as np
import random
with np.load('rnn_data/file_name.npz') as data:
feature = data['feature']
label = data['label']
rea_lenth = data['true_lenth'] #实际长度
#迭代器
class SimpleDataIterator():
def __init__(self, X,y,true_lenth):
self.X = X
self.y = y
self.true_lenth = true_lenth
self.size = len(self.y)
self.epochs = 0
self.cursor = 0
def shuffle(self):
shuffle_index = np.random.permutation(self.size)
self.X = [self.X[i] for i in shuffle_index]
self.y = [self.y[i] for i in shuffle_index]
self.true_lenth = [self.true_lenth[i] for i in shuffle_index]
self.cursor = 0
def next_batch(self, batch_size): #batch_size作为参数传递
if self.cursor+batch_size > self.size:
self.epochs += 1
self.shuffle()
resX = self.X[self.cursor:self.cursor+batch_size]
resX = np.array(resX)
resy = self.y[self.cursor:self.cursor+batch_size]
resy = np.array(resy)
res_len = self.true_lenth[self.cursor:self.cursor+batch_size]
res_len = np.array(res_len)
self.cursor += batch_size
return resX,resy,res_len
#训练
import tensorflow as tf
import sys
import random
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold #StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from scipy import interp
%matplotlib inline #jupyter绘图需要
# hyperparameters
lr = 0.0001
keep_prob = 0.5
lambda_l2_reg = 0.01
training_iters = 100000
train_batch_size = 200 #3200/200=16用作训练 800/200用作测试
test_batch_size = 200
n_steps = 4950 #最大时序长度
n_inputs = 23 # 输入35维的向量
n_hidden_units = 64
n_classes = 2
with tf.Graph().as_default():
# tf Graph input
x = tf.placeholder(tf.float32, [None, n_steps, n_inputs]) #设为None动态改变batch大小 batch size x max length x features.
y = tf.placeholder(tf.float32, [None, n_classes])
true_lenth = tf.placeholder(tf.int32)
#Define weights
weights = {
'in': tf.Variable(tf.random_normal([n_inputs, n_hidden_units])),
'out': tf.Variable(tf.random_normal([n_hidden_units, n_classes]))
}
biases = {
'in': tf.Variable(tf.constant(0.1, shape=[n_hidden_units, ])),
'out': tf.Variable(tf.constant(0.1, shape=[n_classes, ]))
}
indices = label
depth = 2
on_value = 1
off_value = 0
output_ = tf.one_hot(indices,depth,on_value,off_value,axis=1)
def length(sequence):
used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
length = tf.reduce_sum(used, reduction_indices=1)
length = tf.cast(length, tf.int32)
return length
def last_relevant(output, length):
batch_size = tf.shape(output)[0]
max_length = tf.shape(output)[1]
out_size = int(output.get_shape()[2])
index = tf.range(0, batch_size) * max_length + (length - 1)
flat = tf.reshape(output, [-1, out_size])
result = tf.gather(flat, index)
return result
def RNN(X, weights, biases,true_lenth):
with tf.variable_scope('init_name',initializer=tf.orthogonal_initializer()): #正交初始化
cell = tf.contrib.rnn.GRUCell(n_hidden_units)
init_state = tf.get_variable('init_state', [1, n_hidden_units],initializer=tf.constant_initializer(0.0)) #tf.constant_initializer(0.0)
init_state = tf.tile(init_state, [train_batch_size, 1])
outputs, states = tf.nn.dynamic_rnn(
cell,X,dtype=tf.float32,sequence_length=true_lenth,initial_state=init_state)
outputs = tf.nn.dropout(outputs, keep_prob)
last = last_relevant(outputs, true_lenth)
results = tf.matmul(last,weights['out']) + biases['out']
return results
def cost(output, target):
# Compute cross entropy for each frame.
cross_entropy = target * tf.log(output+ 1e-10)
cross_entropy = -tf.reduce_sum(cross_entropy, axis=2)
mask = tf.sign(tf.reduce_max(tf.abs(target), axis=2))
cross_entropy *= mask
# Average over actual sequence lengths.
cross_entropy = tf.reduce_sum(cross_entropy, axis=1)
cross_entropy /= tf.reduce_sum(mask, axis=1)
return tf.reduce_mean(cross_entropy)
pred = RNN(x, weights, biases,true_lenth)
predict_prob = tf.nn.softmax(pred) #得到对应预测标签的概率值
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y,logits = pred)) #(pred,y))
#L2正则化
l2 = lambda_l2_reg * sum(
tf.nn.l2_loss(tf_var)
for tf_var in tf.trainable_variables()
if not ("Bias" in tf_var.name)
)
cost += l2
train_op = tf.train.AdamOptimizer(lr).minimize(cost)
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) #返回true/false
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
else:
init = tf.global_variables_initializer()
with tf.Session() as sess:
labelR = sess.run(output_)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
cv = StratifiedKFold(label, n_folds=5)
finalRes = []
for numFold,(train_index,test_index) in enumerate(cv):
sess.run(init)
x_train = [X[i] for i in train_index]
y_train = [labelR[i] for i in train_index]
train_true_lenth = [rea_lenth[i] for i in train_index]
x_test = [X[i] for i in test_index]
y_test = [labelR[i] for i in test_index]
test_true_lenth = [rea_lenth[i] for i in test_index]
print('train_index 长度:',len(train_index))
print('test_index 长度:',len(test_index))
trainingData = SimpleDataIterator(x_train,y_train,train_true_lenth)
testingData = SimpleDataIterator(x_test,y_test,test_true_lenth)
epoch = 0 #统计迭代所有训练集的次数
maxAccuracy = 0 #连续5次不大于验证集最大准确性则 early stopping
failNum = 0 #统计连续不大于最大准确性的次数
count_ = 0
while epoch= int(len(y_train)/train_batch_size)) : #每30epoch输出此刻准确性
accur = sess.run(accuracy, feed_dict={
x: batch_xs,
y: batch_ys,
true_lenth:batch_xs_len,
})
print('%s%d%s%f'%('At ',epoch,'th accuracy:',accur) )
valiTem = x_test[0:train_batch_size];valiTem = np.array(valiTem)
vali_y = y_test[0:train_batch_size];vali_y = np.array(vali_y)
vali_len = test_true_lenth[0:train_batch_size]
valiAccur = sess.run(accuracy,feed_dict={x:valiTem.reshape([-1, n_steps, n_inputs]),
y:vali_y,true_lenth:vali_len,}) #测试集中拿出一份用于验证集
if valiAccur > maxAccuracy:
maxAccuracy = valiAccur
failNum = 0
else :
failNum += 1
costVal = sess.run(cost, feed_dict={
x: batch_xs,
y: batch_ys,
true_lenth:batch_xs_len,
})
print('%s%f'%('cost:',costVal))
if failNum >= 5:
print('%s%f'%('Accuracy on validation set:',valiAccur))
break
if trainingData.epochs>epoch: #一个batch内的数据全部遍历完
epoch += 1
count_ = 0
result = []
prob = [] #保存最后预测每个label的概率
final_label = []
while testingData.epochs == 0:
batch_xt,batch_yt,batch_xt_len = testingData.next_batch(test_batch_size)
batch_xt = np.array(batch_xt)
batch_yt = np.array(batch_yt)
batch_xt = batch_xt.reshape([test_batch_size, n_steps, n_inputs])
temp_prob = sess.run(predict_prob,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,})
temp_label = sess.run(tf.argmax(batch_yt, 1))
final_label.extend(temp_label)
temp_prob2 = np.array(temp_prob)
prob.extend(temp_prob2[:,1])
result.append(sess.run(accuracy,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,}))
fpr, tpr, thresholds = roc_curve(final_label, prob, pos_label=1)
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.6f)' % (numFold, roc_auc))
print('%d%s%f'%(numFold,"th fold accuracy:",np.mean(result)))
finalRes.append(np.mean(result))
print("Testing accuracy:",np.mean(finalRes))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')#画对角线
mean_tpr /= len(cv) #在mean_fpr100个点,每个点处插值插值多次取平均
mean_tpr[-1] = 1.0 #坐标最后一个点为(1,1)
mean_auc = auc(mean_fpr, mean_tpr) #计算平均AUC值
#画平均ROC曲线
plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.6f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
with tf.Session() as sess:
labelR = sess.run(output_)
#Create a saver object which will save all the variables
saver = tf.train.Saver()
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
cv = StratifiedKFold(label, n_folds=5)
finalRes = []
for numFold,(train_index,test_index) in enumerate(cv):
highest_accuracy = 0
steps_since_save = 0
sess.run(init)
x_train = [X[i] for i in train_index]
y_train = [labelR[i] for i in train_index]
train_true_lenth = [rea_lenth[i] for i in train_index]
x_test = [X[i] for i in test_index]
y_test = [labelR[i] for i in test_index]
test_true_lenth = [rea_lenth[i] for i in test_index]
print('train_index 长度:',len(train_index))
print('test_index 长度:',len(test_index))
trainingData = SimpleDataIterator(x_train,y_train,train_true_lenth)
testingData = SimpleDataIterator(x_test,y_test,test_true_lenth)
epoch = 0 #统计迭代所有训练集的次数
count_ = 0
while epoch= int(len(y_train)/train_batch_size)) : #每30epoch输出此刻准确性
accur = sess.run(accuracy, feed_dict={
x: batch_xs,
y: batch_ys,
true_lenth:batch_xs_len,
})
print('%s%d%s%f'%('At ',epoch,'th accuracy:',accur) )
if accur>highest_accuracy:
print(">> New Highest Accuracy, Saving Model <<")
saver.save(sess, 'saved_model_{0}'.format(numFold))
print(">> Model Saved <<")
highest_accuracy = accur
steps_since_save = 0
else:
steps_since_save += 1
valiTem = x_test[0:train_batch_size];valiTem = np.array(valiTem)
vali_y = y_test[0:train_batch_size];vali_y = np.array(vali_y)
vali_len = test_true_lenth[0:train_batch_size]
valiAccur = sess.run(accuracy,feed_dict={x:valiTem.reshape([-1, n_steps, n_inputs]),
y:vali_y,true_lenth:vali_len,}) #测试集中拿出一份用于验证集
costVal = sess.run(cost, feed_dict={
x: batch_xs,
y: batch_ys,
true_lenth:batch_xs_len,
})
print('%s%f'%('cost:',costVal))
if steps_since_save > MAX_STEPS_SINCE_SAVE:
print("\n\n**** MODEL CONVERGED, STOPPING EARLY ****")
print('%s%f'%('Accuracy on validation set:',valiAccur))
break
if trainingData.epochs>epoch: #一个batch内的数据全部遍历完
epoch += 1
count_ = 0
#restore model
new_saver = tf.train.import_meta_graph('saved_model_{0}.meta'.format(numFold))
new_saver.restore(sess, tf.train.latest_checkpoint('./'))
all_vars = tf.get_collection('vars')
#测试
result = []
prob = [] #保存最后预测每个label的概率
final_label = []
while testingData.epochs == 0:
batch_xt,batch_yt,batch_xt_len = testingData.next_batch(test_batch_size)
batch_xt = np.array(batch_xt)
batch_yt = np.array(batch_yt)
batch_xt = batch_xt.reshape([test_batch_size, n_steps, n_inputs])
temp_prob = sess.run(predict_prob,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,})
temp_label = sess.run(tf.argmax(batch_yt, 1))
final_label.extend(temp_label)
temp_prob2 = np.array(temp_prob)
prob.extend(temp_prob2[:,1])
result.append(sess.run(accuracy,feed_dict={x:batch_xt,y:batch_yt,true_lenth:batch_xt_len,}))