第一次使用tensorflow 很坑 以此记录一些我自己悟出来的门道
概况:样本共有10类,文件夹的安排是这样的,一个train文件夹下放所有的训练图片,然后train下面有10个不同的文件夹,命名为c0、c1、c2…..代表种类,数据存tfrecords,使用slim,vgg16的网络,tensorboard画曲线。
整体代码:
import os
import pandas
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.contrib.slim.nets import vgg
slim=tf.contrib.slim
tfrecords_filename='train.tfrecords'
im_width=224
im_height=224
batch_size=16
num_epochs=10
lr=0.0001
decay_rate=0.1
decay_per=40
train_imgdir='/home/rsq/dataset/kaggledriver/imgs/train/'
test_imgdir='/home/rsq/dataset/kaggledriver/imgs/test/'
train_lists=pandas.read_csv('/home/rsq/dataset/kaggledriver/driver_imgs_list.csv')
test_images=os.listdir(test_imgdir)
test_labels=pandas.read_csv('/home/rsq/dataset/kaggle/sample_submission.csv')
num_iter=len(train_lists)/batch_size
train_summary=[]
valid_summary=[]
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _int64_feature(value):
if type(value)!= list:
value = [value]
#return tf.train.Feature(list=tf.train.Int64List(value=value))
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
if os.path.exists(tfrecords_filename):
print tfrecords_filename,'already exists'
else:
writer_train=tf.python_io.TFRecordWriter(tfrecords_filename)
print "scaning preprocessed file to '%s'" % tfrecords_filename
for i in range(len(train_lists['img'])):
imgname,classname=train_lists['img'][i],train_lists['classname'][i],
img=Image.open(os.path.join(train_imgdir,classname,imgname))
img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))
label=int(classname[1])
example=tf.train.Example(features=tf.train.Features(feature={
'image_raw':_bytes_feature(img.tostring()),
'label':_int64_feature(label)
}))
writer_train.write(example.SerializeToString())
writer_train.close()
print ('preprocessing done')
def read_and_decode(filename_queue):
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
features=tf.parse_single_example(
serialized_example,
features={
'image_raw':tf.FixedLenFeature([],tf.string),
'label':tf.FixedLenFeature([],tf.int64)
}
)
image=tf.decode_raw(features['image_raw'],tf.uint8)
image=tf.reshape(image,[im_height,im_width,3])
label=tf.cast(features['label'],tf.int64)
images,labels=tf.train.shuffle_batch([image,label],
batch_size=batch_size,capacity=256,min_after_dequeue=64,num_threads=2
)
tlabels=tf.one_hot(labels,10,1.0,0.0)
return images,tlabels
def infer(inputs,is_training=True):
inputs=tf.cast(inputs,tf.float32)
inputs = np.multiply(inputs, 1.0 / 255.0) #(0,1)
with tf.variable_scope("vgg_16"):
with slim.arg_scope(vgg.vgg_arg_scope()):
net=slim.repeat(inputs,2,slim.conv2d,64,[3,3],scope='conv1')
net=slim.max_pool2d(net,[2,2],scope='pool1')
net=slim.repeat(net,2,slim.conv2d,128,[3,3],scope='conv2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
net = slim.max_pool2d(net, [2, 2], scope='pool4')
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
net = slim.max_pool2d(net, [2, 2], scope='pool5')
net=slim.flatten(net)
net=slim.fully_connected(net,4096,weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
scope='finetune/fc1'
)
net = slim.dropout(net, 0.5, scope='dropout1')
net=slim.fully_connected(net,4096,weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
scope='finetune/fc2'
)
net = slim.dropout(net,0.5, scope='dropout2')
net=slim.fully_connected(net,1000,weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
scope='finetune/fc3'
)
net = slim.dropout(net,0.5, scope='dropout3')
net=slim.fully_connected(net,10,weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
scope='finetune/fc4'
)
return net
def losses(logits,labels,train_summary):
#print labels
#print logits
loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=labels))
#loss=slim.losses.softmax_cross_entropy(logits,labels)
train_summary.append(tf.summary.scalar('loss',loss))
#valid_summary.append(tf.summary.scalar('loss',loss))
return loss
def accuracy(prediction,labels,train_summary,valid_summary):
prediction=tf.nn.softmax(prediction)
correct=tf.equal(tf.argmax(prediction,1),tf.argmax(labels,1))
accuracy=tf.reduce_mean(tf.cast(correct,'float'))
train_summary.append(tf.summary.scalar('acc',accuracy))
valid_summary.append(tf.summary.scalar('acc',accuracy))
def optimize(losses):
#global_step = tf.Variable(0)
#global_step=tf.contrib.framework.get_or_create_global_step()
learning_rate=tf.train.exponential_decay(lr,global_step,
10000,decay_rate,
staircase=True
)
#optimizer=tf.train.MomentumOptimizer(learning_rate,
#0.9)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
#optimizer=tf.train.AdamOptimizer(learning_rate)
train_op=optimizer.minimize(losses,global_step=global_step)
return train_op
#image=tf.placeholder(tf.uint8,[])
tf.reset_default_graph()
global_step = tf.Variable(0,trainable=False)
filename_queue=tf.train.string_input_producer(['train.tfrecords'],num_epochs=num_epochs+5)
image,label=read_and_decode(filename_queue)
prediction=infer(image)
loss=losses(prediction,label,train_summary)
train_op=optimize(loss)
acc=accuracy(prediction,label,train_summary,valid_summary)
print "training started"
train_merged=tf.summary.merge(train_summary)
valid_merged=tf.summary.merge(valid_summary)
with tf.Session() as sess:
#global_step = tf.Variable(0)
train_writer=tf.summary.FileWriter('./train/7.12.3.30',sess.graph)
#valid_writer=tf.summary.FileWriter('./valid',sess.graph)
init_op=tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer()
)
restore=slim.assign_from_checkpoint_fn(
'../kaggletest/vgg_16.ckpt',
slim.get_model_variables('vgg_16')
)
sess.run(init_op)
restore(sess)
coord=tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess,coord=coord)
for e in range(num_epochs):
avg_loss,acc=0,0
for i in range(num_iter):
_,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])
train_writer.add_summary(summaries,step)
if i%50==0:
print "iterator '%d' epochs %d" % (step,num_epochs)
coord.request_stop()
coord.join(threads)
print 'Train Done'
saver=tf.train.Saver(slim.get_model_variables())
saver.save(sess,'model.ckpt')
sess.close()
tf.reset_default_graph()
im_placeholder=tf.placeholder(tf.uint8,[None,im_height,im_width,3])
logits=infer(im_placeholder,is_training=False)
prediction=tf.nn.softmax(logits)
predicted_labels=tf.argmax(prediction,1)
array=['img','c0','c1','c2','c3','c4','c5','c6','c7','c8','c9']
tmppred=[]
with tf.Session() as sess:
saver=tf.train.Saver()
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
saver.restore(sess,'model.ckpt')
for i, img_path in enumerate(test_images):
#print "\rProcessing %d/%d" % (i+1,len(test_images))
a=[]
img=Image.open(os.path.join(test_imgdir,img_path))
img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))
prob=sess.run(prediction,feed_dict={im_placeholder:np.expand_dims(img,axis=0)})
prob=prob.tolist()
imgname=(img_path.split('/')[-1])
a.append(imgname)
a.extend(prob[0])
tmppred.append(a)
#test_labels.invasive[idx]=prob[0][1]
filename_output='predictionVGG.csv'
pd_data = pandas.DataFrame(tmppred,columns=array)
pd_data.to_csv(filename_output,index=False)
print "Writing result to ",filename_output
sess.close()
首先第一坑:使用tfrecords的话那one-hot-coding怎么办???
小仙女没有找到博客讲啊啊啊,小仙女也没找到文档啊啊啊,经过不断的尝试。。试出来了。请看
for i in range(len(train_lists['img'])):
imgname,classname=train_lists['img'][i],train_lists['classname'][i],
img=Image.open(os.path.join(train_imgdir,classname,imgname))
img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))
label=int(classname[1])#图片按类分别存在不同文件夹下,文件夹名称:c0,c1,c2...
example=tf.train.Example(features=tf.train.Features(feature={
'image_raw':_bytes_feature(img.tostring()),
'label':_int64_feature(label)
}))
writer_train.write(example.SerializeToString())
writer_train.close()
存到tfrecords里的时候还没有采用onehotcoding
然后,在读tfrecords时,请看:
def read_and_decode(filename_queue):
reader=tf.TFRecordReader()
_,serialized_example=reader.read(filename_queue)
features=tf.parse_single_example(
serialized_example,
features={
'image_raw':tf.FixedLenFeature([],tf.string),
'label':tf.FixedLenFeature([],tf.int64)
}
)
image=tf.decode_raw(features['image_raw'],tf.uint8)
image=tf.reshape(image,[im_height,im_width,3])
label=tf.cast(features['label'],tf.int64)
images,labels=tf.train.shuffle_batch([image,label],
batch_size=batch_size,capacity=256,min_after_dequeue=64,num_threads=2
)
tlabels=tf.one_hot(labels,10,1.0,0.0)#tensor类型的label可以直接使用tf.one_hot()函数变成onehotcoding
return images,tlabels
第二坑 cross validation 到底要怎样啦
此大坑包含两个小坑
我要怎样summary啊
关于这一点,可以这样
train_summary=[]
valid_summary=[]
然后分别append不同的scalar或者histogram
train_summary.append(tf.summary.scalar('acc',accuracy))
valid_summary.append(tf.summary.scalar('acc',accuracy))
然后
train_merged=tf.summary.merge(train_summary)
valid_merged=tf.summary.merge(valid_summary)
然后记录的时候
_,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])#可以写不同的merged
train_writer.add_summary(summaries,step)
step要怎么计算啦,还有sess.run(valid_merged)的时候究竟会不会读下一组batch啊
首先,tf里面有个globalstep是用来记录读了几次batch的
global_step = tf.Variable(0,trainable=False)
这个值tensorflow会自己记录下来,需要用的时候可以直接用
def optimize(losses):
learning_rate=tf.train.exponential_decay(lr,global_step,
10000,decay_rate,
staircase=True
)
#optimizer=tf.train.MomentumOptimizer(learning_rate,
#0.9)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
#optimizer=tf.train.AdamOptimizer(learning_rate)
train_op=optimizer.minimize(losses,global_step=global_step)
return train_op
或者
_,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])
train_writer.add_summary(summaries,step)
if i%50==0:
print "iterator %d epochs %d" % (step,num_epochs)
所以由结果显示,在单独sess.run(valid_merged)时,并不会读下一个batch。我是初学者啊,并不知道为什么。实验结果就是这样。所以crossvalidation还是老老实实整两个tfrecords
第三坑 我用tensorboard画出来的图为什么不对啊啊啊为什么loss和acc的曲线都是乱七八糟的横坐标纵坐标对不上啊啊啊啊
关于这一点。。过了很久我才悟出来。。不同次训练得到的summary要存在不同的文件夹下。。。不要好多次训练得到的结果都放在同一文件夹下。。这样tensorboard会把这一堆结果都当成同一个变量画在同一个表格里。。。。全乱了。
这个不算坑吧,slim finetune的秘密全部隐藏在这里!
restore=slim.assign_from_checkpoint_fn(
'../kaggletest/vgg_16.ckpt',#这个是存放别人调好的模型参数的文件
slim.get_model_variables('vgg_16')#这个是选择要使用训练好的数据初始化的层。vgg_16这个是在构建网络的时候给那一部分网络取得名字,这里面对应的那几层将用这个文件里的参数初始化。其他的层将进行finetune
)
还有一点很重要,有一句话要记得加
tf.reset_default_graph()
这句看一下代码里都在什么地方,要保证所有变量在同一个graph里
下面是第一次实验得到的一些教训
如果acc和loss曲线不收敛,可以:
换一个optimizer试一试
改一下learning rate试一试
learning rate对最后的结果影响很大,调的太大有可能不收敛,结果跳啊跳啊跳,跳的太小学不到东西
而optimizer的话是不懂的,只有没被注释掉的那一行可以收敛。但是查了一下大部分任务用gradientdescent是挺好的。收敛以后精确度比adam略高。
#optimizer=tf.train.MomentumOptimizer(learning_rate,
#0.9)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
#optimizer=tf.train.AdamOptimizer(learning_rate)
如果loss和acc波动很大,可以:
把batch_size增大
如果提示显存不足
换块显卡吧
电脑显卡太辣鸡啊,3g显存啥都干不了了啊啊啊,vgg16的网络 batchsize16已经是极限了。。训练的曲线波动好大好大。。内心血与泪