tensorflow试用踩坑

第一次使用tensorflow 很坑 以此记录一些我自己悟出来的门道
概况:样本共有10类,文件夹的安排是这样的,一个train文件夹下放所有的训练图片,然后train下面有10个不同的文件夹,命名为c0、c1、c2…..代表种类,数据存tfrecords,使用slim,vgg16的网络,tensorboard画曲线。
整体代码:

import os

import pandas
import numpy as np
from PIL import Image

import tensorflow as tf
from tensorflow.contrib.slim.nets import vgg

slim=tf.contrib.slim
tfrecords_filename='train.tfrecords'

im_width=224
im_height=224

batch_size=16
num_epochs=10
lr=0.0001
decay_rate=0.1
decay_per=40

train_imgdir='/home/rsq/dataset/kaggledriver/imgs/train/'
test_imgdir='/home/rsq/dataset/kaggledriver/imgs/test/'
train_lists=pandas.read_csv('/home/rsq/dataset/kaggledriver/driver_imgs_list.csv')

test_images=os.listdir(test_imgdir)
test_labels=pandas.read_csv('/home/rsq/dataset/kaggle/sample_submission.csv')
num_iter=len(train_lists)/batch_size

train_summary=[]
valid_summary=[]

def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    if  type(value)!= list:
        value = [value]
    #return tf.train.Feature(list=tf.train.Int64List(value=value))
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

if os.path.exists(tfrecords_filename):
    print tfrecords_filename,'already exists'
else:
    writer_train=tf.python_io.TFRecordWriter(tfrecords_filename)
    print "scaning preprocessed file to '%s'" % tfrecords_filename
    for i in range(len(train_lists['img'])):
        imgname,classname=train_lists['img'][i],train_lists['classname'][i],
        img=Image.open(os.path.join(train_imgdir,classname,imgname))
        img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))

        label=int(classname[1])

        example=tf.train.Example(features=tf.train.Features(feature={
                'image_raw':_bytes_feature(img.tostring()),
                'label':_int64_feature(label)
                }))
        writer_train.write(example.SerializeToString())
    writer_train.close()

    print ('preprocessing done')

    def read_and_decode(filename_queue):
        reader=tf.TFRecordReader()
        _,serialized_example=reader.read(filename_queue)

        features=tf.parse_single_example(
                serialized_example,
                features={
                        'image_raw':tf.FixedLenFeature([],tf.string),
                        'label':tf.FixedLenFeature([],tf.int64)
                        }
                )
        image=tf.decode_raw(features['image_raw'],tf.uint8)
        image=tf.reshape(image,[im_height,im_width,3])

        label=tf.cast(features['label'],tf.int64)

        images,labels=tf.train.shuffle_batch([image,label],
            batch_size=batch_size,capacity=256,min_after_dequeue=64,num_threads=2
                                             )

        tlabels=tf.one_hot(labels,10,1.0,0.0)
        return images,tlabels

    def infer(inputs,is_training=True):
        inputs=tf.cast(inputs,tf.float32)
        inputs = np.multiply(inputs, 1.0 / 255.0)   #(0,1)
        with tf.variable_scope("vgg_16"):
            with slim.arg_scope(vgg.vgg_arg_scope()):
                net=slim.repeat(inputs,2,slim.conv2d,64,[3,3],scope='conv1')
                net=slim.max_pool2d(net,[2,2],scope='pool1')
                net=slim.repeat(net,2,slim.conv2d,128,[3,3],scope='conv2')
                net = slim.max_pool2d(net, [2, 2], scope='pool2')
                net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
                net = slim.max_pool2d(net, [2, 2], scope='pool3')
                net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
                net = slim.max_pool2d(net, [2, 2], scope='pool4')
                net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
                net = slim.max_pool2d(net, [2, 2], scope='pool5')

        net=slim.flatten(net)
        net=slim.fully_connected(net,4096,weights_initializer=tf.contrib.layers.xavier_initializer(),
                                 weights_regularizer=slim.l2_regularizer(0.0005),
                                 scope='finetune/fc1'
                                 )
        net = slim.dropout(net, 0.5, scope='dropout1')

        net=slim.fully_connected(net,4096,weights_initializer=tf.contrib.layers.xavier_initializer(),
                                 weights_regularizer=slim.l2_regularizer(0.0005),
                                 scope='finetune/fc2'
                                 )
        net = slim.dropout(net,0.5, scope='dropout2')

        net=slim.fully_connected(net,1000,weights_initializer=tf.contrib.layers.xavier_initializer(),
                                 weights_regularizer=slim.l2_regularizer(0.0005),
                                 scope='finetune/fc3'
                                 )
        net = slim.dropout(net,0.5, scope='dropout3')
        net=slim.fully_connected(net,10,weights_initializer=tf.contrib.layers.xavier_initializer(),
                                 weights_regularizer=slim.l2_regularizer(0.0005),
                                 scope='finetune/fc4'
                                 )
        return net

    def losses(logits,labels,train_summary):
        #print labels
        #print logits
        loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=labels))
        #loss=slim.losses.softmax_cross_entropy(logits,labels)
        train_summary.append(tf.summary.scalar('loss',loss))
        #valid_summary.append(tf.summary.scalar('loss',loss))
        return loss

    def accuracy(prediction,labels,train_summary,valid_summary):
        prediction=tf.nn.softmax(prediction)
        correct=tf.equal(tf.argmax(prediction,1),tf.argmax(labels,1))
        accuracy=tf.reduce_mean(tf.cast(correct,'float'))
        train_summary.append(tf.summary.scalar('acc',accuracy))
        valid_summary.append(tf.summary.scalar('acc',accuracy))

    def optimize(losses):
        #global_step = tf.Variable(0)

        #global_step=tf.contrib.framework.get_or_create_global_step()

        learning_rate=tf.train.exponential_decay(lr,global_step,
                                                 10000,decay_rate,
                                                 staircase=True
                                                 )
        #optimizer=tf.train.MomentumOptimizer(learning_rate,
                                         #0.9)
        optimizer=tf.train.GradientDescentOptimizer(learning_rate)
        #optimizer=tf.train.AdamOptimizer(learning_rate)
        train_op=optimizer.minimize(losses,global_step=global_step)
        return train_op
    #image=tf.placeholder(tf.uint8,[]) 
    tf.reset_default_graph()
    global_step = tf.Variable(0,trainable=False)
    filename_queue=tf.train.string_input_producer(['train.tfrecords'],num_epochs=num_epochs+5)
    image,label=read_and_decode(filename_queue)

    prediction=infer(image)
    loss=losses(prediction,label,train_summary)
    train_op=optimize(loss)
    acc=accuracy(prediction,label,train_summary,valid_summary)
    print "training started"

    train_merged=tf.summary.merge(train_summary)
    valid_merged=tf.summary.merge(valid_summary)

    with tf.Session() as sess:
        #global_step = tf.Variable(0)

        train_writer=tf.summary.FileWriter('./train/7.12.3.30',sess.graph)
        #valid_writer=tf.summary.FileWriter('./valid',sess.graph)
        init_op=tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()
                         )
        restore=slim.assign_from_checkpoint_fn(
                '../kaggletest/vgg_16.ckpt',
                slim.get_model_variables('vgg_16')
                )
        sess.run(init_op)

        restore(sess)
        coord=tf.train.Coordinator()
        threads=tf.train.start_queue_runners(sess=sess,coord=coord)

        for e in range(num_epochs):
            avg_loss,acc=0,0
            for i in range(num_iter):
                _,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])
                train_writer.add_summary(summaries,step)
                if i%50==0:
                    print "iterator  '%d'  epochs    %d" % (step,num_epochs)                 
        coord.request_stop()
        coord.join(threads)
        print 'Train Done'
        saver=tf.train.Saver(slim.get_model_variables())
        saver.save(sess,'model.ckpt')
        sess.close()

    tf.reset_default_graph()
    im_placeholder=tf.placeholder(tf.uint8,[None,im_height,im_width,3])

    logits=infer(im_placeholder,is_training=False)

    prediction=tf.nn.softmax(logits)

    predicted_labels=tf.argmax(prediction,1)

    array=['img','c0','c1','c2','c3','c4','c5','c6','c7','c8','c9']
    tmppred=[]   
    with tf.Session() as sess:
        saver=tf.train.Saver()
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())
        saver.restore(sess,'model.ckpt')

        for i, img_path in enumerate(test_images):
            #print "\rProcessing %d/%d" % (i+1,len(test_images))
            a=[]

            img=Image.open(os.path.join(test_imgdir,img_path))
            img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))

            prob=sess.run(prediction,feed_dict={im_placeholder:np.expand_dims(img,axis=0)})
            prob=prob.tolist()

            imgname=(img_path.split('/')[-1])
            a.append(imgname)
            a.extend(prob[0])
            tmppred.append(a)
            #test_labels.invasive[idx]=prob[0][1]

        filename_output='predictionVGG.csv'
        pd_data = pandas.DataFrame(tmppred,columns=array)
        pd_data.to_csv(filename_output,index=False)
        print "Writing result to ",filename_output
        sess.close()

首先第一坑:使用tfrecords的话那one-hot-coding怎么办???
小仙女没有找到博客讲啊啊啊,小仙女也没找到文档啊啊啊,经过不断的尝试。。试出来了。请看

    for i in range(len(train_lists['img'])):
        imgname,classname=train_lists['img'][i],train_lists['classname'][i],
        img=Image.open(os.path.join(train_imgdir,classname,imgname))
        img=np.array(img.resize((im_width,im_height),Image.ANTIALIAS))

        label=int(classname[1])#图片按类分别存在不同文件夹下,文件夹名称:c0,c1,c2...

        example=tf.train.Example(features=tf.train.Features(feature={
                'image_raw':_bytes_feature(img.tostring()),
                'label':_int64_feature(label)
                }))
        writer_train.write(example.SerializeToString())
    writer_train.close()

存到tfrecords里的时候还没有采用onehotcoding
然后,在读tfrecords时,请看:

    def read_and_decode(filename_queue):
        reader=tf.TFRecordReader()
        _,serialized_example=reader.read(filename_queue)

        features=tf.parse_single_example(
                serialized_example,
                features={
                        'image_raw':tf.FixedLenFeature([],tf.string),
                        'label':tf.FixedLenFeature([],tf.int64)
                        }
                )
        image=tf.decode_raw(features['image_raw'],tf.uint8)
        image=tf.reshape(image,[im_height,im_width,3])

        label=tf.cast(features['label'],tf.int64)

        images,labels=tf.train.shuffle_batch([image,label],
            batch_size=batch_size,capacity=256,min_after_dequeue=64,num_threads=2
                                             )

        tlabels=tf.one_hot(labels,10,1.0,0.0)#tensor类型的label可以直接使用tf.one_hot()函数变成onehotcoding
        return images,tlabels

第二坑 cross validation 到底要怎样啦

此大坑包含两个小坑
我要怎样summary啊
关于这一点,可以这样

train_summary=[]
valid_summary=[]

然后分别append不同的scalar或者histogram

train_summary.append(tf.summary.scalar('acc',accuracy))
valid_summary.append(tf.summary.scalar('acc',accuracy))

然后

train_merged=tf.summary.merge(train_summary)
valid_merged=tf.summary.merge(valid_summary)

然后记录的时候

_,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])#可以写不同的merged
                train_writer.add_summary(summaries,step)

step要怎么计算啦,还有sess.run(valid_merged)的时候究竟会不会读下一组batch啊
首先,tf里面有个globalstep是用来记录读了几次batch的

global_step = tf.Variable(0,trainable=False)

这个值tensorflow会自己记录下来,需要用的时候可以直接用

    def optimize(losses):
        learning_rate=tf.train.exponential_decay(lr,global_step,
                                                 10000,decay_rate,
                                                 staircase=True
                                                 )
        #optimizer=tf.train.MomentumOptimizer(learning_rate,
                                         #0.9)
        optimizer=tf.train.GradientDescentOptimizer(learning_rate)
        #optimizer=tf.train.AdamOptimizer(learning_rate)
        train_op=optimizer.minimize(losses,global_step=global_step)
        return train_op

或者

        _,_,summaries,step=sess.run([train_op,loss,train_merged,global_step])
        train_writer.add_summary(summaries,step)
        if i%50==0:
            print "iterator  %d  epochs    %d" % (step,num_epochs)

所以由结果显示,在单独sess.run(valid_merged)时,并不会读下一个batch。我是初学者啊,并不知道为什么。实验结果就是这样。所以crossvalidation还是老老实实整两个tfrecords
第三坑 我用tensorboard画出来的图为什么不对啊啊啊为什么loss和acc的曲线都是乱七八糟的横坐标纵坐标对不上啊啊啊啊
关于这一点。。过了很久我才悟出来。。不同次训练得到的summary要存在不同的文件夹下。。。不要好多次训练得到的结果都放在同一文件夹下。。这样tensorboard会把这一堆结果都当成同一个变量画在同一个表格里。。。。全乱了。
这个不算坑吧,slim finetune的秘密全部隐藏在这里!

restore=slim.assign_from_checkpoint_fn(
                '../kaggletest/vgg_16.ckpt',#这个是存放别人调好的模型参数的文件
                slim.get_model_variables('vgg_16')#这个是选择要使用训练好的数据初始化的层。vgg_16这个是在构建网络的时候给那一部分网络取得名字,这里面对应的那几层将用这个文件里的参数初始化。其他的层将进行finetune
                )

还有一点很重要,有一句话要记得加

tf.reset_default_graph()

这句看一下代码里都在什么地方,要保证所有变量在同一个graph里
下面是第一次实验得到的一些教训

如果acc和loss曲线不收敛,可以:
换一个optimizer试一试
改一下learning rate试一试
learning rate对最后的结果影响很大,调的太大有可能不收敛,结果跳啊跳啊跳,跳的太小学不到东西
而optimizer的话是不懂的,只有没被注释掉的那一行可以收敛。但是查了一下大部分任务用gradientdescent是挺好的。收敛以后精确度比adam略高。

#optimizer=tf.train.MomentumOptimizer(learning_rate,
                                         #0.9)
optimizer=tf.train.GradientDescentOptimizer(learning_rate)
#optimizer=tf.train.AdamOptimizer(learning_rate)

如果loss和acc波动很大,可以:
把batch_size增大
如果提示显存不足
换块显卡吧
电脑显卡太辣鸡啊,3g显存啥都干不了了啊啊啊,vgg16的网络 batchsize16已经是极限了。。训练的曲线波动好大好大。。内心血与泪

你可能感兴趣的:(tensorflow)