Tensorflow基于卷积神经网络的向导程序的改进

    最近研究了一下图像识别,在Tensorflow里面有一个向导程序,是用卷积网络来对cifar10的图像来做类别判断的。原程序写的比较早,整个结构比较绕,用很多的函数封装了,在具体阅读程序的时候,需要经常跳到不同的函数模块里面来读代码,可读性不是太好。因此,我自己参照原程序的理念,重新写了一遍,主要的区别是在数据的读取方面采用了Dataset来进行封装,另外在网络的训练和验证方面也做了一些改动。个人认为重写后的程序可读性更好,更加简洁一些。在此我把改写后的程序记录在这里,方便有兴趣研究Tensorflow的朋友参考。另外我觉得自己动手重写一下代码,也能更好的加深理解,是一个不错的学习方式。下一步我的计划是在这个基础上,继续研究不同的网络架构是否能带来更好的性能提升,例如采用RESNET来进行识别。

    下面我将介绍一下重写后的程序,一共包括三个程序文件,cifar10_train, cifar10_test, cifar10_model。其中cifar10_train是对训练数据进行学习,cifar10_test是对测试数据进行检验,cifar10_model是用于构造神经网络模型,给cifar10_train和cifar10_test调用。

     cifar10_model文件的内容如下,基本上是和原程序一样的,只是在模型的参数的scope里面增加了auto_reuse,这样可以方便在训练的时候来重载这些参数来进行检验:

import tensorflow as tf
def _activation_summary(x):
    """Helper to create summaries for activations.

    Creates a summary that provides a histogram of activations.
    Creates a summary that measures the sparsity of activations.

    Args:
      x: Tensor
    Returns:
      nothing
    """
    tensor_name = x.op.name
    tf.summary.histogram(tensor_name + '/activations', x)
    tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
    
def _variable_on_cpu(name, shape, initializer):
    """Helper to create a Variable stored on CPU memory.

    Args:
      name: name of the variable
      shape: list of ints
      initializer: initializer for Variable

    Returns:
      Variable Tensor
    """
    with tf.device('/cpu:0'):
        dtype = tf.float32
        var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
    return var

def _variable_with_weight_decay(name, shape, stddev, wd):
    """Helper to create an initialized Variable with weight decay.

    Note that the Variable is initialized with a truncated normal distribution.
    A weight decay is added only if one is specified.

    Args:
      name: name of the variable
      shape: list of ints
      stddev: standard deviation of a truncated Gaussian
      wd: add L2Loss weight decay multiplied by this float. If None, weight
          decay is not added for this Variable.

    Returns:
      Variable Tensor
    """
    dtype = tf.float32
    var = _variable_on_cpu(
        name,
        shape,
        tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
    if wd is not None:
        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
    return var

#Define the model
def inference(images):
    """Build the CIFAR-10 model.

    Args:
      images: Images returned from distorted_inputs() or inputs().

    Returns:
      Logits.
    """
    # conv1
    with tf.variable_scope('conv1', reuse=tf.AUTO_REUSE) as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 3, 64],
                                             stddev=5e-2,
                                             wd=None)
        conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv1)

    # pool1
    pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                           padding='SAME', name='pool1')
    # norm1
    norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                      name='norm1')

    # conv2
    with tf.variable_scope('conv2', reuse=tf.AUTO_REUSE) as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 64, 64],
                                             stddev=5e-2,
                                             wd=None)
        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv2)

    # norm2
    norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                      name='norm2')
    # pool2
    pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1], padding='SAME', name='pool2')

    # local3
    with tf.variable_scope('local3', reuse=tf.AUTO_REUSE) as scope:
        # Move everything into depth so we can perform a single matrix multiply.
        reshape = tf.reshape(pool2, [images.get_shape().as_list()[0], -1])
        dim = reshape.get_shape()[1].value
        weights = _variable_with_weight_decay('weights', shape=[dim, 384],
                                              stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
        _activation_summary(local3)

    # local4
    with tf.variable_scope('local4', reuse=tf.AUTO_REUSE) as scope:
        weights = _variable_with_weight_decay('weights', shape=[384, 192],
                                              stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
        _activation_summary(local4)

    # linear layer(WX + b),
    # We don't apply softmax here because
    # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
    # and performs the softmax internally for efficiency.
    with tf.variable_scope('softmax_linear', reuse=tf.AUTO_REUSE) as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                              stddev=1/192.0, wd=None)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
        _activation_summary(softmax_linear)

    return softmax_linear

    cifar10_train文件内容如下,这个程序主要完成几个工作:

  1. 构建数据输入队列,采用Dataset来进行封装
  2. 对每次输入的数据进行处理,即对图像进行裁剪,调整对比度等处理
  3. 定义训练的超参数,例如学习速率等等
  4. 每次训练一定步数后,输出模型的误差值,以及在测试数据上的识别正确数

     以下是程序代码:

import tensorflow as tf
import numpy as np
import os
import cifar10_model

#Construct the filenames that include the train cifar10 images
folderPath = 'cifar-10-batches-bin/'
filenames = [os.path.join(folderPath, 'data_batch_%d.bin' % i) for i in xrange(1,6)]

#Define the parameters of the cifar10 image
imageWidth = 32
imageHeight = 32
imageDepth = 3
label_bytes = 1

#Define the train and test batch size
batch_size = 100
test_batch_size = 100

#Calulate the per image bytes and record bytes
image_bytes = imageWidth * imageHeight * imageDepth
record_bytes = label_bytes + image_bytes

#Construct the dataset to read the train images
dataset = tf.data.FixedLengthRecordDataset(filenames, record_bytes)
dataset = dataset.shuffle(50000)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(200)
iterator = dataset.make_initializable_iterator()

#Construct the dataset to read the test images
testfilename = os.path.join(folderPath, 'test_batch.bin')
testdataset = tf.data.FixedLengthRecordDataset(testfilename, record_bytes)
testdataset = testdataset.batch(test_batch_size)
testiterator = testdataset.make_initializable_iterator()

#Decode the test records from the iterator
record = iterator.get_next()
record_decoded_bytes = tf.decode_raw(record, tf.uint8)

#Get the labels from the records
record_labels = tf.slice(record_decoded_bytes, [0, 0], [batch_size, 1])
record_labels = tf.cast(record_labels, tf.int32)

#Get the images from the records
record_images = tf.slice(record_decoded_bytes, [0, 1], [batch_size, image_bytes])
record_images = tf.reshape(record_images, [batch_size, imageDepth, imageHeight, imageWidth])
record_images = tf.transpose(record_images, [0, 2, 3, 1])
record_images = tf.cast(record_images, tf.float32)

#Decode the test records from the iterator
testrecord = testiterator.get_next()
testrecord_decoded_bytes = tf.decode_raw(testrecord, tf.uint8)

#Get the labels from the records
testrecord_labels = tf.slice(testrecord_decoded_bytes, [0, 0], [test_batch_size, 1])
testrecord_labels = tf.cast(testrecord_labels, tf.int32)
testrecord_labels = tf.reshape(testrecord_labels, [-1])

#Get the images from the records
testrecord_images = tf.slice(testrecord_decoded_bytes, [0, 1], [test_batch_size, image_bytes])
testrecord_images = tf.cast(testrecord_images, tf.float32)
testrecord_images = tf.reshape(testrecord_images, 
                               [test_batch_size, imageDepth, imageHeight, imageWidth])
testrecord_images = tf.transpose(testrecord_images, [0, 2, 3, 1])

#Random crop the train image
cropHeight = 24
cropWidth = 24
distorted_images = tf.random_crop(record_images, size = [batch_size, cropHeight, cropWidth, 3])

#Unstack the images as the follow up operation are on single train image
distorted_images = tf.unstack(distorted_images)
for i in xrange(len(distorted_images)):
    distorted_images[i] = tf.image.random_flip_left_right(distorted_images[i])
    distorted_images[i] = tf.image.random_brightness(distorted_images[i], max_delta=63)
    distorted_images[i] = tf.image.random_contrast(distorted_images[i], lower=0.2, upper=1.8)
    distorted_images[i] = tf.image.per_image_standardization(distorted_images[i])
    
#Stack the images
distorted_images = tf.stack(distorted_images)

#Crop the test image as the train process used the crop size
testrecord_images = tf.image.resize_image_with_crop_or_pad(testrecord_images, cropHeight, cropWidth)

#Unstack the images as the follow up operation are on single image
testrecord_images = tf.unstack(testrecord_images)
for i in xrange(len(testrecord_images)):
    testrecord_images[i] = tf.image.per_image_standardization(testrecord_images[i])
    
#Stack the images
testrecord_images = tf.stack(testrecord_images)

#Log the orginal train images and distorted images, can see the images in tensorboard
with tf.name_scope('original'):
    tf.summary.image('images', record_images, max_outputs=10)
with tf.name_scope('distorted'):
    tf.summary.image('images', distorted_images, max_outputs=10)

NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.

global_step = tf.train.get_or_create_global_step()

# Variables that affect learning rate.
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                global_step,
                                decay_steps,
                                LEARNING_RATE_DECAY_FACTOR,
                                staircase=True)

#Get the inference logits by the model
result = cifar10_model.inference(distorted_images)

#Calculate the cross entropy loss
cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=record_labels, logits=result)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)

#Add the l2 weights to the loss
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')

#Define the optimizer
opt_op = tf.train.GradientDescentOptimizer(lr).minimize(loss)

#Define the Exp moving average
ema = tf.train.ExponentialMovingAverage(decay=0.9999)
with tf.control_dependencies([opt_op]):
    optimize_op = ema.apply(tf.trainable_variables())
    
#Save the saver to store the model parameters
saver = tf.train.Saver()

#Get the testrecord prediction results
testresult = tf.argmax(cifar10_model.inference(testrecord_images), axis=1)

#Create the session and run the graph
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)

#Merge all the summary and write
summary_op = tf.summary.merge_all()
train_filewriter = tf.summary.FileWriter('train/', sess.graph)

step = 0
while(True):
    try:
        lossValue, _ = sess.run([loss, optimize_op])
        if step % 100 == 0:
            print "step %i Loss: %f" %(step, lossValue)
            sess.run(testiterator.initializer)
            summary = sess.run(summary_op)
            train_filewriter.add_summary(summary, step)
        if step % 1000 == 0:
            truepredictNum = 0
            sess.run(testiterator.initializer)
            saver.save(sess, 'model/my-model', global_step=step)
            while(True):
                try:
                    predictValue, testValue = sess.run([testresult, testrecord_labels])
                    truepredictNum += np.sum(predictValue==testValue)
                except tf.errors.OutOfRangeError:
                    print "test correct num: %i" %(truepredictNum)
                    break
        step += 1
    except tf.errors.OutOfRangeError:
        train_filewriter.close()
        break

     具体训练的时候,可以看到大约在训练了100000个Batch之后,对于10000张测试图片的准确识别数量大约在8100左右。我们把训练得到的模型参数,以及参数的移动平均值会记录在文件中。随后我们可以在cifar10_test程序中用参数的移动平均值来载入模型,对测试图片进行检验,最终的准确识别结果大约为8500左右,和原程序提到的0.86的准确率非常接近。同时我们也可以看到,采用参数移动平均值的处理方式可以进一步提高模型的性能。

     cifar10_test程序的代码如下:

import tensorflow as tf
import numpy as np
import cifar10_model

#Define the parameters of the cifar10 image
imageWidth = 32
imageHeight = 32
imageDepth = 3
label_bytes = 1
NUM_CLASSES = 10

#Define the test batch size
test_batch_size = 100

#Calulate the per image bytes and record bytes
image_bytes = imageWidth * imageHeight * imageDepth
record_bytes = label_bytes + image_bytes

#Construct the dataset to read the test images
testfilename = os.path.join(folderPath, 'test_batch.bin')
testdataset = tf.data.FixedLengthRecordDataset(testfilename, record_bytes)
testdataset = testdataset.batch(test_batch_size)
testiterator = testdataset.make_initializable_iterator()

#Decode the test records from the iterator
testrecord = testiterator.get_next()
testrecord_decoded_bytes = tf.decode_raw(testrecord, tf.uint8)

#Get the labels from the records
testrecord_labels = tf.slice(testrecord_decoded_bytes, [0, 0], [test_batch_size, 1])
testrecord_labels = tf.cast(testrecord_labels, tf.int32)
testrecord_labels = tf.reshape(testrecord_labels, [-1])

#Get the images from the records
testrecord_images = tf.slice(testrecord_decoded_bytes, [0, 1], [test_batch_size, image_bytes])
testrecord_images = tf.cast(testrecord_images, tf.float32)
testrecord_images = tf.reshape(testrecord_images, 
                               [test_batch_size, imageDepth, imageHeight, imageWidth])
testrecord_images = tf.transpose(testrecord_images, [0, 2, 3, 1])

#Crop the image as the train process used the crop size
cropHeight = 24
cropWidth = 24
testrecord_images = tf.image.resize_image_with_crop_or_pad(testrecord_images, cropHeight, cropWidth)

#Unstack the images as the follow up operation are on single image
testrecord_images = tf.unstack(testrecord_images)
for i in xrange(len(testrecord_images)):
    testrecord_images[i] = tf.image.per_image_standardization(testrecord_images[i])
    
#Stack the images
testrecord_images = tf.stack(testrecord_images)

#inference the logits from the test images, and get the maxvalue index
testresult = tf.argmax(cifar10_model.inference(testrecord_images), axis=1)

#Restore the moving average variables into the model
variable_averages = tf.train.ExponentialMovingAverage(decay=0.9999)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)

with tf.Session() as sess:
    sess.run(testiterator.initializer)
    #Specify restore from which model file
    saver.restore(sess, 'model/my-model-95000')
    truepredictNum = 0
    while(True):
        try:
            predictValue, testValue = sess.run([testresult, testrecord_labels])
            truepredictNum += np.sum(predictValue==testValue)
        except tf.errors.OutOfRangeError:
            print "test correct num: %i" %(truepredictNum)
            break


你可能感兴趣的:(人工智能,机器学习)