最近研究了一下图像识别,在Tensorflow里面有一个向导程序,是用卷积网络来对cifar10的图像来做类别判断的。原程序写的比较早,整个结构比较绕,用很多的函数封装了,在具体阅读程序的时候,需要经常跳到不同的函数模块里面来读代码,可读性不是太好。因此,我自己参照原程序的理念,重新写了一遍,主要的区别是在数据的读取方面采用了Dataset来进行封装,另外在网络的训练和验证方面也做了一些改动。个人认为重写后的程序可读性更好,更加简洁一些。在此我把改写后的程序记录在这里,方便有兴趣研究Tensorflow的朋友参考。另外我觉得自己动手重写一下代码,也能更好的加深理解,是一个不错的学习方式。下一步我的计划是在这个基础上,继续研究不同的网络架构是否能带来更好的性能提升,例如采用RESNET来进行识别。
下面我将介绍一下重写后的程序,一共包括三个程序文件,cifar10_train, cifar10_test, cifar10_model。其中cifar10_train是对训练数据进行学习,cifar10_test是对测试数据进行检验,cifar10_model是用于构造神经网络模型,给cifar10_train和cifar10_test调用。
cifar10_model文件的内容如下,基本上是和原程序一样的,只是在模型的参数的scope里面增加了auto_reuse,这样可以方便在训练的时候来重载这些参数来进行检验:
import tensorflow as tf
def _activation_summary(x):
"""Helper to create summaries for activations.
Creates a summary that provides a histogram of activations.
Creates a summary that measures the sparsity of activations.
Args:
x: Tensor
Returns:
nothing
"""
tensor_name = x.op.name
tf.summary.histogram(tensor_name + '/activations', x)
tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
def _variable_on_cpu(name, shape, initializer):
"""Helper to create a Variable stored on CPU memory.
Args:
name: name of the variable
shape: list of ints
initializer: initializer for Variable
Returns:
Variable Tensor
"""
with tf.device('/cpu:0'):
dtype = tf.float32
var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
return var
def _variable_with_weight_decay(name, shape, stddev, wd):
"""Helper to create an initialized Variable with weight decay.
Note that the Variable is initialized with a truncated normal distribution.
A weight decay is added only if one is specified.
Args:
name: name of the variable
shape: list of ints
stddev: standard deviation of a truncated Gaussian
wd: add L2Loss weight decay multiplied by this float. If None, weight
decay is not added for this Variable.
Returns:
Variable Tensor
"""
dtype = tf.float32
var = _variable_on_cpu(
name,
shape,
tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
if wd is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
#Define the model
def inference(images):
"""Build the CIFAR-10 model.
Args:
images: Images returned from distorted_inputs() or inputs().
Returns:
Logits.
"""
# conv1
with tf.variable_scope('conv1', reuse=tf.AUTO_REUSE) as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 3, 64],
stddev=5e-2,
wd=None)
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation, name=scope.name)
_activation_summary(conv1)
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# norm1
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name='norm1')
# conv2
with tf.variable_scope('conv2', reuse=tf.AUTO_REUSE) as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 64, 64],
stddev=5e-2,
wd=None)
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation, name=scope.name)
_activation_summary(conv2)
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
name='norm2')
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2')
# local3
with tf.variable_scope('local3', reuse=tf.AUTO_REUSE) as scope:
# Move everything into depth so we can perform a single matrix multiply.
reshape = tf.reshape(pool2, [images.get_shape().as_list()[0], -1])
dim = reshape.get_shape()[1].value
weights = _variable_with_weight_decay('weights', shape=[dim, 384],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
_activation_summary(local3)
# local4
with tf.variable_scope('local4', reuse=tf.AUTO_REUSE) as scope:
weights = _variable_with_weight_decay('weights', shape=[384, 192],
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
_activation_summary(local4)
# linear layer(WX + b),
# We don't apply softmax here because
# tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
# and performs the softmax internally for efficiency.
with tf.variable_scope('softmax_linear', reuse=tf.AUTO_REUSE) as scope:
weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
stddev=1/192.0, wd=None)
biases = _variable_on_cpu('biases', [NUM_CLASSES],
tf.constant_initializer(0.0))
softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
_activation_summary(softmax_linear)
return softmax_linear
cifar10_train文件内容如下,这个程序主要完成几个工作:
以下是程序代码:
import tensorflow as tf
import numpy as np
import os
import cifar10_model
#Construct the filenames that include the train cifar10 images
folderPath = 'cifar-10-batches-bin/'
filenames = [os.path.join(folderPath, 'data_batch_%d.bin' % i) for i in xrange(1,6)]
#Define the parameters of the cifar10 image
imageWidth = 32
imageHeight = 32
imageDepth = 3
label_bytes = 1
#Define the train and test batch size
batch_size = 100
test_batch_size = 100
#Calulate the per image bytes and record bytes
image_bytes = imageWidth * imageHeight * imageDepth
record_bytes = label_bytes + image_bytes
#Construct the dataset to read the train images
dataset = tf.data.FixedLengthRecordDataset(filenames, record_bytes)
dataset = dataset.shuffle(50000)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(200)
iterator = dataset.make_initializable_iterator()
#Construct the dataset to read the test images
testfilename = os.path.join(folderPath, 'test_batch.bin')
testdataset = tf.data.FixedLengthRecordDataset(testfilename, record_bytes)
testdataset = testdataset.batch(test_batch_size)
testiterator = testdataset.make_initializable_iterator()
#Decode the test records from the iterator
record = iterator.get_next()
record_decoded_bytes = tf.decode_raw(record, tf.uint8)
#Get the labels from the records
record_labels = tf.slice(record_decoded_bytes, [0, 0], [batch_size, 1])
record_labels = tf.cast(record_labels, tf.int32)
#Get the images from the records
record_images = tf.slice(record_decoded_bytes, [0, 1], [batch_size, image_bytes])
record_images = tf.reshape(record_images, [batch_size, imageDepth, imageHeight, imageWidth])
record_images = tf.transpose(record_images, [0, 2, 3, 1])
record_images = tf.cast(record_images, tf.float32)
#Decode the test records from the iterator
testrecord = testiterator.get_next()
testrecord_decoded_bytes = tf.decode_raw(testrecord, tf.uint8)
#Get the labels from the records
testrecord_labels = tf.slice(testrecord_decoded_bytes, [0, 0], [test_batch_size, 1])
testrecord_labels = tf.cast(testrecord_labels, tf.int32)
testrecord_labels = tf.reshape(testrecord_labels, [-1])
#Get the images from the records
testrecord_images = tf.slice(testrecord_decoded_bytes, [0, 1], [test_batch_size, image_bytes])
testrecord_images = tf.cast(testrecord_images, tf.float32)
testrecord_images = tf.reshape(testrecord_images,
[test_batch_size, imageDepth, imageHeight, imageWidth])
testrecord_images = tf.transpose(testrecord_images, [0, 2, 3, 1])
#Random crop the train image
cropHeight = 24
cropWidth = 24
distorted_images = tf.random_crop(record_images, size = [batch_size, cropHeight, cropWidth, 3])
#Unstack the images as the follow up operation are on single train image
distorted_images = tf.unstack(distorted_images)
for i in xrange(len(distorted_images)):
distorted_images[i] = tf.image.random_flip_left_right(distorted_images[i])
distorted_images[i] = tf.image.random_brightness(distorted_images[i], max_delta=63)
distorted_images[i] = tf.image.random_contrast(distorted_images[i], lower=0.2, upper=1.8)
distorted_images[i] = tf.image.per_image_standardization(distorted_images[i])
#Stack the images
distorted_images = tf.stack(distorted_images)
#Crop the test image as the train process used the crop size
testrecord_images = tf.image.resize_image_with_crop_or_pad(testrecord_images, cropHeight, cropWidth)
#Unstack the images as the follow up operation are on single image
testrecord_images = tf.unstack(testrecord_images)
for i in xrange(len(testrecord_images)):
testrecord_images[i] = tf.image.per_image_standardization(testrecord_images[i])
#Stack the images
testrecord_images = tf.stack(testrecord_images)
#Log the orginal train images and distorted images, can see the images in tensorboard
with tf.name_scope('original'):
tf.summary.image('images', record_images, max_outputs=10)
with tf.name_scope('distorted'):
tf.summary.image('images', distorted_images, max_outputs=10)
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
NUM_EPOCHS_PER_DECAY = 350.0 # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1 # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.1 # Initial learning rate.
global_step = tf.train.get_or_create_global_step()
# Variables that affect learning rate.
num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
global_step,
decay_steps,
LEARNING_RATE_DECAY_FACTOR,
staircase=True)
#Get the inference logits by the model
result = cifar10_model.inference(distorted_images)
#Calculate the cross entropy loss
cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=record_labels, logits=result)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
#Add the l2 weights to the loss
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
#Define the optimizer
opt_op = tf.train.GradientDescentOptimizer(lr).minimize(loss)
#Define the Exp moving average
ema = tf.train.ExponentialMovingAverage(decay=0.9999)
with tf.control_dependencies([opt_op]):
optimize_op = ema.apply(tf.trainable_variables())
#Save the saver to store the model parameters
saver = tf.train.Saver()
#Get the testrecord prediction results
testresult = tf.argmax(cifar10_model.inference(testrecord_images), axis=1)
#Create the session and run the graph
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)
#Merge all the summary and write
summary_op = tf.summary.merge_all()
train_filewriter = tf.summary.FileWriter('train/', sess.graph)
step = 0
while(True):
try:
lossValue, _ = sess.run([loss, optimize_op])
if step % 100 == 0:
print "step %i Loss: %f" %(step, lossValue)
sess.run(testiterator.initializer)
summary = sess.run(summary_op)
train_filewriter.add_summary(summary, step)
if step % 1000 == 0:
truepredictNum = 0
sess.run(testiterator.initializer)
saver.save(sess, 'model/my-model', global_step=step)
while(True):
try:
predictValue, testValue = sess.run([testresult, testrecord_labels])
truepredictNum += np.sum(predictValue==testValue)
except tf.errors.OutOfRangeError:
print "test correct num: %i" %(truepredictNum)
break
step += 1
except tf.errors.OutOfRangeError:
train_filewriter.close()
break
具体训练的时候,可以看到大约在训练了100000个Batch之后,对于10000张测试图片的准确识别数量大约在8100左右。我们把训练得到的模型参数,以及参数的移动平均值会记录在文件中。随后我们可以在cifar10_test程序中用参数的移动平均值来载入模型,对测试图片进行检验,最终的准确识别结果大约为8500左右,和原程序提到的0.86的准确率非常接近。同时我们也可以看到,采用参数移动平均值的处理方式可以进一步提高模型的性能。
cifar10_test程序的代码如下:
import tensorflow as tf
import numpy as np
import cifar10_model
#Define the parameters of the cifar10 image
imageWidth = 32
imageHeight = 32
imageDepth = 3
label_bytes = 1
NUM_CLASSES = 10
#Define the test batch size
test_batch_size = 100
#Calulate the per image bytes and record bytes
image_bytes = imageWidth * imageHeight * imageDepth
record_bytes = label_bytes + image_bytes
#Construct the dataset to read the test images
testfilename = os.path.join(folderPath, 'test_batch.bin')
testdataset = tf.data.FixedLengthRecordDataset(testfilename, record_bytes)
testdataset = testdataset.batch(test_batch_size)
testiterator = testdataset.make_initializable_iterator()
#Decode the test records from the iterator
testrecord = testiterator.get_next()
testrecord_decoded_bytes = tf.decode_raw(testrecord, tf.uint8)
#Get the labels from the records
testrecord_labels = tf.slice(testrecord_decoded_bytes, [0, 0], [test_batch_size, 1])
testrecord_labels = tf.cast(testrecord_labels, tf.int32)
testrecord_labels = tf.reshape(testrecord_labels, [-1])
#Get the images from the records
testrecord_images = tf.slice(testrecord_decoded_bytes, [0, 1], [test_batch_size, image_bytes])
testrecord_images = tf.cast(testrecord_images, tf.float32)
testrecord_images = tf.reshape(testrecord_images,
[test_batch_size, imageDepth, imageHeight, imageWidth])
testrecord_images = tf.transpose(testrecord_images, [0, 2, 3, 1])
#Crop the image as the train process used the crop size
cropHeight = 24
cropWidth = 24
testrecord_images = tf.image.resize_image_with_crop_or_pad(testrecord_images, cropHeight, cropWidth)
#Unstack the images as the follow up operation are on single image
testrecord_images = tf.unstack(testrecord_images)
for i in xrange(len(testrecord_images)):
testrecord_images[i] = tf.image.per_image_standardization(testrecord_images[i])
#Stack the images
testrecord_images = tf.stack(testrecord_images)
#inference the logits from the test images, and get the maxvalue index
testresult = tf.argmax(cifar10_model.inference(testrecord_images), axis=1)
#Restore the moving average variables into the model
variable_averages = tf.train.ExponentialMovingAverage(decay=0.9999)
variables_to_restore = variable_averages.variables_to_restore()
saver = tf.train.Saver(variables_to_restore)
with tf.Session() as sess:
sess.run(testiterator.initializer)
#Specify restore from which model file
saver.restore(sess, 'model/my-model-95000')
truepredictNum = 0
while(True):
try:
predictValue, testValue = sess.run([testresult, testrecord_labels])
truepredictNum += np.sum(predictValue==testValue)
except tf.errors.OutOfRangeError:
print "test correct num: %i" %(truepredictNum)
break