def distorted_inputs(data_dir, batch_size):
"""Construct distorted input for CIFAR training using the Reader ops.
data_dir: Path to the CIFAR-10 data directory.
batch_size: Number of images per batch.
images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
labels: Labels. 1D tensor of [batch_size] size.
filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
for i in xrange(1, 6)]
for f in filenames:
if not tf.gfile.Exists(f):
raise ValueError('Failed to find file: ' + f)
# Create a queue that produces the filenames to read.
# 创建输入队列,下面这个函数
# string_input_producer(
# string_tensor,
# num_epochs = None,
# shuffle = True,
# seed = None,
# capacity = 32,
# shared_name = None,
# name = None,
# cancel_op = None)其中shuffle默认为true,表示随机打乱读文件的顺序,设置为False表示不打乱顺序
filename_queue = tf.train.string_input_producer(filenames)
with tf.name_scope('data_augmentation'): # 一种数据扩充策略data_augmentation
# Read examples from files in the filename queue.
read_input = read_cifar10(filename_queue)
reshaped_image = tf.cast(read_input.uint8image, tf.float32)
# 直译,类似于映射,指映射到制定的一个类型,即类型变换
# 语法:
# tf.cast(x, dtype, name=None)
# x:输入tensor
# dtype:目标数据类型,将x的数据类型转化成dtype。
# name:自定义输出tensor的名字。
# 返回:
# 1个tensor
height = IMAGE_SIZE #24
width = IMAGE_SIZE #24
# Image processing for training the network. Note the many random
# distortions applied to the image.
# 进行一种随机变换操作,一般训练前都会对训练数据进行预处理
# Randomly crop a [height, width] section of the image.
# 随机从图像中截取24*24区域,因为原图好像是32*32的
distorted_image = tf.random_crop(reshaped_image, [height, width, 3])
# Randomly flip the image horizontally.
# 以50%左右概率左右翻转图像
distorted_image = tf.image.random_flip_left_right(distorted_image)
# Because these operations are not commutative, consider randomizing
# the order their operation.
# NOTE: since per_image_standardization zeros the mean and makes
# the stddev unit, this likely has no effect see tensorflow#1458.
# 在[-max_delta,max_delta]的范围内随机调整图像的亮度
distorted_image = tf.image.random_brightness(distorted_image,
# 在[lower,upper]的范围随机调整图像的对比度
distorted_image = tf.image.random_contrast(distorted_image,
lower=0.2, upper=1.8)
# Subtract off the mean and divide by the variance of the pixels.
# 减去平均值并除以像素的方差
# 将代表一张图像的三维矩阵中的数字均值变为0,方差变为1。
float_image = tf.image.per_image_standardization(distorted_image)
# Set the shapes of tensors.
float_image.set_shape([height, width, 3])
read_input.label.set_shape([1]) #标签补充shape信息
# Ensure that the random shuffling has good mixing properties.
min_fraction_of_examples_in_queue = 0.4
min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
min_fraction_of_examples_in_queue) #20000
print ('Filling queue with %d CIFAR images before starting to train. '
'This will take a few minutes.' % min_queue_examples)
# Generate a batch of images and labels by building up a queue of examples.
return _generate_image_and_label_batch(float_image, read_input.label,
min_queue_examples, batch_size,
def _generate_image_and_label_batch(image, label, min_queue_examples,
batch_size, shuffle):
"""Construct a queued batch of images and labels.
image: 3-D Tensor of [height, width, 3] of type.float32.
label: 1-D Tensor of type.int32
min_queue_examples: int32, minimum number of samples to retain
in the queue that provides of batches of examples.
batch_size: Number of images per batch.
shuffle: boolean indicating whether to use a shuffling queue.
images: Images. 4D tensor of [batch_size, height, width, 3] size.
labels: Labels. 1D tensor of [batch_size] size.
# Create a queue that shuffles the examples, and then
# read 'batch_size' images + labels from the example queue.
num_preprocess_threads = 16
if shuffle:
images, label_batch = tf.train.shuffle_batch(
[image, label],
capacity=min_queue_examples + 3 * batch_size,
images, label_batch = tf.train.batch(
[image, label],
capacity=min_queue_examples + 3 * batch_size)
# Display the training images in the visualizer.
tf.summary.image('images', images)
return images, tf.reshape(label_batch, [batch_size])
def _variable_with_weight_decay(name, shape, stddev, wd):
"""Helper to create an initialized Variable with weight decay. #加入权值衰减
Note that the Variable is initialized with a truncated normal distribution.
A weight decay is added only if one is specified.
name: name of the variable
shape: list of ints
stddev: standard deviation of a truncated Gaussian#截断高斯的标准偏差?
wd: add L2Loss weight decay multiplied by this float. If None, weight
decay is not added for this Variable.
Variable Tensor
dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
var = _variable_on_cpu(
tf.truncated_normal_initializer(stddev=stddev, dtype=dtype)) # 从截断的正态分布中输出随机值。这是神经网络权重和过滤器的推荐初始值。
# 生成的值服从具有指定平均值和标准偏差的正态分布,如果生成的值大于平均值2个标准偏差的值则丢弃重新选择。
# Args:
# mean:一个python标量或一个标量张量。要生成的随机值的均值。
# stddev:一个python标量或一个标量张量。要生成的随机值的标准偏差。
# seed:一个Python整数。用于创建随机种子。查看 tf.set_random_seed 行为。
# dtype:数据类型。只支持浮点类型。
if wd is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
return var
def inference(images): #一般包含网络结构的这个函数都以inference命名?习惯吗? 表示在线推断
"""Build the CIFAR-10 model.
images: Images returned from distorted_inputs() or inputs().
# We instantiate all variables using tf.get_variable() instead of
# tf.Variable() in order to share variables across multiple GPU training runs.
# If we only ran this model on a single GPU, we could simplify this function
# by replacing all instances of tf.get_variable() with tf.Variable().
# 此处讲的就是tf.get_variable与tf.Variable的不同作用了,
# 为了能在多GPU任务过程中共享参数,用tf.get_variable()更好一点
# conv1
with tf.variable_scope('conv1') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 3, 64],#前两个参数表示过滤器尺寸,第三个参数为当前层深度,第四个参数表示过滤器深度
stddev=5e-2, #随机初始化权重值的标准偏差
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')#步长为1,全0填充
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) #偏置项,总共有下一层深度个不同的偏置项
pre_activation = tf.nn.bias_add(conv, biases)
conv1 = tf.nn.relu(pre_activation,
# pool1
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],#过滤器尺寸3*3,步长为2,全0填充
padding='SAME', name='pool1')
# norm1
norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# def lrn(input,depth_radius=None,bias=None,alpha=None,beta=None,name=None)
# sqr_sum[a,b,c,d]=sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
# output = input / (bias + alpha * sqr_sum) ** beta
# LRN函数类似DROPOUT和数据增强作为relu激励之后防止数据过拟合而提出的一种处理方法,全称是 local response normalization--局部响应标准化。
# 这个函数很少使用,基本上被类似DROPOUT这样的方法取代,具体原理还是值得一看的
# 目前这方面,有提出batch normalization
# conv2
with tf.variable_scope('conv2') as scope:
kernel = _variable_with_weight_decay('weights',
shape=[5, 5, 64, 64],
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
pre_activation = tf.nn.bias_add(conv, biases)
conv2 = tf.nn.relu(pre_activation,
# norm2
norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
# pool2
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
strides=[1, 2, 2, 1], padding='SAME', name='pool2') #shape为128*6*6*64,128为训练数据个数
# local3
with tf.variable_scope('local3') as scope:
# Move everything into depth so we can perform a single matrix multiply.
# 都是以batch进行处理,所以这个images.get_shape().as_list()[0]=batch_size=128,对向量拉伸为一列 即6*6*24=2304
# x_shape=x.get_shape() # 返回的是TensorShape([Dimension(2), Dimension(3)]),不能使用 因为返回的不是tensor 或string,而是元组
# x_shape=x.get_shape().as_list() # 可以使用 as_list()得到具体的尺寸,x_shape=[2 3]
reshape = tf.reshape(pool2, [images.get_shape().as_list()[0], -1])#(128,2304)
dim = reshape.get_shape()[1].value#2304
weights = _variable_with_weight_decay('weights', shape=[dim, 384],#权重为2304*384,表示此处全连接层节点数为382
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases,使用ReLU激活函数 f(x)=max(x,0)
# local4
with tf.variable_scope('local4') as scope:
weights = _variable_with_weight_decay('weights', shape=[384, 192],#此全连接层节点数为192
stddev=0.04, wd=0.004)
biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + biases,
# linear layer(WX + b),
# We don't apply softmax here because
# tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
# and performs the softmax internally for efficiency.
# 这里其实就是通过一个全连接层10个节点输出相应分类而已
with tf.variable_scope('softmax_linear') as scope:
weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
stddev=1/192.0, wd=None)
biases = _variable_on_cpu('biases', [NUM_CLASSES],
softmax_linear = tf.add(tf.matmul(local4, weights), biases,
return softmax_linear
def loss(logits, labels):
"""Add L2Loss to all the trainable variables.
Add summary for "Loss" and "Loss/avg".
logits: Logits from inference().
labels: Labels from distorted_inputs or inputs(). 1-D tensor
of shape [batch_size]
Loss tensor of type float.
# Calculate the average cross entropy loss across the batch.
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits, name='cross_entropy_per_example')#返回的是一组向量,若求交叉熵,则用tf.reduce_sum,若求loss,则用tf.reduce_mean
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
tf.add_to_collection('losses', cross_entropy_mean)
# The total loss is defined as the cross entropy loss plus all of the weight
# decay terms (L2 loss).
return tf.add_n(tf.get_collection('losses'), name='total_loss')
def _add_loss_summaries(total_loss):
"""Add summaries for losses in CIFAR-10 model.
Generates moving average for all losses and associated summaries for
visualizing the performance of the network.
total_loss: Total loss from loss().
loss_averages_op: op for generating moving averages of losses.
# Compute the moving average of all individual losses and the total loss.
# 运用滑动平均模型,使模型在测试数据上更健壮(robust)的一种方法。
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
losses = tf.get_collection('losses')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.summary.scalar( + ' (raw)', l)
tf.summary.scalar(, loss_averages.average(l))
return loss_averages_op
def train():
"""Train CIFAR-10 for a number of steps."""
with tf.Graph().as_default(), tf.device('/cpu:0'):
# Create a variable to count the number of train() calls. This equals the
# number of batches processed * FLAGS.num_gpus.
global_step = tf.get_variable(
'global_step', [],
initializer=tf.constant_initializer(0), trainable=False)
# Calculate the learning rate schedule.
num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
FLAGS.batch_size) #50000/128
decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY) #衰减速度 50000/128 *350
# Decay the learning rate exponentially based on the number of steps.
lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
cifar10.LEARNING_RATE_DECAY_FACTOR, # decay_rate 衰减系数 =0.1
staircase=True) #设置为True表示阶梯状学习率,350轮数据集遍历后更新一次学习率
# Create an optimizer that performs gradient descent. 构造优化器执行梯度下降?
opt = tf.train.GradientDescentOptimizer(lr)
# Get images and labels for CIFAR-10. 从cifar-10中获取图片和标签
images, labels = cifar10.distorted_inputs()
batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
[images, labels], capacity=2 * FLAGS.num_gpus)#此处使用预加载队列
# Calculate the gradients for each model tower.
tower_grads = []
with tf.variable_scope(tf.get_variable_scope()):
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
# Dequeues one batch for the GPU
image_batch, label_batch = batch_queue.dequeue()#出对操作
# Calculate the loss for one tower of the CIFAR model. This function
# constructs the entire CIFAR model but shares the variables across
# all towers.
loss = tower_loss(scope, image_batch, label_batch) 计算loss和
# Reuse variables for the next tower.
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this CIFAR tower.
grads = opt.compute_gradients(loss)#计算loss函数梯度
# Keep track of the gradients across all towers.
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
# 计算每个梯度的平均值,用的是同步模式
grads = average_gradients(tower_grads)
# Add a summary to track the learning rate.
summaries.append(tf.summary.scalar('learning_rate', lr))#用于指标监控方面,TensorBoard
# Add histograms for gradients.
for grad, var in grads:
if grad is not None:
summaries.append(tf.summary.histogram( + '/gradients', grad))
# Apply the gradients to adjust the shared variables.
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# Add histograms for trainable variables.
for var in tf.trainable_variables():
summaries.append(tf.summary.histogram(, var))
# Track the moving averages of all trainable variables.
variable_averages = tf.train.ExponentialMovingAverage(
cifar10.MOVING_AVERAGE_DECAY, global_step) #衰减率为0.9999
variables_averages_op = variable_averages.apply(tf.trainable_variables())
# Group all updates to into a single train op.
# 需要更新的参数放同一组
train_op =, variables_averages_op)
# Create a saver.
saver = tf.train.Saver(tf.global_variables())
# Build the summary operation from the last tower summaries.
summary_op = tf.summary.merge(summaries)
# Build an initialization operation to run below.
init = tf.global_variables_initializer()
# Start running operations on the Graph. allow_soft_placement must be set to
# True to build towers on GPU, as some of the ops do not have GPU
# implementations.
sess = tf.Session(config=tf.ConfigProto(
# Start the queue runners.
# 启动填充队列的线程
summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
# 基本上开始训练,原文件训练次数居然100w次,1050ti我都跑了七八个小时,loss在四五十万次的时候就下不去了
for step in xrange(FLAGS.max_steps):
start_time = time.time()
_, loss_value =[train_op, loss])
duration = time.time() - start_time
assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration / FLAGS.num_gpus
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
print (format_str % (, step, loss_value,
examples_per_sec, sec_per_batch))
if step % 100 == 0:
summary_str =
summary_writer.add_summary(summary_str, step)
# Save the model checkpoint periodically.
if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt'), checkpoint_path, global_step=step)