对于train来说,基本就是几个步骤:
# coding:utf-8
"""
训练
"""
from abc import ABCMeta
from abc import abstractmethod
class ITrain(object):
__metaclass__ = ABCMeta
@abstractmethod
def train(self):
"""
训练.
:return: train op
"""
pass
# coding:utf-8
"""
cifar10 train
"""
from train import ITrain
import tensorflow as tf
from cifar10_data_input import CIFAR10DataInput
from cifar10_inference import CIFAR10Inference
import time
import datetime
class CIFAR10Train(ITrain):
INPUT_PATH = 'input/cifar10_bin_data/*.bin'
TRAIN_PATH = 'output/train'
BATCH_SIZE = 128
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
LEARNING_RATE_INITILAIZE = 0.1
LEARING_RATE_DECAY_FACTOR = 0.1
NUM_EPOCHS_PER_DECAY = 350.0
def train(self):
input_paths = tf.train.match_filenames_once(CIFAR10Train.INPUT_PATH)
cifar10_input = CIFAR10DataInput(input_file_paths=input_paths,
batch_size=CIFAR10Train.BATCH_SIZE,
example_per_epoch_num=CIFAR10Train.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN)
image_batch, label_batch = cifar10_input.read_data()
tf.summary.image('images', image_batch)
cifar10_inference = CIFAR10Inference(image_channel=3,
batch_size=CIFAR10Train.BATCH_SIZE,
label_class_num=10)
logits = cifar10_inference.inference(images=image_batch)
loss = cifar10_inference.loss(logits, label_batch)
train_op = self._train_op(loss)
class _LoggerHook(tf.train.SessionRunHook):
def __init__(self):
super(_LoggerHook, self).__init__()
self._step = -1
self._start_time = time.time()
self._log_frequency = 100
def begin(self):
self._step = -1
self._start_time = time.time()
self._log_frequency = 100
def before_run(self, run_context):
self._step += 1
# loss会作为参数一起被运行 会在after_run运行结束后 将run_values 也就是这里的loss值传回
return tf.train.SessionRunArgs(loss)
def after_run(self, run_context, run_values):
if self._step % self._log_frequency == 0:
current_time = time.time()
duration = current_time - self._start_time
self._start_time = current_time
loss_value = run_values.results
examples_per_sec = self._log_frequency* CIFAR10Train.BATCH_SIZE / duration
sec_per_batch = float(duration / self._log_frequency)
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print(format_str % (datetime.datetime.now(), self._step, loss_value,
examples_per_sec, sec_per_batch))
with tf.train.MonitoredTrainingSession(checkpoint_dir=CIFAR10Train.TRAIN_PATH,
hooks=[tf.train.StopAtStepHook(last_step=1000), # 在执行了last_step会request stop
tf.train.NanTensorHook(loss), # 监控loss 为None
_LoggerHook()],
config=tf.ConfigProto(log_device_placement=False)) as mon_sess:
while not mon_sess.should_stop():
mon_sess.run(train_op)
def _train_op(self, loss):
# 用来记录全局的global steps 也就是一共运行了多少步
global_step = tf.contrib.framework.get_or_create_global_step()
num_batchs_per_epoch = CIFAR10Train.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / CIFAR10Train.BATCH_SIZE
decay_steps = int(num_batchs_per_epoch * CIFAR10Train.NUM_EPOCHS_PER_DECAY)
# 使用指数衰减来计算变化的学习率
learning_rate = tf.train.exponential_decay(CIFAR10Train.LEARNING_RATE_INITILAIZE,
global_step,
decay_steps=decay_steps,
decay_rate=CIFAR10Train.LEARING_RATE_DECAY_FACTOR,
staircase=True)
tf.summary.scalar('learning_rate', learning_rate)
# 计算平均loss
loss_averages_op = self._add_loss_summaryies(total_loss=loss)
# 表示控制执行的顺序 是计算完loss之后 在进行loss的优化
# 如果不这样做,在并行计算的时候,就会出问题。所以 control_dependencies就相当于并行计算的汇总
with tf.control_dependencies([loss_averages_op]):
opt = tf.train.GradientDescentOptimizer(learning_rate)
grads = opt.compute_gradients(loss)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
# vairablie add to histogram
for var in tf.trainable_variables():
tf.summary.histogram(var.op.name, var)
for grad, var in grads:
if grad is not None:
tf.summary.histogram(var.op.name + '/gradients', grad)
# 为什么变量也要计算移动平均,因为 最终可以使用移动平均的值来代替最终的变量。可以消除抖动引起的影响
variable_averages = tf.train.ExponentialMovingAverage(0.9999, global_step)
viariables_averages_op = variable_averages.apply(tf.trainable_variables())
with tf.control_dependencies([apply_gradient_op, viariables_averages_op]):
train_op = tf.no_op(name='train')
return train_op
def _add_loss_summaryies(self, total_loss):
"""
计算total loss的移动平均
:param total_loss: 每一次的total loss
:return:
"""
# ExponentialMovingAverage 这个的含义是创建移动平均
# 也就是说收集所有的loss,这些会存储在 GraphKeys.MOVING_AVERAGE_VARIABLES
# 所以这是一个存储的全局变量 session级别的生命周期
# 初始化的时候 每一个初始值设置为0,然后 后面每一轮迭代产生的新的值会叠加上去
loss_averages = tf.train.ExponentialMovingAverage(decay=0.9, name='avg')
losses = tf.get_collection('losses')
# apply 表示要对下面的每一个变量进行每一次迭代的移动平均计算. 不是list中的所有变量进行移动平均计算
loss_averages_op = loss_averages.apply(losses + [total_loss])
for l in losses + [total_loss]:
# 输出查看需要看原始数据
tf.summary.scalar(l.op.name + ' (raw)', l)
# 和移动平均的数据
tf.summary.scalar(l.op.name, loss_averages.average(l))
return loss_averages_op
# coding:utf-8
"""
数据输入接口,包含数据的读取以及变换产出标准的数据格式供模型使用
"""
import tensorflow as tf
from abc import ABCMeta
from abc import abstractmethod
class IDataInput(object):
"""
数据的输入
"""
__metaclass__ = ABCMeta
def __init__(self,
input_file_paths,
batch_size,
example_per_epoch_num,
parallel_thread_num=16):
"""
初始化
:param input_file_paths: 输入的文件路径列表
:param batch_size: batch size大小
:param example_per_epoch_num: 每一个epoch的样本数量 一般来说是总的样本数
:param parallel_thread_num: 并行处理的线程数
"""
self._input_file_paths = input_file_paths
self._batch_size = batch_size
self._parallel_thread_num = parallel_thread_num
self._example_per_echo_num = example_per_epoch_num
def read_data(self):
"""
读取数据
:return: (data_batch, image_batch)
"""
# 创建输入的queue
file_path_queue = tf.train.string_input_producer(self._input_file_paths)
record = self._read_data_from_queue(file_path_queue)
stander_data, label = self._preprocess_data(record)
data_batch, label_batch = self._generate_train_batch(stander_data, label, shuffle=False)
return data_batch, label_batch
@abstractmethod
def _read_data_from_queue(self, file_path_queue):
"""
根据queue 读取数据并返回需要的格式
:param file_path_queue:
:return:
"""
pass
def test_read_data_from_queue(self):
# 创建输入的queue
file_path_queue = tf.train.string_input_producer(self._input_file_paths)
return self._read_data_from_queue(file_path_queue)
@abstractmethod
def _preprocess_data(self, record):
"""
对读入的record进行一些预处理。对于图像来说,进行一些扭曲,加入噪音等操作
:param record: 在_read_data_from_queue读取的record
:return:
"""
pass
def _generate_train_batch(self, train_data, label, shuffle=True):
"""
通过队列创建数据batch
:param train_data: 训练数据
:param label: label
:param shuffle: 是否将样本随机后生成batch
:return:
"""
# 队列的capacity,设置来保证内存够用
capacity = self._example_per_echo_num * 0.4 + 3 * self._batch_size
if shuffle:
data_batch, label_batch = tf.train.shuffle_batch([train_data, label],
batch_size=self._batch_size,
num_threads=self._parallel_thread_num,
capacity=capacity,
min_after_dequeue=self._example_per_echo_num * 0.4)
else:
data_batch, label_batch = tf.train.batch([train_data, label],
batch_size=self._batch_size,
num_threads=self._parallel_thread_num,
capacity=capacity)
return data_batch, tf.reshape(label_batch, [self._batch_size])
# coding:utf-8
"""
CIFAR10的输入读取
"""
import tensorflow as tf
from data_input import IDataInput
class CIFAR10Record(object):
"""
CIFAR10 读取的record
"""
pass
class CIFAR10DataInput(IDataInput):
"""
Cifar10的数据输入.
数据集说明如下:
data_batch_1.bin - data_batch_5.bin 5个bin用作训练集.
bin 中的数据,第1个字是label,接下来的3072表示图片, 前1024字节是R,接下来1024是G,最后1024字节是B,
所以是32 * 32=1024的R G B图片. 每个bin包含 10000 个图片,所以总共有5W个训练图片。
test_batch.bin 1个bin用作测试集。
它的结构与训练集是一样的。这样是方便做评估。实际的问题中测试集是不包含label的。
batches.meta.txt 说明每个label对应的含义
"""
def __init__(self,
input_file_paths,
batch_size,
example_per_epoch_num,
parallel_thread_num=16,
label_bytes=1,
image_height=32,
image_width=32,
target_image_height=24,
target_image_width=24,
channel=3):
"""
初始化
:param label_bytes: lable占的字节数
:param image_height: 原始读入的图片高度
:param image_width: 原始读入的图片宽度
:param target_image_height: 目标生成的image height
:param target_image_width: 目标生成的image width
:param channel: 图片channel
"""
super(CIFAR10DataInput, self).__init__(input_file_paths=input_file_paths,
batch_size=batch_size,
example_per_epoch_num=example_per_epoch_num,
parallel_thread_num=parallel_thread_num)
self._label_bytes = label_bytes
self._image_height = image_height
self._image_width = image_width
self._target_image_height = target_image_height
self._target_image_width = target_image_width
self.channel = channel
def _read_data_from_queue(self, file_path_queue):
record = CIFAR10Record()
record.channel = 3
record.height = self._image_height
record.width = self._image_width
image_bytes = self._image_height * self._image_width*self.channel
record_bytes = self._label_bytes + image_bytes
# 从queue中读取
reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
# retern, key, value就是我们要的数据
record.key, value = reader.read(file_path_queue, name='image_reader')
# 对value解码成图片 和 label的字节
image_label_bytes = tf.decode_raw(value, tf.uint8)
# 提取label
record.label = tf.cast(tf.slice(image_label_bytes, [0], [self._label_bytes]), tf.int32)
# 提取image
image_bytes = tf.slice(image_label_bytes, [0+self._label_bytes], [image_bytes])
# reshape成 3 * 32 * 32 的矩阵。因为数据的存储就是 (R, G, B) => 1024, 1024, 1024
image_bytes = tf.reshape(image_bytes, [record.channel, record.height, record.width])
# 而tensorflow要的image是 => 32 * 32 * 3的格式. 将[0, 1, 2] (c,h,w)变成 [1, 2, 0](h,w,c)
record.uint8image = tf.transpose(image_bytes, [1, 2, 0])
return record
def _preprocess_data(self, record):
"""
对图像的预处理 加入噪音
:param record: 在_read_data_from_queue读取到record
:return:
"""
# 将image cast成float32数据
reshaped_image = tf.cast(record.uint8image, tf.float32)
height = self._target_image_height
width = self._target_image_width
# 随机crop
distorted_image = tf.random_crop(reshaped_image, [height, width, 3])
# 随机翻转
distorted_image = tf.image.random_flip_left_right(distorted_image)
# 加入brithness
distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)
# 调整对比度
distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper=1.8)
# 图片标准化
float_image = tf.image.per_image_standardization(distorted_image)
return float_image, record.label
# coding:utf-8
"""
建立前向模型
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow as tf
class IInference(object):
"""
建立前向模型
"""
def __init__(self):
self._loss_name = 'losses'
@abstractmethod
def inference(self, data):
"""
建立前向模型
:param data: 输入的数据
:return: tensorflow op
"""
pass
@abstractmethod
def loss(self, inference, label):
"""
计算loss
:param inference: inference中产生的前向数据
:param label: label
:return:
"""
pass
def bias(self, name, shape, initializer=tf.constant_initializer(0.0)):
"""
创建bias
:param name: bias名字
:param shape: bias的shape
:param initializer: initializer
:return: bias variable
"""
return tf.get_variable(name=name,
shape=shape,
initializer=initializer)
def viariable_with_weight_decay(self, name, shape, stddev, l2_decay):
"""
创建viariable
:param name: 名称
:param shape: shape
:param stddev: 标准差
:param l2_decay: l2loss 的系数. 如果l2_decay=None 则不进行l2 loss.
:return: 生成viaralibe tensor
"""
var = tf.get_variable(name=name,
shape=shape,
initializer=tf.truncated_normal_initializer(stddev=stddev, dtype=tf.float32),
dtype=tf.float32)
# 增加l2loss
if l2_decay is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(var), l2_decay, name='weight_loss')
# 产生的l2 regular 是在最后计算的时候,将所有的l2 添加进去,所以需要先保存起来.
# add_to_collection 相当于维护了一个key, value结构,value是一个list.
tf.add_to_collection(self._loss_name, weight_decay)
return var
# coding:utf-8
"""
CIFAR10的inference
"""
from inference import IInference
import tensorflow as tf
import re
class CIFAR10Inference(IInference):
"""
CIFAR10的Inference使用cnn
"""
def __init__(self, image_channel, batch_size, label_class_num):
"""
初始化
:param image_channel: image_channel
"""
super(CIFAR10Inference, self).__init__()
# 定义卷积核shape, 5*5*channel
self._kernel_width = 5
self._kernel_height = 5
self._image_channel = image_channel
self._batch_size = batch_size
self._label_class_num = label_class_num
@staticmethod
def activation_summary(x):
"""
将激活后的结果summary出来
:param x:
:return:
"""
tower_name = 'tower'
tensor_name = re.sub('%s_[0-9]*/' % tower_name, '', x.op.name)
tf.summary.histogram(tensor_name + '/activations', x)
tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))
def inference(self, images):
# 构建 卷积层1
conv1_kernel_num = 64 # 第一层 64个神经元
with tf.variable_scope('conv1') as scope:
kernel = self.viariable_with_weight_decay(
name='weight',
shape=[self._kernel_height, self._kernel_width, self._image_channel, conv1_kernel_num],
stddev=5e-2,
l2_decay=0.0)
# 卷积
conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
# bias
bias = self.bias('bias', [conv1_kernel_num])
pre_activation = tf.nn.bias_add(conv, bias=bias)
# 激活函数 relu
conv1 = tf.nn.relu(pre_activation, name=scope.name)
# summary conv1
CIFAR10Inference.activation_summary(conv1)
# max pooling
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# 对pool1 normalize
norm1 = tf.nn.lrn(pool1, depth_radius=4, bias=1,
alpha=0.001/9.0, beta=0.75,
name='norm1')
# 构建conv2
conv2_kernel_num = 64 # 第一层 64个神经元
with tf.variable_scope('conv2') as scope:
kernel = self.viariable_with_weight_decay(
name='weight',
shape=[self._kernel_height, self._kernel_width, conv1_kernel_num, conv2_kernel_num],
stddev=5e-2,
l2_decay=0.0)
# 卷积
conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
# bias
bias = self.bias('bias', [conv2_kernel_num], tf.constant_initializer(0.1))
pre_activation = tf.nn.bias_add(conv, bias=bias)
# 激活函数 relu
conv2 = tf.nn.relu(pre_activation, name=scope.name)
# summary conv1
CIFAR10Inference.activation_summary(conv2)
# 对pool1 normalize
norm2 = tf.nn.lrn(conv2, depth_radius=4, bias=1,
alpha=0.001 / 9.0, beta=0.75,
name='norm2')
# max pooling
pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
padding='SAME', name='pool1')
# local3 对卷积层进行全连接
with tf.variable_scope('local3') as scope:
reshape = tf.reshape(pool2,
shape=[self._batch_size, -1])
# 将pool2打平之后的一维向量
dim = reshape.get_shape()[1].value
weights = self.viariable_with_weight_decay('weights',
shape=[dim, 384],
stddev=0.04,
l2_decay=0.004)
bias = self.bias('bias',
shape=[384],
initializer=tf.constant_initializer(0.1))
local3 = tf.nn.relu(tf.matmul(reshape, weights) + bias, name=scope.name)
self.activation_summary(local3)
# local4 全连接
with tf.variable_scope('local4') as scope:
weights = self.viariable_with_weight_decay('weights',
shape=[384, 192],
stddev=0.04,
l2_decay=0.004)
bias = self.bias('bias',
shape=[192],
initializer=tf.constant_initializer(0.1))
local4 = tf.nn.relu(tf.matmul(local3, weights) + bias, name=scope.name)
self.activation_summary(local4)
# 最后一层softmax
with tf.variable_scope('softmax') as scope:
weights = self.viariable_with_weight_decay('weights',
shape=[192, self._label_class_num],
stddev=0.04,
l2_decay=0.004)
bias = self.bias('bias',
shape=[self._label_class_num],
initializer=tf.constant_initializer(0.0))
softmax_linear = tf.add(tf.matmul(local4, weights), bias, name=scope.name)
self.activation_summary(softmax_linear)
return softmax_linear
def loss(self, logits, label):
label = tf.cast(label, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=label,
logits=logits,
name='corss_entropy_per_exampel'
)
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy_mean')
tf.add_to_collection(self._loss_name, cross_entropy_mean)
# 最后将l2 loss叠加上
return tf.add_n(tf.get_collection(self._loss_name), name='total_loss')