最近看到AWS在18年年底的一篇论文(Bag of Tricks for Image Classification with Convolutional Neural Networks),是李沐和他的同事们总结的在图像分类中用到的一些技巧,可以提高分类的准确率,我也照着论文提到的技巧测试了一下,基于Tensorflow 2.1版本,搭建了一个Darknet53的模型(这也是大名鼎鼎的YOLOV3的骨干网络),在这个基础上来对Imagenent进行分类的训练。
首先是Darknet53网络的搭建,具体的网络结构可以参考https://github.com/pjreddie/darknet里面CFG目录下的darknet53.cfg文件。代码如下:
import tensorflow as tf
from tensorflow.keras import Model
l=tf.keras.layers
category_num = 80
vector_size = 3*(1+4+category_num)
def _conv(inputs, filters, kernel_size, strides, padding, bias=False, normalize=True, activation='relu'):
output = inputs
padding_str = 'same'
if padding>0:
output = l.ZeroPadding2D(padding=padding, data_format='channels_first')(output)
padding_str = 'valid'
output = l.Conv2D(filters, kernel_size, strides, padding_str, \
'channels_first', use_bias=bias, \
kernel_initializer='he_normal', \
kernel_regularizer=tf.keras.regularizers.l2(l=5e-4))(output)
if normalize:
output = l.BatchNormalization(axis=1)(output)
if activation=='relu':
output = l.ReLU()(output)
if activation=='relu6':
output = l.ReLU(max_value=6)(output)
if activation=='leaky_relu':
output = l.LeakyReLU(alpha=0.1)(output)
return output
def _residual(inputs, out_channels, activation='relu', name=None):
output1 = _conv(inputs, out_channels//2, 1, 1, 0, False, True, 'leaky_relu')
output2 = _conv(output1, out_channels, 3, 1, 1, False, True, 'leaky_relu')
output = l.Add(name=name)([inputs, output2])
return output
def darknet53_base():
image = tf.keras.Input(shape=(3,None,None))
net = _conv(image, 32, 3, 1, 1, False, True, 'leaky_relu') #32*H*W
net = _conv(net, 64, 3, 2, 1, False, True, 'leaky_relu') #64*H/2*W/2
net = _residual(net, 64, 'leaky_relu') #64*H/2*W/2
net = _conv(net, 128, 3, 2, 1, False, True, 'leaky_relu') #128*H/4*W/4
net = _residual(net, 128, 'leaky_relu') #128*H/4*W/4
net = _residual(net, 128, 'leaky_relu') #128*H/4*W/4
net = _conv(net, 256, 3, 2, 1, False, True, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
net = _residual(net, 256, 'leaky_relu') #256*H/8*W/8
route1 = l.Activation('linear', dtype='float32', name='route1')(net)
net = _conv(net, 512, 3, 2, 1, False, True, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
net = _residual(net, 512, 'leaky_relu') #512*H/16*W/16
route2 = l.Activation('linear', dtype='float32', name='route2')(net)
net = _conv(net, 1024, 3, 2, 1, False, True, 'leaky_relu') #1024*H/32*W/32
net = _residual(net, 1024, 'leaky_relu') #1024*H/32*W/32
net = _residual(net, 1024, 'leaky_relu') #1024*H/32*W/32
net = _residual(net, 1024, 'leaky_relu') #1024*H/32*W/32
net = _residual(net, 1024, 'leaky_relu') #1024*H/32*W/32
route3 = l.Activation('linear', dtype='float32', name='route3')(net)
net = tf.reduce_mean(net, axis=[2,3], keepdims=True)
net = _conv(net, 1000, 1, 1, 0, True, False, 'linear') #1000
net = l.Flatten(data_format='channels_first', name='logits')(net)
net = l.Activation('linear', dtype='float32', name='output')(net)
model = tf.keras.Model(inputs=image, outputs=[net, route1, route2, route3])
return model
在以上的代码中,Darknet53模型有4个输出,其中route1, route2, route3这三个是留待以后搭建YOLO V3网络时用的,在图像分类中暂时用不上。
论文介绍了以下的图像预处理的步骤:
我也遵照以上的步骤进行处理,只是在第3步翻转图片之后,我参照Darknet里面的方式,增加了一个随机旋转图片的步骤,旋转角度是在[-7, 7]之间的一个随机数。另外对于第5步的操作,在论文里面没有给出详细的介绍,我是参考mxnet里面的代码来实现的。对于图像的验证集数据来说,需要把以上的第2步改为把图片的最短边缩放为256并保持长宽比,然后在图片中间剪切一个224*224的矩形。之后跳过第3,4,5步,执行第6步即可。以下是在Tensorflow 2.1版本下的代码,构建训练集和测试集。这里我用到的Imagenent的数据是先整理为TFRECORD的格式,具体做法可以参见我之前的博客https://blog.csdn.net/gzroy/article/details/85954329
论文中提到了以下一些技巧:
完整的训练代码如下:
import tensorflow as tf
import tensorflow_addons as tfa
import math
import os
import random
import time
import numpy as np
from darknet53_model import darknet53_base
from tensorflow.keras.mixed_precision import experimental as mixed_precision
l = tf.keras.layers
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)
imageWidth = 224
imageHeight = 224
imageDepth = 3
batch_size = 128
resize_min = 256
train_images = 1280000
batches_per_epoch = train_images//batch_size
train_epochs = 80
total_steps = batches_per_epoch*train_epochs
random_min_aspect = 0.75
random_max_aspect = 1/0.75
random_min_area = 0.08
random_angle = 7.
initial_warmup_steps = 1000
initial_lr = 0.02
eigvec = tf.constant([[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]], shape=[3,3], dtype=tf.float32)
eigval = tf.constant([55.46, 4.794, 1.148], shape=[3,1], dtype=tf.float32)
mean_RGB = tf.constant([123.68, 116.779, 109.939], dtype=tf.float32)
std_RGB = tf.constant([58.393, 57.12, 57.375], dtype=tf.float32)
train_files_names = os.listdir('../train_tf/')
train_files = ['../train_tf/'+item for item in train_files_names]
valid_files_names = os.listdir('../valid_tf/')
valid_files = ['../valid_tf/'+item for item in valid_files_names]
# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
features = {
"image": tf.io.FixedLenFeature([], tf.string, default_value=""),
"height": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.io.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.io.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.io.FixedLenFeature([], tf.string, default_value=""),
"label": tf.io.FixedLenFeature([1], tf.int64, default_value=[0]),
"bbox_xmin": tf.io.VarLenFeature(tf.float32),
"bbox_xmax": tf.io.VarLenFeature(tf.float32),
"bbox_ymin": tf.io.VarLenFeature(tf.float32),
"bbox_ymax": tf.io.VarLenFeature(tf.float32),
"text": tf.io.FixedLenFeature([], tf.string, default_value=""),
"filename": tf.io.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.io.parse_single_example(example_proto, features)
image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_decoded = tf.cast(image_decoded, dtype=tf.float32)
# Random crop the image
shape = tf.shape(image_decoded)
height, width = shape[0], shape[1]
random_aspect = tf.random.uniform(shape=[], minval=random_min_aspect, maxval=random_max_aspect)
random_area = tf.random.uniform(shape=[], minval=random_min_area, maxval=1.0)
crop_width = tf.math.sqrt(
tf.divide(
tf.multiply(
tf.cast(tf.multiply(height,width), tf.float32),
random_area),
random_aspect)
)
crop_height = tf.cast(crop_width * random_aspect, tf.int32)
crop_height = tf.cond(crop_height= initial_warmup_steps:
tf.keras.backend.set_value(self.model.optimizer.lr, learning_rate_fn(step))
print("Steps:{}, LR:{:6.4f}, Loss:{:4.2f}, Time:{:4.1f}s"\
.format(step, lr, logs['loss'], elasp_time))
def on_epoch_end(self, epoch, logs=None):
epoch_elasp_time = time.time()-self.epoch_starttime
print("Epoch:{}, Top-1 Accuracy:{:5.3f}, Top-5 Accuracy:{:5.3f}, Time:{:5.1f}s"\
.format(epoch, logs['val_output_top_1_accuracy'], logs['val_output_top_5_accuracy'], epoch_elasp_time))
def on_epoch_begin(self, epoch, logs=None):
tf.keras.backend.set_learning_phase(True)
self.epoch_starttime=time.time()
def on_test_begin(self, logs=None):
tf.keras.backend.set_learning_phase(False)
tensorboard_cbk = tf.keras.callbacks.TensorBoard(log_dir='darknet53_20200203/logs')
checkpoint_cbk = tf.keras.callbacks.ModelCheckpoint(filepath='darknet53_20200203/epoch_{epoch}.h5', verbose=1)
model = darknet53_base()
model.compile(
loss={
'output':
tf.keras.losses.CategoricalCrossentropy(
from_logits=True, label_smoothing=0.1)
},
optimizer=tf.keras.optimizers.SGD(
learning_rate=0.001, momentum=0.9),
metrics={
'output':[
tf.keras.metrics.CategoricalAccuracy(
name='top_1_accuracy'),
tf.keras.metrics.TopKCategoricalAccuracy(
k=5,
name='top_5_accuracy')]
}
)
train_data = train_input_fn()
val_data = val_input_fn()
_ = model.fit(
train_data,
validation_data=val_data,
epochs=2,
initial_epoch=0,
verbose=0,
callbacks=[LRCallback(time.time()), tensorboard_cbk, checkpoint_cbk],
steps_per_epoch=5000)
最终在训练了300000个Batch(30个Epoch)之后,在验证集达到了Top1 71.5%,Top5 90.6%的准确率。这个离论文提到的性能以及YOLO3的性能还有一定的差距,不过暂时已经想不到能进一步提高的方法了。