基于Imagenet训练的深度学习卷积网络调参心得

都说深度学习是炼丹术,调参是炼丹的核心技能。最近基于Imagenet的数据集,测试了一下不同的参数对于性能的影响,在此总结一下。

首先搭建一个深度的卷积神经网络,网络结构参照YOLO论文中的对Imagenet预训练的网络,即一个20层的卷积网络再加上一个全连接层,具体的网络结构代码如下:

import tensorflow as tf

def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
        conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
        biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
        bias = tf.nn.bias_add(conv, biases)
        output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
        output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
        return output_bn

def inference(images, pretrain=True, wd=None, training=True):
    conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain, 0.01, training)       #112*112*64
    pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1')   #56*56*64
    conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain, 0.01, training)      #56*56*192
    pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2')   #28*28*192
    conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain, 0.01, training)     #28*28*128
    conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain, 0.01, training)     #28*28*256
    conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain, 0.01, training)     #28*28*256
    conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)     #28*28*512
    pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6')   #14*14*512
    conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)     #14*14*256
    conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)     #14*14*512
    conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)     #14*14*256
    conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)   #14*14*512
    conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)  #14*14*256
    conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain, 0.01, training)  #14*14*256
    conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain, 0.01, training)  #14*14*512
    conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #14*14*1024
    pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16')  #7*7*1024
    conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
    conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024
    conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain, 0.01, training) #7*7*512
    conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain, 0.01, training) #7*7*1024

    avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True)    #1024
    flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
    with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
        weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=pretrain, name='weights')
        weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
        biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=pretrain, name='biases')
        local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
    return local

网络训练的代码如下:

import tensorflow as tf
import os
import random
import time

imageWidth = 224
imageHeight = 224
imageDepth = 3
batch_size = 112
resize_min = 256

def distort_color(image, color_ordering=0):
    if color_ordering == 0:
        image = tf.image.random_brightness(image, max_delta=32. / 255.)#亮度
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)#饱和度
        image = tf.image.random_hue(image, max_delta=0.2)#色相
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)#对比度
    if color_ordering == 1:
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
        image = tf.image.random_hue(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
    if color_ordering == 2:
        image = tf.image.random_hue(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
    if color_ordering == 3:
        image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
        image = tf.image.random_brightness(image, max_delta=32. / 255.)
        image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
        image = tf.image.random_hue(image, max_delta=0.2)
    return tf.clip_by_value(image, 0.0, 1.0)

# Parse TFRECORD and distort the image for train
def _parse_function(example_proto):
    features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
                "height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
                "colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
                "img_format": tf.FixedLenFeature([], tf.string, default_value=""),
                "label": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
                "bbox_xmin": tf.VarLenFeature(tf.float32),
                "bbox_xmax": tf.VarLenFeature(tf.float32),
                "bbox_ymin": tf.VarLenFeature(tf.float32),
                "bbox_ymax": tf.VarLenFeature(tf.float32),
                "text": tf.FixedLenFeature([], tf.string, default_value=""),
                "filename": tf.FixedLenFeature([], tf.string, default_value="")
               }
    parsed_features = tf.parse_single_example(example_proto, features)

    image_decoded = tf.image.decode_jpeg(parsed_features["image"], channels=3)
    image_decoded = tf.image.convert_image_dtype(image_decoded, tf.float32)
    shape = tf.shape(image_decoded)
    height, width = shape[0], shape[1]
    resized_height, resized_width = tf.cond(height

测试结论如下:

1. 在每个卷积层的输出之后增加Batch Normalization,可以加快网络收敛,提高网络性能。Batch normalization应该增加在激活函数之后。Batch normalization增加之后,在调用Optimizer之前,需要确保Batch normalization的平均值和方差都已进行Update。需要增加以下代码:

update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optimize_op = optimizer.minimize(loss, global_step=global_step)

另外,在训练时,tf.batch_normalization的isTraining要设置为True,在预测时要设置为False

2. 对于图像的预处理,测试了几种不同的方式:

    a. 把图像的像素值转换为0-1

    b. 把图像的像素值转换为均值为0的正态分布

    c. 随机改变图像的对比度,饱和度,亮度,色相

    经过测试,把图像像素值转换为均值为0的正态分布的效果最好

3. 对L2的weight decay的参数的测试,测试了0, 0.005, 0.0005, 0.00005这几个取值,发现0.00005这个取值效果最好

最终的训练结果为,在训练10个EPOCH后,TOP 5的准确率为83.3%, TOP 1的准确率为61.5%

你可能感兴趣的:(机器学习,人工智能)