TensorFlow学习--AlexNet实现&图像识别

AlexNet主要技术点

AlexNet使用的主要技术:
1. 使用ReLU作为CNN的激活函数,解决了Sigmoid在较深网络中的梯度弥散问题(vanishing gradient problem).
2. 训练时使用Dropout随机忽略一部分神经元,避免了模型的过拟合问题.
3. 在CNN中使用重叠的最大池化,避免了平均池化造成的模糊效果.同时让步长小于池化核的尺寸,使池化层的输出发生重叠和覆盖,提升特征的丰富性.
4. 提出了LRN层,对局部神经元的活动创建竞争机制,强化响应比较大的神经元,抑制反馈较小的神经元,增强模型的泛化能力.
5. 数据增强.随机地从256*256的原始图像中截取224*224大小的区域以及水平翻转图像(相当于增加了  (256224)2×2=2048 倍的数据量).进行预测时,取图片的四个角加中间共5个位置并进行翻转,即10个图像,对其进行预测并对10次结果求均值.
6. 使用CUDA加速深度卷积网络的训练,利用GPU强大的并行运算能力,处理神经网络训练时大量的矩阵运算.

AlexNet网络结构

AlexNet的网络结构:

TensorFlow学习--AlexNet实现&图像识别_第1张图片
5个卷积层+3个全连接层

AlexNet每层的超参数如图.
两个GPU,一个GPU运行图形顶部的图层部分,另一个运行图层底部的图层部分。 GPU仅在某些层进行通信。
输入的图片规格为224*224*3,预处理后为227*227*3.
第一个卷积层使用96个较大的11*11尺寸的卷积核,步长为4,(采用了2个GPU处理,每个GPU处理48个).原图像为RGB 图像,是3通道,此处96个过滤器也是3通道的.得到的特征图大小new_feture_size=(img_size - filter_size)/stride +1 = (227-11)/4+1=55即大小为55*55.紧接着一个LRN层,然后是一个3*3的Max pooling最大池化层,步长为2.

AlexNet耗时测试

使用随机图片数据测试AlexNet前馈/反馈的平均耗时:

#!/usr/bin/python
# coding:utf-8

# TensorFlow实现AlexNet

from datetime import datetime
import math
import time
import tensorflow as tf


def convLayer(x, name, kh, kw, n_out, dh, dw, p):
    # 输入x的通道数
    n_in = x.get_shape()[-1].value
    with tf.name_scope(name) as scorp:
        # 使用截断正态分布函数初始化卷积核(kh*kw*n_in)卷积核数量为n_out
        kernel = tf.Variable(tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=1e-1), name='weights')
        # 对x进行卷积操作,strides步长为dh*dw,卷积核大小为kh*kw,padding模式为SAME即填充边界的点
        conv = tf.nn.conv2d(x, kernel, [1, dh, dw, 1], padding='SAME')
        # biases初始化为0
        biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), trainable=True, name='biases')
        # conv+biases
        bias = tf.nn.bias_add(conv, biases)
        activation =tf.nn.relu(bias, name=scorp)
        # 将训练参数kernel.biaases添加到p中
        p += [kernel, biases]
        # 打印出tensor activation结构
        print activation.op.name, ' ', activation.get_shape().as_list()
    return activation, p


# 总共测试100个batch的数据
num_batches = 100

# 全连接层
def fcLayer(x, inputData, outputData, reluFlag, name):
    with tf.variable_scope(name) as scope:
        w = tf.get_variable('w', shape=[inputData, outputData], dtype='float')
        b = tf.get_variable('b', [outputData], dtype='float')
        out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
        if reluFlag:
            return tf.nn.relu(out)
        else:
            return out


# 接受images作为输入,返回最后一层pool5及AlexNet中所有需要训练的模型参数
def AlexNet(images, classNum=None, dropoutrate=None):
    parameters = []

    # 卷积层1
    conv1, parameters = convLayer(images, name='conv1', kh=11, kw=11, n_out=64, dh=4, dw=4, p=parameters)
    # 添加LRN层和最大池化层
    # 对conv1进行LRN处理
    lrn1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.001/9, beta=0.75, name='lrn1')
    # 对lrn1进行最大池化处理,池化尺寸3*3,步长2*2,padding模式选VALID即取样不能超过边框
    pool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
    # 打印出结果pool1的结构
    print pool1.op.name, ' ', pool1.get_shape().as_list()

    # 卷积层2
    conv2, parameters = convLayer(pool1, name='conv2', kh=5, kw=5, n_out=192, dh=1, dw=1, p=parameters)
    # LRN处理
    lrn2 = tf.nn.lrn(conv2, 4, bias=1.0,alpha=0.001/9, beta=0.75, name='lrn2')
    # 最大池化处理
    pool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
    # 打印出结果pool2的结构
    print pool2.op.name, ' ', pool2.get_shape().as_list()

    # 卷积层3
    conv3, parameters = convLayer(pool2, name='conv3', kh=3, kw=3, n_out=384, dh=1, dw=1, p=parameters)

    # 卷积层4
    conv4, parameters = convLayer(conv3, name='conv4', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)

    # 卷积层5
    conv5, parameters = convLayer(conv4, name='conv5', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)
    pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')
    print pool5.op.name, ' ', pool5.get_shape().as_list()

    fc_in = tf.reshape(pool5, [-1, 256*6*6])
    fc6 = fcLayer(fc_in, 256*6*6, 4096, True, 'fc6')
    dropout6 = tf.nn.dropout(fc6, dropoutrate)

    fc7 = fcLayer(dropout6, 4096, 4096,True, 'fc7')
    dropout7 = tf.nn.dropout(fc7, dropoutrate)

    fc8 = fcLayer(dropout7, 4096, classNum, True, 'fc8')

    return pool5, parameters




# 评估AlexNet每轮计算占用的时间
# 输入TensorFlow的Session,需要测评的算子target,测试的名称info_string
def time_tensorflow_run(session, target, info_string):
    # 定义预热轮数(忽略前10轮,不考虑显存加载等因素的影响)
    num_steps_burn_in = 10
    total_duration = 0.0
    total_duration_squared = 0.0

    for i in range(num_batches + num_steps_burn_in):
        start_time = time.time()
        _ = session.run(target)
        # 持续时间
        duration = time.time()- start_time
        if i >= num_steps_burn_in:
            # 只考量10轮迭代之后的计算时间
            if not i % 10:
                print '%s: step %d, duration = %.3f' % (datetime.now().strftime('%X'), i - num_steps_burn_in, duration)
            # 记录总时间
            total_duration += duration
            total_duration_squared += duration * duration
    # 计算每轮迭代的平均耗时mn,和标准差sd
    mn = total_duration / num_batches
    vr = total_duration_squared / num_batches - mn * mn
    sd = math.sqrt(vr)
    # 打印出每轮迭代耗时
    print '%s: %s across %d steps, %.3f +/- %.3f sec / batch' % (datetime.now().strftime('%X'), info_string, num_batches, mn, sd)


# 使用随机图片数据测试前馈和反馈计算的耗时
def run_benchmark():
    with tf.Graph().as_default():
        batch_size = 32
        image_size = 224
        # 生成随机图片数据
        images = tf.Variable(tf.random_normal([batch_size,          # 每轮迭代的样本数
                                              image_size,image_size,# 图片尺寸224*224
                                              3],                   # 图片的通道数
                                              dtype=tf.float32,     # 数据类型
                                              stddev=1e-1))         # 标准差
        # 构建AlexNet,得到pool5和训练参数集合parameters
        pool5, parameters= AlexNet(images, classNum=1000, dropoutrate=0.5)
        # 初始化所有参数
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        # AlexNet的forward计算测评
        time_tensorflow_run(sess, pool5, 'Forward')
        # 为pool5设置一个优化目标loss,计算poll5的loss
        objective = tf.nn.l2_loss(pool5)
        # 求相对于loss的所有模型参数的梯度,模拟训练过程
        grad = tf.gradients(objective, parameters)
        # AlexNet的backward计算测评
        time_tensorflow_run(sess, grad, 'Forward-backward')


if __name__ == '__main__':
    run_benchmark()   

打印输出:

conv1   [32, 56, 56, 64]
pool1   [32, 27, 27, 64]
conv2   [32, 27, 27, 192]
pool2   [32, 13, 13, 192]
conv3   [32, 13, 13, 384]
conv4   [32, 13, 13, 256]
conv5   [32, 13, 13, 256]
pool5   [32, 6, 6, 256]

19:43:26: step 0, duration = 1.526
19:43:43: step 10, duration = 2.018
19:44:03: step 20, duration = 1.618
19:44:19: step 30, duration = 1.583
19:44:37: step 40, duration = 1.808
19:44:56: step 50, duration = 1.749
19:45:13: step 60, duration = 1.849
19:45:32: step 70, duration = 1.837
19:45:49: step 80, duration = 1.587
19:46:06: step 90, duration = 1.663
19:46:23: Forward across 100 steps, 1.789 +/- 0.210 sec / batch
19:47:30: step 0, duration = 5.831
19:48:34: step 10, duration = 5.831
19:49:49: step 20, duration = 8.383
19:50:57: step 30, duration = 6.152
19:52:48: step 40, duration = 13.673
19:54:44: step 50, duration = 10.054
19:56:32: step 60, duration = 11.055
19:58:17: step 70, duration = 10.246
20:00:06: step 80, duration = 12.227
20:02:01: step 90, duration = 10.946
20:03:31: Forward-backward across 100 steps, 9.666 +/- 2.279 sec / batch

可以看到5个卷积层以及最后一个池化层,以及每一层输出tensor的尺寸.
然后还可以看到forward以及backword运算的时间,此处没有使用GPU,因此可以看到每轮迭代的时间消耗比较大.


AlexNet实现及图像识别

# AlexNet实现
import tensorflow as tf
import numpy as np


# 卷积层
# group=2时等于AlexNet分上下两部分
def convLayer(x, kHeight, kWidth, strideX, strideY, featureNum, name, padding="SAME", groups=1):
    # 获取channel数
    channel = int(x.get_shape()[-1])
    # 定义卷积的匿名函数
    conv = lambda a, b: tf.nn.conv2d(a, b, strides=[1, strideY, strideX, 1], padding=padding)
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape=[kHeight, kWidth, channel / groups, featureNum])
        b = tf.get_variable("b", shape=[featureNum])
        # 将张量分解成子张量,划分后的输入和权重
        xNew = tf.split(value=x, num_or_size_splits=groups, axis=3)
        wNew = tf.split(value=w, num_or_size_splits=groups, axis=3)
        # 分别提取feature map
        featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
        # feature map整合
        mergeFeatureMap = tf.concat(axis=3, values=featureMap)
        out = tf.nn.bias_add(mergeFeatureMap, b)
        # relu后的结果
        return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name=scope.name)

# 全连接层
def fcLayer(x, inputD, outputD, reluFlag, name):
    with tf.variable_scope(name) as scope:
        w = tf.get_variable("w", shape=[inputD, outputD], dtype="float")
        b = tf.get_variable("b", [outputD], dtype="float")
        out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
        if reluFlag:
            return tf.nn.relu(out)
        else:
            return out

# alexNet模型
class alexNet(object):
    def __init__(self, x, keepPro, classNum, modelPath="bvlc_alexnet.npy"):
        self.X = x
        self.KEEPPRO = keepPro
        self.CLASSNUM = classNum
        self.MODELPATH = modelPath
        self.buildCNN()

    def buildCNN(self):
        # 卷积层1
        conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
        # 最大池化层,池化窗口3*3,步长2*2
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
        lrn1 = tf.nn.lrn(pool1, depth_radius=2, alpha=2e-05,beta=0.75, bias=1.0, name='norm1')
        # 卷积层2
        conv2 = convLayer(lrn1, 5, 5, 1, 1, 256, "conv2", groups=2)
        # 最大池化层,池化窗口3*3,步长2*2
        pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
        lrn2 = tf.nn.lrn(pool2, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0, name='lrn2')
        # 卷积层3
        conv3 = convLayer(lrn2, 3, 3, 1, 1, 384, "conv3")
        # 卷积层4
        conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
        # 卷积层5
        conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
        # 最大池化层,池化窗口3*3,步长2*2
        pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')

        # 全连接层1
        fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
        fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
        dropout1 = tf.nn.dropout(fc1, self.KEEPPRO)
        # 全连接层2
        fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
        dropout2 = tf.nn.dropout(fc2, self.KEEPPRO)
        # 全连接层3
        self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")

    # 加载modeel
    def loadModel(self, sess):
        wDict = np.load(self.MODELPATH, encoding="bytes").item()
        # 模型中的层
        for name in wDict:
            if name not in []:
                with tf.variable_scope(name, reuse=True):
                    for p in wDict[name]:
                        if len(p.shape) == 1:
                            # bias 只有一维
                            sess.run(tf.get_variable('b', trainable=False).assign(p))
                        else:
                            # weights
                            sess.run(tf.get_variable('w', trainable=False).assign(p))


import os
import cv2
import caffe_classes


# AlexNet测试
if __name__=='__main__':
    dropoutPro = 1
    classNum = 1000
    testPath = "testimage"
    # 读取测试图像
    testImg = []
    for f in os.listdir(testPath):
        testImg.append(cv2.imread(testPath + "/" + f))

    imgMean = np.array([104, 117, 124], np.float)
    x = tf.placeholder("float", [1, 227, 227, 3])
    # alexNet模型
    model = alexNet(x, dropoutPro, classNum)
    score = model.fc3
    print score
    softmax = tf.nn.softmax(score)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # 加载模型
        model.loadModel(sess)
        for i, img in enumerate(testImg):
            # resize成网络输入大小,去均值
            test = cv2.resize(img.astype(np.float), (227, 227)) - imgMean
            # test拉成tensor
            test = test.reshape((1, 227, 227, 3))
            # 取概率最大类的下标
            maxx = np.argmax(sess.run(softmax, feed_dict={x: test}))
            # 概率最大的类
            res = caffe_classes.class_names[maxx]
            print(res)
            # 设置字体
            font = cv2.FONT_HERSHEY_SIMPLEX
            # 显示类的名字
            cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 0, 255), 2)
            # 显示
            cv2.imshow("test", img)
            cv2.waitKey(0)

可以看到斑马zebar和鹤crane的测试结果:

TensorFlow学习--AlexNet实现&图像识别_第2张图片

AlexNet相关连接:

1 .论文:Imagenet Classification with Deep Convolutional Neural Networks

  1. 训练好的文件bvlc_alexnet.npy以及与网络对应的类别文件caffe_classes.py下载链接

你可能感兴趣的:(TensorFlow)