AlexNet使用的主要技术:
1. 使用ReLU作为CNN的激活函数,解决了Sigmoid在较深网络中的梯度弥散问题(vanishing gradient problem).
2. 训练时使用Dropout随机忽略一部分神经元,避免了模型的过拟合问题.
3. 在CNN中使用重叠的最大池化,避免了平均池化造成的模糊效果.同时让步长小于池化核的尺寸,使池化层的输出发生重叠和覆盖,提升特征的丰富性.
4. 提出了LRN层,对局部神经元的活动创建竞争机制,强化响应比较大的神经元,抑制反馈较小的神经元,增强模型的泛化能力.
5. 数据增强.随机地从256*256的原始图像中截取224*224大小的区域以及水平翻转图像(相当于增加了 (256−224)2×2=2048 倍的数据量).进行预测时,取图片的四个角加中间共5个位置并进行翻转,即10个图像,对其进行预测并对10次结果求均值.
6. 使用CUDA加速深度卷积网络的训练,利用GPU强大的并行运算能力,处理神经网络训练时大量的矩阵运算.
AlexNet的网络结构:
AlexNet每层的超参数如图.
两个GPU,一个GPU运行图形顶部的图层部分,另一个运行图层底部的图层部分。 GPU仅在某些层进行通信。
输入的图片规格为224*224*3,预处理后为227*227*3.
第一个卷积层使用96个较大的11*11尺寸的卷积核,步长为4,(采用了2个GPU处理,每个GPU处理48个).原图像为RGB 图像,是3通道,此处96个过滤器也是3通道的.得到的特征图大小new_feture_size=(img_size - filter_size)/stride +1 = (227-11)/4+1=55即大小为55*55.紧接着一个LRN层,然后是一个3*3的Max pooling最大池化层,步长为2.
使用随机图片数据测试AlexNet前馈/反馈的平均耗时:
#!/usr/bin/python
# coding:utf-8
# TensorFlow实现AlexNet
from datetime import datetime
import math
import time
import tensorflow as tf
def convLayer(x, name, kh, kw, n_out, dh, dw, p):
# 输入x的通道数
n_in = x.get_shape()[-1].value
with tf.name_scope(name) as scorp:
# 使用截断正态分布函数初始化卷积核(kh*kw*n_in)卷积核数量为n_out
kernel = tf.Variable(tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=1e-1), name='weights')
# 对x进行卷积操作,strides步长为dh*dw,卷积核大小为kh*kw,padding模式为SAME即填充边界的点
conv = tf.nn.conv2d(x, kernel, [1, dh, dw, 1], padding='SAME')
# biases初始化为0
biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), trainable=True, name='biases')
# conv+biases
bias = tf.nn.bias_add(conv, biases)
activation =tf.nn.relu(bias, name=scorp)
# 将训练参数kernel.biaases添加到p中
p += [kernel, biases]
# 打印出tensor activation结构
print activation.op.name, ' ', activation.get_shape().as_list()
return activation, p
# 总共测试100个batch的数据
num_batches = 100
# 全连接层
def fcLayer(x, inputData, outputData, reluFlag, name):
with tf.variable_scope(name) as scope:
w = tf.get_variable('w', shape=[inputData, outputData], dtype='float')
b = tf.get_variable('b', [outputData], dtype='float')
out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
if reluFlag:
return tf.nn.relu(out)
else:
return out
# 接受images作为输入,返回最后一层pool5及AlexNet中所有需要训练的模型参数
def AlexNet(images, classNum=None, dropoutrate=None):
parameters = []
# 卷积层1
conv1, parameters = convLayer(images, name='conv1', kh=11, kw=11, n_out=64, dh=4, dw=4, p=parameters)
# 添加LRN层和最大池化层
# 对conv1进行LRN处理
lrn1 = tf.nn.lrn(conv1, 4, bias=1.0, alpha=0.001/9, beta=0.75, name='lrn1')
# 对lrn1进行最大池化处理,池化尺寸3*3,步长2*2,padding模式选VALID即取样不能超过边框
pool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
# 打印出结果pool1的结构
print pool1.op.name, ' ', pool1.get_shape().as_list()
# 卷积层2
conv2, parameters = convLayer(pool1, name='conv2', kh=5, kw=5, n_out=192, dh=1, dw=1, p=parameters)
# LRN处理
lrn2 = tf.nn.lrn(conv2, 4, bias=1.0,alpha=0.001/9, beta=0.75, name='lrn2')
# 最大池化处理
pool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
# 打印出结果pool2的结构
print pool2.op.name, ' ', pool2.get_shape().as_list()
# 卷积层3
conv3, parameters = convLayer(pool2, name='conv3', kh=3, kw=3, n_out=384, dh=1, dw=1, p=parameters)
# 卷积层4
conv4, parameters = convLayer(conv3, name='conv4', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)
# 卷积层5
conv5, parameters = convLayer(conv4, name='conv5', kh=3, kw=3, n_out=256, dh=1, dw=1, p=parameters)
pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')
print pool5.op.name, ' ', pool5.get_shape().as_list()
fc_in = tf.reshape(pool5, [-1, 256*6*6])
fc6 = fcLayer(fc_in, 256*6*6, 4096, True, 'fc6')
dropout6 = tf.nn.dropout(fc6, dropoutrate)
fc7 = fcLayer(dropout6, 4096, 4096,True, 'fc7')
dropout7 = tf.nn.dropout(fc7, dropoutrate)
fc8 = fcLayer(dropout7, 4096, classNum, True, 'fc8')
return pool5, parameters
# 评估AlexNet每轮计算占用的时间
# 输入TensorFlow的Session,需要测评的算子target,测试的名称info_string
def time_tensorflow_run(session, target, info_string):
# 定义预热轮数(忽略前10轮,不考虑显存加载等因素的影响)
num_steps_burn_in = 10
total_duration = 0.0
total_duration_squared = 0.0
for i in range(num_batches + num_steps_burn_in):
start_time = time.time()
_ = session.run(target)
# 持续时间
duration = time.time()- start_time
if i >= num_steps_burn_in:
# 只考量10轮迭代之后的计算时间
if not i % 10:
print '%s: step %d, duration = %.3f' % (datetime.now().strftime('%X'), i - num_steps_burn_in, duration)
# 记录总时间
total_duration += duration
total_duration_squared += duration * duration
# 计算每轮迭代的平均耗时mn,和标准差sd
mn = total_duration / num_batches
vr = total_duration_squared / num_batches - mn * mn
sd = math.sqrt(vr)
# 打印出每轮迭代耗时
print '%s: %s across %d steps, %.3f +/- %.3f sec / batch' % (datetime.now().strftime('%X'), info_string, num_batches, mn, sd)
# 使用随机图片数据测试前馈和反馈计算的耗时
def run_benchmark():
with tf.Graph().as_default():
batch_size = 32
image_size = 224
# 生成随机图片数据
images = tf.Variable(tf.random_normal([batch_size, # 每轮迭代的样本数
image_size,image_size,# 图片尺寸224*224
3], # 图片的通道数
dtype=tf.float32, # 数据类型
stddev=1e-1)) # 标准差
# 构建AlexNet,得到pool5和训练参数集合parameters
pool5, parameters= AlexNet(images, classNum=1000, dropoutrate=0.5)
# 初始化所有参数
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# AlexNet的forward计算测评
time_tensorflow_run(sess, pool5, 'Forward')
# 为pool5设置一个优化目标loss,计算poll5的loss
objective = tf.nn.l2_loss(pool5)
# 求相对于loss的所有模型参数的梯度,模拟训练过程
grad = tf.gradients(objective, parameters)
# AlexNet的backward计算测评
time_tensorflow_run(sess, grad, 'Forward-backward')
if __name__ == '__main__':
run_benchmark()
打印输出:
conv1 [32, 56, 56, 64]
pool1 [32, 27, 27, 64]
conv2 [32, 27, 27, 192]
pool2 [32, 13, 13, 192]
conv3 [32, 13, 13, 384]
conv4 [32, 13, 13, 256]
conv5 [32, 13, 13, 256]
pool5 [32, 6, 6, 256]
19:43:26: step 0, duration = 1.526
19:43:43: step 10, duration = 2.018
19:44:03: step 20, duration = 1.618
19:44:19: step 30, duration = 1.583
19:44:37: step 40, duration = 1.808
19:44:56: step 50, duration = 1.749
19:45:13: step 60, duration = 1.849
19:45:32: step 70, duration = 1.837
19:45:49: step 80, duration = 1.587
19:46:06: step 90, duration = 1.663
19:46:23: Forward across 100 steps, 1.789 +/- 0.210 sec / batch
19:47:30: step 0, duration = 5.831
19:48:34: step 10, duration = 5.831
19:49:49: step 20, duration = 8.383
19:50:57: step 30, duration = 6.152
19:52:48: step 40, duration = 13.673
19:54:44: step 50, duration = 10.054
19:56:32: step 60, duration = 11.055
19:58:17: step 70, duration = 10.246
20:00:06: step 80, duration = 12.227
20:02:01: step 90, duration = 10.946
20:03:31: Forward-backward across 100 steps, 9.666 +/- 2.279 sec / batch
可以看到5个卷积层以及最后一个池化层,以及每一层输出tensor的尺寸.
然后还可以看到forward以及backword运算的时间,此处没有使用GPU,因此可以看到每轮迭代的时间消耗比较大.
# AlexNet实现
import tensorflow as tf
import numpy as np
# 卷积层
# group=2时等于AlexNet分上下两部分
def convLayer(x, kHeight, kWidth, strideX, strideY, featureNum, name, padding="SAME", groups=1):
# 获取channel数
channel = int(x.get_shape()[-1])
# 定义卷积的匿名函数
conv = lambda a, b: tf.nn.conv2d(a, b, strides=[1, strideY, strideX, 1], padding=padding)
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape=[kHeight, kWidth, channel / groups, featureNum])
b = tf.get_variable("b", shape=[featureNum])
# 将张量分解成子张量,划分后的输入和权重
xNew = tf.split(value=x, num_or_size_splits=groups, axis=3)
wNew = tf.split(value=w, num_or_size_splits=groups, axis=3)
# 分别提取feature map
featureMap = [conv(t1, t2) for t1, t2 in zip(xNew, wNew)]
# feature map整合
mergeFeatureMap = tf.concat(axis=3, values=featureMap)
out = tf.nn.bias_add(mergeFeatureMap, b)
# relu后的结果
return tf.nn.relu(tf.reshape(out, mergeFeatureMap.get_shape().as_list()), name=scope.name)
# 全连接层
def fcLayer(x, inputD, outputD, reluFlag, name):
with tf.variable_scope(name) as scope:
w = tf.get_variable("w", shape=[inputD, outputD], dtype="float")
b = tf.get_variable("b", [outputD], dtype="float")
out = tf.nn.xw_plus_b(x, w, b, name=scope.name)
if reluFlag:
return tf.nn.relu(out)
else:
return out
# alexNet模型
class alexNet(object):
def __init__(self, x, keepPro, classNum, modelPath="bvlc_alexnet.npy"):
self.X = x
self.KEEPPRO = keepPro
self.CLASSNUM = classNum
self.MODELPATH = modelPath
self.buildCNN()
def buildCNN(self):
# 卷积层1
conv1 = convLayer(self.X, 11, 11, 4, 4, 96, "conv1", "VALID")
# 最大池化层,池化窗口3*3,步长2*2
pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1')
lrn1 = tf.nn.lrn(pool1, depth_radius=2, alpha=2e-05,beta=0.75, bias=1.0, name='norm1')
# 卷积层2
conv2 = convLayer(lrn1, 5, 5, 1, 1, 256, "conv2", groups=2)
# 最大池化层,池化窗口3*3,步长2*2
pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2')
lrn2 = tf.nn.lrn(pool2, depth_radius=2, alpha=2e-05, beta=0.75, bias=1.0, name='lrn2')
# 卷积层3
conv3 = convLayer(lrn2, 3, 3, 1, 1, 384, "conv3")
# 卷积层4
conv4 = convLayer(conv3, 3, 3, 1, 1, 384, "conv4", groups=2)
# 卷积层5
conv5 = convLayer(conv4, 3, 3, 1, 1, 256, "conv5", groups=2)
# 最大池化层,池化窗口3*3,步长2*2
pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5')
# 全连接层1
fcIn = tf.reshape(pool5, [-1, 256 * 6 * 6])
fc1 = fcLayer(fcIn, 256 * 6 * 6, 4096, True, "fc6")
dropout1 = tf.nn.dropout(fc1, self.KEEPPRO)
# 全连接层2
fc2 = fcLayer(dropout1, 4096, 4096, True, "fc7")
dropout2 = tf.nn.dropout(fc2, self.KEEPPRO)
# 全连接层3
self.fc3 = fcLayer(dropout2, 4096, self.CLASSNUM, True, "fc8")
# 加载modeel
def loadModel(self, sess):
wDict = np.load(self.MODELPATH, encoding="bytes").item()
# 模型中的层
for name in wDict:
if name not in []:
with tf.variable_scope(name, reuse=True):
for p in wDict[name]:
if len(p.shape) == 1:
# bias 只有一维
sess.run(tf.get_variable('b', trainable=False).assign(p))
else:
# weights
sess.run(tf.get_variable('w', trainable=False).assign(p))
import os
import cv2
import caffe_classes
# AlexNet测试
if __name__=='__main__':
dropoutPro = 1
classNum = 1000
testPath = "testimage"
# 读取测试图像
testImg = []
for f in os.listdir(testPath):
testImg.append(cv2.imread(testPath + "/" + f))
imgMean = np.array([104, 117, 124], np.float)
x = tf.placeholder("float", [1, 227, 227, 3])
# alexNet模型
model = alexNet(x, dropoutPro, classNum)
score = model.fc3
print score
softmax = tf.nn.softmax(score)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# 加载模型
model.loadModel(sess)
for i, img in enumerate(testImg):
# resize成网络输入大小,去均值
test = cv2.resize(img.astype(np.float), (227, 227)) - imgMean
# test拉成tensor
test = test.reshape((1, 227, 227, 3))
# 取概率最大类的下标
maxx = np.argmax(sess.run(softmax, feed_dict={x: test}))
# 概率最大的类
res = caffe_classes.class_names[maxx]
print(res)
# 设置字体
font = cv2.FONT_HERSHEY_SIMPLEX
# 显示类的名字
cv2.putText(img, res, (int(img.shape[0] / 3), int(img.shape[1] / 3)), font, 1, (0, 0, 255), 2)
# 显示
cv2.imshow("test", img)
cv2.waitKey(0)
可以看到斑马zebar和鹤crane的测试结果:
1 .论文:Imagenet Classification with Deep Convolutional Neural Networks