VGGNet 参考论文Very Deep Convolutional Networks for Large-Scale Image Recognition
VGGNet代码实现参考《Tensorflow实战》黄文坚
Fer2013图片48*48,train_set一共28708张图,测试集有两个,pub_test_set和pri_test_set各3589张图。一共7种表情,类别为0-6。
利用Google的Colab来跑模型,自己的电脑实在太慢了。
结果:
在经过41个epoch之后,模型收敛。
这时候对Fer2013 public测试集的分类精度为47.45%。对private测试集的分类精度为46.92%。
人眼分辨Fer2013的准确率为65%左右。
步骤:
1.将一整个Fer2013分为训练集和两个测试集,分别存为csv文件。
2.读入三个文件。
3.将48*48的图片resize为224*224的标准大小图片。
4.进行训练,在每次训练完成后都直接对两个测试集进行测试。
遇到的问题:
1.一开始没将图片resize为224*224,48*48的图片经过VGG的5层2*2的池化层之后,图片大小变为了1*1,特征全没了,训练失败。
2.学习速率过低,一开始的学习速率定为0.001,导致模型收敛极慢。
3.训练集过大,程序中需要将48*48的图像改成224*224的图。无论是在程序中进行resize还是读取resize之后的csv文件都特别慢,值得一提的是,resize之后的train_set的大小为7.53 GB。
---解决:使用openCV的cv2.resize来改变图像大小,速度快了很多。
---疑问:为什么openCV的resize比自己实现的resize快这么多?是不是因为openCV的resize用的不是python,是C或汇编之类的?
4.用Colab来跑也还是太慢了,跑到最后才读完数据并训练了19个epoch,目标训练100个epoch,但是Colab会把长时间占用GPU的我踢掉。
---解决:在解决了上面的问题后,时间已经够模型收敛了。
VGGNet的具体代码是直接抄书的
上代码:
#@title
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 16 10:00:07 2018
@author: Administrator
"""
from datetime import datetime
import tensorflow as tf
import pandas as pd
import numpy as np
import csv
import cv2
f_train = open("drive//fer2013//traincpy.csv", encoding = 'UTF-8')
df_train = pd.read_csv(f_train)
#f_train_resize = open("drive//fer2013//trainresize.csv", encoding = 'UTF-8')
#df_train_resize = pd.read_csv(f_train_resize)
f_test_pub = open("drive//fer2013//valcpy.csv", encoding = 'UTF-8')
df_test_pub = pd.read_csv(f_test_pub)
f_test_pri = open("drive//fer2013//testcpy.csv", encoding = 'UTF-8')
df_test_pri = pd.read_csv(f_test_pri)
print('read csv file finished')
train_featuresets = df_train.iloc[1: , 1: ]
train_emotionsets = df_train.iloc[1: , 0:1]
test_pub_featuresets = df_test_pub.iloc[0: , 1: ]
test_pub_emotionsets = df_test_pub.iloc[0: , 0:1]
test_pri_featuresets = df_test_pri.iloc[0: , 1: ]
test_pri_emotionsets = df_test_pri.iloc[0: , 0:1]
print('dataset load finished')
#train_featuresets_resize = df_train_resize.iloc[1: , 0: ]
#train_feature_resize = tf.constant(train_featuresets_resize)
#train_feature_resize = tf.reshape(train_feature_resize, [-1, 224, 224, 1])
#train_emotion = np.reshape(np.array(train_emotionsets, dtype = 'float32'), (-1))
#print(train_feature_resize.shape)
#train_feature = tf.constant(train_featuresets)
#train_emotion = tf.constant(train_emotionsets)
#train_feature = tf.reshape(train_feature, [-1, 48, 48, 1])
#train_emotion = tf.reshape(train_emotion, [-1, 1])
#双线性插值,讲48*48的图片变成224*224
def resize(src, new_size):
dst_w = 224
dst_h = 224 # 目标图像宽高
src_h = 48
src_w = 48 # 源图像宽高
if src_h == dst_h and src_w == dst_w:
return src.copy()
scale_x = float(src_w) / dst_w # x缩放比例
scale_y = float(src_h) / dst_h # y缩放比例
# 遍历目标图像,插值
dst = np.zeros((dst_h, dst_w, 1), dtype=np.uint8)
for n in range(1): # 对channel循环
for dst_y in range(dst_h): # 对height循环
for dst_x in range(dst_w): # 对width循环
# 目标在源上的坐标
src_x = (dst_x + 0.5) * scale_x - 0.5
src_y = (dst_y + 0.5) * scale_y - 0.5
# 计算在源图上四个近邻点的位置
src_x_0 = int(np.floor(src_x))
src_y_0 = int(np.floor(src_y))
src_x_1 = min(src_x_0 + 1, src_w - 1)
src_y_1 = min(src_y_0 + 1, src_h - 1)
#双线性插值
value0 = (src_x_1 - src_x) * src[src_y_0, src_x_0, n] + (src_x - src_x_0) * src[src_y_0, src_x_1, n]
value1 = (src_x_1 - src_x) * src[src_y_1, src_x_0, n] + (src_x - src_x_0) * src[src_y_1, src_x_1, n]
dst[dst_y, dst_x, n] = int((src_y_1 - src_y) * value0 + (src_y - src_y_0) * value1)
return dst
#实在是太慢了,所以改用OPENCV了
#话说OPENCV为什么这么快?
print('start resize 48*48 to 224*224')
train_feature_resize = []
train_feature = np.reshape(np.array(train_featuresets, dtype = 'float32'), (-1, 48, 48, 1))
train_emotion = np.reshape(np.array(train_emotionsets, dtype = 'float32'), (-1))
print('total ', train_feature.shape[0])
for i in range(train_feature.shape[0]):
#for i in range(640):
if i%1000 == 0:
print('now resize 48 --> 224 train set',i)
#cv2.resize(src, dsize[, dst[, fx[, fy[, interpolation]]]]) → dst
#pic = cv2.resize(pic, (400, 400), interpolation=cv2.INTER_CUBIC)
train_feature_resize.append(cv2.resize(train_feature[i], (224, 224), interpolation=cv2.INTER_LINEAR))
#print(train_feature_resize.shape)
#train_feature_resize = np.array(train_feature_resize, dtype = 'float32')
print(len(train_feature_resize))
print('train_feature resize finished')
test_pub_feature_resize = []
test_pub_feature = np.reshape(np.array(test_pub_featuresets, dtype = 'float32'), (-1, 48, 48, 1))
test_pub_emotion = np.reshape(np.array(test_pub_emotionsets, dtype = 'float32'), (-1))
for i in range(test_pub_feature.shape[0]):
##for i in range(320):
if i%200 == 0:
print('now resize 48 --> 224 pub test set',i)
test_pub_feature_resize.append(cv2.resize(test_pub_feature[i], (224,224), interpolation=cv2.INTER_LINEAR))
test_pub_feature_resize = np.reshape(np.array(test_pub_feature_resize, dtype = 'float32'), (-1, 224, 224,1))
print(test_pub_feature_resize.shape)
print('test_pub resize finished')
test_pri_feature_resize = []
test_pri_feature = np.reshape(np.array(test_pri_featuresets, dtype = 'float32'), (-1, 48, 48, 1))
test_pri_emotion = np.reshape(np.array(test_pri_emotionsets, dtype = 'float32'), (-1))
for i in range(test_pri_feature.shape[0]):
#for i in range(320):
if i%200 == 0:
print('now resize 48 --> 224 pri test set',i)
test_pri_feature_resize.append(cv2.resize(test_pri_feature[i], (224,224), interpolation=cv2.INTER_LINEAR))
test_pri_feature_resize = np.reshape(np.array(test_pri_feature_resize, dtype = 'float32'), (-1, 224, 224,1))
print(test_pri_feature_resize.shape)
print('test_pri resize finished')
#print(train_feature[0:2])
batch_size = 32
num_batches = 100
keep_prob = tf.placeholder(tf.float32)
X = tf.placeholder(tf.float32, [32, 224, 224, 1])
Y = tf.placeholder(tf.int32)
# 用来创建卷积层并把本层的参数存入参数列表
# input_op:输入的tensor name:该层的名称 kh:卷积层的高 kw:卷积层的宽 n_out:输出通道数,dh:步长的高 dw:步长的宽,p是参数列表
def conv_op(input_op, name, kh, kw, n_out, dh, dw, p):
#获取input_op的通道数
n_in = input_op.get_shape()[-1].value
with tf.name_scope(name) as scope:
#卷积核参数
kernel = tf.get_variable(scope + "w", shape = [kh, kw, n_in, n_out], dtype = tf.float32, initializer = tf.contrib.layers.xavier_initializer_conv2d())
#对input_op进行卷积处理,卷及和为kernel,步长
#第一个参数需要做卷积的输入图像,是一个Tensor,[batch, in_height, in_width, in_channels]是一个4维的Tensor,float32和float64之一
#第二个参数相当于CNN中的卷积核,是一个Tensor,[filter_height, filter_width, in_channels, out_channels]类型与参数input相同,第三维in_channels,是input的第四维
#第三个参数卷积时在图像每一维的步长,这是一个一维的向量,长度4
#第四个参数padding:string类型的量,只能是"SAME","VALID"其中之一,SAME可以停留在图像边缘
#结果返回一个Tensor,这个输出,就是我们常说的feature map,shape仍然是[batch, height, width, channels]
conv = tf.nn.conv2d(input_op, kernel, (1, dh, dw, 1), padding = "SAME")
#创建一个张量,用0.0来填充
bias_init_val = tf.constant(0.0, shape = [n_out], dtype = tf.float32)
#转成可训练的参数,可以对他用Optimizer
biases = tf.Variable(bias_init_val, trainable = True, name = 'b')
#将偏差项bias加到conv上面,这里是bias必须是一维的
z = tf.nn.bias_add(conv, biases)
#卷积层的输出
activation = tf.nn.relu(z, name = scope)
#将kernel和biases加到参数列表
p += [kernel, biases]
return activation
#定义全连接层
def fc_op(input_op, name, n_out, p):
#获取通道数
n_in = input_op.get_shape()[-1].value
with tf.name_scope(name) as scope:
#创建全连接层的参数,只有两个维度,也用xavier_initializer来初始化
kernel = tf.get_variable(scope+"w", shape = [n_in, n_out], dtype = tf.float32, initializer = tf.contrib.layers.xavier_initializer())
#初始化biases,这里用0.1来填充了
biases = tf.Variable(tf.constant(0.1, shape = [n_out], dtype = tf.float32), name = 'b')
activation = tf.nn.relu_layer(input_op, kernel, biases, name = scope)
p += [kernel, biases]
return activation
#定义最大池化层的创建函数
#maxpool即领域内取最大
def mpool_op(input_op, name, kh, kw, dh, dw):
#这里tf.nn.max_pool(value, ksize, strides, padding, name=None)
#value输入通常是feature map
#池化窗口的大小,不再batch和channel上池化,所以两个为1
#窗口在每个维度上的滑动步长
#和卷积类似
#返回一个Tensor,类型不变,shape仍然是[batch, height, width, channels]这种形式
return tf.nn.max_pool(input_op, ksize = [1, kh, kw, 1], strides = [1, dh, dw, 1], padding = 'SAME', name = name)
#创建VGGNET-16的网络结构
def inference_op(input_op, keep_prob):
p = []
conv1_1 = conv_op(input_op, name = "conv1_1", kh = 3, kw = 3, n_out = 64, dh = 1, dw = 1, p = p)
conv1_2 = conv_op(conv1_1, name = "conv1_2", kh = 3, kw = 3, n_out = 64, dh = 1, dw = 1, p = p)
#这里每次都会输出结果的边长减半,但是通道数加倍了
pool1 = mpool_op(conv1_2, name = "pool1", kh = 2, kw = 2, dw = 2, dh = 2)
conv2_1 = conv_op(pool1, name = "conv2_1", kh = 3, kw = 3, n_out = 128, dh = 1, dw = 1, p = p)
conv2_2 = conv_op(conv2_1, name = "conv2_2", kh = 3, kw = 3, n_out = 128, dh = 1, dw = 1, p = p)
pool2 = mpool_op(conv2_2, name = "pool1", kh = 2, kw = 2, dw = 2, dh = 2)
conv3_1 = conv_op(pool2, name = "conv3_1", kh = 3, kw = 3, n_out = 256, dh = 1, dw = 1, p = p)
conv3_2 = conv_op(conv3_1, name = "conv3_2", kh = 3, kw = 3, n_out = 256, dh = 1, dw = 1, p = p)
conv3_3 = conv_op(conv3_2, name = "conv3_3", kh = 3, kw = 3, n_out = 256, dh = 1, dw = 1, p = p)
pool3 = mpool_op(conv3_3, name = "pool3", kh = 2, kw = 2, dh = 2, dw = 2)
conv4_1 = conv_op(pool3, name = "conv4_1", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
conv4_2 = conv_op(conv4_1, name = "conv4_2", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
conv4_3 = conv_op(conv4_2, name = "conv4_3", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
pool4 = mpool_op(conv4_3, name = "pool4", kh = 2, kw = 2, dh = 2, dw = 2)
conv5_1 = conv_op(pool4, name = "conv5_1", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
conv5_2 = conv_op(conv5_1, name = "conv5_2", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
conv5_3 = conv_op(conv5_2, name = "conv5_3", kh = 3, kw = 3, n_out = 512, dh = 1, dw = 1, p = p)
pool5 = mpool_op(conv5_3, name = "pool5", kh = 2, kw = 2, dh = 2, dw = 2)
shp = pool5.get_shape()
#将每个样本化为长度为(长*宽*通道)的一维向量
flattened_shape = shp[1].value * shp[2].value * shp[3].value
resh1 = tf.reshape(pool5, [-1, flattened_shape], name = "resh1")
#链接到一个隐含节点为4096的全连接层
fc6 = fc_op(resh1, name = "fc6", n_out = 4096, p = p)
#dropout防止或减轻过拟合而使用的函数,它一般用在全连接层。
#Dropout就是在不同的训练过程中随机扔掉一部分神经元。
#训练时的保留率为0.5,预测时为1.0
fc6_drop = tf.nn.dropout(fc6, keep_prob, name = "fc6_drop")
#fc6_drop = fc6
fc7 = fc_op(fc6_drop, name = "fc7", n_out = 4096, p = p)
fc7_drop = tf.nn.dropout(fc7, keep_prob, name = "fc7_drop")
#fc7_drop = fc7
fc8 = fc_op(fc7_drop, name = "fc8", n_out = 7, p = p)
#得到分类输出概率
softmax = tf.nn.softmax(fc8)
#得到概率最大的类别
predictions = tf.argmax(softmax, 1)
#print('in inference op : softmax', softmax)
#print('in inference op : prediction',predictions)
return predictions, softmax
#这里通道变多可以增加表达能力,每个通道都是由一个卷积核算出来的,有的特征对不同的卷积核敏感,多通道可以把他们都保留下来
def train_vgg():
predictions, softmax = inference_op(X, keep_prob)
#
#这里肯定有问题
#
cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = Y, logits = softmax))
loss = tf.reduce_mean(cross_entropy)
train_op = tf.train.GradientDescentOptimizer(0.006).minimize(loss)
#初始化全局参数
max_pub = 0
max_pri = 0
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
print('now strat train')
print('the length of train_emotion:', len(train_emotion))
for i in range(100):
start = 0
end = start + batch_size
step = 0
while(end < len(train_emotion)):
#while(end < 640):
#这里可能有问题
train_op.run(feed_dict = {X:np.reshape(np.array(train_feature_resize[start:end], dtype = 'float32'), (-1, 224, 224,1)), keep_prob:0.5, Y:train_emotion[start:end]})
start += batch_size
end += batch_size
if step%100 == 0:
#print(tf.argmax(softmax, 1).eval())
print('round: ', i,'step: ' , step)
step += 1
print('goint to start public prediction')
print('the length of test_pub_emotion :' ,len(test_pub_emotion))
start1 = 0
end1 = start1 + batch_size
k = 0
while(end1 < len(test_pub_emotion)):
#while(end1 < 320):
predict = sess.run(predictions, feed_dict = {X:test_pub_feature_resize[start1:end1], keep_prob:1})
#prediction.append(predict.tolist())
accurate = test_pub_emotion[start1:end1]
if end1%512 == 0:
print(predict)
#print(accurate)
for w in range(len(predict)):
if predict[w] == accurate[w]:
k += 1
start1 += batch_size
end1 += batch_size
accurate_rate = k / len(test_pub_emotion)
if accurate_rate > max_pub:
max_pub = accurate_rate
print('end public prediction')
print('the public accurate is : ', accurate_rate)
print('the max pub_accurate :', max_pub)
print('goint to start private prediction')
print('the length of test_pri_emotion :' ,len(test_pri_emotion))
start1 = 0
end1 = start1 + batch_size
k = 0
while(end1 < len(test_pri_emotion)):
#while(end1 < 320):
predict = sess.run(predictions, feed_dict = {X:test_pri_feature_resize[start1:end1], keep_prob:1})
#prediction.append(predict.tolist())
accurate = test_pri_emotion[start1:end1]
if end1%512 == 0:
print(predict)
#print(accurate)
for w in range(len(predict)):
if predict[w] == accurate[w]:
k += 1
start1 += batch_size
end1 += batch_size
accurate_rate = k / len(test_pub_emotion)
if accurate_rate > max_pri:
max_pri = accurate_rate
print('end public prediction')
print('the public accurate is : ', accurate_rate)
print('the max pri_accurate:', max_pri)
train_vgg()
结果:
在18个epoch之后精度为43.8左右,且对比之前有较大提升,如果再训练应该还能更好。
最后的精度果然更好了。但是比起ResNet这样的还是有差距,并且因为参数数量还是多的,所以速度还是慢。
这里的矩阵是我随便找的几个predict数据。