目录
1.概述
2.环境
3.数据读取
4.VGG
5.Resnet
6.摄像头表情识别
分别用vgg和resnet对fer2013做了训练,只是简单的实现,没有做什么其他改进方法,在测试集的accuracy并不高,仅做练习。文末用训练好的模型做了摄像头表情检测,仅供参差。
face_recognition==1.2.3 opencv_python==4.1.0.25 tensorflow==1.13.1 numpy==1.16.4
自己比较懒,直接在网上下载的jpg格式的fer2013,然后利用tf.data导入数据。
tf.data用起来还是比较方便简单的,我这里直接从文件夹加载数据,只需要灰度图,然后把数据映射到[0,1]。
def _parse_function(filename, label):
print(filename)
image_string = tf.read_file(filename)
image_decoded = tf.cond(
tf.image.is_jpeg(image_string),
lambda: tf.image.decode_jpeg(image_string, channels=3),
lambda: tf.image.decode_png(image_string, channels=3))
image_gray = tf.image.rgb_to_grayscale(image_decoded)
image_gray = tf.cast(image_gray, tf.float32) / 255.0
label = tf.one_hot(label, len(TYPE))
return image_gray, label
def create_dataset(filenames, labels, batch_size=batch_size, is_shuffle=True, n_repeats=-1, func_map=_parse_function):
"""create dataset for train and validation dataset"""
dataset = tf.data.Dataset.from_tensor_slices((tf.constant(filenames), tf.constant(labels)))
dataset = dataset.map(func_map)
if is_shuffle:
dataset = dataset.shuffle(buffer_size=1000 + 3 * batch_size)
dataset = dataset.batch(batch_size).repeat(n_repeats)
return dataset
# train data
filenames_t = []
labels_t = []
for index, type in enumerate(TYPE):
file_list = [os.path.join(train_datasets, str(index) + '/' + file)
for file in os.listdir(os.path.join(train_datasets, str(index)))
if file.endswith('jpg')]
filenames_t += file_list
num = len(file_list)
labels_t += [index for i in range(num)]
randnum = np.random.randint(0, 100)
np.random.seed(randnum)
np.random.shuffle(filenames_t)
np.random.seed(randnum)
np.random.shuffle(labels_t)
train_dataset = create_dataset(filenames_t, labels_t)
# validation data
filenames_v = []
labels_v = []
for index, type in enumerate(TYPE):
file_list = [os.path.join(validation_datasets, str(index) + '/' + file)
for file in os.listdir(os.path.join(validation_datasets, str(index)))
if file.endswith('jpg')]
filenames_v += file_list
num = len(file_list)
labels_v += [index for i in range(num)]
randnum = np.random.randint(0, 100)
np.random.seed(randnum)
np.random.shuffle(filenames_v)
np.random.seed(randnum)
np.random.shuffle(labels_v)
val_dataset = create_dataset(filenames_v, labels_v)
因为fer2013数据集的图片分辨率比较低,只有48x48,单通道,所以第一次就想直接用最简单的网络做下练习,这里先用了vgg。
并不是经典的vgg16或vgg19网络,做了简化,4个conv层+4个pooling层+2个全连接层,卷积层代替全连接实现。网络比较简单。
训练过程如下:
lossaccuracy
测试集简单测试,accuracy平均54%,低的令人发指。
回看了下数据集。。。除了自己可能哪里写错的原因,这个数据集存在几个问题:
1.像素过低
2.有正脸,侧脸,各种黑脸,白脸,漫画脸。。各种脸我自己都分不清什么表情
3.玛德不是脸
模型代码:
import tensorflow as tf
class VGG():
def _max_pool(self, net, name):
return tf.layers.max_pooling2d(net, pool_size=[2, 2], strides=[2, 2], padding='same', name=name)
def _conv_layer(self, net, filters, activation=tf.nn.relu, name=None):
return tf.layers.conv2d(net, filters=filters, kernel_size=[3, 3], strides=[1, 1], padding='same',
activation=activation,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
name=name)
def _bn_layer(self, net, training, name):
return tf.layers.batch_normalization(
net, training=training, name=name)
def _dropout_layer(self, net, dropout_prob, training, name):
return tf.layers.dropout(net, rate=dropout_prob, training=training, name=name)
def _fc_layer(self, net, num_classes, name):
return tf.layers.dense(net, num_classes, activation=tf.nn.relu, name=name)
def _conv_fc_layer(self, net, filters, kernel_size, padding='same', activation=None, name=None):
return tf.layers.conv2d(net, filters=filters, kernel_size=kernel_size, strides=[1, 1], padding=padding,
activation=activation,
kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
name=name)
def predict(self, input, num_classes, dropout_prob=0.5, training=True, scope=None):
with tf.variable_scope(scope, 'VGG', [input]):
net = self._conv_layer(input, 16, name="conv1_1") # 48x48x16
net = self._max_pool(net, 'pool1') # 24x24x16
net = self._conv_layer(net, 32, name="conv2_1") # 24x24x32
net = self._max_pool(net, 'pool2') # 12x12x32
net = self._conv_layer(net, 64, name="conv3_1") # 12x12x32
net = self._max_pool(net, 'pool3') # 6x6x32
net = self._conv_layer(net, 128, name="conv4_1") # 6x6x128
net = self._max_pool(net, 'pool4') # 3x3x128
net = self._conv_fc_layer(net, 1024, [3, 3], 'valid', tf.nn.relu, name="fc5") # 1x1x1024
net = self._dropout_layer(net, dropout_prob, training, 'dp5')
net = self._conv_fc_layer(net, 1024, [1, 1], activation=tf.nn.relu, name="fc6") # 1x1x1024
net = self._dropout_layer(net, dropout_prob, training, 'dp6')
net = self._conv_fc_layer(net, num_classes, [1, 1], name="fc7") # 1x1 x num_classes
net = tf.squeeze(net, [1, 2], name='fc7/squeezed')
net = tf.nn.softmax(net, name="prob")
return net
用了4个resblock,结构比较简单,训练过程如下:
lossaccuracy
测试集简单测试,accuracy平均48%。。。。好吧,那就这样吧。
模型代码:
import tensorflow as tf
class resnet():
def __init__(self, num_classes, is_training=True):
self.is_training = is_training
self.num_classes = num_classes
self.layers = []
def _conv_layer(self, net, filters, activation=None, name='Conv'):
return tf.layers.conv2d(net, filters=filters, kernel_size=[3, 3], strides=[1, 1], padding='same',
kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
activation=activation,
name=name)
def _conv_bn_relu_layer(self, net, filters, stride=1, name='Conv_bn_relu'):
net = tf.layers.conv2d(
net, filters=filters, kernel_size=[3, 3],
strides=[stride, stride], padding='same', name=name)
net = self._bn_layer(net, self.is_training)
net = tf.nn.relu(net)
return net
def _bn_layer(self, net, training, name='BN'):
return tf.layers.batch_normalization(
net, training=training, name=name)
def _max_pool(self, net, name='Max_Pool'):
return tf.layers.max_pooling2d(net, pool_size=[2, 2], strides=[2, 2], padding='same', name=name)
def _avg_pool(self, net, name='Avg_Pool'):
return tf.layers.average_pooling2d(net, pool_size=[2, 2], strides=[2, 2], padding='valid', name=name)
def _fc_layer(self, net, num_classes, name='FC'):
return tf.layers.dense(net, num_classes, name=name)
def _residual_block(self, input_layer, output_channel, first_block=False):
'''
Defines a residual block in ResNet
:param input_layer: 4D tensor
:param output_channel: int. return_tensor.get_shape().as_list()[-1] = output_channel
:param first_block: if this is the first residual block of the whole network
:return: 4D tensor.
'''
input_channel = input_layer.get_shape().as_list()[-1]
# When it's time to "shrink" the image size, we use stride = 2
if input_channel * 2 == output_channel:
increase_dim = True
stride = 2
elif input_channel == output_channel:
increase_dim = False
stride = 1
else:
raise ValueError('Output and input channel does not match in residual blocks!!!')
# The first conv layer of the first residual block does not need to be normalized and relu-ed.
with tf.variable_scope('conv1_in_block'):
if first_block:
net = self._conv_layer(input_layer, output_channel)
else:
net = self._conv_bn_relu_layer(input_layer, output_channel, stride)
with tf.variable_scope('conv2_in_block'):
net = self._conv_bn_relu_layer(net, output_channel, 1)
# When the channels of input layer and conv2 does not match, we add zero pads to increase the
# depth of input layers
if increase_dim is True:
pooled_input = self._avg_pool(input_layer)
padded_input = tf.pad(pooled_input, [[0, 0], [0, 0], [0, 0], [input_channel // 2,
input_channel // 2]])
else:
padded_input = input_layer
output = net + padded_input
return output
def build(self, input, n):
with tf.variable_scope('conv0', regularizer=tf.contrib.layers.l2_regularizer(0.0002)):
net = self._conv_layer(input, 16, tf.nn.relu, name='Conv1') # 48x48
net = self._conv_layer(net, 16, tf.nn.relu, name='Conv2') # 48x48
net = self._max_pool(net) # 24x24
self.layers.append(net)
for i in range(n):
with tf.variable_scope('conv1_%d' % i, regularizer=tf.contrib.layers.l2_regularizer(0.0002)):
if i == 0:
net = self._residual_block(self.layers[-1], 16, first_block=True) # 24x24
else:
net = self._residual_block(self.layers[-1], 16)
self.layers.append(net)
for i in range(n):
with tf.variable_scope('conv2_%d' % i, regularizer=tf.contrib.layers.l2_regularizer(0.0002)): # 12x12
net = self._residual_block(self.layers[-1], 32)
self.layers.append(net)
for i in range(n):
with tf.variable_scope('conv3_%d' % i, regularizer=tf.contrib.layers.l2_regularizer(0.0002)): # 6x6
net = self._residual_block(self.layers[-1], 64)
self.layers.append(net)
for i in range(n):
with tf.variable_scope('conv4_%d' % i, regularizer=tf.contrib.layers.l2_regularizer(0.0002)): # 3x3
net = self._residual_block(self.layers[-1], 128)
self.layers.append(net)
#assert net.get_shape().as_list()[1:] == [3, 3, 128]
with tf.variable_scope('fc', regularizer=tf.contrib.layers.l2_regularizer(0.0002)):
net = self._bn_layer(self.layers[-1], self.is_training)
net = tf.nn.relu(net)
net = tf.reduce_mean(net, [1, 2])
assert net.get_shape().as_list()[-1:] == [128]
net = self._fc_layer(net, self.num_classes)
self.layers.append(net)
return self.layers[-1]
def loss(self, logits, labels):
labels = tf.cast(labels, tf.int64)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=labels, name='cross_entropy_per_example')
cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
return cross_entropy_mean
opencv读取视频/摄像头,利用face_recognition检测出人脸,然后送进网络检测,效果和代码如下:
import face_recognition
import cv2 as cv
import sys
import tensorflow as tf
from model_resnet import resnet
import numpy as np
b_Saved = True
b_Show = False
TYPE = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
model_path = 'model/resnet/'
train_mode = tf.placeholder(tf.bool)
input = tf.placeholder(tf.float32, shape=[1, 48, 48, 1])
model = resnet(len(TYPE), train_mode)
logits = tf.nn.softmax(
model.build(input, 2)
)
saver = tf.train.Saver()
cam = cv.VideoCapture('./video_test/emotion_test.mp4')
if not cam.isOpened():
sys.exit()
if b_Saved:
width = cam.get(cv.CAP_PROP_FRAME_WIDTH)
height = cam.get(cv.CAP_PROP_FRAME_HEIGHT)
fps = int(cam.get(cv.CAP_PROP_FPS))
writer = cv.VideoWriter(
'./video_test/emotion_test_result.avi', cv.VideoWriter_fourcc(*'MJPG'), fps, (int(width), int(height))
)
with tf.Session() as sess:
if tf.train.latest_checkpoint(model_path) is not None:
saver.restore(sess, tf.train.latest_checkpoint(model_path))
else:
assert 'can not find checkpoint folder path!'
try:
while True:
ret, bgr = cam.read()
if not ret:
cam.set(cv.CAP_PROP_POS_FRAMES, 0)
if b_Saved:
break
if b_Show:
continue
h, w, _ = bgr.shape
#bgr = cv.resize(bgr, (w // 2, h //2))
gray = cv.cvtColor(bgr, cv.COLOR_BGR2GRAY)
face_locations = face_recognition.face_locations(bgr)
emotion_list = []
face_list = []
for face_location in face_locations:
top, right, bottom, left = face_location
face_roi = gray[top:bottom, left:right]
face_roi = cv.resize(face_roi, (48, 48))
face_list.append(face_roi)
logits_ = sess.run(logits,
feed_dict={
input: np.reshape(face_roi.astype(np.float32) / 255.0, (1,) + face_roi.shape + (1,)),
train_mode: False
})
emotion = TYPE[np.argmax(logits_[0])]
emotion_list.append(emotion)
cv.rectangle(bgr, (left, top), (right, bottom), (0, 255, 0), 2)
cv.rectangle(bgr, (left, top - 20), (right, top), (0, 255, 0), cv.FILLED)
cv.putText(bgr, emotion, (left, top), cv.FONT_HERSHEY_PLAIN, 1.5, (0, 0, 255), thickness=1)
print('detect face:{}, emotion:{}'.format(len(face_locations), emotion_list))
if b_Show:
cv.imshow('Camera', bgr)
for index, roi in enumerate(face_list):
cv.imshow('roi_%d' % index, roi)
cv.waitKey(1)
if b_Saved:
writer.write(bgr)
except Exception as e:
print('Error:', e)
sys.exit()
finally:
cam.release()
if b_Saved:
writer.release()
项目完整代码:GitHub