- Date: 2020/06/28
- Author: [email protected]
- 初衷: 在口罩人脸识别中,推理实现三种输入:
1.image; 2.video;3.camera- 参考:
face-mask-detection-tf2/inference.py
FaceMaskDetection/tensorflow_infer.py
- 为什么不使用这个 FaceMaskDetection/tensorflow_infer.py 项目呢?
因为它不提供训练的代码,提供视频和图片的推理,视频fps 90~100 之间
- 为什么用face-mask-detection-tf2/inference.py 这个项目呢?
因为它提供了训练的代码,提供了图片和摄像头的推理
基于PureHing/face-mask-detection-tf2 项目,对于该项目的改动,目前我就改动了inference.py
这个文件,新增了视频流输入,提取出图片输入代码为run_img
函数
This model is a lightweight face mask detection model. Based on ssd,the backbone is Mobilenet and RFB.
- Tensorflow 2.1
opencv
的图片输入推理和摄像头输入推理import cv2
import os
import time
import numpy as np
import tensorflow as tf
from absl import flags, app
from absl.flags import FLAGS
from pathlib import Path
import logging
输出日志 及 判断是否存在img_path
logging.info(f"[*] image path: {img_path}")
assert os.path.exists(img_path), \
(f"Cannot find image path from {img_path}")
logging.info("[*] Predict {} image... ".format(img_path))
I0628 17:00:31.323117 139714559833920 inference.py:98] [*] image path: assets/test2.jpg
I0628 17:00:31.323309 139714559833920 inference.py:101] [*] Predict assets/test2.jpg image...
opencv
图片读取是BGR
格式,所以需要转换一下,转成RGB
格式
img_raw = cv2.imread(img_path)
# get image size, (572, 950)
img_height_raw, img_width_raw, _ = img_raw.shape
img = cv2.cvtColor(img_raw, cv2.BGR2RGB)
此时,可以输出img
看一下
cv2.imshow('img_raw', img_raw)
cv2.imshow('img', img[:, :, ::-1])
cv2.waitKey(0)
cfg[‘steps’] = [8, 16, 32, 64]
img: (572, 950) —> (576, 960)
pad_params: (mg_h, img_w, img_pad_h, img_pad_w)
(572, 950, 4, 10)
# pad input image to avoid unmatched shape problem
img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
# 归一化, 范围限制在 [-0.5, 0.5],
img = img / 255.0 - 0.5
def pad_input_image(img, max_steps):
"""pad image to suitable shape"""
img_h, img_w, _ = img.shape
img_pad_h = (max_steps - img_h%max_steps) if img_h % max_steps > 0 \
else 0
img_pad_w = (max_steps - img_w%max_steps) if img_w % max_steps > 0 \
else 0
padd_val = np.mean(img, axis=(0, 1)).astype(np.uint8)
img = cv2.copyMakeBorder(img, 0, img_pad_h, 0, img_pad_w,
cv2.BORDER_CONSTANT, value=padd_val.tolist())
pad_params = (img_h, img_w, img_pad_h, img_pad_w)
return img, pad_params
"min_sizes":[[10, 16, 24], [32, 48], [64, 96], [128, 192, 256]],
"steps": [8, 16, 32, 64]
四个特征图,每个特征图上的每一个像素点对应的有[3, 2, 2, 3]个框,第一个特征图上的点有3个框,因此
总共的预选框的数量 = sum(每个特征图大小 * 框的数量)
In [21]: def fun(sum, a): ...: for i in range(len(a)): ...: if i == 1 or i == 2: ...: sum = sum + np.ceil(576/a[i])*np.ceil(960/a[i])*2 ...: else: ...: sum = sum + np.ceil(576/a[i])*np.ceil(960/a[i])*3 ...: print(sum) ...: In [22]: a Out[22]: [8, 16, 32, 64] In [23]: fun(0, a) 31725.0
# sum(feature map size[i] * len(min_size[i])) 31725
priors, _ = priors_box(cfg, image_sizes=(img.shape[0], img.shape[1]))
priors = tf.cast(priors, tf.float32)
增加一个维度,即设置batch=1
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
img_np_expanded = np.expand_dims(img, axis=0)
# shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img_np_expanded)
使用score_threshold
和nms_threshold
筛选出符合条件的boexes
、classes
、scores
# split three parts
boxes, classes, scores = parse_predict(predictions, priors, cfg)
此时已经基本做好了检测和类别分类,剩下的就是保存和显示图片
# recover padding effect
boxes = recover_pad_output(boxes, pad_params)
# draw and save results
save_img_path = os.path.join('assets', 'out_'+os.path.basename(img_path))
for prior_index in range(len(boxes)):
show_image(img_raw, boxes, classes, scores, img_height_raw, img_width_raw, prior_index, cfg['labels_list'])
cv2.imwrite(save_img_path, img_raw)
cv2.imshow('results', img_raw)
cv2.waitKey(0)
def run_img(img_path, cfg, show_result=True):
img_path = Path(img_path)
logging.info(f"[*] image path: {img_path}")
assert img_path.exists(), (f"Cannot find image path from {img_path}")
logging.info("[*] Predict {} image... ".format(img_path))
# read image, default is BGR
img_raw = cv2.imread(str(img_path))
# get image size, (572, 950)
img_height_raw, img_width_raw, _ = img_raw.shape
# convert BGR to RGB
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# cv2.imshow('img_raw', img_raw)
# cv2.imshow('img', img[:, :, ::-1])
# cv2.waitKey(0)
# pad input image to avoid unmatched shape problem
img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
# 归一化, 范围限制在 [-0.5, 0.5], (576, 960)
img = img / 255.0 - 0.5
# sum(feature map size[i] * len(min_size[i])) 31725
priors, _ = priors_box(cfg, image_sizes=(img.shape[0], img.shape[1]))
priors = tf.cast(priors, tf.float32)
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
img_np_expanded = np.expand_dims(img, axis=0)
# shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img_np_expanded)
# split three parts
boxes, classes, scores = parse_predict(predictions, priors, cfg)
logging.info(f"scores:{scores}")
# recover padding effect
boxes = recover_pad_output(boxes, pad_params)
# draw and save results
save_img_path = Path('assets') / ('out_'+img_path.name)
for prior_index in range(len(boxes)):
show_image(img_raw, boxes, classes, scores, img_height_raw, img_width_raw, prior_index, cfg['labels_list'])
cv2.imwrite(str(save_img_path), img_raw)
cv2.imshow('results', img_raw)
cv2.waitKey(0)
- 在视频推理的过程中,fps 的值为11~ 20 之间,参考2中的fps为90~100之间
究其原因,可能是在于预选框的数量不同。而预选框的数量和特征图大小相关。
参考1中的特征图的大小和原始尺寸相关,具体的计算方式如下:
imgae_size = [720, 1280] steps = [8, 16, 32, 64] # 四个特征图大小 feature_maps = [ [math.ceil(image_sizes[0] / step), math.ceil(image_sizes[1] / step)] for step in steps]
参考2中的特征图大小是预先设定的,
# anchor configuration feature_map_sizes = [[33, 33], [17, 17], [9, 9], [5, 5], [3, 3]] anchor_sizes = [[0.04, 0.056], [0.08, 0.11], [0.16, 0.22], [0.32, 0.45], [0.64, 0.72]] anchor_ratios = [[1, 0.62, 0.42]] * 5
所以这大概就是fps之间的区别,还有一个可能的因素是在于骨干网络的不同
# 参考1的视频推理时间: read_frame:0.002267, infer time:0.084887, write time:0.004903 # 参考2的视频推理时间: read_frame:0.001686, infer time:0.018405, write time:0.009145
- 另一个就是在最后的输出结果当中,我把score调成了0.9,才有了想要的效果
# 建立一个读取视频对象
cap = cv2.VideoCapture(video_path)
# 读取视频的每帧信息 & fps
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
fps = cap.get(cv2.CAP_PROP_FPS)
# 获取视频总帧数
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(total_frames)
assert cap.isOpened(), ("Video open failed.")
# 设置储存视频的格式
fourcc = cv2.VideoWriter_fourcc(*'XVID')
# 创建一个视频写入对象
writer = cv2.VideoWriter(output_video_name, fourcc,
fps, (int(width), int(height)))
# 读取每一帧
ret, img_raw = cap.read()
assert ret, print('No video found')
# 格式转换
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# cv2.imshow('img', img[:, :, ::-1])
# key = cv2.waitKey(0)
接下来就和上面的图片输入差不多了,下面是源码。不同的地方在于视频保存和图片保存的方式不一样,另外,我把cfg.score_threshold
设置成了0.9.
def run_on_video(video_path, output_video_name, conf_thresh, cfg, model):
# 建立一个读取视频对象
cap = cv2.VideoCapture(video_path)
# 读取视频的每帧信息 & fps
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
fps = cap.get(cv2.CAP_PROP_FPS)
# 设置储存视频的格式
fourcc = cv2.VideoWriter_fourcc(*'XVID')
# 创建一个视频写入对象
writer = cv2.VideoWriter(output_video_name, fourcc,
fps, (int(width), int(height)))
# 获取视频总帧数
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
total_frames = int(total_frames)
assert cap.isOpened(), ("Video open failed.")
# 获取先验框
priors, _ = priors_box(cfg, image_sizes=(int(height), int(width)))
priors = tf.cast(priors, tf.float32)
for idx in range(total_frames):
start_stamp = time.time()
ret, img_raw = cap.read()
assert ret, print('No video found')
img = cv2.cvtColor(img_raw, cv2.COLOR_BGR2RGB)
# reading time
read_frame_stamp = time.time()
# reshape
img = img / 255.0 - 0.5
# 增维度, img shape: [1, None, None, 3]
# predictions shape: [1, anchors number, 3 classes + 4 xyzw]
predictions = model.predict(img[np.newaxis, ...])
boxes, classes, scores = parse_predict(predictions, priors, cfg)
for prior_index in range(len(classes)):
show_image(img_raw, boxes, classes, scores, int(height), int(width),
prior_index, cfg['labels_list'])
fps_str = "FPS: %.2f" % (1 / (time.time() - read_frame_stamp))
cv2.putText(img_raw, fps_str, (25, 25), cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)
# show frame
cv2.imshow('result', img_raw)
cv2.waitKey(1)
# inference time
inference_stamp = time.time()
# write frame
writer.write(img_raw)
write_frame_stamp = time.time()
print("%d of %d" % (idx+1, total_frames))
print("read_frame:%f, infer time:%f, write time:%f" %
(read_frame_stamp - start_stamp,
inference_stamp - read_frame_stamp,
write_frame_stamp - inference_stamp))
cap.release()
writer.release()
最后的摄像头部分,我没有usb摄像头,故没有深入研究,只是将他摘出来,嵌套在main函数中, 最终的效果还是在于产生一个240*320的窗口,实时检测
def run_camera():
capture = cv2.VideoCapture(0)
capture.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 240)
priors, _ = priors_box(cfg, image_sizes=(240, 320))
priors = tf.cast(priors, tf.float32)
start = time.time()
while True:
_, frame = capture.read()
assert frame, ('No camera found')
h, w, _ = frame.shape
img = np.float32(frame.copy())
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img / 255.0 - 0.5
predictions = model(img[np.newaxis, ...])
boxes, classes, scores = parse_predict(predictions, priors, cfg)
for prior_index in range(len(classes)):
show_image(frame, boxes, classes, scores, h, w, prior_index, cfg['labels_list'])
# calculate fps
fps_str = "FPS: %.2f" % (1 / (time.time() - start))
start = time.time()
cv2.putText(frame, fps_str, (25, 25), cv2.FONT_HERSHEY_DUPLEX, 0.75, (0, 255, 0), 2)
# show frame
cv2.imshow('frame', frame)
if cv2.waitKey(1) == ord('q'):
exit()
该项目推理时的最精华的部分正是在于prior boxes
的生成,以及nms
筛选机制,曾经在笔试的时候,有考题是让学生手撕iou
算法,我当时有灵感,但是没写全,有点遗憾
以后待补
上述说的即以下两个函数:
priors, _ = priors_box(cfg, image_sizes=(int(height), int(width)))
boxes, classes, scores = parse_predict(predictions, priors, cfg)