这一篇博客生成ONet的训练数据。
进入prepare_data
文件夹打开gen_hard_example.py
脚本,代码如下:
#coding:utf-8
import sys
#sys.path.append("../")
from prepare_data.utils import convert_to_square
sys.path.insert(0,'..')
import numpy as np
import argparse
import os
import pickle as pickle
import cv2
from train_models.mtcnn_model import P_Net, R_Net, O_Net
from train_models.MTCNN_config import config
from prepare_data.loader import TestLoader
from Detection.detector import Detector
from Detection.fcn_detector import FcnDetector
from Detection.MtcnnDetector import MtcnnDetector
from utils import *
from prepare_data.data_utils import *
#net : 24(RNet)/48(ONet)
#data: dict()
def save_hard_example(net, data,save_path):
im_idx_list = data['images']
gt_boxes_list = data['bboxes']
#得到真实的数据
num_of_images = len(im_idx_list)
print("processing %d images in total" % num_of_images)
# save files
neg_label_file = "../../DATA/no_LM%d/neg_%d.txt" % (net, image_size)
neg_file = open(neg_label_file, 'w')
pos_label_file = "../../DATA/no_LM%d/pos_%d.txt" % (net, image_size)
pos_file = open(pos_label_file, 'w')
part_label_file = "../../DATA/no_LM%d/part_%d.txt" % (net, image_size)
part_file = open(part_label_file, 'w')
det_boxes = pickle.load(open(os.path.join(save_path, 'detections.pkl'), 'rb'))
#加载刚刚保存的数据
# print(len(det_boxes), num_of_images)
print(len(det_boxes))
print(num_of_images)
assert len(det_boxes) == num_of_images, "incorrect detections or ground truths"
# index of neg, pos and part face, used as their image names
n_idx = 0
p_idx = 0
d_idx = 0
image_done = 0
#im_idx_list image index(list)
#det_boxes detect result(list)
#gt_boxes_list gt(list)
for im_idx, dets, gts in zip(im_idx_list, det_boxes, gt_boxes_list):
gts = np.array(gts, dtype=np.float32).reshape(-1, 4)
if image_done % 100 == 0:
print("%d images done" % image_done)
image_done += 1
if dets.shape[0] == 0:
continue
img = cv2.imread(im_idx)
#change to square
dets = convert_to_square(dets)
dets[:, 0:4] = np.round(dets[:, 0:4])
neg_num = 0
for box in dets:
x_left, y_top, x_right, y_bottom, _ = box.astype(int)
width = x_right - x_left + 1
height = y_bottom - y_top + 1
if width < 20 or x_left < 0 or y_top < 0 or x_right > img.shape[1] - 1 or y_bottom > img.shape[0] - 1:
#忽略那些太小的和超出边框的图片
continue
#计算Iou值来裁剪样本
Iou = IoU(box, gts)
cropped_im = img[y_top:y_bottom + 1, x_left:x_right + 1, :]
resized_im = cv2.resize(cropped_im, (image_size, image_size),
interpolation=cv2.INTER_LINEAR)
# save negative images and write label
# Iou with all gts must below 0.3
if np.max(Iou) < 0.3 and neg_num < 60:
#save the examples
save_file = get_path(neg_dir, "%s.jpg" % n_idx)
# print(save_file)
neg_file.write(save_file + ' 0\n')
cv2.imwrite(save_file, resized_im)
n_idx += 1
neg_num += 1
else:
# find gt_box with the highest iou
idx = np.argmax(Iou)
assigned_gt = gts[idx]
x1, y1, x2, y2 = assigned_gt
# compute bbox reg label
offset_x1 = (x1 - x_left) / float(width)
offset_y1 = (y1 - y_top) / float(height)
offset_x2 = (x2 - x_right) / float(width)
offset_y2 = (y2 - y_bottom) / float(height)
# save positive and part-face images and write labels
if np.max(Iou) >= 0.65:
save_file = get_path(pos_dir, "%s.jpg" % p_idx)
pos_file.write(save_file + ' 1 %.2f %.2f %.2f %.2f\n' % (
offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
p_idx += 1
elif np.max(Iou) >= 0.4:
save_file = os.path.join(part_dir, "%s.jpg" % d_idx)
part_file.write(save_file + ' -1 %.2f %.2f %.2f %.2f\n' % (
offset_x1, offset_y1, offset_x2, offset_y2))
cv2.imwrite(save_file, resized_im)
d_idx += 1
neg_file.close()
part_file.close()
pos_file.close()
def t_net(prefix, epoch,
batch_size, test_mode="PNet",
thresh=[0.6, 0.6, 0.7], min_face_size=25,
stride=2, slide_window=False, shuffle=False, vis=False):
#prefix:['../data/MTCNN_model/PNet_Landmark/PNet', '../data/MTCNN_model/RNet_Landmark/RNet', '../data/MTCNN_model/ONet_Landmark/ONet']
#epoch:[18, 14, 16]
#batch_size:[2048, 256, 16]
#test_mode:"RNet"
#thresh:[0.3, 0.1, 0.7]
#min_face_size:20
#stride=2
#slide_window:False
#shuffle:False
#vis:False
detectors = [None, None, None]
print("Test model: ", test_mode)
model_path = ['%s-%s' % (x, y) for x, y in zip(prefix, epoch)]
#model_path = ['../data/MTCNN_model/PNet_Landmark/PNet-18', '../data/MTCNN_model/RNet_Landmark/RNet-14', '../data/MTCNN_model/ONet_Landmark/ONet-16']
print(model_path[0])
#model_path[0] = '../data/MTCNN_model/PNet_Landmark/PNet-18'
if slide_window:
PNet = Detector(P_Net, 12, batch_size[0], model_path[0])
else:
PNet = FcnDetector(P_Net, model_path[0])
detectors[0] = PNet
#在这里调用了FcnDetector这个类加载PNet的模型
if test_mode in ["RNet", "ONet"]:
print("==================================", test_mode)
RNet = Detector(R_Net, 24, batch_size[1], model_path[1])
detectors[1] = RNet
#在这里调用了Detector这个类加载RNet的模型
if test_mode == "ONet":
print("==================================", test_mode)
ONet = Detector(O_Net, 48, batch_size[2], model_path[2])
detectors[2] = ONet
basedir = '../../DATA/'
filename = './wider_face_train_bbx_gt.txt'
data = read_annotation(basedir,filename)
#调用read_annotation()函数,返回字典data,包括了'images' and 'bboxes'
mtcnn_detector = MtcnnDetector(detectors=detectors, min_face_size=min_face_size,
stride=stride, threshold=thresh, slide_window=slide_window)
#调用了MtcnnDetector这个类
print("==================================")
# 注意是在“test”模式下
print('load test data')
#调用了TestLoader类对图片加载
test_data = TestLoader(data['images'])
print ('finish loading')
print ('start detecting....')
detections,_ = mtcnn_detector.detect_face(test_data)
#调用了MtcnnDetector里面的detect_face方法
print ('finish detecting ')
save_net = 'RNet'
if test_mode == "PNet":
save_net = "RNet"
elif test_mode == "RNet":
save_net = "ONet"
#save detect result
save_path = os.path.join(data_dir, save_net)
print ('save_path is :')
print(save_path)
if not os.path.exists(save_path):
os.mkdir(save_path)
save_file = os.path.join(save_path, "detections.pkl")
with open(save_file, 'wb') as f:
pickle.dump(detections, f,1)
#将 MtcnnDetector生成的数据储存起来
print("%s测试完成开始OHEM" % image_size)
save_hard_example(image_size, data, save_path)
def parse_args():
#命令解析器,定义了一系列的参数,每个参数里面的'help'是该参数的具体描述
parser = argparse.ArgumentParser(description='Test mtcnn',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--test_mode', dest='test_mode', help='test net type, can be pnet, rnet or onet',
default='RNet', type=str)
parser.add_argument('--prefix', dest='prefix', help='prefix of model name', nargs="+",
default=['../data/MTCNN_model/PNet_Landmark/PNet', '../data/MTCNN_model/RNet_Landmark/RNet', '../data/MTCNN_model/ONet_Landmark/ONet'],
type=str)
parser.add_argument('--epoch', dest='epoch', help='epoch number of model to load', nargs="+",
default=[18, 14, 16], type=int)
parser.add_argument('--batch_size', dest='batch_size', help='list of batch size used in prediction', nargs="+",
default=[2048, 256, 16], type=int)
parser.add_argument('--thresh', dest='thresh', help='list of thresh for pnet, rnet, onet', nargs="+",
default=[0.3, 0.1, 0.7], type=float)
parser.add_argument('--min_face', dest='min_face', help='minimum face size for detection',
default=20, type=int)
parser.add_argument('--stride', dest='stride', help='stride of sliding window',
default=2, type=int)
parser.add_argument('--sw', dest='slide_window', help='use sliding window in pnet', action='store_true')
parser.add_argument('--shuffle', dest='shuffle', help='shuffle data on visualization', action='store_true')
parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
args = parser.parse_args()
return args
if __name__ == '__main__':
net = 'ONet' #网络为ONet
if net == "RNet":
image_size = 24
if net == "ONet":
image_size = 48
base_dir = '../../DATA/WIDER_train'
data_dir = '../../DATA/%s' % str(image_size)
neg_dir = get_path(data_dir, 'negative')
pos_dir = get_path(data_dir, 'positive')
part_dir = get_path(data_dir, 'part')
#create dictionary shuffle
for dir_path in [neg_dir, pos_dir, part_dir]:
if not os.path.exists(dir_path):
os.makedirs(dir_path)
args = parse_args()
print('Called with argument:')
print(args)
t_net(args.prefix,#模型参数文件
args.epoch, #周期数
args.batch_size, #测试的batch_size
args.test_mode,#测试的模型选择
args.thresh, #分类阈值
args.min_face, #最小人脸大小
args.stride,#stride
args.slide_window,
args.shuffle,
vis=False)
这里调用了类MtcnnDetector
里面的方法detect_face()
,并传入参数test_data
,在这里将代码调出来看是如何实现的:
class MtcnnDetector(object):
def __init__(self,
detectors,
min_face_size=20,
stride=2,
threshold=[0.6, 0.7, 0.7],
scale_factor=0.79,
# scale_factor=0.709,#change
slide_window=False):
self.pnet_detector = detectors[0]
self.rnet_detector = detectors[1]
self.onet_detector = detectors[2]
self.min_face_size = min_face_size
self.stride = stride
self.thresh = threshold
self.scale_factor = scale_factor
self.slide_window = slide_window
def detect_face(self, test_data):
all_boxes = [] #保存每一张图片的bboxes
landmarks = []
batch_idx = 0
sum_time = 0
t1_sum = 0
t2_sum = 0
t3_sum = 0
num_of_img = test_data.size
#图片数量
empty_array = np.array([])
s_time = time.time()
#返回当前时间的时间戳
for databatch in test_data:
#依次提取test_data里面的每一张图片,每提取一百张图片打印进度和所耗费时间
batch_idx += 1
if batch_idx % 100 == 0:
c_time = (time.time() - s_time )/100
print("%d out of %d images done" % (batch_idx ,test_data.size))
print('%f seconds for each image' % c_time)
s_time = time.time()
im = databatch
if self.pnet_detector:
#self.pnet_detector = detectors[0]
st = time.time()
# ignore landmark
boxes, boxes_c, landmark = self.detect_pnet(im)
#这里调用了方法detect_pnet,接下来我们转到下面这个方法对应的代码
t1 = time.time() - st
sum_time += t1
t1_sum += t1
if boxes_c is None:
print("boxes_c is None...")
all_boxes.append(empty_array)
# pay attention
landmarks.append(empty_array)
continue
# rnet
if self.rnet_detector:
t = time.time()
# 传入图片和pnet_detector返回的bbox坐标
boxes, boxes_c, landmark = self.detect_rnet(im, boxes_c)
#这里调用了方法detect_pnet,接下来我们转到下面这个方法对应的代码
t2 = time.time() - t
sum_time += t2
t2_sum += t2
if boxes_c is None:
all_boxes.append(empty_array)
landmarks.append(empty_array)
continue
# 这个地方没有调用到
if self.onet_detector:
t = time.time()
boxes, boxes_c, landmark = self.detect_onet(im, boxes_c)
t3 = time.time() - t
sum_time += t3
t3_sum += t3
if boxes_c is None:
all_boxes.append(empty_array)
landmarks.append(empty_array)
continue
all_boxes.append(boxes_c)
landmarks.append(landmark)
#保存得到的bbox和landmark信息
print('num of images', num_of_img)
print("time cost in average" +
'{:.3f}'.format(sum_time/num_of_img) +
' pnet {:.3f} rnet {:.3f} onet {:.3f}'.format(t1_sum/num_of_img, t2_sum/num_of_img,t3_sum/num_of_img))
print('boxes length:',len(all_boxes))
return all_boxes, landmarks
这里调用了类MtcnnDetector
里面的方法detect_pnet()
,并传入参数im
,在这里将代码调出来看是如何实现的:
def detect_pnet(self, im):
"""Get face candidates through pnet
Parameters:
----------
im: numpy array
input image array
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
#获得图片的宽、高、通道数
net_size = 12
current_scale = float(net_size) / self.min_face_size
#current_scale=12 / 20 = 0.6
# find initial scale
im_resized = self.processed_image(im, current_scale)
#缩小0.6倍
current_height, current_width, _ = im_resized.shape
# fcn
all_boxes = list()
while min(current_height, current_width) > net_size:
cls_cls_map, reg = self.pnet_detector.predict(im_resized)
#self.pnet_detector = detectors[0] = FcnDetector(P_Net, model_path[0])
# 我们在下面将类FcnDetector的代码调出来
# 返回PNet网络的预测结果,得到class_prob 和 bbox_pred
boxes = self.generate_bbox(cls_cls_map[:, :, 1], reg, current_scale, self.thresh[0])
# 在下面我们将方法generate_bbox()的代码调出来
# boxes: (x1,y1,x2,y2,score,x1_offset,y1_offset,x2_offset,y2_offset)
current_scale *= self.scale_factor
# 将宽高进一步缩放,形成图像金字塔,注意这里scale_factor默认为0.79,论文的源码
#好像是0.709,在宽高小于20之前一直进行此while循环
im_resized = self.processed_image(im, current_scale)
#将im缩放0.79倍
current_height, current_width, _ = im_resized.shape
#获得新的高宽
if boxes.size == 0:
continue
keep = py_nms(boxes[:, :5], 0.5, 'Union')
#从非极大值抑制算法获得index
boxes = boxes[keep]
#筛选出出对应的boxes元素
all_boxes.append(boxes)
if len(all_boxes) == 0:
return None, None, None
all_boxes = np.vstack(all_boxes)
#按照行顺序把数组给堆叠起来
keep = py_nms(all_boxes[:, 0:5], 0.7, 'Union')
#合并第一阶段的检测
all_boxes = all_boxes[keep]
boxes = all_boxes[:, :5]
bbw = all_boxes[:, 2] - all_boxes[:, 0] + 1
bbh = all_boxes[:, 3] - all_boxes[:, 1] + 1
# 得到bbox的坐标
boxes_c = np.vstack([all_boxes[:, 0] + all_boxes[:, 5] * bbw,
all_boxes[:, 1] + all_boxes[:, 6] * bbh,
all_boxes[:, 2] + all_boxes[:, 7] * bbw,
all_boxes[:, 3] + all_boxes[:, 8] * bbh,
all_boxes[:, 4]])
boxes_c = boxes_c.T
return boxes, boxes_c, None
在上文调用了类FcnDetector
,并传入参数im_resized
,类的代码如下:
import tensorflow as tf
import sys
sys.path.append("../")
from train_models.MTCNN_config import config
class FcnDetector(object):
#net_factory: which net
#model_path: where the params'file is
def __init__(self, net_factory, model_path):
#create a graph
graph = tf.Graph()
with graph.as_default():
#在图中定义张量和运算
self.image_op = tf.placeholder(tf.float32, name='input_image')
self.width_op = tf.placeholder(tf.int32, name='image_width')
self.height_op = tf.placeholder(tf.int32, name='image_height')
image_reshape = tf.reshape(self.image_op, [1, self.height_op, self.width_op, 3])
self.cls_prob, self.bbox_pred, _ = net_factory(image_reshape, training=False)
#调用了PNet网络,得到训练PNet后的self.cls_prob和self.bbox_pred
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)))
saver = tf.train.Saver()
#判断PNet训练后的模型是否存在
model_dict = '/'.join(model_path.split('/')[:-1])
ckpt = tf.train.get_checkpoint_state(model_dict)
print(model_path)
readstate = ckpt and ckpt.model_checkpoint_path
assert readstate, "the params dictionary is not valid"
print("restore models' param")
saver.restore(self.sess, model_path)
def predict(self, databatch):
height, width, _ = databatch.shape
cls_prob, bbox_pred = self.sess.run([self.cls_prob, self.bbox_pred],
feed_dict={self.image_op: databatch, self.width_op: width,
self.height_op: height})
#喂入databatch
return cls_prob, bbox_pred
#返回cls_prob, bbox_pred
在上文调用了类MtcnnDetector
里面的方法generate_bbox()
,并传入参数cls_cls_map[:, :, 1], reg, current_scale, self.thresh[0]
,在这里将代码调出来看是如何实现的:
def generate_bbox(self, cls_map, reg, scale, threshold):
"""
generate bbox from feature cls_map according to the threshold
Parameters:
----------
cls_map: numpy array , n x m
detect score for each position
reg: numpy array , n x m x 4
bbox
scale: float number
scale of this detection
threshold: float number
detect threshold
Returns:
-------
bbox array
"""
stride = 2
# stride = 4
cellsize = 12
# cellsize = 25
t_index = np.where(cls_map > threshold)
#返回人脸分类概率大于0.6的样本的index
if t_index[0].size == 0:
return np.array([])
#不存在对应的样本时返回空值
dx1, dy1, dx2, dy2 = [reg[t_index[0], t_index[1], i] for i in range(4)]
#得到对应bbox的offset
reg = np.array([dx1, dy1, dx2, dy2])
score = cls_map[t_index[0], t_index[1]]
#人脸概率
boundingbox = np.vstack([np.round((stride * t_index[1]) / scale),
np.round((stride * t_index[0]) / scale),
np.round((stride * t_index[1] + cellsize) / scale),
np.round((stride * t_index[0] + cellsize) / scale),
score,
reg])
#原始图片中回归框坐标需要经过反向运算,计算方式如下,其中cellSize=12,是因为12*12的图片进去后变成1*1
#stride=2是因为几层卷积中只有一个stride为2
return boundingbox.T
#返回boundingbox的转置
这里调用了类MtcnnDetector
里面的方法detect_rnet()
,并传入参数im, boxes_c
,在这里将代码调出来看是如何实现的:
def detect_rnet(self, im, dets):
"""Get face candidates using rnet
Parameters:
----------
im: numpy array
input image array
dets: numpy array
detection results of pnet
Returns:
-------
boxes: numpy array
detected boxes before calibration
boxes_c: numpy array
boxes after calibration
"""
h, w, c = im.shape
dets = self.convert_to_square(dets)
#将图片转换为正方形
dets[:, 0:4] = np.round(dets[:, 0:4])
[dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(dets, w, h)
#dx,dy:目标图片的起点
#edx,edy:目标图片的结束点
#x,y:原始图片的起点
#ex,ey:原始图片的结束点
num_boxes = dets.shape[0]
cropped_ims = np.zeros((num_boxes, 24, 24, 3), dtype=np.float32)
for i in range(num_boxes):
tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8)
tmp[dy[i]:edy[i] + 1, dx[i]:edx[i] + 1, :] = im[y[i]:ey[i] + 1, x[i]:ex[i] + 1, :]
cropped_ims[i, :, :, :] = (cv2.resize(tmp, (24, 24)) - 127.5) / 128
#遍历图片,将bbox找出来并resize成24*24
cls_scores, reg, _ = self.rnet_detector.predict(cropped_ims)
#经过RNet的训练返回人脸分类的结果和bbox的结果
cls_scores = cls_scores[:, 1]
keep_inds = np.where(cls_scores > self.thresh[1])[0]
if len(keep_inds) > 0:
boxes = dets[keep_inds]
boxes[:, 4] = cls_scores[keep_inds]
reg = reg[keep_inds]
#找出人脸分类概率大于阈值的图片
else:
return None, None, None
keep = py_nms(boxes, 0.6)
boxes = boxes[keep]
boxes_c = self.calibrate_box(boxes, reg[keep])
#回归信息reg来调整bbox的坐标信息
return boxes, boxes_c, None