一般的人脸识别应用通常都包括三个过程:
1 人脸detect,这一步主要是定位人脸在图像中的位置,输出人脸的位置矩形框
2 人脸shape predictor,这一步主要是找出眼睛眉毛鼻子嘴巴的68个定位点
3 人脸对齐alignment,这一步主要是通过投影几何变换出一张标准脸
4 人脸识别,这一步主要在对齐的人脸图像上提取128维的特征向量,根据特征向量间的距离来进行判断识别。
本编文章主要想解读一下dlib中关于人脸对齐的源码。人脸对齐主要是一个affline transform即仿射变换,我们在detect到人脸后会得到一个矩形位置框,需要将这个矩形里面的人脸变换到150*150大小的标准人脸,英文叫做chip,首先看一下一个结构体,它包含变换的一些基本信息chip_detail:
struct chip_details
{
chip_details() : angle(0), rows(0), cols(0) {}
chip_details(const rectangle& rect_) : rect(rect_),angle(0), rows(rect_.height()), cols(rect_.width()) {}
chip_details(const drectangle& rect_) : rect(rect_),angle(0),
rows((unsigned long)(rect_.height()+0.5)), cols((unsigned long)(rect_.width()+0.5)) {}
chip_details(const drectangle& rect_, unsigned long size) : rect(rect_),angle(0)
{ compute_dims_from_size(size); }
chip_details(const drectangle& rect_, unsigned long size, double angle_) : rect(rect_),angle(angle_)
{ compute_dims_from_size(size); }
chip_details(const drectangle& rect_, const chip_dims& dims) :
rect(rect_),angle(0),rows(dims.rows), cols(dims.cols) {}
chip_details(const drectangle& rect_, const chip_dims& dims, double angle_) :
rect(rect_),angle(angle_),rows(dims.rows), cols(dims.cols) {}
template
chip_details(
const std::vector >& chip_points,
const std::vector >& img_points,
const chip_dims& dims
) :
rows(dims.rows), cols(dims.cols)
{
DLIB_CASSERT( chip_points.size() == img_points.size() && chip_points.size() >= 2,
"\t chip_details::chip_details(chip_points,img_points,dims)"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t chip_points.size(): " << chip_points.size()
<< "\n\t img_points.size(): " << img_points.size()
);
const point_transform_affine tform = find_similarity_transform(chip_points,img_points);
dlib::vector p(1,0);
p = tform.get_m()*p;
// There are only 3 things happening in a similarity transform. There is a
// rescaling, a rotation, and a translation. So here we pick out the scale and
// rotation parameters.
angle = std::atan2(p.y(),p.x());
// Note that the translation and scale part are represented by the extraction
// rectangle. So here we build the appropriate rectangle.
const double scale = length(p);
rect = centered_drect(tform(point(dims.cols,dims.rows)/2.0),
dims.cols*scale,
dims.rows*scale);
}
drectangle rect;//chip在原图中的位置大小
double angle;//chip和原图目标间的角度
unsigned long rows; //chip的行列大小
unsigned long cols;
inline unsigned long size() const
{
return rows*cols;
}
private:
void compute_dims_from_size (
unsigned long size
)
{
const double relative_size = std::sqrt(size/(double)rect.area());
rows = static_cast(rect.height()*relative_size + 0.5);
cols = static_cast(size/(double)rows + 0.5);
rows = std::max(1ul,rows);
cols = std::max(1ul,cols);
}
};
这个结构体的构造函数,主要传入chip上的特征点和实际人脸的特征点,找到一个相似变换,然后提取角度,平移,缩放三个方面的信息,角度放在angle,平移和缩放放在rect
template
chip_details(
const std::vector >& chip_points,
const std::vector >& img_points,
const chip_dims& dims
)
需要注意的是,这里的变换是从chip-》img的(img指原图),意思给定chip上一个点,可以找到img上面对应点。
inline chip_details get_face_chip_details (
const full_object_detection& det,
const unsigned long size = 200,
const double padding = 0.2
)
{
DLIB_CASSERT(det.num_parts() == 68,
"\t chip_details get_face_chip_details()"
<< "\n\t You must give a detection with exactly 68 parts in it."
<< "\n\t det.num_parts(): " << det.num_parts()
);
DLIB_CASSERT(padding >= 0 && size > 0,
"\t chip_details get_face_chip_details()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t padding: " << padding
<< "\n\t size: " << size
);
// Average positions of face points 17-67
const double mean_face_shape_x[] = {
0.000213256, 0.0752622, 0.18113, 0.29077, 0.393397, 0.586856, 0.689483, 0.799124,
0.904991, 0.98004, 0.490127, 0.490127, 0.490127, 0.490127, 0.36688, 0.426036,
0.490127, 0.554217, 0.613373, 0.121737, 0.187122, 0.265825, 0.334606, 0.260918,
0.182743, 0.645647, 0.714428, 0.793132, 0.858516, 0.79751, 0.719335, 0.254149,
0.340985, 0.428858, 0.490127, 0.551395, 0.639268, 0.726104, 0.642159, 0.556721,
0.490127, 0.423532, 0.338094, 0.290379, 0.428096, 0.490127, 0.552157, 0.689874,
0.553364, 0.490127, 0.42689
};
const double mean_face_shape_y[] = {
0.106454, 0.038915, 0.0187482, 0.0344891, 0.0773906, 0.0773906, 0.0344891,
0.0187482, 0.038915, 0.106454, 0.203352, 0.307009, 0.409805, 0.515625, 0.587326,
0.609345, 0.628106, 0.609345, 0.587326, 0.216423, 0.178758, 0.179852, 0.231733,
0.245099, 0.244077, 0.231733, 0.179852, 0.178758, 0.216423, 0.244077, 0.245099,
0.780233, 0.745405, 0.727388, 0.742578, 0.727388, 0.745405, 0.780233, 0.864805,
0.902192, 0.909281, 0.902192, 0.864805, 0.784792, 0.778746, 0.785343, 0.778746,
0.784792, 0.824182, 0.831803, 0.824182
};
COMPILE_TIME_ASSERT(sizeof(mean_face_shape_x)/sizeof(double) == 68-17);
std::vector > from_points, to_points;
for (unsigned long i = 17; i < det.num_parts(); ++i)
{
// Ignore the lower lip
if ((55 <= i && i <= 59) || (65 <= i && i <= 67))
continue;
// Ignore the eyebrows
if (17 <= i && i <= 26)
continue;
dlib::vector p;
p.x() = (padding+mean_face_shape_x[i-17])/(2*padding+1);
p.y() = (padding+mean_face_shape_y[i-17])/(2*padding+1);
from_points.push_back(p*size);
to_points.push_back(det.part(i));
}
return chip_details(from_points, to_points, chip_dims(size,size));
}
这个首先定义标准脸特征点的位置,再解析出了chip_detail结构体。
再接下来就是提取人脸图片了:
template <
typename image_type1,
typename image_type2
>
void extract_image_chips (
const image_type1& img,
const std::vector& chip_locations,
dlib::array& chips
)
{
// make sure requires clause is not broken
#ifdef ENABLE_ASSERTS
for (unsigned long i = 0; i < chip_locations.size(); ++i)
{
DLIB_CASSERT(chip_locations[i].size() != 0 &&
chip_locations[i].rect.is_empty() == false,
"\t void extract_image_chips()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t chip_locations["< pyr;
long max_depth = 0;
// If the chip is supposed to be much smaller than the source subwindow then you
// can't just extract it using bilinear interpolation since at a high enough
// downsampling amount it would effectively turn into nearest neighbor
// interpolation. So we use an image pyramid to make sure the interpolation is
// fast but also high quality. The first thing we do is figure out how deep the
// image pyramid needs to be.
rectangle bounding_box;
for (unsigned long i = 0; i < chip_locations.size(); ++i)
{
long depth = 0;
double grow = 2;
drectangle rect = pyr.rect_down(chip_locations[i].rect);
while (rect.area() > chip_locations[i].size())
{
rect = pyr.rect_down(rect);
++depth;
// We drop the image size by a factor of 2 each iteration and then assume a
// border of 2 pixels is needed to avoid any border effects of the crop.
grow = grow*2 + 2;
}
drectangle rot_rect;
const vector cent = center(chip_locations[i].rect);
rot_rect += rotate_point(cent,chip_locations[i].rect.tl_corner(),chip_locations[i].angle);
rot_rect += rotate_point(cent,chip_locations[i].rect.tr_corner(),chip_locations[i].angle);
rot_rect += rotate_point(cent,chip_locations[i].rect.bl_corner(),chip_locations[i].angle);
rot_rect += rotate_point(cent,chip_locations[i].rect.br_corner(),chip_locations[i].angle);
bounding_box += grow_rect(rot_rect, grow).intersect(get_rect(img));
max_depth = std::max(depth,max_depth);
}
//std::cout << "max_depth: " << max_depth << std::endl;
//std::cout << "crop amount: " << bounding_box.area()/(double)get_rect(img).area() << std::endl;
// now make an image pyramid
dlib::array::pixel_type> > levels(max_depth);
if (levels.size() != 0)
pyr(sub_image(img,bounding_box),levels[0]);
for (unsigned long i = 1; i < levels.size(); ++i)
pyr(levels[i-1],levels[i]);
std::vector > from, to;
// now pull out the chips
chips.resize(chip_locations.size());
for (unsigned long i = 0; i < chips.size(); ++i)
{
// If the chip doesn't have any rotation or scaling then use the basic version
// of chip extraction that just does a fast copy.
if (chip_locations[i].angle == 0 &&
chip_locations[i].rows == chip_locations[i].rect.height() &&
chip_locations[i].cols == chip_locations[i].rect.width())
{
impl::basic_extract_image_chip(img, chip_locations[i].rect, chips[i]);
}
else
{
set_image_size(chips[i], chip_locations[i].rows, chip_locations[i].cols);
// figure out which level in the pyramid to use to extract the chip
int level = -1;
drectangle rect = translate_rect(chip_locations[i].rect, -bounding_box.tl_corner());
while (pyr.rect_down(rect).area() > chip_locations[i].size())
{
++level;
rect = pyr.rect_down(rect);
}
// find the appropriate transformation that maps from the chip to the input
// image
from.clear();
to.clear();
from.push_back(get_rect(chips[i]).tl_corner()); to.push_back(rotate_point(center(rect),rect.tl_corner(),chip_locations[i].angle));
from.push_back(get_rect(chips[i]).tr_corner()); to.push_back(rotate_point(center(rect),rect.tr_corner(),chip_locations[i].angle));
from.push_back(get_rect(chips[i]).bl_corner()); to.push_back(rotate_point(center(rect),rect.bl_corner(),chip_locations[i].angle));
point_transform_affine trns = find_affine_transform(from,to);
// now extract the actual chip
if (level == -1)
transform_image(sub_image(img,bounding_box),chips[i],interpolate_bilinear(),trns);
else
transform_image(levels[level],chips[i],interpolate_bilinear(),trns);
}
}
}
上面函数中,首先需要构造一个图像金字塔用于图像缩放,因为如果我们的chip是150*150的,映射到原图上的1000*1000,这个就需要有个缩放的过程,如果我们直接从缩放到150*150得到的图像质量不好,因此采用一级级下采样来缩放,函数中首先寻找到对应的目标下采样深度,进行图像下采样,接着通过find_affine_transform来计算到仿射变换矩阵,得到矩阵后直接transform_image就好了。transform_image里面就不分析了,基本就是一个个像素位置变换填充就可以了。