dlib开源库中人脸检测部分代码(在dlib_face_detection.cpp中)的流程梳理,其中涉及到图像金字塔(双线性插值),hog特征提取,svm分类,以及其中的一些trick。
图像金字塔 ——》对每一级图像进行hog提取特征——》svm分类(是否存在人脸)
在dlib_face_detection.cpp的main函数开始,
执行 frontal_face_detector detector = get_frontal_face_detector(); 调用 frontal_face_detection.h 19:
typedef object_detector > > frontal_face_detector;
inline const std::string get_serialized_frontal_faces();
//获取已经训练好的模型,是字符串形式,deserialize后得到模型文件
inline frontal_face_detector get_frontal_face_detector()
{
std::istringstream sin(get_serialized_frontal_faces());
frontal_face_detector detector;
deserialize(detector, sin);
return detector;
}
执行 deserialize(detector, sin); 调用 object_detector.h 240:
template
void deserialize (
object_detector& item,
std::istream& in
)
{
int version = 0;
deserialize(version, in);
if (version == 1)
{
deserialize(item.scanner, in);
item.w.resize(1);
deserialize(item.w[0].w, in);
item.w[0].init(item.scanner);
deserialize(item.boxes_overlap, in);
}
else if (version == 2)
{
deserialize(item.scanner, in);
deserialize(item.boxes_overlap, in);
unsigned long num_detectors = 0;
deserialize(num_detectors, in);
item.w.resize(num_detectors);
for (unsigned long i = 0; i < item.w.size(); ++i)
{
deserialize(item.w[i].w, in);
item.w[i].init(item.scanner);
}
}
else
{
throw serialization_error("Unexpected version encountered while deserializing a dlib::object_detector object.");
}
}
执行253 deserialize(item.scanner, in); 调用 scan_fhog_pyramid.h 495:
template
void deserialize (
scan_fhog_pyramid& item,
std::istream& in
)
{
int version = 0;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unsupported version found when deserializing a scan_fhog_pyramid object.");
deserialize(item.fe, in);
deserialize(item.feats, in);
deserialize(item.cell_size, in);
deserialize(item.padding, in);
deserialize(item.window_width, in);
deserialize(item.window_height, in);
deserialize(item.max_pyramid_levels, in);
deserialize(item.min_pyramid_layer_width, in);
deserialize(item.min_pyramid_layer_height, in);
deserialize(item.nuclear_norm_regularization_strength, in);
// When developing some feature extractor, it's easy to accidentally change its
// number of dimensions and then try to deserialize data from an older version of
// your extractor into the current code. This check is here to catch that kind of
// user error.
long dims;
deserialize(dims, in);
if (item.get_num_dimensions() != dims)
throw serialization_error("Number of dimensions in serialized scan_fhog_pyramid doesn't match the expected number.");
}
基本参数,fe, feats,cell_size,padding, window_width, window_height等
执行254 deserialize(item.boxes_overlap, in); 调用 box_overlap_testing.h 136
inline void deserialize (
test_box_overlap& item,
std::istream& in
)
{
double percent_covered_thresh, iou_thresh;
deserialize(iou_thresh, in);
deserialize(percent_covered_thresh, in);
item = test_box_overlap(iou_thresh, percent_covered_thresh);
}
num_detectors = 5 //front, left,right, front left rotated, front right rotated人脸的五面
for (unsigned long i = 0; i < item.w.size(); ++i)
{
deserialize(item.w[i].w, in);
item.w[i].init(item.scanner);
}
对w进行赋值, w的size = 5
//初始化部分结束
然后加载图像:
load_image(img,argv[i]);
做图像金字塔:
pyramid_up(img);
在图像金字塔中首先将图像扩大至原来的二倍,再把图像尺寸变为上一张图像的5/6,直到图像尺寸小于64*64。
调用 interpolation.h 1602
if (image_size(in_img) == 0)
{
set_image_size(out_img, 0, 0);
return;
}
rectangle rect = get_rect(in_img);
rectangle uprect = pyr.rect_up(rect);
if (uprect.is_empty())
{
set_image_size(out_img, 0, 0);
return;
}
set_image_size(out_img, uprect.bottom()+1, uprect.right()+1);
resize_image(in_img, out_img, interp);
//图像扩大至原来的2倍结束
然后进行人脸检测
std::vector dets = detector(img);
调用object_detector.h 505
template <
typename image_scanner_type
>
template <
typename image_type
>
std::vector object_detector::
operator() (
const image_type& img,
double adjust_threshold
)
{
std::vector dets;
(*this)(img,dets,adjust_threshold);
std::vector final_dets(dets.size());
for (unsigned long i = 0; i < dets.size(); ++i)
final_dets[i] = dets[i].rect;
return final_dets;
}
执行507 (*this)(img,dets,adjust_threshold); 调用 object_detector.h 432:
template <
typename image_scanner_type
>
template <
typename image_type
>
void object_detector::
operator() (
const image_type& img,
std::vector& final_dets,
double adjust_threshold
)
{
scanner.load(img); //执行完此句,图像金字塔建立完成,每张图像尺寸都是上一个图像的5/6
std::vector > dets;
std::vector dets_accum;
for (unsigned long i = 0; i < w.size(); ++i)
{
const double thresh = w[i].w(scanner.get_num_dimensions());
scanner.detect(w[i].get_detect_argument(), dets, thresh + adjust_threshold);
for (unsigned long j = 0; j < dets.size(); ++j)
{
rect_detection temp;
temp.detection_confidence = dets[j].first-thresh;
temp.weight_index = i;
temp.rect = dets[j].second;
dets_accum.push_back(temp);
}
}
// Do non-max suppression
final_dets.clear();
if (w.size() > 1)
std::sort(dets_accum.rbegin(), dets_accum.rend());
for (unsigned long i = 0; i < dets_accum.size(); ++i)
{
if (overlaps_any_box(final_dets, dets_accum[i].rect))
continue;
final_dets.push_back(dets_accum[i]);
}
}
执行433 scanner.load(img); 调用 scan_fhog_pyramid.h 630:
template <
typename Pyramid_type,
typename feature_extractor_type
>
template <
typename image_type
>
void scan_fhog_pyramid::
load (
const image_type& img
)
{
unsigned long width, height;
compute_fhog_window_size(width,height); //10*10
impl::create_fhog_pyramid(img, fe, feats, cell_size, height,
width, min_pyramid_layer_width, min_pyramid_layer_height,
max_pyramid_levels);
}
执行632 compute_fhog_window_size(width,height); 调用scan_fhog_pyramid.h 355
inline void compute_fhog_window_size(
unsigned long& width,
unsigned long& height
) const
{
const rectangle rect = centered_rect(point(0,0),window_width,window_height); //得到中心点为point的矩形
const rectangle temp = grow_rect(fe.image_to_feats(rect, cell_size, 1, 1), padding);
width = temp.width();
height = temp.height();
}
执行357 调用scan_fhog_pyramid.h 28
inline rectangle image_to_feats (
const rectangle& rect,
int cell_size,
int filter_rows_padding,
int filter_cols_padding
) const
{
return image_to_fhog(rect, cell_size, filter_rows_padding, filter_cols_padding);
}
执行28 调用fhog.h
inline rectangle image_to_fhog (
const rectangle& rect,
int cell_size = 8,
int filter_rows_padding = 1,
int filter_cols_padding = 1
)
{
// make sure requires clause is not broken
DLIB_ASSERT( cell_size > 0 &&
filter_rows_padding > 0 &&
filter_cols_padding > 0 ,
"\t rectangle image_to_fhog()"
<< "\n\t Invalid inputs were given to this function. "
<< "\n\t cell_size: " << cell_size
<< "\n\t filter_rows_padding: " << filter_rows_padding
<< "\n\t filter_cols_padding: " << filter_cols_padding
);
return rectangle(image_to_fhog(rect.tl_corner(),cell_size,filter_rows_padding,filter_cols_padding),
image_to_fhog(rect.br_corner(),cell_size,filter_rows_padding,filter_cols_padding));
}
调用1168
inline point image_to_fhog (
point p,
int cell_size = 8,
int filter_rows_padding = 1,
int filter_cols_padding = 1
)
{
// make sure requires clause is not broken
DLIB_ASSERT( cell_size > 0 &&
filter_rows_padding > 0 &&
filter_cols_padding > 0 ,
"\t point image_to_fhog()"
<< "\n\t Invalid inputs were given to this function. "
<< "\n\t cell_size: " << cell_size
<< "\n\t filter_rows_padding: " << filter_rows_padding
<< "\n\t filter_cols_padding: " << filter_cols_padding
);
// There is a one pixel border around the image.
p -= point(1,1);
// There is also a 1 "cell" border around the HOG image formation.
return p/cell_size - point(1,1) + point((filter_cols_padding-1)/2,(filter_rows_padding-1)/2);
}
//image_to_fhog结束
执行scan_fhog_pyramid.h
const rectangle temp = grow_rect(fe.image_to_feats(rect, cell_size, 1, 1), padding);
调用
inline const rectangle grow_rect (
const rectangle& rect,
long num
)
{
return shrink_rect(rect, -num);
}
inline const rectangle shrink_rect (
const rectangle& rect,
long num
)
{
return rectangle(rect.left()+num, rect.top()+num, rect.right()-num, rect.bottom()-num);
}
//结束compute_fhog_window_size(width,height);
调用scan_fhog_pyramid.h 574:
namespace impl
{
template <
typename pyramid_type,
typename image_type,
typename feature_extractor_type
>
void create_fhog_pyramid (
const image_type& img,
const feature_extractor_type& fe,
array > >& feats,
int cell_size,
int filter_rows_padding,
int filter_cols_padding,
unsigned long min_pyramid_layer_width,
unsigned long min_pyramid_layer_height,
unsigned long max_pyramid_levels
)
{
unsigned long levels = 0;
rectangle rect = get_rect(img);
//生成金字塔图像, levels表示多少级, 图像最小是64*64
// figure out how many pyramid levels we should be using based on the image size
pyramid_type pyr;
do
{
rect = pyr.rect_down(rect);
++levels;
} while (rect.width() >= min_pyramid_layer_width && rect.height() >= min_pyramid_layer_height &&
levels < max_pyramid_levels);
if (feats.max_size() < levels)
feats.set_max_size(levels);
feats.set_size(levels); //我的输入图像是1024*1024, feats_size = 20
// build our feature pyramid
fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding);
DLIB_ASSERT(feats[0].size() == fe.get_num_planes(),
"Invalid feature extractor used with dlib::scan_fhog_pyramid. The output does not have the \n"
"indicated number of planes.");
if (feats.size() > 1)
{
typedef typename image_traits::pixel_type pixel_type;
array2d temp1, temp2;
pyr(img, temp1);
fe(temp1, feats[1], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
for (unsigned long i = 2; i < feats.size(); ++i)
{
pyr(temp2, temp1);
fe(temp1, feats[i], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
}
}
}
}
执行 fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding); 调用scan_fhog_pyramid.h 51
template <
typename image_type
>
void operator()(
const image_type& img,
dlib::array >& hog,
int cell_size,
int filter_rows_padding,
int filter_cols_padding
) const
{
extract_fhog_features(img,hog,cell_size,filter_rows_padding,filter_cols_padding);
}
extract_fhog_features 调用 fhog.h中
template <
typename image_type,
typename T,
typename mm1,
typename mm2
>
void extract_fhog_features(
const image_type& img,
dlib::array,mm2>& hog,
int cell_size = 8,
int filter_rows_padding = 1,
int filter_cols_padding = 1
)
{
impl_fhog::impl_extract_fhog_features(img, hog, cell_size, filter_rows_padding, filter_cols_padding);
// If the image is too small then the above function outputs an empty feature map.
// But to make things very uniform in usage we require the output to still have the
// 31 planes (but they are just empty).
if (hog.size() == 0)
hog.resize(31);
}
调用fhog.h 709
hog提取特征向量。在算cell里每个像素合梯度属于9个bin中的哪个bin时,采用x方向梯度,y方向梯度分别与9个方向单位向量的x,y相乘,计算得到哪个值最小,就属于哪个方向的bin,这种方法比计算方向角度(反正切值)速度快,而且可以定点化。
另外在计算属于哪个bin时,采用了插值的思想(回头补上)。hog特征为31维,18个360°bin+9个180°bin+4个邻域行特征向量的和做归一化。截断系数为0.2
//结束 impl_fhog::impl_extract_fhog_features
构建特征金字塔:
执行 pyr(img, temp1);
//create_fhog_pyramid结束
//结束 scanner.load
执行 const double thresh = w[i].w(scanner.get_num_dimensions()); 调用scan_fhog_pyramid.h 708
template <
typename Pyramid_type,
typename feature_extractor_type
>
long scan_fhog_pyramid::
get_num_dimensions (
) const
{
unsigned long width, height;
compute_fhog_window_size(width,height);
return width*height*fe.get_num_planes(); //width = 10, height = 10, fe.get_num_planes() = 31
}
执行 const double thresh = w[i].w(scanner.get_num_dimensions()); //scanner.get_num_dimensions()=3100=10*10*31
执行 scanner.detect(w[i].get_detect_argument(), dets, thresh + adjust_threshold); 调用scan_fhog_pyramid.h 1209, 839:
template <
typename Pyramid_type,
typename feature_extractor_type
>
struct processed_weight_vector >
{
processed_weight_vector(){}
typedef matrix feature_vector_type;
typedef typename scan_fhog_pyramid::fhog_filterbank fhog_filterbank;
void init (
const scan_fhog_pyramid& scanner
)
{
fb = scanner.build_fhog_filterbank(w);
}
const fhog_filterbank& get_detect_argument() const { return fb; }
feature_vector_type w;
fhog_filterbank fb;
};
template <
typename Pyramid_type,
typename feature_extractor_type
>
void scan_fhog_pyramid::
detect (
const fhog_filterbank& w,
std::vector >& dets,
const double thresh
) const
{
// make sure requires clause is not broken
DLIB_ASSERT(is_loaded_with_image() &&
w.get_num_dimensions() == get_num_dimensions(),
"\t void scan_fhog_pyramid::detect()"
<< "\n\t Invalid inputs were given to this function "
<< "\n\t is_loaded_with_image(): " << is_loaded_with_image()
<< "\n\t w.get_num_dimensions(): " << w.get_num_dimensions()
<< "\n\t get_num_dimensions(): " << get_num_dimensions()
<< "\n\t this: " << this
);
unsigned long width, height;
compute_fhog_window_size(width,height);
impl::detect_from_fhog_pyramid(feats, fe, w, thresh,
height-2*padding, width-2*padding, cell_size, height, width, dets);
}
执行
impl::detect_from_fhog_pyramid(feats, fe, w, thresh,
height-2*padding, width-2*padding, cell_size, height, width, dets);
调用scan_fhog_pyramid.h 778:
template <
typename pyramid_type,
typename feature_extractor_type,
typename fhog_filterbank
>
void detect_from_fhog_pyramid (
const array > >& feats,
const feature_extractor_type& fe,
const fhog_filterbank& w,
const double thresh,
const unsigned long det_box_height,
const unsigned long det_box_width,
const int cell_size,
const int filter_rows_padding,
const int filter_cols_padding,
std::vector >& dets
)
{
dets.clear(); //检测器初始化
array2d saliency_image; //显著图
pyramid_type pyr;
// for all pyramid levels 遍历所有金字塔图像
for (unsigned long l = 0; l < feats.size(); ++l)
{
const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image); //得到显著图,区域
// now search the saliency image for any detections
for (long r = area.top(); r <= area.bottom(); ++r)
{
for (long c = area.left(); c <= area.right(); ++c)
{
// if we found a detection
if (saliency_image[r][c] >= thresh)
{ //该区域位置
rectangle rect = fe.feats_to_image(centered_rect(point(c,r),det_box_width,det_box_height),
cell_size, filter_rows_padding, filter_cols_padding);
rect = pyr.rect_up(rect, l);//图像放大到原图
dets.push_back(std::make_pair(saliency_image[r][c], rect));
}
}
}
}
std::sort(dets.rbegin(), dets.rend(), compare_pair_rect); //根据分数排序
}
执行 const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image); 调用 scan_fhog_pyramid.h 421:
namespace impl
{
template
rectangle apply_filters_to_fhog (
const fhog_filterbank& w,
const array >& feats,
array2d& saliency_image
)
{
const unsigned long num_separable_filters = w.num_separable_filters(); //61
rectangle area;
// use the separable filters(row filter and col filter)if they would be faster than running the regular filters(二维filter).
if (num_separable_filters > w.filters.size()*std::min(w.filters[0].nr(),w.filters[0].nc())/3.0)
{
area = spatially_filter_image(feats[0], saliency_image, w.filters[0]);
for (unsigned long i = 1; i < w.filters.size(); ++i)
{
// now we filter but the output adds to saliency_image rather than
// overwriting it.
spatially_filter_image(feats[i], saliency_image, w.filters[i], 1, false, true);
}
}
else
{
saliency_image.clear();
array2d scratch;
// find the first filter to apply
unsigned long i = 0;
while (i < w.row_filters.size() && w.row_filters[i].size() == 0)
++i;
for (; i < w.row_filters.size(); ++i)
{
for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)
{
if (saliency_image.size() == 0)
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false);
else
area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,true);
}//横向卷积,纵向卷积比二维卷积速度快
}
if (saliency_image.size() == 0)
{
saliency_image.set_size(feats[0].nr(), feats[0].nc());
assign_all_pixels(saliency_image, 0);
}
}
return area;
}
}
FILT(r,c) == col_filter(r)*row_filter(c) //二维卷积和行、列卷积间的关系
执行 const unsigned long num_separable_filters = w.num_separable_filters(); 调用 scan_fhog_pyramid.h 223:
unsigned long num_separable_filters() const
{
unsigned long num = 0;
for (unsigned long i = 0; i < row_filters.size(); ++i)
{
num += row_filters[i].size();
}
return num; //61
}
执行 area = float_spatially_filter_image_separable(feats[i], saliency_image, w.row_filters[i][j], w.col_filters[i][j],scratch,false); 调用 spatial_filtering.h 571:
template <
typename in_image_type,
typename out_image_type,
typename EXP1,
typename EXP2
>
rectangle float_spatially_filter_image_separable (
const in_image_type& in_img_,
out_image_type& out_img_,
const matrix_exp& _row_filter,
const matrix_exp& _col_filter,
out_image_type& scratch_,
bool add_to = false
)
{
// You can only use this function with images and filters containing float
// variables.
COMPILE_TIME_ASSERT((is_float_filtering::value == true));
const_temp_matrix row_filter(_row_filter);
const_temp_matrix col_filter(_col_filter);
DLIB_ASSERT(row_filter.size() != 0 && col_filter.size() != 0 &&
is_vector(row_filter) &&
is_vector(col_filter),
"\trectangle float_spatially_filter_image_separable()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t row_filter.size(): "<< row_filter.size()
<< "\n\t col_filter.size(): "<< col_filter.size()
<< "\n\t is_vector(row_filter): "<< is_vector(row_filter)
<< "\n\t is_vector(col_filter): "<< is_vector(col_filter)
);
DLIB_ASSERT(is_same_object(in_img_, out_img_) == false,
"\trectangle float_spatially_filter_image_separable()"
<< "\n\tYou must give two different image objects"
);
const_image_view in_img(in_img_);
image_view out_img(out_img_);
// if there isn't any input image then don't do anything
if (in_img.size() == 0)
{
out_img.clear();
return rectangle();
}
//out_img为结果
out_img.set_size(in_img.nr(),in_img.nc());
//卷积区域
// figure out the range that we should apply the filter to
const long first_row = col_filter.size()/2;
const long first_col = row_filter.size()/2;
const long last_row = in_img.nr() - ((col_filter.size()-1)/2);
const long last_col = in_img.nc() - ((row_filter.size()-1)/2);
const rectangle non_border = rectangle(first_col, first_row, last_col-1, last_row-1);
if (!add_to)
zero_border_pixels(out_img, non_border);
image_view scratch(scratch_);
scratch.set_size(in_img.nr(), in_img.nc());
// apply the row filter行卷积
for (long r = 0; r < in_img.nr(); ++r)
{
long c = first_col;
for (; c < last_col-7; c+=8)
{
simd8f p,p2,p3, temp = 0, temp2=0, temp3=0;
long n = 0;
for (; n < row_filter.size()-2; n+=3)
{
// pull out the current pixel and put it into p
p.load(&in_img[r][c-first_col+n]);
p2.load(&in_img[r][c-first_col+n+1]);
p3.load(&in_img[r][c-first_col+n+2]);
temp += p*row_filter(n);
temp2 += p2*row_filter(n+1);
temp3 += p3*row_filter(n+2);
}
for (; n < row_filter.size(); ++n)
{
// pull out the current pixel and put it into p
p.load(&in_img[r][c-first_col+n]);
temp += p*row_filter(n);
}
temp += temp2 + temp3;
temp.store(&scratch[r][c]);
}
for (; c < last_col; ++c)
{
float p;
float temp = 0;
for (long n = 0; n < row_filter.size(); ++n)
{
// pull out the current pixel and put it into p
p = in_img[r][c-first_col+n];
temp += p*row_filter(n);
}
scratch[r][c] = temp;
}
}
// apply the column filter 列卷积
for (long r = first_row; r < last_row; ++r)
{
long c = first_col;
for (; c < last_col-7; c+=8)
{
simd8f p, p2, p3, temp = 0, temp2 = 0, temp3 = 0;
long m = 0;
for (; m < col_filter.size()-2; m+=3)
{
p.load(&scratch[r-first_row+m][c]);
p2.load(&scratch[r-first_row+m+1][c]);
p3.load(&scratch[r-first_row+m+2][c]);
temp += p*col_filter(m);
temp2 += p2*col_filter(m+1);
temp3 += p3*col_filter(m+2);
}
for (; m < col_filter.size(); ++m)
{
p.load(&scratch[r-first_row+m][c]);
temp += p*col_filter(m);
}
temp += temp2+temp3;
// save this pixel to the output image
if (add_to == false)
{
temp.store(&out_img[r][c]);
}
else
{
p.load(&out_img[r][c]);
temp += p;
temp.store(&out_img[r][c]);
}
}
for (; c < last_col; ++c)
{
float temp = 0;
for (long m = 0; m < col_filter.size(); ++m)
{
temp += scratch[r-first_row+m][c]*col_filter(m);
}
// save this pixel to the output image
if (add_to == false)
{
out_img[r][c] = temp;
}
else
{
out_img[r][c] += temp;
}
}
}
return non_border;
}