dlib是基于c++的跨平台通用框架,其内容非常丰富,涵盖机器学习、图像处理、数据压缩等。原作者链接。dlib在image Processing 模块中定义了基于方向梯度直方图(Histogram of Oriented Gradient,HOG)特征和图像金字塔的多尺度目标检测方法。在非深度学习的领域中,其目标检测算法准确率较高,同时能保证良好的实时性。
本文将以人脸检测为例,详细解析dlib本部分源码。
获取已训练的目标检测模型
namespace dlib
{ //miexp : 6 控制缩放步长 (6-1)/6
typedef object_detector6> > > frontal_face_detector;
// 注:dlib将模型文件编码成字符串放在frontal_face_detector.h中,该函数用于解码
// 字符串获取模型文件内容
inline const std::string get_serialized_frontal_faces();
inline frontal_face_detector get_frontal_face_detector();
}
目标检测主函数,分别用5种角度的目标检测器分别调用scanner.detect() 函数找出当前图像中包含人脸的位置
void object_detector::
operator() (// miexp: 目标检测接口
const image_type& img,
std::vector & final_dets,
double adjust_threshold )
{
scanner.load(img);//载入当前帧或者图像,并提取hog特征用于检测
std::vector<std::pair<double, rectangle> > dets;//被挑选中的位置和得分
std::vector dets_accum;
for (unsigned long i = 0; i < w.size(); ++i)//分别检测5种角度
{
const double thresh = w[i].w(scanner.get_num_dimensions());
scanner.detect(w[i].get_detect_argument(), dets, thresh + adjust_threshold);// 得到候选位置
for (unsigned long j = 0; j < dets.size(); ++j)
{
rect_detection temp;
temp.detection_confidence = dets[j].first-thresh;//计算候选位置的detection_confidence
temp.weight_index = i;
temp.rect = dets[j].second;
dets_accum.push_back(temp);
}
}
// 非极大值抑制,用于筛选最准确的窗口
final_dets.clear();
if (w.size() > 1)
std::sort(dets_accum.rbegin(), dets_accum.rend());//按detection_confidence排序
for (unsigned long i = 0; i < dets_accum.size(); ++i)
{
// 候选窗口若覆盖,则放弃当前窗口(复杂度n2)
if (overlaps_any_box(final_dets, dets_accum[i].rect))
continue;
final_dets.push_back(dets_accum[i]);
}
}
load 载入图像并以金字塔形式提取hog特征
void scan_fhog_pyramid::load (constimage_type& img )
{
// miexp: load 载入图像并提取hog特征
unsigned long width, height;
compute_fhog_window_size(width,height);//金字塔顶端最小图像,与hog窗口大小相关 10 10
//计算图像金字塔,并提取各层HOG特征存入feats中
impl::create_fhog_pyramid(img, fe, feats, cell_size, height,
width, min_pyramid_layer_width, min_pyramid_layer_height,
max_pyramid_levels);
}
//---------------------------------------------------------------
// mcreate_fhog_pyramid 构建图像HOG特征金字塔
void create_fhog_pyramid (
const image_type& img,
const feature_extractor_type& fe,
array<arrayfloat > > >& feats,
int cell_size,
int filter_rows_padding,
int filter_cols_padding,
unsigned long min_pyramid_layer_width,
unsigned long min_pyramid_layer_height,
unsigned long max_pyramid_levels
)
{
unsigned long levels = 0;
rectangle rect = get_rect(img);
pyramid_type pyr;
{//根据图像大小计算金字塔层数
do
{
rect = pyr.rect_down(rect);
++levels;
} while (rect.width() >= min_pyramid_layer_width &&
rect.height() >= min_pyramid_layer_height &&
levels < max_pyramid_levels);
}
//根据金字塔层数分配内存
if (feats.max_size() < levels)
feats.set_max_size(levels);
feats.set_size(levels);
// 构建特征金字塔
// 提取第0层hog特征 ,通过()运算符重载,会调用中的 impl_extract_fhog_features函数
fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding);
if (feats.size() > 1)
{
typedef typename image_traits::pixel_type pixel_type;
array2d temp1, temp2;
pyr(img, temp1);// 下采样
//提取第1层HOG特征
fe(temp1, feats[1], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
for (unsigned long i = 2; i < feats.size(); ++i)
{
pyr(temp2, temp1);//下采样
//提取第i层特征
fe(temp1, feats[i], cell_size,filter_rows_padding,filter_cols_padding);
swap(temp1,temp2);
}
}
}
//---------------------------------------------------------------
// 调用的extract_fhog_features函数
void impl_extract_fhog_features( const image_type& img_,
out_type& hog,
int cell_size,//本例中为8,即每个cell中有8x8个像素点
int filter_rows_padding,
int filter_cols_padding )
{
const_image_view img(img_);//定义第一象限和第二象限的9个方向的单位向量
matrix<float,2,1> directions[9];
directions[0] = 1.0000, 0.0000;
directions[1] = 0.9397, 0.3420;
directions[2] = 0.7660, 0.6428;
directions[3] = 0.500, 0.8660;
directions[4] = 0.1736, 0.9848;
directions[5] = -0.1736, 0.9848;
directions[6] = -0.5000, 0.8660;
directions[7] = -0.7660, 0.6428;
directions[8] = -0.9397, 0.3420;
//计算所需内存用于缓存梯度直方图及各cell梯度直方图向量的norm
const int cells_nr = (int)((float)img.nr()/(float)cell_size + 0.5);
const int cells_nc = (int)((float)img.nc()/(float)cell_size + 0.5);
//三维矩阵 cells_nr+2 cells_nc+2 18
array2dfloat,18,1> > hist(cells_nr+2, cells_nc+2);
for (long r = 0; r < hist.nr(); ++r)
for (long c = 0; c < hist.nc(); ++c)
hist[r][c] = 0;//初始化为0
array2d<float> norm(cells_nr, cells_nc);// 各cell梯度直方图的norm值矩阵
assign_all_pixels(norm, 0);//初始化为0
// memory for HOG features
const int hog_nr = std::max(cells_nr-2, 0);
const int hog_nc = std::max(cells_nc-2, 0);
const int padding_rows_offset = (filter_rows_padding-1)/2;
const int padding_cols_offset = (filter_cols_padding-1)/2;
init_hog(hog, hog_nr, hog_nc, filter_rows_padding, filter_cols_padding);
const int visible_nr = std::min((long)cells_nr*cell_size,img.nr())-1;
const int visible_nc = std::min((long)cells_nc*cell_size,img.nc())-1;
//首先计算方向梯度直方图
for (int y = 1; y < visible_nr; y++) //遍历图像纵坐标
{
const float yp = ((float)y+0.5)/(float)cell_size - 0.5;
const int iyp = (int)std::floor(yp);
const float vy0 = yp - iyp;
const float vy1 = 1.0 - vy0;
int x;
for (x = 1; x < visible_nc; x++)
{
matrix<float, 2, 1> grad;
float v;
get_gradient(y,x,img,grad,v);//做水平垂直差分,计算当前像素点的梯度(dy,dx)
// 即找出最大的内积绝对值 和 方向
float best_dot = 0;
int best_o = 0;
for (int o = 0; o < 9; o++)
{
const float dot = dlib::dot(directions[o], grad);//点乘计算内积
if (dot > best_dot) //在一、二象限内积为正
{
best_dot = dot;
best_o = o;
}
else if (-dot > best_dot) //在三、四象限内积为负
{
best_dot = -dot;
best_o = o+9;
}
}
v = std::sqrt(v);
// add to 4 histograms around pixel using bilinear interpolation
const float xp = ((double)x + 0.5) / (double)cell_size - 0.5;
const int ixp = (int)std::floor(xp);
const float vx0 = xp - ixp;
const float vx1 = 1.0 - vx0;
// 插值,抑制突变
hist[iyp+1][ixp+1](best_o) += vy1*vx1*v;
hist[iyp+1+1][ixp+1](best_o) += vy0*vx1*v;
hist[iyp+1][ixp+1+1](best_o) += vy1*vx0*v;
hist[iyp+1+1][ixp+1+1](best_o) += vy0*vx0*v;
}
}
//计算每个cell的energy
for (int r = 0; r < cells_nr; ++r)
{
for (int c = 0; c < cells_nc; ++c)
{
for (int o = 0; o < 9; o++)
{// 平方和求模
norm[r][c] += (hist[r+1][c+1](o) + hist[r+1][c+1](o+9)) *
(hist[r+1][c+1](o) + hist[r+1][c+1](o+9));
}
}
}
//然后。。。。太多了不抄代码了,大致如下:
for (int y = 0; y < hog_nr; y++)
{
for (int x = 0; x < hog_nc; x++)
{
for(int o = 0; o < 31; o++)
{
set_hog(hog,o,x,y,18维的单位向量[某种映射关系]);
}
}
}
//得到该图的hog特征,hog (h/8 ,w/8 ,31),即图像中每个8x8的块,对应一个31维的向量
}
前面提取到了金字塔每一层的hog特征,一并存入feats中,然后开始检测
void detect_from_fhog_pyramid ( const array<arrayfloat > > >& feats,
const feature_extractor_type& fe,
const fhog_filterbank& w,
const double thresh,
const unsigned long det_box_height,
const unsigned long det_box_width,
const int cell_size,
const int filter_rows_padding,
const int filter_cols_padding,
std::vector<std::pair<double, rectangle> >& dets )
{
dets.clear();//初始化
array2d<float> saliency_image;//显著性系数图,系数越大,则对应位置包含目标可能性越高
pyramid_type pyr;
//遍历金字塔所有层
for (unsigned long l = 0; l < feats.size(); ++l)
{
// 检测每一层,得到每一层的显著性系数图,及目标区域
const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
// 遍历该区域
for (long r = area.top(); r <= area.bottom(); ++r)
{
for (long c = area.left(); c <= area.right(); ++c)
{
// if we found a detection
if (saliency_image[r][c] >= thresh)//若该处显著性系数大于阈值
{
// 取出该处的矩阵位置
rectangle temp = centered_rect(point(c,r),det_box_width,det_box_height);
rectangle rect = fe.feats_to_image(temp,cell_size,
filter_rows_padding, filter_cols_padding);
rect = pyr.rect_up(rect, l);//放大到正常大小
dets.push_back(std::make_pair(saliency_image[r][c], rect));
}
}
}
}
std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);//根据显著性系数排序
}
检测每一层
rectangle apply_filters_to_fhog( const fhog_filterbank& w,//读入的本地检测模型
const arrayfloat > >& feats,//当前层hog特征集
array2d<float>& saliency_image )
{
rectangle area;
saliency_image.clear();
array2d<float> scratch;
unsigned long i = 0;
for (j=0; i < w.row_filters.size(); ++i)// 31个卷积核
{// 31个共同投票求和
for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)// 2 或者 3
{// row_filters[i][j] 和w.col_filters[i][j] 均为长度为10的向量 说明检测窗口是80×80 像素
area = float_spatially_filter_image_separable(feats[i], saliency_image,
w.row_filters[i][j], w.col_filters[i][j],scratch,true);
//先横向卷积,后纵向卷积,两次一维卷积比二维卷积速度更快
}
}
if (saliency_image.size() == 0)
{
saliency_image.set_size(feats[0].nr(), feats[0].nc());
assign_all_pixels(saliency_image, 0);
}
return area;
}
两次一维卷积
rectangle float_spatially_filter_image_separable( const in_image_type& in_img_,
out_image_type& out_img_,
const matrix_exp& _row_filter,
const matrix_exp& _col_filter,
out_image_type& scratch_,
bool add_to = false )
{
const_temp_matrix row_filter(_row_filter);//float[10]的卷积核
const_temp_matrix col_filter(_col_filter);//float[10]的卷积核
const_image_view in_img(in_img_);
image_view out_img(out_img_);
out_img.set_size(in_img.nr(),in_img.nc());//卷积结果存入out_img
// 卷积过程的起始位置和终止位置
const long first_row = col_filter.size()/2;
const long first_col = row_filter.size()/2;
const long last_row = in_img.nr() - ((col_filter.size()-1)/2);
const long last_col = in_img.nc() - ((row_filter.size()-1)/2);
// 使用 row filter
for (long r = 0; r < in_img.nr(); ++r)
{
long c =first_col;
for ( ; c < last_col; ++c)
{
float p;
float temp = 0;
for (long n = 0; n < row_filter.size(); ++n)
{
temp += in_img[r][c-first_col+n]*row_filter(n);//一维卷积
}
scratch[r][c] = temp;
}
}
// 使用 column filter
for (long r = first_row; r < last_row; ++r)
{
long c = first_col;
for (; c < last_col; ++c)
{
float temp = 0;
for (long m = 0; m < col_filter.size(); ++m)
{
temp += scratch[r-first_row+m][c]*col_filter(m);
}
out_img[r][c] = add_to? temp : out_img[r][c] + temp;
}
}
return non_border;
}