dlib人脸检测源码解析

dlib人脸检测源码解析

dlib是基于c++的跨平台通用框架,其内容非常丰富,涵盖机器学习、图像处理、数据压缩等。原作者链接。dlib在image Processing 模块中定义了基于方向梯度直方图(Histogram of Oriented Gradient,HOG)特征和图像金字塔的多尺度目标检测方法。在非深度学习的领域中,其目标检测算法准确率较高,同时能保证良好的实时性。
本文将以人脸检测为例,详细解析dlib本部分源码。

源码及解析


  • frontal_face_detector.h

获取已训练的目标检测模型
namespace dlib
{    //miexp : 6 控制缩放步长 (6-1)/6
    typedef object_detector6> > > frontal_face_detector;
    // 注:dlib将模型文件编码成字符串放在frontal_face_detector.h中,该函数用于解码
    //    字符串获取模型文件内容
    inline const std::string get_serialized_frontal_faces();
    inline frontal_face_detector get_frontal_face_detector();
}


  • object_detector.h

目标检测主函数,分别用5种角度的目标检测器分别调用scanner.detect() 函数找出当前图像中包含人脸的位置

void object_detector::
    operator() (// miexp: 目标检测接口
        const image_type& img,
        std::vector& final_dets,
        double adjust_threshold )
{
    scanner.load(img);//载入当前帧或者图像,并提取hog特征用于检测
    std::vector<std::pair<double, rectangle> > dets;//被挑选中的位置和得分
    std::vector dets_accum;
    for (unsigned long i = 0; i < w.size(); ++i)//分别检测5种角度
    {
        const double thresh = w[i].w(scanner.get_num_dimensions());
        scanner.detect(w[i].get_detect_argument(), dets, thresh + adjust_threshold);// 得到候选位置
        for (unsigned long j = 0; j < dets.size(); ++j)
        {
            rect_detection temp;
            temp.detection_confidence = dets[j].first-thresh;//计算候选位置的detection_confidence
            temp.weight_index = i;
            temp.rect = dets[j].second;
            dets_accum.push_back(temp);
        }
    }
    //  非极大值抑制,用于筛选最准确的窗口
    final_dets.clear();
    if (w.size() > 1)
    std::sort(dets_accum.rbegin(), dets_accum.rend());//按detection_confidence排序
    for (unsigned long i = 0; i < dets_accum.size(); ++i)
    {
        // 候选窗口若覆盖,则放弃当前窗口(复杂度n2)
         if (overlaps_any_box(final_dets, dets_accum[i].rect))
             continue;
         final_dets.push_back(dets_accum[i]);
    }
}


  • scan_fhog_pyramid.h

load 载入图像并以金字塔形式提取hog特征

void scan_fhog_pyramid::load (constimage_type& img )
{
    // miexp: load 载入图像并提取hog特征
     unsigned long width, height;
     compute_fhog_window_size(width,height);//金字塔顶端最小图像,与hog窗口大小相关 10 10
     //计算图像金字塔,并提取各层HOG特征存入feats中
     impl::create_fhog_pyramid(img, fe, feats, cell_size, height,
            width, min_pyramid_layer_width, min_pyramid_layer_height,
            max_pyramid_levels);
 }
//---------------------------------------------------------------
// mcreate_fhog_pyramid 构建图像HOG特征金字塔 
void create_fhog_pyramid (
            const image_type& img,
            const feature_extractor_type& fe,
            array<arrayfloat> > >& feats,
            int cell_size,
            int filter_rows_padding,
            int filter_cols_padding,
            unsigned long min_pyramid_layer_width,
            unsigned long min_pyramid_layer_height,
            unsigned long max_pyramid_levels
        )
 {
    unsigned long levels = 0;
    rectangle rect = get_rect(img);
    pyramid_type pyr;
    {//根据图像大小计算金字塔层数
      do
      {
       rect = pyr.rect_down(rect);
       ++levels;
       } while (rect.width() >= min_pyramid_layer_width && 
                rect.height() >= min_pyramid_layer_height &&
                levels < max_pyramid_levels);
    }
    //根据金字塔层数分配内存
    if (feats.max_size() < levels)
       feats.set_max_size(levels);
    feats.set_size(levels);
    // 构建特征金字塔
    // 提取第0层hog特征 ,通过()运算符重载,会调用中的 impl_extract_fhog_features函数
    fe(img, feats[0], cell_size,filter_rows_padding,filter_cols_padding);
    if (feats.size() > 1)
    {
       typedef typename image_traits::pixel_type pixel_type;
       array2d temp1, temp2;
       pyr(img, temp1);// 下采样
       //提取第1层HOG特征
       fe(temp1, feats[1], cell_size,filter_rows_padding,filter_cols_padding);
       swap(temp1,temp2);
       for (unsigned long i = 2; i < feats.size(); ++i)
       {
          pyr(temp2, temp1);//下采样
          //提取第i层特征
          fe(temp1, feats[i], cell_size,filter_rows_padding,filter_cols_padding);
          swap(temp1,temp2);
       }
     }
 }
//---------------------------------------------------------------
// 调用的extract_fhog_features函数 
void impl_extract_fhog_features( const image_type& img_, 
                                 out_type& hog, 
                                 int cell_size,//本例中为8,即每个cell中有8x8个像素点
                                 int filter_rows_padding,
                                 int filter_cols_padding ) 
{
    const_image_view img(img_);//定义第一象限和第二象限的9个方向的单位向量
    matrix<float,2,1> directions[9];
    directions[0] =  1.0000, 0.0000; 
    directions[1] =  0.9397, 0.3420;
    directions[2] =  0.7660, 0.6428;
    directions[3] =  0.500,  0.8660;
    directions[4] =  0.1736, 0.9848;
    directions[5] = -0.1736, 0.9848;
    directions[6] = -0.5000, 0.8660;
    directions[7] = -0.7660, 0.6428;
    directions[8] = -0.9397, 0.3420;
    //计算所需内存用于缓存梯度直方图及各cell梯度直方图向量的norm
    const int cells_nr = (int)((float)img.nr()/(float)cell_size + 0.5);
    const int cells_nc = (int)((float)img.nc()/(float)cell_size + 0.5);
    //三维矩阵 cells_nr+2  cells_nc+2  18 
    array2dfloat,18,1> > hist(cells_nr+2, cells_nc+2);
    for (long r = 0; r < hist.nr(); ++r)
       for (long c = 0; c < hist.nc(); ++c)
           hist[r][c] = 0;//初始化为0

    array2d<float> norm(cells_nr, cells_nc);// 各cell梯度直方图的norm值矩阵
    assign_all_pixels(norm, 0);//初始化为0
    // memory for HOG features
    const int hog_nr = std::max(cells_nr-2, 0);
    const int hog_nc = std::max(cells_nc-2, 0);
    const int padding_rows_offset = (filter_rows_padding-1)/2;
    const int padding_cols_offset = (filter_cols_padding-1)/2;
    init_hog(hog, hog_nr, hog_nc, filter_rows_padding, filter_cols_padding);
    const int visible_nr = std::min((long)cells_nr*cell_size,img.nr())-1;
    const int visible_nc = std::min((long)cells_nc*cell_size,img.nc())-1;
    //首先计算方向梯度直方图
    for (int y = 1; y < visible_nr; y++) //遍历图像纵坐标
    {
         const float yp = ((float)y+0.5)/(float)cell_size - 0.5;
         const int iyp = (int)std::floor(yp);
         const float vy0 = yp - iyp;
         const float vy1 = 1.0 - vy0;
         int x;
         for (x = 1; x < visible_nc; x++) 
         {
            matrix<float, 2, 1> grad;
            float v;
            get_gradient(y,x,img,grad,v);//做水平垂直差分,计算当前像素点的梯度(dy,dx)

            // 即找出最大的内积绝对值 和 方向
            float best_dot = 0;
            int best_o = 0;
            for (int o = 0; o < 9; o++) 
            {
               const float dot = dlib::dot(directions[o], grad);//点乘计算内积
               if (dot > best_dot) //在一、二象限内积为正
               {
                  best_dot = dot;
                  best_o = o;
               } 
               else if (-dot > best_dot) //在三、四象限内积为负
               {
                  best_dot = -dot;
                  best_o = o+9;
               }
            }
            v = std::sqrt(v);
            // add to 4 histograms around pixel using bilinear interpolation
            const float xp = ((double)x + 0.5) / (double)cell_size - 0.5;
            const int ixp = (int)std::floor(xp);
            const float vx0 = xp - ixp;
            const float vx1 = 1.0 - vx0;
            // 插值,抑制突变
             hist[iyp+1][ixp+1](best_o) += vy1*vx1*v;
             hist[iyp+1+1][ixp+1](best_o) += vy0*vx1*v;
             hist[iyp+1][ixp+1+1](best_o) += vy1*vx0*v;
             hist[iyp+1+1][ixp+1+1](best_o) += vy0*vx0*v;
         }
    }
    //计算每个cell的energy
    for (int r = 0; r < cells_nr; ++r)
    {
       for (int c = 0; c < cells_nc; ++c)
       {
          for (int o = 0; o < 9; o++) 
          {// 平方和求模
            norm[r][c] += (hist[r+1][c+1](o) + hist[r+1][c+1](o+9)) *
                          (hist[r+1][c+1](o) + hist[r+1][c+1](o+9));
          }
       }
    }
    //然后。。。。太多了不抄代码了,大致如下:
    for (int y = 0; y < hog_nr; y++) 
    {
       for (int x = 0; x < hog_nc; x++) 
       { 
            forint o = 0; o < 31; o++)
            {
               set_hog(hog,o,x,y,18维的单位向量[某种映射关系]);
            }
       }
    }
    //得到该图的hog特征,hog  (h/8 ,w/8 ,31),即图像中每个8x8的块,对应一个31维的向量 
}

前面提取到了金字塔每一层的hog特征,一并存入feats中,然后开始检测

void detect_from_fhog_pyramid ( const array<arrayfloat> > >& feats,
                                const feature_extractor_type& fe,
                                const fhog_filterbank& w,
                                const double thresh,
                                const unsigned long det_box_height,
                                const unsigned long det_box_width,
                                const int cell_size,
                                const int filter_rows_padding,
                                const int filter_cols_padding,
                                std::vector<std::pair<double, rectangle> >& dets ) 
{
   dets.clear();//初始化
   array2d<float> saliency_image;//显著性系数图,系数越大,则对应位置包含目标可能性越高
   pyramid_type pyr;
   //遍历金字塔所有层
   for (unsigned long l = 0; l < feats.size(); ++l)
   {
      // 检测每一层,得到每一层的显著性系数图,及目标区域
      const rectangle area = apply_filters_to_fhog(w, feats[l], saliency_image);
      // 遍历该区域
      for (long r = area.top(); r <= area.bottom(); ++r)
      {
         for (long c = area.left(); c <= area.right(); ++c)
         {
            // if we found a detection
            if (saliency_image[r][c] >= thresh)//若该处显著性系数大于阈值
            {
               // 取出该处的矩阵位置
               rectangle temp = centered_rect(point(c,r),det_box_width,det_box_height);
               rectangle rect = fe.feats_to_image(temp,cell_size, 
                                         filter_rows_padding, filter_cols_padding);
               rect = pyr.rect_up(rect, l);//放大到正常大小
               dets.push_back(std::make_pair(saliency_image[r][c], rect));
             }
          }
      }
   }
   std::sort(dets.rbegin(), dets.rend(), compare_pair_rect);//根据显著性系数排序
}

检测每一层

rectangle apply_filters_to_fhog( const fhog_filterbank& w,//读入的本地检测模型
                                 const arrayfloat> >& feats,//当前层hog特征集
                                 array2d<float>& saliency_image )
{
    rectangle area;
    saliency_image.clear();
    array2d<float> scratch;

    unsigned long i = 0;
    for (j=0; i < w.row_filters.size(); ++i)// 31个卷积核
    {// 31个共同投票求和
       for (unsigned long j = 0; j < w.row_filters[i].size(); ++j)// 2 或者 3  
       {// row_filters[i][j] 和w.col_filters[i][j] 均为长度为10的向量 说明检测窗口是80×80 像素                
          area = float_spatially_filter_image_separable(feats[i], saliency_image, 
                           w.row_filters[i][j], w.col_filters[i][j],scratch,true);
        //先横向卷积,后纵向卷积,两次一维卷积比二维卷积速度更快
       }
    }
    if (saliency_image.size() == 0)
    {
        saliency_image.set_size(feats[0].nr(), feats[0].nc());
        assign_all_pixels(saliency_image, 0);
    }
    return area;
}


  • spatial_filtering.h

两次一维卷积

rectangle float_spatially_filter_image_separable( const in_image_type& in_img_,
                                                 out_image_type& out_img_,
                                                 const matrix_exp& _row_filter,
                                                 const matrix_exp& _col_filter,
                                                 out_image_type& scratch_,
                                                 bool add_to = false    )
{
    const_temp_matrix row_filter(_row_filter);//float[10]的卷积核
    const_temp_matrix col_filter(_col_filter);//float[10]的卷积核

    const_image_view in_img(in_img_);
    image_view out_img(out_img_);
    out_img.set_size(in_img.nr(),in_img.nc());//卷积结果存入out_img
    // 卷积过程的起始位置和终止位置
    const long first_row = col_filter.size()/2;
    const long first_col = row_filter.size()/2;
    const long last_row = in_img.nr() - ((col_filter.size()-1)/2);
    const long last_col = in_img.nc() - ((row_filter.size()-1)/2);
    // 使用 row filter
    for (long r = 0; r < in_img.nr(); ++r)
    {
       long c =first_col;
       for ( ; c < last_col; ++c)
       {
           float p;
           float temp = 0;
           for (long n = 0; n < row_filter.size(); ++n)
           {
               temp += in_img[r][c-first_col+n]*row_filter(n);//一维卷积
           }
           scratch[r][c] = temp;
       }
    }
    // 使用 column filter 
    for (long r = first_row; r < last_row; ++r)
    {
        long c = first_col;
        for (; c < last_col; ++c)
        {
            float temp = 0;
            for (long m = 0; m < col_filter.size(); ++m)
            {
               temp += scratch[r-first_row+m][c]*col_filter(m);
            }
            out_img[r][c] = add_to? temp : out_img[r][c] + temp;
        }
    }
    return non_border;
}

你可能感兴趣的:(dlib人脸检测源码解析)