最近在研究人脸对齐,Joint Cascade Face Detection and Alignment(ECCV14)这篇文章感觉不错,将对齐和人脸检测同时做了,而且速度非常快,精度也很高。不过菜鸟一下子看不懂,所以就翻了一下之前的文章,发现这些算法都是一点一点进化过来的。之前作者发表过Face Alignment at 3000 FPS via Regressing Local Binary Features,再之前CVPR2012的时候他们在Face Alignment by Explicit Shape Regression中提出了shape index feature,这个特征在3000fps中也有使用。
先简单的介绍一下论文,之后会给出注释过的源码,只注释了训练部分,因为这个代码有一段时间了,看一下思路学习一下就好。新的方法实在太多,而且性能更优越,这里只是为后续工作打基础。
作者使用了多级回归的方式得到特征点,在3000fps中也用到了多级回归。一共训练了10个强分类器,每个强分类器输出一个shape的更新参数,shape更新之后再重新生成新的特征,训练并进行一下次迭代。这10个强分类器每一个都是由500个蕨分类器组成,有点像随机森林。
在介绍蕨分类器之前要先介绍一下Shape-indexed features,蕨分类器和回归树不同的地方在于,回归树是根据最小均方差来产生特征。而蕨分类器是自己生成特征。这里用到的特征就是Shape-indexed features ,论文中提到了计算方法。具体解释会在源码中给出说明。
1. Project the regression target(vectorial delta shape) to a random direction to produce a scalar.
2. Among P 2 features, select a feature with highest cor- relation to the scalar.
3. Repeat steps 1. and 2. F times to obtain F features.
4. Construct a fern by F features with random thresholds.
随机蕨算法比较早的应用是在TLD算法中,然后被使用在ESR这篇论文中。下面我找了几篇比较好的随机厥参考文献,感觉第一篇说的最简洁清晰,不过他说道的是分类问题,对于ESR中的回归问题其实也一样,统计落入一个分桶中的所有差值做平均,就可以得到回归值。linux下没有截图工具,之后再加一些图说明一下。
单个厥分离器的准确率较低,所以通过类似随机森林的方法多次取特征,多次采样,对结果做voting或者average就可以得到较好的结果。
参考文献:
http://blog.csdn.net/huangynn/article/details/51730076
http://blog.sciencenet.cn/blog-465130-964430.html
http://blog.csdn.net/stayfoolish_fan/article/details/50506906
http://blog.csdn.net/stayfoolish_fan/article/details/50455359
工程以及训练样本我之后会放在github上,对原论文做了些修改去掉了论文里提到的similarity transform,可能精度会有一定下降,不过肉眼看不出,代码会清晰一些。
函数入口,设置了一些基本的参数,加载数据,然后开始训练。
#include "FaceAlignment.h"
using namespace std;
using namespace cv;
int main(){
char s[10];
sprintf(s,"%.4d",1);
string ss(s);
cout<int img_num = 130; //1340
int candidate_pixel_num = 400;
int fern_pixel_num = 5;
int first_level_num = 10;
int second_level_num = 500;
int landmark_num = 29;
int initial_number = 20;
vector > images;
vector bbox;
//加载训练图片
cout<<"Read images..."<for(int i = 0;i < img_num;i++){
string image_name = "/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/trainingImages/";
image_name = image_name + to_string(i+1) + ".jpg";
Mat_ temp = imread(image_name,0);
images.push_back(temp);
}
// 读取bounding_box. x,y,width,height,center_x,center_y
vector bounding_box;
ifstream fin;
fin.open("/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/boundingbox.txt");
for(int i = 0;i < img_num;i++){
BoundingBox temp;
fin>>temp.start_x>>temp.start_y>>temp.width>>temp.height;
temp.centroid_x = temp.start_x + temp.width/2.0;
temp.centroid_y = temp.start_y + temp.height/2.0;
bounding_box.push_back(temp);
}
fin.close();
// 读取特征点坐标
vectordouble > > ground_truth_shapes;
fin.open("/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/keypoints.txt");
for(int i = 0;i < img_num;i++){
Mat_<double> temp(landmark_num,2);
for(int j = 0;j < landmark_num;j++){
fin>>temp(j,0);
}
for(int j = 0;j < landmark_num;j++){
fin>>temp(j,1);
}
ground_truth_shapes.push_back(temp);
}
fin.close();
//训练模型
ShapeRegressor regressor;
regressor.Train(images,ground_truth_shapes,bounding_box,first_level_num,second_level_num,
candidate_pixel_num,fern_pixel_num,initial_number);
regressor.Save("/home/f/FaceAlignment-master/FaceAlignment/model.txt");
return 0;
}
这里也没有到真正的训练阶段,主要是预处理。
void ShapeRegressor::Train(const vector >& images, //gray scale images
const vectordouble > >& ground_truth_shapes, // a vector of N*2 matrix, where N is the number of landmarks
const vector & bounding_box, // BoundingBox of faces
int first_level_num, int second_level_num, // 10 500
int candidate_pixel_num, int fern_pixel_num, // 400 5
int initial_num){ // 20 number of initial shapes for each input image
cout<<"Start training..."<0].rows;
// data augmentation and multiple initialization
vector > augmented_images;
vector augmented_bounding_box;
vectordouble > > augmented_ground_truth_shapes;
vectordouble > > current_shapes; // 扩大之后的初始化shape,绝对坐标
// 扩大训练数据
RNG random_generator(getTickCount());
for(int i = 0;i < images.size();i++){
for(int j = 0;j < initial_num;j++){ //为每一幅图片产生initial_num个初始化shape
int index = 0;
do{
index = random_generator.uniform(0, images.size());
}while(index == i);
augmented_images.push_back(images[i]);
augmented_ground_truth_shapes.push_back(ground_truth_shapes[i]);
augmented_bounding_box.push_back(bounding_box[i]);
// 1. Select ground truth shapes of other images as initial shapes
// 2. Project current shape to bounding box of ground truth shapes
// 绝对坐标点,包含了人脸位置信息,这里先转换为相对坐标,再通过bounding_box还原,去除绝对坐标的偏差
Mat_<double> temp = ground_truth_shapes[index];
temp = ProjectShape(temp, bounding_box[index]);
temp = ReProjectShape(temp, bounding_box[i]);
current_shapes.push_back(temp);
}
}
// 求平均shape模型,结果保存为相对坐标
// get mean shape from training shapes
mean_shape_ = GetMeanShape(ground_truth_shapes,bounding_box);
// train fern cascades
fern_cascades_.resize(first_level_num);
vectordouble > > prediction;
for(int i = 0;i < first_level_num;i++){
cout<<"Training fern cascades: "<1<<" out of "<1, first_level_num);
// update current shapes
// 对每一副图片的形状进行更新,prediction[x] 中保存了n个特征点的位移更新量
for(int j = 0;j < prediction.size();j++){
current_shapes[j] = prediction[j] + ProjectShape(current_shapes[j], augmented_bounding_box[j]);
current_shapes[j] = ReProjectShape(current_shapes[j],augmented_bounding_box[j]);
}
}
}
这里开始进入正题,为每一幅图片生成400个特征点,然后放入到fern分类器中训练。生成特征点的部分稍微解释一下,从代码可以看出,就是先随机的生成400个随机点,这些点都在bounding_box里,然后计算每一个点距离shape特征点(29个point)的距离,找出距离最近的shape特征点的索引,那么这个特征点就是针对某一个shape特征点的特征。(特征有点乱,人脸特征点,我都在前面加了shape,特指人脸上的点)
vectordouble > > FernCascade::Train(const vector >& images,
const vectordouble > >& current_shapes,
const vectordouble > >& ground_truth_shapes,
const vector & bounding_box,
const Mat_<double>& mean_shape,
int second_level_num, //500
int candidate_pixel_num, //400
int fern_pixel_num, //5
int curr_level_num,
int first_level_num){ //10
Mat_<double> candidate_pixel_locations(candidate_pixel_num,2); //特征点位置坐标(相对于mean_shape,75行)
Mat_<int> nearest_landmark_index(candidate_pixel_num,1); //特征点最近shape点的索引
vectordouble > > regression_targets; //存放残差
RNG random_generator(getTickCount());
second_level_num_ = second_level_num;
// calculate regression targets: the difference between ground truth shapes and current shapes
// candidate_pixel_locations: the locations of candidate pixels, indexed relative to its nearest landmark on mean shape
// 计算残差
regression_targets.resize(current_shapes.size());
for(int i = 0;i < current_shapes.size();i++){
regression_targets[i] = ProjectShape(ground_truth_shapes[i],bounding_box[i])
- ProjectShape(current_shapes[i],bounding_box[i]);
}
// 生成 shape-indexed features 特征点
// 在整张脸中生成400个随机点,并且找到和这400个随机点最近的shape点的索引
for(int i = 0;i < candidate_pixel_num;i++){
double x = random_generator.uniform(-1.0,1.0);
double y = random_generator.uniform(-1.0,1.0);
if(x*x + y*y > 1.0){ //x,y的值代表的相对坐标,这取值范围涵盖了整个boundingbox上的所有点
i--;
continue;
}
// find nearest landmark index
double min_dist = 1e10;
int min_index = 0;
for(int j = 0;j < mean_shape.rows;j++){
double temp = pow(mean_shape(j,0)-x,2.0) + pow(mean_shape(j,1)-y,2.0);
if(temp < min_dist){
min_dist = temp;
min_index = j;
}
}
candidate_pixel_locations(i,0) = x - mean_shape(min_index,0); //
candidate_pixel_locations(i,1) = y - mean_shape(min_index,1);
nearest_landmark_index(i) = min_index;
}
// for densities: each row is the pixel densities at each candidate pixels for an image
// 求每幅图的400个特征点的特征值
vector<vector<double> > densities;
densities.resize(candidate_pixel_num);
for(int i = 0;i < images.size();i++){
Mat_<double> temp = ProjectShape(current_shapes[i],bounding_box[i]);
for(int j = 0;j < candidate_pixel_num;j++){
//这里不确定,应该是特征点相对于shape点的绝对坐标,但是计算方法有点奇怪,修改之后会偏移较大
double project_x = (candidate_pixel_locations(j,0) + candidate_pixel_locations(j,1))*bounding_box[i].width/2.0;
double project_y = (candidate_pixel_locations(j,0) + candidate_pixel_locations(j,1))*bounding_box[i].height/2.0;
int index = nearest_landmark_index(j);
int real_x = project_x + current_shapes[i](index,0);
int real_y = project_y + current_shapes[i](index,1);
//不能越界
real_x = std::max(0.0,std::min((double)real_x,images[i].cols-1.0));
real_y = std::max(0.0,std::min((double)real_y,images[i].rows-1.0));
densities[j].push_back((int)images[i](real_y,real_x)); //j索引的是400个特征点的值,i索引的是所有图片
}
}
//求 densities 的协方差 densities里面存储了所有训练图片的特征点
// calculate the covariance between densities at each candidate pixels
Mat_<double> covariance(candidate_pixel_num,candidate_pixel_num);
Mat_<double> mean;
for(int i = 0;i < candidate_pixel_num;i++){
for(int j = i;j< candidate_pixel_num;j++){
double correlation_result = calculate_covariance(densities[i],densities[j]);
covariance(i,j) = correlation_result;
covariance(j,i) = correlation_result;
}
}
// train ferns
// 训练蕨分类器,每个蕨分类器的输出对n个shape点的坐标做修正
vectordouble > > prediction;
prediction.resize(regression_targets.size());
for(int i = 0;i < regression_targets.size();i++){
prediction[i] = Mat::zeros(mean_shape.rows,2,CV_64FC1);
}
ferns_.resize(second_level_num);
clock_t t = clock();
for(int i = 0;i < second_level_num;i++){
vectordouble > > temp = ferns_[i].Train(densities,covariance,candidate_pixel_locations,nearest_landmark_index,regression_targets,fern_pixel_num);
// update regression targets
for(int j = 0;j < temp.size();j++){
prediction[j] = prediction[j] + temp[j];
//boost? 每次都根据残差修改训练参数,但是没有像adaboost修改样本权重
regression_targets[j] = regression_targets[j] - temp[j];
}
//打印训练时间
if((i+1) % 50 == 0){
cout<<"Fern cascades: "<< curr_level_num << " out of "<< first_level_num<<"; ";
cout<<"Ferns: "<1<<" out of "<double remaining_level_num= (first_level_num - curr_level_num) * 500 + second_level_num - i;
double time_remaining = 0.02 * double(clock() - t) / CLOCKS_PER_SEC * remaining_level_num;
cout<<"Expected remaining time: "
<< (int)time_remaining / 60<<"min "<<(int)time_remaining % 60 <<"s"<return prediction;
}
通过计算协方差求出5对特征点,根据特征点的差值产生一个2^5的分桶,统计每个分桶中落入的shape和ground_truth shape差值,求平均之后就是某个分桶中的shape的更新值。
vectordouble > > Fern::Train(const vector<vector<double> >& candidate_pixel_intensity, //特征点的特征值
const Mat_<double>& covariance, //特征点之间的协方差(找出最有辨别力的特征点)
const Mat_<double>& candidate_pixel_locations, //特征点的坐标(相对坐标)
const Mat_<int>& nearest_landmark_index, //特征点索引
const vectordouble > >& regression_targets, //残差
int fern_pixel_num){ //有效特征对的数量5
fern_pixel_num_ = fern_pixel_num;
landmark_num_ = regression_targets[0].rows;
selected_pixel_index_.create(fern_pixel_num,2); //the index of selected pixels pairs in fern
selected_pixel_locations_.create(fern_pixel_num,4); //the locations of selected pixel pairs stored in the format (x_1,y_1,x_2,y_2) for each row
selected_nearest_landmark_index_.create(fern_pixel_num,2);
int candidate_pixel_num = candidate_pixel_locations.rows;
// select pixel pairs from candidate pixels, this selection is based on the correlation between pixel
// densities and regression targets
// for details, please refer to "Face Alignment by Explicit Shape Regression"
// threshold_: thresholds for each pair of pixels in fern
threshold_.create(fern_pixel_num,1);
// get a random direction
RNG random_generator(getTickCount());
for(int i = 0;i < fern_pixel_num;i++){
Mat_<double> random_direction(landmark_num_ ,2);
random_generator.fill(random_direction,RNG::UNIFORM,-1.1,1.1);
normalize(random_direction,random_direction);
vector<double> projection_result(regression_targets.size(), 0);
// project regression targets along the random direction
// 将regression targets 向随机方向投影
for(int j = 0;j < regression_targets.size();j++){
double temp = 0;
temp = sum(regression_targets[j].mul(random_direction))[0];
projection_result[j] = temp; //随机方向的投影
}
Mat_<double> covariance_projection_density(candidate_pixel_num,1);
// 求随机方向投影和特征点的协方差
for(int j = 0;j < candidate_pixel_num;j++){
covariance_projection_density(j) = calculate_covariance(projection_result,candidate_pixel_intensity[j]);
}
// find max correlation
// 找到方差最大的特征点
double max_correlation = -1;
int max_pixel_index_1 = 0;
int max_pixel_index_2 = 0;
for(int j = 0;j < candidate_pixel_num;j++){
for(int k = 0;k < candidate_pixel_num;k++){
double temp1 = covariance(j,j) + covariance(k,k) - 2*covariance(j,k);
if(abs(temp1) < 1e-10){
continue;
}
bool flag = false;
//???
for(int p = 0;p < i;p++){
if(j == selected_pixel_index_(p,0) && k == selected_pixel_index_(p,1)){
flag = true;
break;
}else if(j == selected_pixel_index_(p,1) && k == selected_pixel_index_(p,0)){
flag = true;
break;
}
}
if(flag){
continue;
}
double temp = (covariance_projection_density(j) - covariance_projection_density(k))
/ sqrt(temp1);
if(abs(temp) > max_correlation){
max_correlation = temp;
max_pixel_index_1 = j;
max_pixel_index_2 = k;
}
}
}
selected_pixel_index_(i,0) = max_pixel_index_1;
selected_pixel_index_(i,1) = max_pixel_index_2;
selected_pixel_locations_(i,0) = candidate_pixel_locations(max_pixel_index_1,0);
selected_pixel_locations_(i,1) = candidate_pixel_locations(max_pixel_index_1,1);
selected_pixel_locations_(i,2) = candidate_pixel_locations(max_pixel_index_2,0);
selected_pixel_locations_(i,3) = candidate_pixel_locations(max_pixel_index_2,1);
selected_nearest_landmark_index_(i,0) = nearest_landmark_index(max_pixel_index_1);
selected_nearest_landmark_index_(i,1) = nearest_landmark_index(max_pixel_index_2);
// get threshold for this pair
double max_diff = -1;
for(int j = 0;j < candidate_pixel_intensity[max_pixel_index_1].size();j++){
double temp = candidate_pixel_intensity[max_pixel_index_1][j] - candidate_pixel_intensity[max_pixel_index_2][j];
if(abs(temp) > max_diff){
max_diff = abs(temp);
}
}
threshold_(i) = random_generator.uniform(-0.2*max_diff,0.2*max_diff);
}
// determine the bins of each shape
// 5个bit的分桶,统计落入每一个分桶的索引
vector<vector<int> > shapes_in_bin;
int bin_num = pow(2.0,fern_pixel_num);
shapes_in_bin.resize(bin_num);
for(int i = 0;i < regression_targets.size();i++){
int index = 0;
for(int j = 0;j < fern_pixel_num;j++){
double density_1 = candidate_pixel_intensity[selected_pixel_index_(j,0)][i];
double density_2 = candidate_pixel_intensity[selected_pixel_index_(j,1)][i];
if(density_1 - density_2 >= threshold_(j)){
index = index + pow(2.0,j);
}
}
shapes_in_bin[index].push_back(i);
}
// get bin output
vectordouble > > prediction;
prediction.resize(regression_targets.size());
bin_output_.resize(bin_num);
for(int i = 0;i < bin_num;i++){ //针对每一个分桶计算prediction[i]
Mat_<double> temp = Mat::zeros(landmark_num_,2, CV_64FC1);
int bin_size = shapes_in_bin[i].size();
//求总的差值
for(int j = 0;j < bin_size;j++){
int index = shapes_in_bin[i][j];
temp = temp + regression_targets[index];
}
if(bin_size == 0){
bin_output_[i] = temp;
continue;
}
// 正则化,防止过拟合
temp = (1.0/((1.0+1000.0/bin_size) * bin_size)) * temp;
bin_output_[i] = temp;
// 对每一个落入分桶中的shape的位移量进行更新
for(int j = 0;j < bin_size;j++){
int index = shapes_in_bin[i][j];
prediction[index] = temp;
}
}
return prediction;
}