注:ubuntu系统对vlc播放器支持较好,因此推荐安装vlc。
sudo apt install vlc
目标检测发展
基于传统手工特征的检测算法时期:基于手工特征所构建的。通过设计多元化的检测算法来拟补手工特征表达能力的缺陷。
目标检测的发展则分别经历了“包围框回归”、“深度神经网络兴起”、“多参考窗口(Multi-References,又称Anchors)”、“难样本挖掘与聚焦”以及“多尺度多端口检测”几个里程碑式的技术进步
目前的人脸检测方法主要有两大类:基于知识和基于统计。
“基于知识的方法主要利用先验知识将人脸看作器官特征的组合,根据眼睛、眉毛、嘴巴、鼻子等器官的特征以及相互之间的几何位置关系来检测人脸。基于统计的方法则将人脸看作一个整体的模式——二维像素矩阵,从统计的观点通过大量人脸图像样本构造人脸模式空间,根据相似度量来判断人脸是否存在。在这两种框架之下,发展了许多方法。目前随着各种方法的不断提出和应用条件的变化,将知识模型与统计模型相结合的综合系统将成为未来的研究趋势。”(来自论文《基于Adaboost的人脸检测方法及眼睛定位算法研究》)
基于知识的人脸检测方法
模板匹配
人脸特征
形状与边缘
纹理特性
颜色特征
基于统计的人脸检测方法
主成分分析与特征脸
神经网络方法
支持向量机
隐马尔可夫模型
Adaboost算法
Haar分类器 = Haar-like特征 + 积分图方法 + AdaBoost + 级联
1. 人脸 Haar-like特征检测
人脸有一些特征,一张正脸图像中,人眼睛区域会比脸颊区域暗,嘴唇区域也会比四周的区域暗,但鼻子区域会比两边脸颊要亮。基于这些特征,VJ使用可四种矩形特征:
A,B为边界特性,C为细节特征,D为对角线特征
Haar特征分别对白色区域和黑色区域的像素求和,然后求这两种和的差;可以通过图像卷积实现
后续提出更多特征检测算子
2. 建立积分图像,利用积分图像快速获取集中不同的矩形特征
对于积分图像中的任何一点,该点的积分图像值等于位于该点左上角所有像素之和。
S ( x , y ) = ∑ x ′ ≤ x ∑ y ′ ≤ y f ( x ′ , y ′ ) S(x,y)=\sum_{x^{'}≤x}\sum_{y^{'}≤y}f(x^{'},y^{'}) S(x,y)=x′≤x∑y′≤y∑f(x′,y′)
积分图像满足以下公式:
s ( x , y ) = f ( x , y ) + s ( x − 1 , y ) + s ( x , y − 1 ) − s ( x − 1 , y − 1 ) s(x,y)=f(x,y)+s(x-1,y)+s(x,y-1)-s(x-1,y-1) \\ s(x,y)=f(x,y)+s(x−1,y)+s(x,y−1)−s(x−1,y−1)
还满足以下特点:
上图像中标识出了四个区域:A, B , C ,D
1 处像素点对应的在积分图像中的值为:sum(A);
2 处像素点对应的在积分图像中的值为:sum(A+B);
3 处像素点对应的在积分图像中的值为:sum(A+C);
4 处像素点对应的在积分图像中的值为:sum(A+B+C+D);
则:
区域D所有的像素点灰度值之和为:
sum(A+B+C+D) - sum(A+C) - sum(A+B) + sum(A)
3. 获取图像特征
由于采用了四种矩形来提取人脸特征
特征 | 描述像素个数 | 特征情形 |
---|---|---|
二邻接矩形 | 最少2个像素点表示 | 横竖两种情况(A,B) |
三邻接矩形C | 最少需要3个像素点表示 | 横竖两种情况 |
四邻接矩形D | 最少需要4个像素点表示 | 只有一种情况 |
根据图像卷积,一个W*H的图像与m*n的filter卷积,得到的图像大小为:(W-m+1)*(H-n+1)(默认stride为1)。新图像的每一个像素点的值就是原图一个m*n的local patch 与m*n的filter的乘积和。新图像有多少个像素点,原图就有多少个m*n的矩形。
4. 利用Adaboost算法进行训练-(提取有用特征)
由于采用haar特征提取后,生成的特征非常多,对于24*24的图像,采用二,三,四邻矩形提取特征,可以生成162336个特征向量,需要在巨大的特征中提取出有用的特征信息,VJ采用Adaboost算法进行训练,提取有用的特征。
AdaBoost 是一种具有一般性的分类器提升算法,它使用的分类器并不局限某一特定算法。利用AdaBoost算法可以帮助我们选择更好的矩阵特征组合,这里矩阵特征组合就是分类器,分类器将矩阵组合以二叉决策树的形式存储起来。
Adaboosting原理
VJ中AdaBoost算法流程
5. 建立层级分类器
/*
* CART classifier
*/
typedef struct CvCARTHaarClassifier
{
CV_INT_HAAR_CLASSIFIER_FIELDS()
int count;
int* compidx;
CvTHaarFeature* feature;
CvFastHaarFeature* fastfeature;
float* threshold;
int* left;
int* right;
float* val;
} CvCARTHaarClassifier;
强分类器的诞生需要T轮的迭代,具体操作如下:
6. 非极大值抑制
VJ
vj
harrTraing
代码测试使用的是opencv库中训练好的人脸检测分类器模型,安装好opencv后,可以在/usr/share/opencv/haarcascades文件夹下找到。
#include
#include
#include
#include
#include
using namespace cv;
using namespace std;
void facesDetect(Mat in_image,vector<Rect> &rect)
{
CascadeClassifier cascade;
cascade.load("haarcascade_frontalface_default.xml");
cascade.detectMultiScale(in_image, rect, 1.1, 3, 0);//分类器对象调用
//printf("检测到人脸个数:%d\n", rect.size());
// for (int i = 0; i < rect.size(); i++)
// {
rectangle(in_image,Rect(rect[i].x,rect[i].y,rect[i].width,rect[i].height), \
// Scalar(255,0,0),2,1.0);
// }
}
void eyesDetect(Mat in_image,Mat &out_image)
{
vector<Rect> tmp_rect;
facesDetect(in_image,tmp_rect);
CascadeClassifier cascade;
cascade.load("haarcascade_eye.xml");
for(int i=0;i<tmp_rect.size();i++)
{
vector<Rect> rect;
Mat tmp_mat=in_image(tmp_rect[i]);
// imshow("face",tmp_mat);
// waitKey(1);
cascade.detectMultiScale(tmp_mat, rect, 1.1, 3, 0);//分类器对象调用
if(rect.size() ==2)
{
rectangle(in_image,Rect(tmp_rect[i].x,tmp_rect[i].y,tmp_rect[i].width,tmp_rect[i].height), \
Scalar(255,0,0),2,1.0);
for(int j=0;j<rect.size();j++)
{
Point center;
int radius;
center.x = cvRound((tmp_rect[i].x+rect[j].x + rect[j].width * 0.5));
center.y = cvRound((tmp_rect[i].y+rect[j].y + rect[j].height * 0.5));
radius = cvRound((rect[j].width + rect[j].height) *0.5);
circle(in_image, center, radius,Scalar(255,255,0) , 2);
}
}
// printf("the %d faces has %d eyes,\n",i,tmp_rect_1.size());
}
out_image=in_image.clone();
}
void smilesDetect(Mat in_image,Mat &out_image)
{
vector<Rect> tmp_rect;
CascadeClassifier cascade;
cascade.load("haarcascade_smile.xml");
cascade.detectMultiScale(in_image, tmp_rect, 1.1, 3, 0);//分类器对象调用
CascadeClassifier cascade2;
cascade2.load("haarcascade_eye.xml");
for(int i=0;i<tmp_rect.size();i++)
{
vector<Rect> rect;
Mat tmp_mat=in_image(tmp_rect[i]);
// imshow("face",tmp_mat);
// waitKey(1);
cascade2.detectMultiScale(tmp_mat, rect, 1.1, 3, 0);//分类器对象调用
if(rect.size() ==2)
{
rectangle(in_image,Rect(tmp_rect[i].x,tmp_rect[i].y,tmp_rect[i].width,tmp_rect[i].height), \
Scalar(255,0,0),2,1.0);
for(int j=0;j<rect.size();j++)
{
Point center;
int radius;
center.x = cvRound((tmp_rect[i].x+rect[j].x + rect[j].width * 0.5));
center.y = cvRound((tmp_rect[i].y+rect[j].y + rect[j].height * 0.5));
radius = cvRound((rect[j].width + rect[j].height) *0.5);
circle(in_image, center, radius,Scalar(255,255,0) , 2);
}
}
// printf("the %d faces has %d eyes,\n",i,tmp_rect_1.size());
}
out_image=in_image.clone();
}
int main()
{
Mat srcImg, dstImg, grayImg,smileImg;
VideoCapture capture(0);
while(1)
{
capture >>srcImg;
//尺寸调整
// resize(srcImg, srcImg, Size(srcImg.cols/4, srcImg.rows/4), 0, 0, INTER_LINEAR); //用线性插值
// imshow("原图",srcImg);
grayImg.create(srcImg.size(), srcImg.type());
cvtColor(srcImg, grayImg, CV_BGR2GRAY);
eyesDetect(grayImg,dstImg);
smilesDetect(grayImg,smileImg);
//5.显示
imshow("识别结果",dstImg);
imshow("smile",smileImg);
waitKey(1);
}
}
人脸检测必读文章
1.HOG简介
2.HOG算子提取特征过程
HOG特征简介
#include
#include
int main()
{
cv::Mat img;
cv::VideoCapture capture(0);
cv::HOGDescriptor hog(cv::Size(64, 128), cv::Size(16, 16), cv::Size(8, 8), cv::Size(8, 8), 9);
hog.setSVMDetector(cv::HOGDescriptor::getDefaultPeopleDetector());
std::vector<cv::Rect> regions;
while(1)
{
capture>>img;
hog.detectMultiScale(img, regions, 0, cv::Size(8, 8), cv::Size(32, 32), 1.05, 1);
// 显示
for (size_t i = 0;i < regions.size();i++)
{
cv::rectangle(img, regions[i], cv::Scalar(0, 0, 255), 2);
}
cv::imshow("行人检测", img);
cv::waitKey(1);
}
return 0;
}
training+detect
#include
#include
using namespace std;
using namespace cv:
///HOG+SVM识别方式2///
void Train()
{
读入训练样本图片路径和类别///
//图像路径和类别
vector<string> imagePath;
vector<int> imageClass;
int numberOfLine = 0;
string buffer;
ifstream trainingData(string(FILEPATH)+"TrainData.txt");
unsigned long n;
while (!trainingData.eof())
{
getline(trainingData, buffer);
if (!buffer.empty())
{
++numberOfLine;
if (numberOfLine % 2 == 0)
{
//读取样本类别
imageClass.push_back(atoi(buffer.c_str()));
}
else
{
//读取图像路径
imagePath.push_back(buffer);
}
}
}
//关闭文件
trainingData.close();
获取样本的HOG特征///
//样本特征向量矩阵
int numberOfSample = numberOfLine / 2;
Mat featureVectorOfSample(numberOfSample, 3780, CV_32FC1);//矩阵中每行为一个样本
//样本的类别
Mat classOfSample(numberOfSample, 1, CV_32SC1);
Mat convertedImage;
Mat trainImage;
// 计算HOG特征
for (vector<string>::size_type i = 0; i <= imagePath.size() - 1; ++i)
{
//读入图片
Mat src = imread(imagePath[i], -1);
if (src.empty())
{
cout << "can not load the image:" << imagePath[i] << endl;
continue;
}
//cout << "processing:" << imagePath[i] << endl;
// 归一化
resize(src, trainImage, Size(64, 128));
// 提取HOG特征
HOGDescriptor hog(cvSize(64, 128), cvSize(16, 16), cvSize(8, 8), cvSize(8, 8), 9);
vector<float> descriptors;
double time1 = getTickCount();
hog.compute(trainImage, descriptors);//这里可以设置检测窗口步长,如果图片大小超过64×128,可以设置winStride
double time2 = getTickCount();
double elapse_ms = (time2 - time1) * 1000 / getTickFrequency();
//cout << "HOG dimensions:" << descriptors.size() << endl;
//cout << "Compute time:" << elapse_ms << endl;
//保存到特征向量矩阵中
for (vector<float>::size_type j = 0; j <= descriptors.size() - 1; ++j)
{
featureVectorOfSample.at<float>(i, j) = descriptors[j];
}
//保存类别到类别矩阵
//!!注意类别类型一定要是int 类型的
classOfSample.at<int>(i, 0) = imageClass[i];
}
///使用SVM分类器训练///
//设置参数,注意Ptr的使用
Ptr<SVM> svm = SVM::create();
svm->setType(SVM::C_SVC);
svm->setKernel(SVM::LINEAR);//注意必须使用线性SVM进行训练,因为HogDescriptor检测函数只支持线性检测!!!
svm->setTermCriteria(TermCriteria(CV_TERMCRIT_ITER, 1000, FLT_EPSILON));
//使用SVM学习
svm->train(featureVectorOfSample, ROW_SAMPLE, classOfSample);
//保存分类器(里面包括了SVM的参数,支持向量,α和rho)
svm->save(string(FILEPATH) + "Classifier.xml");
/*
SVM训练完成后得到的XML文件里面,有一个数组,叫做support vector,还有一个数组,叫做alpha,有一个浮点数,叫做rho;
将alpha矩阵同support vector相乘,注意,alpha*supportVector,将得到一个行向量,将该向量前面乘以-1。之后,再该行向量的最后添加一个元素rho。
如此,变得到了一个分类器,利用该分类器,直接替换opencv中行人检测默认的那个分类器(cv::HOGDescriptor::setSVMDetector()),
*/
//获取支持向量机:矩阵默认是CV_32F
Mat supportVector = svm->getSupportVectors();//
//获取alpha和rho
Mat alpha;//每个支持向量对应的参数α(拉格朗日乘子),默认alpha是float64的
Mat svIndex;//支持向量所在的索引
float rho = svm->getDecisionFunction(0, alpha, svIndex);
//转换类型:这里一定要注意,需要转换为32的
Mat alpha2;
alpha.convertTo(alpha2, CV_32FC1);
//结果矩阵,两个矩阵相乘
Mat result(1, 3780, CV_32FC1);
result = alpha2*supportVector;
//乘以-1,这里为什么会乘以-1?
//注意因为svm.predict使用的是alpha*sv*another-rho,如果为负的话则认为是正样本,在HOG的检测函数中,使用rho+alpha*sv*another(another为-1)
for (int i = 0; i < 3780; ++i)
result.at<float>(0, i) *= -1;
//将分类器保存到文件,便于HOG识别
//这个才是真正的判别函数的参数(ω),HOG可以直接使用该参数进行识别
FILE *fp = fopen((string(FILEPATH) + "HOG_SVM.txt").c_str(), "wb");
for (int i = 0; i<3780; i++)
{
fprintf(fp, "%f \n", result.at<float>(0,i));
}
fprintf(fp, "%f", rho);
fclose(fp);
}
// 使用训练好的分类器识别
void Detect()
{
Mat img;
FILE* f = 0;
char _filename[1024];
// 获取测试图片文件路径
f = fopen((string(FILEPATH) + "TestData.txt").c_str(), "rt");
if (!f)
{
fprintf(stderr, "ERROR: the specified file could not be loaded\n");
return;
}
//加载训练好的判别函数的参数(注意,与svm->save保存的分类器不同)
vector<float> detector;
ifstream fileIn(string(FILEPATH) + "HOG_SVM.txt", ios::in);
float val = 0.0f;
while (!fileIn.eof())
{
fileIn >> val;
detector.push_back(val);
}
fileIn.close();
//设置HOG
HOGDescriptor hog;
hog.setSVMDetector(detector);// 使用自己训练的分类器
//hog.setSVMDetector(HOGDescriptor::getDefaultPeopleDetector());//可以直接使用05 CVPR已训练好的分类器,这样就不用Train()这个步骤了
namedWindow("people detector", 1);
// 检测图片
for (;;)
{
// 读取文件名
char* filename = _filename;
if (f)
{
if (!fgets(filename, (int)sizeof(_filename)-2, f))
break;
//while(*filename && isspace(*filename))
// ++filename;
if (filename[0] == '#')
continue;
//去掉空格
int l = (int)strlen(filename);
while (l > 0 && isspace(filename[l - 1]))
--l;
filename[l] = '\0';
img = imread(filename);
}
printf("%s:\n", filename);
if (!img.data)
continue;
fflush(stdout);
vector<Rect> found, found_filtered;
double t = (double)getTickCount();
// run the detector with default parameters. to get a higher hit-rate
// (and more false alarms, respectively), decrease the hitThreshold and
// groupThreshold (set groupThreshold to 0 to turn off the grouping completely).
//多尺度检测
hog.detectMultiScale(img, found, 0, Size(8, 8), Size(32, 32), 1.05, 2);
t = (double)getTickCount() - t;
printf("detection time = %gms\n", t*1000. / cv::getTickFrequency());
size_t i, j;
//去掉空间中具有内外包含关系的区域,保留大的
for (i = 0; i < found.size(); i++)
{
Rect r = found[i];
for (j = 0; j < found.size(); j++)
if (j != i && (r & found[j]) == r)
break;
if (j == found.size())
found_filtered.push_back(r);
}
// 适当缩小矩形
for (i = 0; i < found_filtered.size(); i++)
{
Rect r = found_filtered[i];
// the HOG detector returns slightly larger rectangles than the real objects.
// so we slightly shrink the rectangles to get a nicer output.
r.x += cvRound(r.width*0.1);
r.width = cvRound(r.width*0.8);
r.y += cvRound(r.height*0.07);
r.height = cvRound(r.height*0.8);
rectangle(img, r.tl(), r.br(), cv::Scalar(0, 255, 0), 3);
}
imshow("people detector", img);
int c = waitKey(0) & 255;
if (c == 'q' || c == 'Q' || !f)
break;
}
if (f)
fclose(f);
return;
}
void HOG_SVM2()
{
//如果使用05 CVPR提供的默认分类器,则不需要Train(),直接使用Detect检测图片
Train();
Detect();
}
///HOG+SVM识别方式1///
void HOG_SVM1()
{
读入训练样本图片路径和类别///
//图像路径和类别
vector<string> imagePath;
vector<int> imageClass;
int numberOfLine = 0;
string buffer;
ifstream trainingData(string(FILEPATH) + "TrainData.txt", ios::in);
unsigned long n;
while (!trainingData.eof())
{
getline(trainingData, buffer);
if (!buffer.empty())
{
++numberOfLine;
if (numberOfLine % 2 == 0)
{
//读取样本类别
imageClass.push_back(atoi(buffer.c_str()));
}
else
{
//读取图像路径
imagePath.push_back(buffer);
}
}
}
trainingData.close();
获取样本的HOG特征///
//样本特征向量矩阵
int numberOfSample = numberOfLine / 2;
Mat featureVectorOfSample(numberOfSample, 3780, CV_32FC1);//矩阵中每行为一个样本
//样本的类别
Mat classOfSample(numberOfSample, 1, CV_32SC1);
//开始计算训练样本的HOG特征
for (vector<string>::size_type i = 0; i <= imagePath.size() - 1; ++i)
{
//读入图片
Mat src = imread(imagePath[i], -1);
if (src.empty())
{
cout << "can not load the image:" << imagePath[i] << endl;
continue;
}
cout << "processing" << imagePath[i] << endl;
//缩放
Mat trainImage;
resize(src, trainImage, Size(64, 128));
//提取HOG特征
HOGDescriptor hog(Size(64, 128), Size(16, 16), Size(8, 8), Size(8, 8), 9);
vector<float> descriptors;
hog.compute(trainImage, descriptors);//这里可以设置检测窗口步长,如果图片大小超过64×128,可以设置winStride
cout << "HOG dimensions:" << descriptors.size() << endl;
//保存特征向量矩阵中
for (vector<float>::size_type j = 0; j <= descriptors.size() - 1; ++j)
{
featureVectorOfSample.at<float>(i, j) = descriptors[j];
}
//保存类别到类别矩阵
//!!注意类别类型一定要是int 类型的
classOfSample.at<int>(i, 0) = imageClass[i];
}
///使用SVM分类器训练///
//设置参数
//参考3.0的Demo
Ptr<SVM> svm = SVM::create();
svm->setKernel(SVM::RBF);
svm->setType(SVM::C_SVC);
svm->setC(10);
svm->setCoef0(1.0);
svm->setP(1.0);
svm->setNu(0.5);
svm->setTermCriteria(TermCriteria(CV_TERMCRIT_EPS, 1000, FLT_EPSILON));
//使用SVM学习
svm->train(featureVectorOfSample, ROW_SAMPLE, classOfSample);
//保存分类器
svm->save("Classifier.xml");
///使用训练好的分类器进行识别///
vector<string> testImagePath;
ifstream testData(string(FILEPATH) + "TestData.txt", ios::out);
while (!testData.eof())
{
getline(testData, buffer);
//读取
if (!buffer.empty())
testImagePath.push_back(buffer);
}
testData.close();
ofstream fileOfPredictResult(string(FILEPATH) + "PredictResult.txt"); //最后识别的结果
for (vector<string>::size_type i = 0; i <= testImagePath.size() - 1; ++i)
{
//读取测试图片
Mat src = imread(testImagePath[i], -1);
if (src.empty())
{
cout << "Can not load the image:" << testImagePath[i] << endl;
continue;
}
//缩放
Mat testImage;
resize(src, testImage, Size(64, 64));
//测试图片提取HOG特征
HOGDescriptor hog(cvSize(64, 64), cvSize(16, 16), cvSize(8, 8), cvSize(8, 8), 9);
vector<float> descriptors;
hog.compute(testImage, descriptors);
cout << "HOG dimensions:" << descriptors.size() << endl;
Mat featureVectorOfTestImage(1, descriptors.size(), CV_32FC1);
for (int j = 0; j <= descriptors.size() - 1; ++j)
{
featureVectorOfTestImage.at<float>(0, j) = descriptors[j];
}
//对测试图片进行分类并写入文件
int predictResult = svm->predict(featureVectorOfTestImage);
char line[512];
//printf("%s %d\r\n", testImagePath[i].c_str(), predictResult);
std::sprintf(line, "%s %d\n", testImagePath[i].c_str(), predictResult);
fileOfPredictResult << line;
}
fileOfPredictResult.close();
}
int main()
{
//HOG+SVM识别方式1:直接输出类别
HOG_SVM1();
//HOG+SVM识别方式2:输出图片中的存在目标的矩形
//HOG_SVM2();
}
原文链接
python版本
HOG特征提取
由于HOG有一个缺点:很难处理遮挡问题,人体姿势动作幅度过大或物体方向改变也不易检测
DPM=HOG+SVM+滑动窗
DPM算法思想:输入一幅图像,对图像提取图像特征,针对某个物件制作出相应的激励模板,在原始的图像中计算,得到该激励效果图,根据激励的分布,确定目标位置。
制作激励模板就相当于人为地设计一个卷积核,一个比较复杂的卷积核,拿这个卷积核与原图像进行卷积运算得到一幅特征图。比如拿一个静止站立的人的HOG特征形成的卷积核,与原图像的梯度图像进行一个卷积运算,那么目标区域就会被加密。
那么说到这里就会出现一个问题,人在图像中可能有各种的姿态,比如躺着,趴着,坐着等等,我们只用一个静止站立状态的人的激励模板去做探测就会失败。也就是说图像中的物件可能会发生形变,那么我们用固定的激励模板去探测目标物件的时候就不再适用,那么该如何解决这一问题呢,这就引出了局部模板,也就是说,我们不做一个整体的人的激励模板,转而去做人的部分组件的模板,比如头、胳膊、腿等,其实这就是DPM算法。
再概括一下,HOG的特征提取比较死板,一定要是一个人,这个人还只能是特定的姿态比如站立,动作幅度改变不能太大。而DMP就是先检测整个人,再检测四肢,然后综合两者的信息去判断。
opencv contrib 中包含DPM模块,其中包含DPM训练,使用demo和DPM模型文件。
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace cv;
using namespace cv::dpm;
using namespace std;
const string model_path = "inriaperson.xml";
int main(int argc, char** argv)
{
cv::Mat img;
cv::VideoCapture capture(0);
#ifdef HAVE_TBB
cout << "Running with TBB" << endl;
#else
#ifdef _OPENMP
cout << "Running with OpenMP" << endl;
#else
cout << "Running without OpenMP and without TBB" << endl;
#endif
#endif
cv::Ptr<DPMDetector> detector = \
DPMDetector::create(vector<string>(1, model_path));
vector<DPMDetector::ObjectDetection> ds;
Scalar color(0, 255, 255); // yellow
Mat frame;
while(1)
{
capture>>frame;
cv::Mat image;
image=frame.clone();
double t = (double)getTickCount();
// detection
detector->detect(image, ds);
// compute frame per second (fps)
t = ((double)getTickCount() - t) / getTickFrequency();//elapsed time
cout << t <<" " <<ds.size() << endl;
// draw boxes
string text = format("%0.1f fps", 1.0 / t);
for (unsigned int i = 0; i < ds.size(); i++)
{
rectangle(frame, ds[i].rect, color, 2);
}
// draw text on image
Scalar textColor(0, 0, 250);
putText(frame, text, Point(10, 50), FONT_HERSHEY_PLAIN, 2, textColor, 2);
imshow("cut", frame);//注意需要加waitkey,否则会只显示最后切割的图片
waitKey(1);
}
return 0;
}
ALEXnet
Environments
prepare pytorch-summary
pytorch-summary is a very useful tool for understanding the model structure, for example it can output the dimensions of each layer.
Clone, and cd
into the repo directory.
git clone https://github.com/sksq96/pytorch-summary
python setup.py build
python setup.py install
model.py
import torch.nn as nn
import torch
'''
使用nn.Sequential, 将一系列的层结构打包,形成一个整体
'''
class AlexNet(nn.Module):
def __init__(self, num_classes=1000, init_weights=False):
super(AlexNet, self).__init__()
# 专门用来提取图像特征
self.features = nn.Sequential(
nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=2), # input[3, 224, 224] output[48, 55, 55]
nn.ReLU(inplace=True), # inPlace=True, 增加计算量减少内存使用的一个方法
nn.MaxPool2d(kernel_size=3, stride=2), # output[48, 27, 27]
nn.Conv2d(48, 128, kernel_size=5, padding=2), # output[128, 27, 27]
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # output[128, 13, 13]
nn.Conv2d(128, 192, kernel_size=3, padding=1), # output[192, 13, 13]
nn.ReLU(inplace=True),
nn.Conv2d(192, 192, kernel_size=3, padding=1), # output[192, 13, 13]
nn.ReLU(inplace=True),
nn.Conv2d(192, 128, kernel_size=3, padding=1), # output[128, 13, 13]
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2), # output[128, 6, 6]
)
# 将全连接层作为一个整体,通过Dropout使其防止过拟合,一般放在全连接层和全连接层之间
self.classifier = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(128 * 6 * 6, 2048),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(2048, 2048),
nn.ReLU(inplace=True),
nn.Linear(2048, num_classes),
)
if init_weights:
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = torch.flatten(x, start_dim=1) # 展平处理,从channel维度开始展平,(第一个维度为channel)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules(): # 返回一个迭代器,遍历我们网络中的所有模块
if isinstance(m, nn.Conv2d): # 判断层结构是否为所给定的层,比如此处判断是否为卷积层
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None: # 此处判断该层偏置是否为空
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear): # 如果是全连接层
nn.init.normal_(m.weight, 0, 0.01) # 通过正态分布来给权重赋值
nn.init.constant_(m.bias, 0)
train.py
# coding=UTF-8
import torch
import torch.nn as nn
from torchvision import transforms, datasets, utils
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim
from model import AlexNet
import os
import json
import time
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# 数据预处理,定义data_transform这个字典
data_transform = {
"train": transforms.Compose([transforms.RandomResizedCrop(224), # 随机裁剪,裁剪到224*224
transforms.RandomHorizontalFlip(), # 水平方向随机翻转
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),
"val": transforms.Compose([transforms.Resize((224, 224)), # cannot 224, must (224, 224)
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])}
# os.getcwd()获取当前文件所在目录, "../.."返回上上层目录,".."返回上层目录
data_root = os.path.abspath(os.path.join(os.getcwd(), "..")) # get data root path
image_path = data_root + "/alexnet/flower_data" # flower data set path
train_dataset = datasets.ImageFolder(root=image_path + "/train",
transform=data_transform["train"])
train_num = len(train_dataset)
flower_list = train_dataset.class_to_idx # 获取分类的名称所对应的索引,即{'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
cla_dict = dict((val, key) for key, val in flower_list.items()) # 遍历获得的字典,将key和value反过来,即key变为0,val变为daisy
# 将key和value反过来的目的是,预测之后返回的索引可以直接通过字典得到所属类别
# write dict into json file
json_str = json.dumps(cla_dict, indent=4)
with open('class_indices.json', 'w') as json_file: # 保存入json文件
json_file.write(json_str)
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size, shuffle=True,
num_workers=0)
validate_dataset = datasets.ImageFolder(root=image_path + "/val",
transform=data_transform["val"])
val_num = len(validate_dataset)
validate_loader = torch.utils.data.DataLoader(validate_dataset,
batch_size=batch_size, shuffle=False,
num_workers=0)
# test_data_iter = iter(validate_loader)
# test_image, test_label = test_data_iter.next()
#
# def imshow(img):
# img = img / 2 + 0.5 # unnormalize
# npimg = img.numpy()
# plt.imshow(np.transpose(npimg, (1, 2, 0)))
# plt.show()
#
# print(' '.join('%5s' % flower_list[test_label[j]] for j in range(4)))
# imshow(utils.make_grid(test_image))
net = AlexNet(num_classes=5, init_weights=True)
net.to(device)
loss_function = nn.CrossEntropyLoss()
pata = list(net.parameters()) # 查看net内的参数
optimizer = optim.Adam(net.parameters(), lr=0.0002)
save_path = './AlexNet.pth'
best_acc = 0.0
for epoch in range(10):
# train
net.train() # 在训练过程中调用dropout方法
running_loss = 0.0
t1 = time.perf_counter() # 统计训练一个epoch所需时间
for step, data in enumerate(train_loader, start=0):
images, labels = data
optimizer.zero_grad()
outputs = net(images.to(device))
loss = loss_function(outputs, labels.to(device))
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
# print train process
rate = (step + 1) / len(train_loader)
a = "*" * int(rate * 50)
b = "." * int((1 - rate) * 50)
print("\rtrain loss: {:^3.0f}%[{}->{}]{:.3f}".format(int(rate * 100), a, b, loss))
print()
print(time.perf_counter()-t1)
# validate
net.eval() # 在测试过程中关掉dropout方法,不希望在测试过程中使用dropout
acc = 0.0 # accumulate accurate number / epoch
with torch.no_grad():
for data_test in validate_loader:
test_images, test_labels = data_test
outputs = net(test_images.to(device))
predict_y = torch.max(outputs, dim=1)[1]
acc += (predict_y == test_labels.to(device)).sum().item()
accurate_test = acc / val_num
if accurate_test > best_acc:
best_acc = accurate_test
torch.save(net.state_dict(), save_path)
print('[epoch %d] train_loss: %.3f test_accuracy: %.3f' %
(epoch + 1, running_loss / step, acc / val_num))
print('Finished Training')
predict.py
import torch
from model import AlexNet
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
import json
data_transform = transforms.Compose(
[transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# load image
img = Image.open("../tulips.jpg")
plt.imshow(img)
# [N, C, H, W]
img = data_transform(img) # 预处理的时候已经将channel这个维度提到最前面
# expand batch dimension
img = torch.unsqueeze(img, dim=0) # 在最前面增加一个batch维度
# read class_indict
try:
json_file = open('./class_indices.json', 'r')
class_indict = json.load(json_file)
except Exception as e:
print(e)
exit(-1)
# create model
model = AlexNet(num_classes=5)
# load model weights
model_weight_path = "./AlexNet.pth"
model.load_state_dict(torch.load(model_weight_path))
model.eval()
with torch.no_grad():
# predict class
output = torch.squeeze(model(img)) # 将batch维度压缩掉
predict = torch.softmax(output, dim=0) # 变成概率分布
predict_cla = torch.argmax(predict).numpy() # 获得最大概率处的索引值
print(class_indict[str(predict_cla)], predict[predict_cla].item())
plt.show()
split_data.py
import os
from shutil import copy
import random
def mkfile(file):
if not os.path.exists(file):
os.makedirs(file)
file = 'flower_data'
flower_class = [cla for cla in os.listdir(file) if ".txt" not in cla]
mkfile('flower_data/train')
for cla in flower_class:
mkfile('flower_data/train/'+cla)
mkfile('flower_data/val')
for cla in flower_class:
mkfile('flower_data/val/'+cla)
split_rate = 0.1
for cla in flower_class:
cla_path = file + '/' + cla + '/'
images = os.listdir(cla_path)
num = len(images)
eval_index = random.sample(images, k=int(num*split_rate))
for index, image in enumerate(images):
if image in eval_index:
image_path = cla_path + image
new_path = 'flower_data/val/' + cla
copy(image_path, new_path)
else:
image_path = cla_path + image
new_path = 'flower_data/train/' + cla
copy(image_path, new_path)
print("\r[{}] processing [{}/{}]".format(cla, index+1, num), end="")
print()
print("processing done!")
alexnet.py
import torch
from torch import nn
from torch.nn import functional as F
import torchvision
def main():
print('cuda device count: ', torch.cuda.device_count())
net = torchvision.models.alexnet(pretrained=True)
#net.fc = nn.Linear(512, 2)
net.eval()
net = net.to('cuda:0')
print(net)
tmp = torch.ones(2, 3, 224, 224).to('cuda:0')
out = net(tmp)
print('alexnet out:', out.shape)
torch.save(net, "alexnet.pth")
if __name__ == '__main__':
main()
inference.py
import torch
from torch import nn
import torchvision
import os
import struct
from torchsummary import summary
def main():
print('cuda device count: ', torch.cuda.device_count())
net = torch.load('alexnet.pth')
net = net.to('cuda:0')
net.eval()
print('model: ', net)
#print('state dict: ', net.state_dict().keys())
tmp = torch.ones(1, 3, 224, 224).to('cuda:0')
print('input: ', tmp)
out = net(tmp)
#for l in list(net.classifier.modules())[1:]:
# print('-', l)
print('output:', out)
summary(net, (3, 224, 224))
f = open("alexnet.wts", 'w')
f.write("{}\n".format(len(net.state_dict().keys())))
for k,v in net.state_dict().items():
print('key: ', k)
print('value: ', v.shape)
vr = v.reshape(-1).cpu().numpy()
f.write("{} {}".format(k, len(vr)))
for vv in vr:
f.write(" ")
f.write(struct.pack(">f", float(vv)).hex())
f.write("\n")
if __name__ == '__main__':
main()
run
cd alexnet
python alexnet.py // do inference and save model into .pth firstly.
python inference.py // then do inference and save weights file
overFeat简介
Overfeat
(1) FCN-全卷积神经网络
(2) offset max-pooling
SSD简介
SSD主要设计思想是特征分层提取,并依此进行边框回归和分类。具有以下特点: