摘要:
本文主要关注opencv常规版和cuda版的模板匹配算法,网上cuda版的资料不多,这里做个记录,以后用到也好有个参考。
@[toc]
1. opencv cuda版配置
opencv cuda版需要自己用cmake编译,编译过程并不复杂,cmake编译成vs的项目,然后用vs编译成opencv_worldXXX.dll.编译过程可参考link1,link2
-
编译后的opencv 目录
-
vs项目配置,包含目录,库目录,连接器
注:这里的opencv_world341.lib对应的opencv_world341.dll有两种办法加入到项目中,一是:在系统环境变量中添加它的路径,二是:把它直接复制到项目.exe所在路径。如果有第三种方法,比如在vs项目的什么地方添加一下什么路径就行的话,还请告知,因为我总觉得这两者都不怎么高效,特别是需要移植到不同计算机上时。
2. 源码
- 包含cpu、gpu版的模板匹配算法demo示例,还有一个速度对比测试。
# include
# include
# include
int gpuTemplateMatch(const cv::Mat &srcImage, const cv::Mat &tempImage, cv::Mat &result,
double &matchVal, cv::Point &matchLoc, int mode)
{
if (srcImage.empty() || tempImage.empty())
{
std::cout << "ERROR:In function gpuTemplateMatch: input image is empty! \n";
return -1;
}
cv::cuda::GpuMat d_result;
cv::cuda::GpuMat d_srcImage;
cv::cuda::GpuMat d_tempImage;
d_srcImage.upload(srcImage);
d_tempImage.upload(tempImage);
cv::Ptr alg ;
switch (mode)
{
case 0:
//R = sum (t-Roi)^2
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_SQDIFF);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, &matchVal, NULL, &matchLoc, NULL);
break;
case 1:
//R = sum (t-Roi)^2/(sqrt(sum t^2 * sum Roi^2))
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_SQDIFF_NORMED);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, &matchVal, NULL, &matchLoc, NULL);
break;
case 2:
//R = sum t*Roi
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_CCORR);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 3:
//R = sum t*Roi / (sqrt(sum t^2 * sum Roi^2))
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_CCORR_NORMED);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 4:
//R = sum t1*Roi1
//t1 = t - t_mean
//Roi1 = Roi - Roi_mean
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_CCOEFF);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 5:
//R = sum t1*Roi1 / (sqrt(sum t1^2 * sum Roi1^2))
//t1 = t - t_mean
//Roi1 = Roi - Roi_mean
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_CCOEFF_NORMED);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
default:
//cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCOEFF_NORMED);
alg = cv::cuda::createTemplateMatching(srcImage.type(), cv::TM_CCOEFF_NORMED);
alg->match(d_srcImage, d_tempImage, d_result);
d_result.download(result);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
}
return 0;
}
int cpuTemplateMatch(const cv::Mat &srcImage, const cv::Mat &tempImage, cv::Mat &result,
double &matchVal, cv::Point &matchLoc, int mode)
{
//https://docs.opencv.org/3.4.12/de/da9/tutorial_template_matching.html
if (srcImage.empty() || tempImage.empty())
{
std::cout << "ERROR:In function cpuTemplateMatch: input image is empty! \n";
return -1;
}
//cv::Mat result;
int result_w = srcImage.cols - tempImage.cols;
int result_h = srcImage.rows - tempImage.rows;
if (result_w < 0 || result_h < 0)
{
std::cout << "ERROR:in function opencvTemplateMatch: roi image's size should be larger than tamplate's \n";
return -1;
}
//result.create(result_h, result_w, CV_32FC1);
switch (mode)
{
case 0:
//R = sum (t-Roi)^2
cv::matchTemplate(srcImage, tempImage, result, cv::TM_SQDIFF);
cv::minMaxLoc(result, &matchVal, NULL, &matchLoc, NULL);
break;
case 1:
//R = sum (t-Roi)^2/(sqrt(sum t^2 * sum Roi^2))
cv::matchTemplate(srcImage, tempImage, result, cv::TM_SQDIFF_NORMED);
cv::minMaxLoc(result, &matchVal, NULL, &matchLoc, NULL);
break;
case 2:
//R = sum t*Roi
cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCORR);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 3:
//R = sum t*Roi / (sqrt(sum t^2 * sum Roi^2))
cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCORR_NORMED);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 4:
//R = sum t1*Roi1
//t1 = t - t_mean
//Roi1 = Roi - Roi_mean
cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCOEFF);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
case 5:
//R = sum t1*Roi1 / (sqrt(sum t1^2 * sum Roi1^2))
//t1 = t - t_mean
//Roi1 = Roi - Roi_mean
cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCOEFF_NORMED);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
default:
cv::matchTemplate(srcImage, tempImage, result, cv::TM_CCOEFF_NORMED);
cv::minMaxLoc(result, NULL, &matchVal, NULL, &matchLoc);
break;
}
return 0;
}
int speedTest()
{
std::map matchMode =
{
{0,"TM_SQDIFF"},
{1,"TM_SQDIFF_NORMED"},
{2,"TM_CCORR"},
{3,"TM_CCORR_NORMED"},
{4,"TM_CCOEFF"},
{5,"cv::TM_CCOEFF_NORMED"}
};
/*std::string srcPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-1.png";
std::string tempPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-2.png";
cv::Mat srcImage = cv::imread(srcPath, 0);
cv::Mat tempImage = cv::imread(tempPath, 0);*/
cv::Mat srcImage = cv::Mat::zeros(cv::Size(200,200),CV_8UC1);
cv::Mat tempImage = cv::Mat::ones(cv::Size(20, 20), CV_8UC1);
clock_t start, end;
double matchVal;
cv::Point matchLoc;
cv::Mat result;
cv::cuda::GpuMat d_srcImage, d_tempImage, d_result;
cv::Ptr alg;
int TIMES = 10;
for (int mode = 0; mode < 6; mode++)
{
for (int size = 100; size < 1000; size += 100)
{
//resize the image
cv::resize(srcImage, srcImage, cv::Size(2000 , 2000 ));
cv::resize(tempImage, tempImage, cv::Size(size, size));
d_srcImage.upload(srcImage);
d_tempImage.upload(tempImage);
//gpu match
start = clock();
for (int times = 0; times < TIMES; times++)
{
gpuTemplateMatch(srcImage, tempImage, result, matchVal, matchLoc, mode);
}
end = clock();
auto runtime_gpu = (end - start) / TIMES * 1000 / CLOCKS_PER_SEC;
//cpu match
start = clock();
for (int times = 0; times < TIMES; times++)
{
cpuTemplateMatch(srcImage, tempImage, result, matchVal, matchLoc, mode);
}
end = clock();
auto runtime_cpu = (end - start) / TIMES * 1000 / CLOCKS_PER_SEC;
//gpu compute only
start = clock();
for (int times = 0; times < TIMES; times++)
{
alg = cv::cuda::createTemplateMatching(srcImage.type(), mode);//
alg->match(d_srcImage, d_tempImage, d_result);
}
end = clock();
auto runtime_gpuComputing = (end - start) / TIMES * 1000 / CLOCKS_PER_SEC;
printf("[+++++++++++++++++++++++++++++++++++++++++++++]\n");
printf("srcSize=[%d,%d], tempSize=[%d,%d]\n", srcImage.rows, srcImage.cols, tempImage.rows, tempImage.cols);
printf("match mode:%s\n", matchMode[mode].c_str());
printf("gpu total runtime:%d ms\n", runtime_gpu);
printf("cpu total runtime:%d ms\n",runtime_cpu);
printf("cpuT / gpuT :%3f\n", double(runtime_cpu) / double(runtime_gpu));
printf("gpu compute time:%dms\n", runtime_gpuComputing);
}
}
return 0;
}
int gpuTemplateMatchDemo()
{
std::string srcPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-1.png";
std::string tempPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-2.png";
cv::Mat srcImage = cv::imread(srcPath, 0);
cv::Mat tempImage = cv::imread(tempPath, 0);
//match
double matchVal;
cv::Point matchLoc;
cv::Mat result;
int mode = 3;
gpuTemplateMatch(srcImage, tempImage, result, matchVal, matchLoc, mode);
//show result
std::cout << "matchVal = " << matchVal << std::endl;
cv::Point topLeft = matchLoc;
cv::Point bottomRight = cv::Point(topLeft.x + tempImage.cols, topLeft.y + tempImage.rows);
cv::Mat drawImage = cv::imread(srcPath);
cv::rectangle(drawImage, cv::Rect(topLeft, bottomRight), cv::Scalar(0, 255, 0),2);
cv::imshow("srcImage", srcImage);
cv::imshow("tempImage", tempImage);
cv::imshow("drawImage", drawImage);
//show results
cv::normalize(result, result, 0, 1, cv::NORM_MINMAX);
cv::imshow("result", result);
cv::waitKey(0);
cv::destroyAllWindows();
return 0;
}
int cpuTemplateMatchDemo()
{
//prepare image and template
std::string srcPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-1.png";
std::string tempPath = "K:\\imageData\\totalBoard\\image2\\00000103_1-2.png";
cv::Mat srcImage = cv::imread(srcPath, 0);
cv::Mat tempImage = cv::imread(tempPath, 0);
//match
double matchVal;
cv::Point matchLoc;
cv::Mat result;
int mode = 1;
cpuTemplateMatch(srcImage, tempImage, result, matchVal, matchLoc, mode);
//show result
std::cout << "matchVal = " << matchVal << std::endl;
cv::Point topLeft = matchLoc;
cv::Point bottomRight = cv::Point(topLeft.x + tempImage.cols, topLeft.y + tempImage.rows);
cv::Mat drawImage = cv::imread(srcPath);
cv::rectangle(drawImage, cv::Rect(topLeft, bottomRight), cv::Scalar(0, 255, 0),2);
cv::imshow("srcImage", srcImage);
cv::imshow("tempImage", tempImage);
cv::imshow("drawImage", drawImage);
//show results
cv::normalize(result, result, 0, 1, cv::NORM_MINMAX);
cv::imshow("result", result);
cv::waitKey(0);
cv::destroyAllWindows();
return 0;
}
int main()
{
gpuTemplateMatchDemo();
cpuTemplateMatchDemo();
speedTest();
return 0;
}
3. 结果
-
demo
- 速度对比(部分结果)
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[100,100]
match mode:TM_SQDIFF_NORMED
gpu total runtime:103 ms
cpu total runtime:106 ms
cpuT / gpuT :1.029126
gpu compute time:91ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[200,200]
match mode:TM_SQDIFF_NORMED
gpu total runtime:103 ms
cpu total runtime:95 ms
cpuT / gpuT :0.922330
gpu compute time:90ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[300,300]
match mode:TM_SQDIFF_NORMED
gpu total runtime:101 ms
cpu total runtime:99 ms
cpuT / gpuT :0.980198
gpu compute time:89ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[400,400]
match mode:TM_SQDIFF_NORMED
gpu total runtime:101 ms
cpu total runtime:97 ms
cpuT / gpuT :0.960396
gpu compute time:90ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[500,500]
match mode:TM_SQDIFF_NORMED
gpu total runtime:100 ms
cpu total runtime:94 ms
cpuT / gpuT :0.940000
gpu compute time:90ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[600,600]
match mode:TM_SQDIFF_NORMED
gpu total runtime:111 ms
cpu total runtime:91 ms
cpuT / gpuT :0.819820
gpu compute time:102ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[700,700]
match mode:TM_SQDIFF_NORMED
gpu total runtime:111 ms
cpu total runtime:91 ms
cpuT / gpuT :0.819820
gpu compute time:102ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[800,800]
match mode:TM_SQDIFF_NORMED
gpu total runtime:110 ms
cpu total runtime:89 ms
cpuT / gpuT :0.809091
gpu compute time:102ms
[+++++++++++++++++++++++++++++++++++++++++++++]
srcSize=[2000,2000], tempSize=[900,900]
match mode:TM_SQDIFF_NORMED
gpu total runtime:108 ms
cpu total runtime:85 ms
cpuT / gpuT :0.787037
gpu compute time:101ms
-
速度对比测试时的GPU状态
4. 总结
GPU加速模板匹配看起来效果并不是很好,测试了不同大小的图片有的情况速度会超过CPU,本来觉得应该会有几倍的加速效果,但是其实并没有,大多数情况下反而是变慢了。开始觉得是cpu向gpu传图的过程耗时较多,后面去掉传图的过程只看匹配过程,它的计算就是比cpu的慢,不知道是不是因为这块GPU太低端了。