最近在学习CUDA编程,利用周末撸了一个GPU版本的中心线提取算法(灰度重心法)。
实际测试了100多张激光条纹图像的中心线提取,100w像素图像gpu的计算速度比cpu快1/3左右,400w像素图像gpu的计算速度是cpu的3倍左右,1600w像素图像gpu的计算速度是cpu的10~15倍。
下面放上main函数,其他文件放在下载链接里面了。毕竟是知识付费的时代,要几个积分也不算多吧哈哈。
/********************************************************************************
** @auth: taify
** @date: 2022/11/28
** @Ver : V1.0.0
** @desc: 中心线灰度重心法cpu和gpu版,gpu版本适合处理大图及一次性处理多张图的情况
*********************************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "libcpu/centerline_cpu.h"
#include "libgpu/centerline_gpu.h"
//#define SHOW_RESULT
int main(int argc, char* argv[])
{
int image_num = 132;
std::vector<cv::Mat> mats(image_num);
std::vector<cv::cuda::GpuMat> gpumats(image_num);
for (size_t i = 0; i < image_num; i++)
{
cv::Mat image = cv::imread("./images/" + std::to_string(i + 1) + ".bmp", 0);
cv::resize(image, image, cv::Size(image.cols , image.rows ));
mats[i] = image;
gpumats[i].upload(image);
}
clock_t t1 = clock();
for (size_t i = 0; i < image_num; i++)
{
cv::GaussianBlur(mats[i], mats[i], cv::Size(3, 3), 0, 0);
cv::threshold(mats[i], mats[i], 100, 255, cv::THRESH_TOZERO);
float* pts = (float*)malloc(sizeof(float) * mats[i].cols);
centerline_cpu(mats[i], pts);
//std::cout << pts[0] << std::endl;
#ifdef SHOW_RESULT
cv::cvtColor(mats[i], mats[i], cv::COLOR_GRAY2BGR);
for (size_t j = 0; j < mats[i].cols; j++)
{
cv::circle(mats[i], cv::Point(j, pts[j]), 0.5, cv::Scalar(0, 0, 255), -1);
}
cv::imwrite("./cpu/" + std::to_string(i + 1) + ".bmp", mats[i]);
#endif // SHOW_RESULT
free(pts);
}
clock_t t2 = clock();
std::cout <<"cpu time cost: " << t2 - t1 << "ms" << std::endl;
for (size_t i = 0; i < image_num; i++)
{
cv::Ptr<cv::cuda::Filter> guass_filter = cv::cuda::createGaussianFilter(CV_8U, CV_8U, cv::Size(3, 3), 0, 0);
guass_filter->apply(gpumats[i], gpumats[i]);
cv::cuda::threshold(gpumats[i], gpumats[i], 100, 255, cv::THRESH_TOZERO);
float* pts= (float*)malloc(sizeof(float) * gpumats[i].cols);
float* dev_pts;
for (size_t i = 0; i < gpumats[i].cols; i++)
{
pts[i] = 0.0f;
}
cudaMalloc(&dev_pts, sizeof(float) * gpumats[i].cols);
cudaMemcpy(dev_pts, pts, sizeof(float) * gpumats[i].cols, cudaMemcpyHostToDevice);
dim3 threads(1024, 1, 1);
dim3 blocks(gpumats[i].cols / threads.x, 1, 1);
centerline_gpu << <blocks, threads >> > (gpumats[i], dev_pts);
cudaMemcpy(pts, dev_pts, sizeof(float) * gpumats[i].cols, cudaMemcpyDeviceToHost);
//std::cout << pts[0] << std::endl;
#ifdef SHOW_RESULT
for (size_t j = 0; j < mats[i].cols; j++)
{
cv::circle(mats[i], cv::Point(j, pts[j]), 0.5, cv::Scalar(0, 0, 255), -1);
}
cv::imwrite("./gpu/" + std::to_string(i + 1) + ".bmp", mats[i]);
#endif // SHOW_RESULT
free(pts);
}
clock_t t3 = clock();
std::cout << "gpu time cost: " << t3 - t2 << "ms" << std::endl;
return EXIT_SUCCESS;
}
工程下载地址:点击跳转
该工程需要配置CUDA和带GPU模块的OpenCV。
VS2019+CUDA11.1 Release x64编译的OpenCV4.5.5(带CUDA和contrib模块)
之前的文章:激光条纹中心线提取算法总结和复现