官方的pdf文档中已经对CUDA怎么牵手OpenCV做了很好的说明。详情请下载文档1,文档2。
我也是博采众家之长为己所用,总结下CUDA牵手OpenCV的方法。
包含两个文件,kernel.cu以及app.cpp。
#ifndef _KERNEL_CU_
#define _KERNEL_CU_
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define BYTE unsigned char
__global__ void InverseImg_kernel(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < nWidth && iy < nHeight)
{
pImgOut[iy * nWidthStep + ix] =
255 - pImgIn[iy * nWidthStep + ix];
}
}
extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels)
{
// var for timing
clock_t start, finish;
double duration = 0.0;
// cpu 计时开始
start = clock();
// 准备空间
BYTE* d_pImgOut;
BYTE* d_pImgIn;
cudaMalloc((void**)&d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight);
cudaMalloc((void**)&d_pImgIn, sizeof(BYTE) * nWidthStep * nHeight);
//传入数据源
cudaMemcpy(d_pImgIn, pImgIn, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyHostToDevice);
cudaMemset(d_pImgOut, 0, sizeof(BYTE) * nWidthStep * nHeight);
//GPU处理
dim3 ts(16, 16);
dim3 bs((nWidth*nChannels + 15) / 16, (nHeight + 15) / 16);
InverseImg_kernel<<< bs, ts >>>(d_pImgOut, d_pImgIn, nWidth*nChannels, nHeight, nWidthStep);
//输出结果
cudaMemcpy(pImgOut, d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyDeviceToHost);
//释放空间
cudaFree(d_pImgOut);
cudaFree(d_pImgIn);
//cpu 计时结束
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
return duration;
}
#endif
#include
using namespace cv;
#define BYTE unsigned char
extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels);
void main(void)
{
IplImage* img = cvLoadImage("1.jpg", CV_LOAD_IMAGE_GRAYSCALE);
cvShowImage("原始图", img);
BYTE* pImgIn = (BYTE* ) img->imageData;
BYTE* pImgOut = (BYTE*)img->imageData;
int nWidth = img->width;
int nHeight = img->height;
int nDepth = img->depth;
int nWidthStep = img->widthStep;
int nChannels = img->nChannels;
double time = cudaInverseImg(pImgOut, pImgIn, nWidth, nHeight, nWidthStep, nChannels);
printf("time : %f", time);
IplImage* imgOut = cvCreateImageHeader(cvSize(nWidth, nHeight), nDepth, nChannels);
cvSetData(imgOut, pImgOut, nWidthStep);
cvShowImage("反相图", imgOut);
cvWaitKey(0);
}
包含两个文件,kernel.cu以及app.cpp。这个例子分别编写了CPU以及GPU处理图像的代码。最让我弄不明白的是,GPU运算的时间比CPU运算的时间长,另外GPU镜像时也有问题。但是自行利用Mat编写CUDA内核函数的思路还是在这里的(目前是这样,因为这个方法并不是主流的方法,所以我也没有深究。原谅我的不求甚解……)。
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void kernel_channel_1(uchar* srcData, uchar* dstData, int rows, int cols)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < rows && iy < cols)
{
*(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
}
}
__global__ void kernel_channel_3(uchar3* srcData, uchar3* dstData, int rows, int cols)
{
const int ix = blockIdx.x * blockDim.x + threadIdx.x;
const int iy = blockIdx.y * blockDim.y + threadIdx.y;
if (ix < rows && iy < cols)
{
*(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
}
}
extern "C"
static int iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
int rowNumber = src.rows;
int colNumber = src.cols;
dim3 threads(16, 16);
dim3 grid(iDivUp(rowNumber + 15, threads.x), iDivUp(colNumber + 15, threads.y));
size_t memSize = sizeof(uchar3) * rowNumber * colNumber;
switch (src.channels())
{
case 1:
uchar* uSrcData;
uchar* uDstData;
cudaMalloc((void**)&uSrcData, sizeof(uchar) * rowNumber * colNumber);
cudaMalloc((void**)&uDstData, sizeof(uchar) * rowNumber * colNumber);
cudaMemcpy(uSrcData, src.data, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyHostToDevice);
cudaMemset(uDstData, 0, sizeof(uchar) * rowNumber * colNumber);
kernel_channel_1 <<>>(uSrcData, uDstData, rowNumber, colNumber);
cudaMemcpy(dst.data, uDstData, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyDeviceToHost);
// 释放空间
cudaFree(uSrcData);
cudaFree(uDstData);
case 3:
uchar3* vSrcData;
uchar3* vDstData;
cudaMalloc((void**)&vSrcData, memSize);
cudaMalloc((void**)&vDstData, memSize);
cudaMemcpy(vSrcData, src.data, memSize, cudaMemcpyHostToDevice);
cudaMemset(vDstData, 0, memSize);
kernel_channel_3 <<>>(vSrcData, vDstData, rowNumber, colNumber);
cudaMemcpy(dst.data, vDstData, memSize, cudaMemcpyDeviceToHost);
//释放空间
cudaFree(vSrcData);
cudaFree(vDstData);
default:
break;
}
}
extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
int rowNumber = src.rows;
int colNumber = src.cols;
switch (src.channels())
{
case 1:
const uchar* uSrcData;
uchar* uDstData;
for (int i = 0; i < rowNumber; i++)
{
uSrcData = src.ptr<uchar>(i);
uDstData = dst.ptr<uchar>(i);
for (int j = 0; j < colNumber; j++)
{
*(uDstData + j) = *(uSrcData + colNumber - 1 - j);
}
}
case 3:
const cv::Vec3b* vSrcData;
cv::Vec3b* vDstData;
for (int i = 0; i < rowNumber; i++)
{
vSrcData = src.ptr(i);
vDstData = dst.ptr(i);
for (int j = 0; j < colNumber; j++)
{
*(vDstData + j) = *(vSrcData + colNumber - 1 - j);
}
}
default:
break;
}
}
#include
#include
using namespace cv;
using namespace std;
extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst);
extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst);
void main()
{
Mat srcImage = imread("1.jpg");
Mat dstImageCpu = srcImage.clone();
const int64 startCpu = getTickCount();
cpuMirrorImg(srcImage, dstImageCpu);
const double timeSecCpu = (getTickCount() - startCpu) / getTickFrequency();
cout << "CPU Time : " << timeSecCpu * 1000 << " ms" << endl;
Mat dstImageGpu = Mat::zeros(srcImage.size(), srcImage.type());
const int64 startGpu = getTickCount();
gpuMirrorImg(srcImage, dstImageGpu);
const double timeSecGpu = (getTickCount() - startGpu) / getTickFrequency();
cout << "GPU Time : " << timeSecGpu * 1000 << " ms" << endl;
imshow("source", srcImage);
imshow("mirror [CPU]", dstImageCpu);
imshow("mirror [GPU]", dstImageGpu);
waitKey(0);
}
这个例程来源开头提过的官方文档。很不错,一级棒!!!该例程由三个文件组成kernel.cu,swap_rb.cpp,app.cpp。
#include
#include
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace cv;
using namespace cv::cuda;
__global__ void swap_rb_kernel(const PtrStepSz src, PtrStep dst)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < src.cols && y < src.rows)
{
uchar3 v = src(y, x); // Reads pixel in GPU memory. Valid! We are on GPU!
dst(y, x) = make_uchar3(v.z, v.y, v.x);
}
}
void swap_rb_caller(const PtrStepSz& src, PtrStep dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((src.cols + block.x - 1) / block.x, (src.rows + block.y - 1) / block.y);
swap_rb_kernel <<0, stream >>>(src, dst);
if (stream == 0)
cudaDeviceSynchronize();
}
#include
#include
#include
using namespace cv;
using namespace cv::cuda;
void swap_rb_caller(const PtrStepSz& src, PtrStep dst, cudaStream_t stream);
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
{
CV_Assert(src.type() == CV_8UC3);
dst.create(src.size(), src.type()); // create if not allocated yet
cudaStream_t s = StreamAccessor::getStream(stream);
swap_rb_caller(src, dst, s);
}
#include
#include "opencv2/core.hpp"
#include
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
void main()
{
Mat srcImage = imread("1.jpg");
Mat dstImage = Mat::zeros(srcImage.size(), srcImage.type());
GpuMat srcImageGpu(srcImage);
GpuMat dstImageGpu;
dstImageGpu.create(srcImageGpu.size(), srcImageGpu.type());
swap_rb(srcImageGpu, dstImageGpu);
dstImageGpu.download(dstImage);
imshow("source image", srcImage);
imshow("gpu image", dstImage);
waitKey(0);
}
例子来源于OpenCV,Google下关键字“OpenCV CUDA”就能索搜到。包含一个文件app.cpp。
#include
#include
#include "opencv2/core.hpp"
#include
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
static void help()
{
cout << "This program demonstrates line finding with the Hough transform." << endl;
cout << "Usage:" << endl;
cout << "./gpu-example-houghlines , Default is ../data/pic1.png\n" << endl;
}
int main(int argc, const char* argv[])
{
const string filename = argc >= 2 ? argv[1] : "1.jpg";
Mat src = imread(filename, IMREAD_GRAYSCALE);
if (src.empty())
{
help();
cout << "can not open " << filename << endl;
return -1;
}
Mat mask;
cv::Canny(src, mask, 100, 200, 3);
Mat dst_cpu;
cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR);
Mat dst_gpu = dst_cpu.clone();
vector lines_cpu;
{
const int64 start = getTickCount();
cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
const double timeSec = (getTickCount() - start) / getTickFrequency();
cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
cout << "CPU Found : " << lines_cpu.size() << endl;
}
for (size_t i = 0; i < lines_cpu.size(); ++i)
{
Vec4i l = lines_cpu[i];
line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
}
GpuMat d_src(mask);
GpuMat d_lines;
{
const int64 start = getTickCount();
Ptr hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 50, 5);
hough->detect(d_src, d_lines);
const double timeSec = (getTickCount() - start) / getTickFrequency();
cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
cout << "GPU Found : " << d_lines.cols << endl;
}
vector lines_gpu;
if (!d_lines.empty())
{
lines_gpu.resize(d_lines.cols);
Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
d_lines.download(h_lines);
}
for (size_t i = 0; i < lines_gpu.size(); ++i)
{
Vec4i l = lines_gpu[i];
line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
}
imshow("source", src);
imshow("detected lines [CPU]", dst_cpu);
imshow("detected lines [GPU]", dst_gpu);
waitKey();
return 0;
}
看过官方提供的文档之后,我才将CUDA正确的牵手OpenCV。官方已经说的很好了,感觉说啥都是多余的。就到这里吧!今天阳光不错,等下配眼镜。注意身体,尤其是眼睛,不光在说我自己,也在说正在阅读的你。
参考:
《GPGPU编程技术——从GLSL、CUDA到OpenCL》♥♥♥♥♥
《数字图像处理高级应用——基于MATLAB与CUDA的实现》♥♥♥
《基于CUDA的并行程序设计》♥♥♥
《CUDA专家手册》♥♥♥♥♥
《高性能CUDA应用设计与开发》♥♥♥♥