并行计算之路<4>——CUDA牵手OpenCV

官方的pdf文档中已经对CUDA怎么牵手OpenCV做了很好的说明。详情请下载文档1,文档2。

我也是博采众家之长为己所用,总结下CUDA牵手OpenCV的方法。

用IplImage

包含两个文件,kernel.cu以及app.cpp。

kernel.cu

#ifndef _KERNEL_CU_
#define _KERNEL_CU_

#include

#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#define BYTE unsigned char

__global__ void InverseImg_kernel(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep)
{
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
    const int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if (ix < nWidth && iy < nHeight)
    {
        pImgOut[iy * nWidthStep + ix] =
            255 - pImgIn[iy * nWidthStep + ix];
    }

}

extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels)
{
    // var for timing
    clock_t start, finish;
    double  duration = 0.0;

    // cpu 计时开始
    start = clock();

    // 准备空间
    BYTE* d_pImgOut;
    BYTE* d_pImgIn;
    cudaMalloc((void**)&d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight);
    cudaMalloc((void**)&d_pImgIn, sizeof(BYTE) * nWidthStep * nHeight);

    //传入数据源
    cudaMemcpy(d_pImgIn, pImgIn, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyHostToDevice);

    cudaMemset(d_pImgOut, 0, sizeof(BYTE) * nWidthStep * nHeight);

    //GPU处理
    dim3 ts(16, 16);
    dim3 bs((nWidth*nChannels + 15) / 16, (nHeight + 15) / 16);
    InverseImg_kernel<<< bs, ts >>>(d_pImgOut, d_pImgIn, nWidth*nChannels, nHeight, nWidthStep);


    //输出结果
    cudaMemcpy(pImgOut, d_pImgOut, sizeof(BYTE) * nWidthStep * nHeight, cudaMemcpyDeviceToHost);

    //释放空间
    cudaFree(d_pImgOut);
    cudaFree(d_pImgIn);

    //cpu 计时结束
    finish = clock();
    duration = (double)(finish - start) / CLOCKS_PER_SEC;

    return duration;
}

#endif

app.cpp

#include    

using namespace cv;   

#define BYTE unsigned char

extern "C"
double cudaInverseImg(BYTE* pImgOut, BYTE* pImgIn, int nWidth, int nHeight, int nWidthStep, int nChannels);

void main(void)
{
    IplImage* img = cvLoadImage("1.jpg", CV_LOAD_IMAGE_GRAYSCALE);

    cvShowImage("原始图", img);

    BYTE* pImgIn = (BYTE* ) img->imageData;
    BYTE* pImgOut = (BYTE*)img->imageData;
    int nWidth = img->width;
    int nHeight = img->height;
    int nDepth = img->depth;
    int nWidthStep = img->widthStep;
    int nChannels = img->nChannels;

    double time = cudaInverseImg(pImgOut, pImgIn, nWidth, nHeight, nWidthStep, nChannels);

    printf("time : %f", time);

    IplImage* imgOut = cvCreateImageHeader(cvSize(nWidth, nHeight), nDepth, nChannels);
    cvSetData(imgOut, pImgOut, nWidthStep);
    cvShowImage("反相图", imgOut);

    cvWaitKey(0);
}

牵牵

并行计算之路<4>——CUDA牵手OpenCV_第1张图片

用Mat

包含两个文件,kernel.cu以及app.cpp。这个例子分别编写了CPU以及GPU处理图像的代码。最让我弄不明白的是,GPU运算的时间比CPU运算的时间长,另外GPU镜像时也有问题。但是自行利用Mat编写CUDA内核函数的思路还是在这里的(目前是这样,因为这个方法并不是主流的方法,所以我也没有深究。原谅我的不求甚解……)。

kernel.cu

#include    

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void kernel_channel_1(uchar* srcData, uchar* dstData, int rows, int cols)
{
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
    const int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if (ix < rows && iy < cols)
    {
        *(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
    }
}

__global__ void kernel_channel_3(uchar3* srcData, uchar3* dstData, int rows, int cols)
{
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
    const int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if (ix < rows && iy < cols)
    {
        *(dstData + ix + iy * rows) = *(srcData + rows - 1 - ix + (cols - 1 - iy) * rows);
    }
}

extern "C"
static int iDivUp(int a, int b)
{
    return (a % b != 0) ? (a / b + 1) : (a / b);
}

extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
    int rowNumber = src.rows;
    int colNumber = src.cols;

    dim3 threads(16, 16);
    dim3 grid(iDivUp(rowNumber + 15, threads.x), iDivUp(colNumber + 15, threads.y));

    size_t memSize = sizeof(uchar3) * rowNumber * colNumber;

    switch (src.channels())
    {
    case 1:
        uchar* uSrcData;
        uchar* uDstData;
        cudaMalloc((void**)&uSrcData, sizeof(uchar) * rowNumber * colNumber);
        cudaMalloc((void**)&uDstData, sizeof(uchar) * rowNumber * colNumber);

        cudaMemcpy(uSrcData, src.data, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyHostToDevice);
        cudaMemset(uDstData, 0, sizeof(uchar) * rowNumber * colNumber);

        kernel_channel_1 <<>>(uSrcData, uDstData, rowNumber, colNumber);

        cudaMemcpy(dst.data, uDstData, sizeof(uchar) * rowNumber * colNumber, cudaMemcpyDeviceToHost);

        // 释放空间
        cudaFree(uSrcData);
        cudaFree(uDstData);

    case 3:
        uchar3* vSrcData;
        uchar3* vDstData;
        cudaMalloc((void**)&vSrcData, memSize);
        cudaMalloc((void**)&vDstData, memSize);

        cudaMemcpy(vSrcData, src.data, memSize, cudaMemcpyHostToDevice);
        cudaMemset(vDstData, 0, memSize);

        kernel_channel_3 <<>>(vSrcData, vDstData, rowNumber, colNumber);

        cudaMemcpy(dst.data, vDstData, memSize, cudaMemcpyDeviceToHost);

        //释放空间
        cudaFree(vSrcData);
        cudaFree(vDstData);

    default:
        break;
    }
}


extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst)
{
    int rowNumber = src.rows;
    int colNumber = src.cols;

    switch (src.channels())
    {
    case 1:
        const uchar* uSrcData;
        uchar* uDstData;
        for (int i = 0; i < rowNumber; i++)
        {
            uSrcData = src.ptr<uchar>(i);
            uDstData = dst.ptr<uchar>(i);
            for (int j = 0; j < colNumber; j++)
            {
                *(uDstData + j) = *(uSrcData + colNumber - 1 - j);
            }
        }

    case 3:
        const cv::Vec3b* vSrcData;
        cv::Vec3b* vDstData;
        for (int i = 0; i < rowNumber; i++) 
        {
            vSrcData = src.ptr(i);
            vDstData = dst.ptr(i);
            for (int j = 0; j < colNumber; j++)
            {
                *(vDstData + j) = *(vSrcData + colNumber - 1 - j);
            }
        }
    default:
        break;
    }
}

app.cpp

#include 
#include   
using namespace cv;
using namespace std;

extern "C"
void cpuMirrorImg(const cv::Mat& src, cv::Mat& dst);

extern "C"
void gpuMirrorImg(const cv::Mat& src, cv::Mat& dst);

void main()
{
    Mat srcImage = imread("1.jpg");

    Mat dstImageCpu = srcImage.clone();
    const int64 startCpu = getTickCount();
    cpuMirrorImg(srcImage, dstImageCpu);
    const double timeSecCpu = (getTickCount() - startCpu) / getTickFrequency();
    cout << "CPU Time : " << timeSecCpu * 1000 << " ms" << endl;

    Mat dstImageGpu = Mat::zeros(srcImage.size(), srcImage.type());
    const int64 startGpu = getTickCount();
    gpuMirrorImg(srcImage, dstImageGpu);
    const double timeSecGpu = (getTickCount() - startGpu) / getTickFrequency();
    cout << "GPU Time : " << timeSecGpu * 1000 << " ms" << endl;

    imshow("source", srcImage);
    imshow("mirror [CPU]", dstImageCpu);
    imshow("mirror [GPU]", dstImageGpu);

    waitKey(0);
}

尴尬

利用GpuMat

这个例程来源开头提过的官方文档。很不错,一级棒!!!该例程由三个文件组成kernel.cu,swap_rb.cpp,app.cpp。

kernel.cu

#include 
#include 
#include 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

using namespace cv;
using namespace cv::cuda;

__global__ void swap_rb_kernel(const PtrStepSz src, PtrStep dst)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if (x < src.cols && y < src.rows)
    {
        uchar3 v = src(y, x); // Reads pixel in GPU memory. Valid! We are on GPU!
        dst(y, x) = make_uchar3(v.z, v.y, v.x);
    }
}

void swap_rb_caller(const PtrStepSz& src, PtrStep dst, cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((src.cols + block.x - 1) / block.x, (src.rows + block.y - 1) / block.y);
    swap_rb_kernel <<0, stream >>>(src, dst);
    if (stream == 0)
        cudaDeviceSynchronize();
}

swap_rb.cpp

#include 
#include 
#include 

using namespace cv;
using namespace cv::cuda;

void swap_rb_caller(const PtrStepSz& src, PtrStep dst, cudaStream_t stream);
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
{
    CV_Assert(src.type() == CV_8UC3);
    dst.create(src.size(), src.type()); // create if not allocated yet
    cudaStream_t s = StreamAccessor::getStream(stream);
    swap_rb_caller(src, dst, s);
}

app.cp

#include 
#include "opencv2/core.hpp"
#include 
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"

using namespace std;
using namespace cv;
using namespace cv::cuda;

void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());

void main()
{
    Mat srcImage = imread("1.jpg");
    Mat dstImage = Mat::zeros(srcImage.size(), srcImage.type());

    GpuMat srcImageGpu(srcImage);
    GpuMat dstImageGpu;
    dstImageGpu.create(srcImageGpu.size(), srcImageGpu.type());
    swap_rb(srcImageGpu, dstImageGpu);
    dstImageGpu.download(dstImage);

    imshow("source image", srcImage);
    imshow("gpu image", dstImage);
    waitKey(0);
}

真相

并行计算之路<4>——CUDA牵手OpenCV_第2张图片

OpenCV自带的CUDA库

例子来源于OpenCV,Google下关键字“OpenCV CUDA”就能索搜到。包含一个文件app.cpp。

代码

#include 
#include 
#include "opencv2/core.hpp"
#include 
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
#include "opencv2/cudaimgproc.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
static void help()
{
    cout << "This program demonstrates line finding with the Hough transform." << endl;
    cout << "Usage:" << endl;
    cout << "./gpu-example-houghlines , Default is ../data/pic1.png\n" << endl;
}
int main(int argc, const char* argv[])
{
    const string filename = argc >= 2 ? argv[1] : "1.jpg";
    Mat src = imread(filename, IMREAD_GRAYSCALE);
    if (src.empty())
    {
        help();
        cout << "can not open " << filename << endl;
        return -1;
    }
    Mat mask;
    cv::Canny(src, mask, 100, 200, 3);
    Mat dst_cpu;
    cv::cvtColor(mask, dst_cpu, COLOR_GRAY2BGR);
    Mat dst_gpu = dst_cpu.clone();
    vector lines_cpu;
    {
        const int64 start = getTickCount();
        cv::HoughLinesP(mask, lines_cpu, 1, CV_PI / 180, 50, 60, 5);
        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "CPU Time : " << timeSec * 1000 << " ms" << endl;
        cout << "CPU Found : " << lines_cpu.size() << endl;
    }
    for (size_t i = 0; i < lines_cpu.size(); ++i)
    {
        Vec4i l = lines_cpu[i];
        line(dst_cpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
    }
    GpuMat d_src(mask);
    GpuMat d_lines;
    {
        const int64 start = getTickCount();
        Ptr hough = cuda::createHoughSegmentDetector(1.0f, (float)(CV_PI / 180.0f), 50, 5);
        hough->detect(d_src, d_lines);
        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "GPU Time : " << timeSec * 1000 << " ms" << endl;
        cout << "GPU Found : " << d_lines.cols << endl;
    }
    vector lines_gpu;
    if (!d_lines.empty())
    {
        lines_gpu.resize(d_lines.cols);
        Mat h_lines(1, d_lines.cols, CV_32SC4, &lines_gpu[0]);
        d_lines.download(h_lines);
    }
    for (size_t i = 0; i < lines_gpu.size(); ++i)
    {
        Vec4i l = lines_gpu[i];
        line(dst_gpu, Point(l[0], l[1]), Point(l[2], l[3]), Scalar(0, 0, 255), 3, LINE_AA);
    }
    imshow("source", src);
    imshow("detected lines [CPU]", dst_cpu);
    imshow("detected lines [GPU]", dst_gpu);
    waitKey();
    return 0;
}

一线牵

并行计算之路<4>——CUDA牵手OpenCV_第3张图片

尾巴

看过官方提供的文档之后,我才将CUDA正确的牵手OpenCV。官方已经说的很好了,感觉说啥都是多余的。就到这里吧!今天阳光不错,等下配眼镜。注意身体,尤其是眼睛,不光在说我自己,也在说正在阅读的你。

参考:
《GPGPU编程技术——从GLSL、CUDA到OpenCL》♥♥♥♥♥
《数字图像处理高级应用——基于MATLAB与CUDA的实现》♥♥♥
《基于CUDA的并行程序设计》♥♥♥
《CUDA专家手册》♥♥♥♥♥
《高性能CUDA应用设计与开发》♥♥♥♥

你可能感兴趣的:(并行计算,cuda,并行计算,opencv,kernel)