CUDA学习笔记一


先来看一下OpenCV背景建模算法的GPU测试样例:

#include <iostream>
#include <string>

#include "opencv2/core.hpp"
#include "opencv2/core/utility.hpp"
#include "opencv2/cudabgsegm.hpp"
#include "opencv2/cudalegacy.hpp"
#include "opencv2/video.hpp"
#include "opencv2/highgui.hpp"

using namespace std;
using namespace cv;
using namespace cv::cuda;

enum Method
{
    MOG,
    MOG2,
    GMG,
    FGD_STAT
};

int main(int argc, const char** argv)
{
    cv::CommandLineParser cmd(argc, argv,
        "{ c camera |             | use camera }"
        "{ f file   | ../data/768x576.avi | input video file }"
        "{ m method | mog         | method (mog, mog2, gmg, fgd) }"
        "{ h help   |             | print help message }");

    if (cmd.has("help") || !cmd.check())
    {
        cmd.printMessage();
        cmd.printErrors();
        return 0;
    }

    bool useCamera = cmd.has("camera");
    string file = cmd.get<string>("file");
    string method = cmd.get<string>("method");

    if (method != "mog"
        && method != "mog2"
        && method != "gmg"
        && method != "fgd")
    {
        cerr << "Incorrect method" << endl;
        return -1;
    }

    Method m = method == "mog" ? MOG :
               method == "mog2" ? MOG2 :
               method == "fgd" ? FGD_STAT :
                                  GMG;

    VideoCapture cap;

    if (useCamera)
        cap.open(0);
    else
        cap.open(file);

    if (!cap.isOpened())
    {
        cerr << "can not open camera or video file" << endl;
        return -1;
    }

    Mat frame;
    cap >> frame;

    GpuMat d_frame(frame);

    Ptr<BackgroundSubtractor> mog = cuda::createBackgroundSubtractorMOG();
    Ptr<BackgroundSubtractor> mog2 = cuda::createBackgroundSubtractorMOG2();
    Ptr<BackgroundSubtractor> gmg = cuda::createBackgroundSubtractorGMG(40);
    Ptr<BackgroundSubtractor> fgd = cuda::createBackgroundSubtractorFGD();

    GpuMat d_fgmask;
    GpuMat d_fgimg;
    GpuMat d_bgimg;

    Mat fgmask;
    Mat fgimg;
    Mat bgimg;

    switch (m)
    {
    case MOG:
        mog->apply(d_frame, d_fgmask, 0.01);
        break;

    case MOG2:
        mog2->apply(d_frame, d_fgmask);
        break;

    case GMG:
        gmg->apply(d_frame, d_fgmask);
        break;

    case FGD_STAT:
        fgd->apply(d_frame, d_fgmask);
        break;
    }

    namedWindow("image", WINDOW_NORMAL);
    namedWindow("foreground mask", WINDOW_NORMAL);
    namedWindow("foreground image", WINDOW_NORMAL);
    if (m != GMG)
    {
        namedWindow("mean background image", WINDOW_NORMAL);
    }

    for(;;)
    {
        cap >> frame;
        if (frame.empty())
            break;
        d_frame.upload(frame);

        int64 start = cv::getTickCount();

        //update the model
        switch (m)
        {
        case MOG:
            mog->apply(d_frame, d_fgmask, 0.01);
            mog->getBackgroundImage(d_bgimg);
            break;

        case MOG2:
            mog2->apply(d_frame, d_fgmask);
            mog2->getBackgroundImage(d_bgimg);
            break;

        case GMG:
            gmg->apply(d_frame, d_fgmask);
            break;

        case FGD_STAT:
            fgd->apply(d_frame, d_fgmask);
            fgd->getBackgroundImage(d_bgimg);
            break;
        }

        double fps = cv::getTickFrequency() / (cv::getTickCount() - start);
        std::cout << "FPS : " << fps << std::endl;

        d_fgimg.create(d_frame.size(), d_frame.type());
        d_fgimg.setTo(Scalar::all(0));
        d_frame.copyTo(d_fgimg, d_fgmask);

        d_fgmask.download(fgmask);
        d_fgimg.download(fgimg);
        if (!d_bgimg.empty())
            d_bgimg.download(bgimg);

        imshow("image", frame);
        imshow("foreground mask", fgmask);
        imshow("foreground image", fgimg);
        if (!bgimg.empty())
            imshow("mean background image", bgimg);

        int key = waitKey(30);
        if (key == 27)
            break;
    }

    return 0;
}

OpenCV提供的部分接口可以完成一些Cuda编程的基本处理,比如将图像从CPU到GPU拷贝 (Mat --> GpuMat),upload,download。OpenCV封装和屏蔽了CUDA底层的函数,这样有好处也有弊端,对于某些关注算法应用的人来说,很好,只要几行代码,便可以使用算法的GPU并行版本了,但是对于我们这些研究算法并行实现细节的人来说,就不太方便。而且对上层封装越方便,下层就更不容易抽离。所以在这里我假想已经获取了uchar或unchar3的图像数据。而且从Mat到GpuMat的过程很慢……在另外一篇vibe的博客中介绍过OpenCV封装的数据类型。
相对来说我更喜欢这样的打开方式:

__global__ void swap_rb_kernel(const uchar3* src,uchar3* dst,int width,int height)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.x + blockIdx.y * blockDim.y;
    
    if(x < width && y < height)
    {
        uchar3 v = src[y * width + x];
        dst[y * width + x].x = v.z;
        dst[y * width + x].y = v.y;
        dst[y * width + x].z = v.x;
    }
}

void swap_rb_caller(const uchar3* src,uchar3* dst,int width,int height)
{
    dim3 block(32,8);
    dim3 grid((width + block.x - 1)/block.x,(height + block.y - 1)/block.y);
    
    swap_rb_kernel<<<grid,block,0>>>(src,dst,width,height);
    cudaThreadSynchronize();
}

int main()
{
    Mat image = imread("lena.jpg");
    imshow("src",image);
    
    size_t memSize = image.cols*image.rows*sizeof(uchar3);
    uchar3* d_src = NULL;
    uchar3* d_dst = NULL;
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_src,memSize));
    CUDA_SAFE_CALL(cudaMalloc((void**)&d_dst,memSize));
    CUDA_SAFE_CALL(cudaMempcy(d_src,image.data,memSize,cudaMemcpyHostToDevice));
    
    swap_rb_caller(d_src,d_dst,image.cols,image.rows);
    
    CUDA_SAFE_CALL(cudaMempcy(image.data,d_dst,memSize,cudaMemcpyDeviceToHost));
    imshow("gpu",image);
    waitKey(0);
    
    CUDA_SAFE_CALL(cudaFree(d_src));
    CUDA_SAFE_CALL(cudaFree(d_dst));
    return 0;
}

所以在这里以后更加关注算法的__device__函数实现或者kernel函数。


参考:http://www.cnblogs.com/dwdxdy/p/3528711.html

你可能感兴趣的:(CUDA,opencv)