CUDA(Compute Unified Device Architecture,统一计算架构)是由NVIDIA所推出的一种集成技术,是其对于GPGPU(A General-Purpose Graphics Processing Unit)的正式名称。通过该技术,开发者可以利用NVIDIA的GeForce 8以后的GPU进行计算。极大加速计算型应用的效率。通常用于游戏开发、视频编解码、图像处理等领域。
CUDA is a parallel computing platform and programming model developed by NVIDIA for general computing on graphical processing units (GPUs).
With CUDA, developers can dramatically speed up computing applications by harnessing the power of GPUs.

“Hello World”版使用

当我们使用Windows上的Visual Studio进行开发时,如果选择CUDA应用开发,IDE会自动生成一个“Hello World”版核函数,内容如下:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"


cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
    int i = threadIdx.x;
    c[i] = a[i] + b[i];

int main()
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;

    return 0;

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;

    return cudaStatus;


CUDA编程需要清楚基本的Host和Device的概念,其中Host为主机(一般指CPU),Device为设备(GPU)。以__global__开头的函数表示一个核函数,只能在Device上运行,必须由CPU调用。以__host__开头的函数与C/C++中的普通函数相同,由CPU调用及执行的函数 ,一般可以省略。

另外在__global__ void addKernel(int *c, const int *a, const int *b)函数中我们可以看到threadIdx.x的身影,这就涉及到CUDA编程中threadIdx, blockIdx, blockDim, gridDim等概念,主要是线程集束的分配和索引所需要的概念。在实际使用时会有1D/2D/3D等区别。可以参考官方教程手册《CUDA_C_Programming_Guide》。
函数cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size){}是CPU调用核函数addKernel(int *c, const int *a, const int *b)的主机函数,有cudaSetDevice、cudaMalloc、cudaMemcpy、cudaGetLastError、cudaDeviceSynchronize、cudaFree等操作,用于GPU内存分配、拷贝与释放、设备设置、错误处理以及数据同步等。代码addKernel<<<1, size>>>(dev_c, dev_a, dev_b);是真正启动核函数的操作。

NVIDIA有一个比较好的入门博客《An Easy Introduction to CUDA C and C++》,概念讲的比较全面。


#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

using namespace std;
using namespace cv;
using namespace cv::cuda;

typedef unsigned char uchar;

__global__ void mutiply_kernel(const PtrStepSz<uchar3> input1_c3, const PtrStepSz<uchar> input2_c1, PtrStep<uchar3> dst)
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if (x >= input2_c1.cols || y >= input2_c1.rows)

    if (x < input2_c1.cols && y < input2_c1.rows)
        dst(x, y).x = input1_c3(x, y).x * input2_c1(x, y) / 255;
        dst(x, y).y = input1_c3(x, y).y * input2_c1(x, y) / 255;
        dst(x, y).z = input1_c3(x, y).z * input2_c1(x, y) / 255;

void cu_mul(const PtrStepSz<uchar3>& input1_c3, const PtrStepSz<uchar>& input2_c1, PtrStepSz<uchar3> dst, cudaStream_t stream)
    dim3 block(32, 8);
    dim3 grid(divUp(input2_c1.cols, block.x), divUp(input2_c1.rows, block.y));

    mutiply_kernel <<<grid, block, 0, stream>>> (input1_c3, input2_c1, dst);

    if (stream == 0)


#include "cuda_runtime.h"
#include "device_launch_parameters.h"


using namespace std;
using namespace cv;
using namespace cv::cuda;

void cu_mul(const PtrStepSz<uchar3>& input1_c3, const PtrStepSz<uchar>& input2_c1, PtrStepSz<uchar3> dst, cudaStream_t stream);

int main()
    std::string input_src_path = "src.jpg";
    std::string input_mask_path = "mask.png";
    cv::Mat src = cv::imread(input_src_path, CV_LOAD_IMAGE_COLOR);
    cv::Mat mask = cv::imread(input_mask_path, CV_LOAD_IMAGE_COLOR);
    cv::Mat result;
    cv::cuda::GpuMat src_gpu;
    cv::cuda::GpuMat mask_gpu;
    cv::cuda::GpuMat r_gpu;
    r_gpu.create(src.size(), src.type());
    cv::cuda::Stream stream;
    double time1 = static_cast<double>(cv::getTickCount());  
    //cu kernel launch to do dot cal
    cu_mul(src_gpu, mask_gpu, r_gpu, StreamAccessor::getStream(stream));
    double time2 = static_cast<double>(cv::getTickCount());
    double time_cuda = (time2 - time1) / cv::getTickFrequency();
    std::cout << "Time use: " << time_cuda << "s" << std::endl;//输出运行时间
    cv::imwrite("out-cu.png", result);

