


下面以向量加法为例,介绍CUDA实现代码,以下是具体代码,我的版本除CUDA相关函数外,其他代码均以C++ 的方式实现,并且尽可能调用STL库的数据结构和算法。注意:CUDA核函数内部的日志输出不支持C++的输出流std::cout,而只能使用C语言函数printf。另外,核函数内部的日志输出会大幅增加运行时间。因此,核函数内的调试输出语句使用完毕后,必须注释掉。




// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;

namespace {
// 1 << 20 ==> 1048576
constexpr size_t kDataSize = 1 << 20;
constexpr size_t kDataBytes = kDataSize * sizeof(float);
constexpr size_t kThreadPerBlock = 256;
}  // namespace

__global__ void AddVector(const float *const device_input1,
                             const float *const device_input2, const int num,
                             float *const device_result) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  // printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
  // 	   blockDim.x, blockIdx.x, threadIdx.x, i);

  if (i < num) {
    // printf("device_input1[%d] = %.2f, device_input2[%d] = %.2f\n", i,
    // device_input1[i], i, device_input2[i]);
    device_result[i] = device_input1[i] + device_input2[i];

int main() {
  auto global_start = std::chrono::high_resolution_clock::now();

  auto start = std::chrono::high_resolution_clock::now();
  std::vector<float> host_input1(kDataSize, 0.0), host_input2(kDataSize, 0.0),
      host_result(kDataSize, 0.0);
  for (size_t i = 0; i < kDataSize; ++i) {
    host_input1[i] = static_cast<float>(i);
    host_input2[i] = static_cast<float>(2 * i);
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> duration = stop - start;
  std::cout << "The time taken to initialize data on the host is "
            << duration.count() << "ms. \n";

  // Allocate memories for the device pointers.
  start = std::chrono::high_resolution_clock::now();
  float *device_input1 = nullptr;
  float *device_input2 = nullptr;
  float *device_result = nullptr;
  cudaMalloc((void **)&device_input1, kDataBytes);
  cudaMalloc((void **)&device_input2, kDataBytes);
  cudaMalloc((void **)&device_result, kDataBytes);
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to allocate memories on the device is "
            << duration.count() << "ms. \n";

  // Copy data from host to device
  start = std::chrono::high_resolution_clock::now();
  cudaMemcpy(device_input1,, kDataBytes,
  cudaMemcpy(device_input2,, kDataBytes,
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to copy data frome the host to the device is "
            << duration.count() << "ms. \n";

  // Free up host memories
  start = std::chrono::high_resolution_clock::now();
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to free up memories on the host is "
            << duration.count() << "ms. \n";

  const size_t block_per_grid =
      (kDataSize + kThreadPerBlock - 1) / kThreadPerBlock;
  std::cout << "block_per_grid = " << block_per_grid << std::endl;

  // Invoke the kernel function.
  start = std::chrono::high_resolution_clock::now();
  AddVector<<<block_per_grid, kThreadPerBlock>>>(
      device_input1, device_input2, kDataSize, device_result);
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to perform the kernel task is "
            << duration.count() << "ms. \n";

  // Copy data from device to host.
  start = std::chrono::high_resolution_clock::now();
  cudaMemcpy(, device_result, kDataBytes,
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to copy data from the device to the host is "
            << duration.count() << "ms. \n";

  // Free up device memories.
  start = std::chrono::high_resolution_clock::now();
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to free up memories on the device is "
            << duration.count() << "ms. \n";

  // Show first 20 elements.
  start = std::chrono::high_resolution_clock::now();
  size_t display_num = 20;
  auto iter_end = host_result.begin();
  std::advance(iter_end, display_num);
  std::cout << "The first " << display_num << " results are:\n";
  std::copy(host_result.begin(), iter_end,
            std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to output the results is " << duration.count()
            << "ms. \n";

  auto global_stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> global_duration =
      global_stop - global_start;
  std::cout << "The whole time is " << global_duration.count() << "ms. \n";

  return 0;

上述代码中,我使用了如下C++ 11版的计时代码,请大家熟悉这种计时操作方式:

  auto start = std::chrono::high_resolution_clock::now();
  //  Here is the code that is timed.
  // ...
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> duration = stop - start;
  std::cout << "The time taken to initialize data on the host is "
            << duration.count() << "ms. \n";


  // Free up host memories


__global__ void AddVector(const float *const device_input1,
                             const float *const device_input2, const int num,
                             float *const device_result) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  // printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
  // 	   blockDim.x, blockIdx.x, threadIdx.x, i);

  if (i < num) {
    // printf("device_input1[%d] = %.2f, device_input2[%d] = %.2f\n", i,
    // device_input1[i], i, device_input2[i]);
    device_result[i] = device_input1[i] + device_input2[i];

还有另一种所谓的网格跨步循环(grid-stride loop)写法,代码如下:

__global__ void AddVectorWithStride(const float *const device_input1,
                                    const float *const device_input2,
                                    const int num, float *const device_result) {
  size_t index = blockIdx.x * blockDim.x + threadIdx.x;
  // Notice that the stride of the loop is blockDim.x * gridDim.x which is the
  // total number of threads in the grid. So if there are 1280 threads in the
  // grid, thread 0 will compute elements 0, 1280, 2560, etc.
  size_t stride = blockDim.x * gridDim.x;  
  // Rather than assume that the thread grid is large enough to cover the entire
  // data array, this kernel loops over the data array one grid-size at a time.
  for (size_t i = index; i < num; i += stride) {
    device_result[i] = device_input1[i] + device_input2[i];


  size_t display_num = 20;
  auto iter_end = host_result.begin();
  std::advance(iter_end, display_num);
  std::cout << "The first " << display_num << " results are:\n";
  std::copy(host_result.begin(), iter_end,
            std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;


cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(vector_add VERSION 0.1.0)

# Set the c++17 standard

# Set the -O3 optimization level with debug information

# Disable warning messages
cmake_policy(SET CMP0074 NEW)

# Find CUDA
find_package(CUDA REQUIRED)

# These flags embed debugging information for both host and device code

# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)


  1. 使用cmake的编译:
mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标,也可直接使用:make
cmake --build .
  1. 如直接使用nvcc编译,指令为:
nvcc -O3 -G -g -std=c++17 -o vector_add


  1. GCC编译器版本必须为9.1以上版本(Ubuntu 20.04 2021年以后的版本默认就是GCC 9.3)


// First call

The time taken to initialize data on the host is 5.98281ms.
The time taken to allocate memories on the device is 1645.37ms.
The time taken to copy data frome the host to the device is 1.8091ms.
The time taken to free up memories on the host is 0.505051ms.
block_per_grid = 4096
The time taken to perform the kernel task is 0.050156ms.
The time taken to copy data from the device to the host is 1.14139ms.
The time taken to free up memories on the device is 0.234709ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.005323ms.
The whole time is 1655.22ms.

// Second call

The time taken to initialize data on the host is 5.7178ms.
The time taken to allocate memories on the device is 82.1604ms.
The time taken to copy data frome the host to the device is 1.57324ms.
The time taken to free up memories on the host is 0.473982ms.
block_per_grid = 4096
The time taken to perform the kernel task is 0.009277ms.
The time taken to copy data from the device to the host is 0.801573ms.
The time taken to free up memories on the device is 0.136609ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.007459ms.
The whole time is 90.9926ms.


在上面的实现中,我们需要单独在host和device上进行内存分配,并且要进行数据拷贝,这是很容易出错的。CUDA 6.0引入统一内存(Unified Memory)来避免这种麻烦,简单来说就是统一内存使用一个托管内存来共同管理host和device中的内存,并且自动在host和device中进行数据传输,具体函数为cudaMallocManaged

 cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flag=0);




// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;

namespace {
// 1 << 20 ==> 1048576
constexpr size_t kDataSize = 1 << 20;
constexpr size_t kDataBytes = kDataSize * sizeof(float);
constexpr size_t kThreadPerBlock = 256;
}  // namespace

__global__ void AddVector(const float *const managed_input1,
                             const float *const managed_input2, const int num,
                             float *const managed_result) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  // printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
  // 	   blockDim.x, blockIdx.x, threadIdx.x, i);

  if (i < num) {
    // printf("managed_input1[%d] = %.2f, managed_input2[%d] = %.2f\n", i,
    // managed_input1[i], i, managed_input2[i]);
    managed_result[i] = managed_input1[i] + managed_input2[i];

int main() {
  auto global_start = std::chrono::high_resolution_clock::now();

  // Use unified memories for the data pointers.
  auto start = std::chrono::high_resolution_clock::now();
  float *managed_input1 = nullptr;
  float *managed_input2 = nullptr;
  float *managed_result = nullptr;
  cudaMallocManaged((void **)&managed_input1, kDataBytes);
  cudaMallocManaged((void **)&managed_input2, kDataBytes);
  cudaMallocManaged((void **)&managed_result, kDataBytes);
  for (size_t i = 0; i < kDataSize; ++i) {
    managed_input1[i] = static_cast<float>(i);
    managed_input2[i] = static_cast<float>(2 * i);
  auto stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> duration = stop - start;
      << "The time taken to allocate managed memories and initialize data is "
      << duration.count() << "ms. \n";

  const size_t block_per_grid =
      (kDataSize + kThreadPerBlock - 1) / kThreadPerBlock;
  std::cout << "block_per_grid = " << block_per_grid << std::endl;

  // Invoke the kernel function.
  start = std::chrono::high_resolution_clock::now();
  AddVector<<<block_per_grid, kThreadPerBlock>>>(
      managed_input1, managed_input2, kDataSize, managed_result);

  // Synchronize the device to obtain correct results.
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to perform the kernel task is "
            << duration.count() << "ms. \n";

  // Show first 20 elements.
  start = std::chrono::high_resolution_clock::now();
  size_t display_num = 20;
  std::cout << "The first " << display_num << " results are:\n";
  std::copy(managed_result, managed_result + display_num,
            std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to output the results is " << duration.count()
            << "ms. \n";

  // Free up managed memories.

  stop = std::chrono::high_resolution_clock::now();
  duration = stop - start;
  std::cout << "The time taken to free up managed memories is "
            << duration.count() << "ms. \n";

  auto global_stop = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double, std::milli> global_duration =
      global_stop - global_start;
  std::cout << "The whole time is " << global_duration.count() << "ms. \n";

  return 0;


// Synchronize the device to obtain correct results.


First call:

The time taken to allocate managed memories and initialize data is 1693.49ms.
block_per_grid = 4096
The time taken to perform the kernel task is 2.88018ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.066513ms.
The time taken to free up managed memories is 0.956636ms.
The whole time is 1697.41ms.

Second call:

The time taken to allocate managed memories and initialize data is 74.3958ms.
block_per_grid = 4096
The time taken to perform the kernel task is 2.47183ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.037014ms.
The time taken to free up managed memories is 0.624669ms.
The whole time is 77.5455ms.


CUDA GDB在host端的调试指令和普通GDB无差别,在device端的调试指令主要有如下几种:

cuda thread lane warp block sm grid device kernel

关于CUDA GDB的详细介绍,请参考CUDA官方网站:

nvcc -O3 -G -g -std=c++17 -o vector_add


cuda-gdb -q vector_add
# 在22行处设置断点
# 运行程序
# 显示CUDA相关信息
cuda thread lane warp block sm grid device kernel
# 切换到第128个CUDA线程
cuda thread (128)
# 显示变量blockDim.x的值
p blockDim.x
# 显示变量i的值
p i
# 设置条件断点:当i等512时中断
b if i == 512
# 继续运行程序
# 显示变量i的值
p i
# 显示显卡信息
info cuda devices

