贺志国
下面以向量加法为例,介绍CUDA实现代码,以下是具体代码vector_add.cu
,我的版本除CUDA相关函数外,其他代码均以C++ 的方式实现,并且尽可能调用STL库的数据结构和算法。注意:CUDA核函数内部的日志输出不支持C++的输出流std::cout
,而只能使用C语言函数printf
。另外,核函数内部的日志输出会大幅增加运行时间。因此,核函数内的调试输出语句使用完毕后,必须注释掉。
#include
#include
#include
#include
#include
#include
// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;
namespace {
// 1 << 20 ==> 1048576
constexpr size_t kDataSize = 1 << 20;
constexpr size_t kDataBytes = kDataSize * sizeof(float);
constexpr size_t kThreadPerBlock = 256;
} // namespace
__global__ void AddVector(const float *const device_input1,
const float *const device_input2, const int num,
float *const device_result) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
// printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
// blockDim.x, blockIdx.x, threadIdx.x, i);
if (i < num) {
// printf("device_input1[%d] = %.2f, device_input2[%d] = %.2f\n", i,
// device_input1[i], i, device_input2[i]);
device_result[i] = device_input1[i] + device_input2[i];
}
}
int main() {
auto global_start = std::chrono::high_resolution_clock::now();
auto start = std::chrono::high_resolution_clock::now();
std::vector<float> host_input1(kDataSize, 0.0), host_input2(kDataSize, 0.0),
host_result(kDataSize, 0.0);
for (size_t i = 0; i < kDataSize; ++i) {
host_input1[i] = static_cast<float>(i);
host_input2[i] = static_cast<float>(2 * i);
}
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = stop - start;
std::cout << "The time taken to initialize data on the host is "
<< duration.count() << "ms. \n";
// Allocate memories for the device pointers.
start = std::chrono::high_resolution_clock::now();
float *device_input1 = nullptr;
float *device_input2 = nullptr;
float *device_result = nullptr;
cudaMalloc((void **)&device_input1, kDataBytes);
cudaMalloc((void **)&device_input2, kDataBytes);
cudaMalloc((void **)&device_result, kDataBytes);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to allocate memories on the device is "
<< duration.count() << "ms. \n";
// Copy data from host to device
start = std::chrono::high_resolution_clock::now();
cudaMemcpy(device_input1, host_input1.data(), kDataBytes,
cudaMemcpyHostToDevice);
cudaMemcpy(device_input2, host_input2.data(), kDataBytes,
cudaMemcpyHostToDevice);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to copy data frome the host to the device is "
<< duration.count() << "ms. \n";
// Free up host memories
start = std::chrono::high_resolution_clock::now();
host_input1.clear();
host_input1.shrink_to_fit();
host_input2.clear();
host_input2.shrink_to_fit();
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to free up memories on the host is "
<< duration.count() << "ms. \n";
const size_t block_per_grid =
(kDataSize + kThreadPerBlock - 1) / kThreadPerBlock;
std::cout << "block_per_grid = " << block_per_grid << std::endl;
// Invoke the kernel function.
start = std::chrono::high_resolution_clock::now();
AddVector<<<block_per_grid, kThreadPerBlock>>>(
device_input1, device_input2, kDataSize, device_result);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to perform the kernel task is "
<< duration.count() << "ms. \n";
// Copy data from device to host.
start = std::chrono::high_resolution_clock::now();
cudaMemcpy(host_result.data(), device_result, kDataBytes,
cudaMemcpyDeviceToHost);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to copy data from the device to the host is "
<< duration.count() << "ms. \n";
// Free up device memories.
start = std::chrono::high_resolution_clock::now();
cudaFree(device_input1);
cudaFree(device_input2);
cudaFree(device_result);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to free up memories on the device is "
<< duration.count() << "ms. \n";
// Show first 20 elements.
start = std::chrono::high_resolution_clock::now();
size_t display_num = 20;
auto iter_end = host_result.begin();
std::advance(iter_end, display_num);
std::cout << "The first " << display_num << " results are:\n";
std::copy(host_result.begin(), iter_end,
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to output the results is " << duration.count()
<< "ms. \n";
auto global_stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> global_duration =
global_stop - global_start;
std::cout << "The whole time is " << global_duration.count() << "ms. \n";
return 0;
}
上述代码中,我使用了如下C++ 11版的计时代码,请大家熟悉这种计时操作方式:
auto start = std::chrono::high_resolution_clock::now();
// Here is the code that is timed.
// ...
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = stop - start;
std::cout << "The time taken to initialize data on the host is "
<< duration.count() << "ms. \n";
另外,std::vector
的内存清理不能只调用clear
函数,而必须在之后调用shrink_to_fit
才能真正释放占用的内存空间:
// Free up host memories
host_input1.clear();
host_input1.shrink_to_fit();
host_input2.clear();
host_input2.shrink_to_fit();
内核函数:
__global__ void AddVector(const float *const device_input1,
const float *const device_input2, const int num,
float *const device_result) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
// printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
// blockDim.x, blockIdx.x, threadIdx.x, i);
if (i < num) {
// printf("device_input1[%d] = %.2f, device_input2[%d] = %.2f\n", i,
// device_input1[i], i, device_input2[i]);
device_result[i] = device_input1[i] + device_input2[i];
}
}
还有另一种所谓的网格跨步循环(grid-stride loop)写法,代码如下:
__global__ void AddVectorWithStride(const float *const device_input1,
const float *const device_input2,
const int num, float *const device_result) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
// Notice that the stride of the loop is blockDim.x * gridDim.x which is the
// total number of threads in the grid. So if there are 1280 threads in the
// grid, thread 0 will compute elements 0, 1280, 2560, etc.
size_t stride = blockDim.x * gridDim.x;
// Rather than assume that the thread grid is large enough to cover the entire
// data array, this kernel loops over the data array one grid-size at a time.
for (size_t i = index; i < num; i += stride) {
device_result[i] = device_input1[i] + device_input2[i];
}
}
这种写法认为GPU上的线程数不够多,因此将grid上的处理逻辑放在一个线程中。也就是说,该线程利用循环来处理所有grid上的逻辑。这两种内核函数在我的GPU显卡上执行时间上似乎没有差异。
显示结果时,注意我使用的是STL库函数std::copy
,大家要学会这种用法,自己的水平就会不断提高。
size_t display_num = 20;
auto iter_end = host_result.begin();
std::advance(iter_end, display_num);
std::cout << "The first " << display_num << " results are:\n";
std::copy(host_result.begin(), iter_end,
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
CMake的编译配置文件如下:
cmake_minimum_required(VERSION 3.0.0)
# Set the project name and its version
project(vector_add VERSION 0.1.0)
# Set the c++17 standard
set(CMAKE_CXX_STANDARD 17)
# Set the -O3 optimization level with debug information
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -g")
# Disable warning messages
cmake_policy(SET CMP0074 NEW)
# Find CUDA
find_package(CUDA REQUIRED)
# These flags embed debugging information for both host and device code
set(CUDA_NVCC_FLAGS -G -g)
# Generate a cuda executable file.
cuda_add_executable(${PROJECT_NAME} ${PROJECT_NAME}.cu)
mkdir build && cd build
# 生成Makefile
cmake ..
# 构建目标,也可直接使用:make
cmake --build .
nvcc
编译,指令为:nvcc -O3 -G -g -std=c++17 vector_add.cu -o vector_add
注意:CUDA代码带调试信息会极大降低效率,除非调试需要,否则不要添加-G选项。
以下是运行结果,从结果可见,第一次调用因初始化显卡设备导致耗时较长,第二次调用就恢复正常了,整个调用过程中,CUDA内存分配耗时最长:
// First call
The time taken to initialize data on the host is 5.98281ms.
The time taken to allocate memories on the device is 1645.37ms.
The time taken to copy data frome the host to the device is 1.8091ms.
The time taken to free up memories on the host is 0.505051ms.
block_per_grid = 4096
The time taken to perform the kernel task is 0.050156ms.
The time taken to copy data from the device to the host is 1.14139ms.
The time taken to free up memories on the device is 0.234709ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.005323ms.
The whole time is 1655.22ms.
// Second call
The time taken to initialize data on the host is 5.7178ms.
The time taken to allocate memories on the device is 82.1604ms.
The time taken to copy data frome the host to the device is 1.57324ms.
The time taken to free up memories on the host is 0.473982ms.
block_per_grid = 4096
The time taken to perform the kernel task is 0.009277ms.
The time taken to copy data from the device to the host is 0.801573ms.
The time taken to free up memories on the device is 0.136609ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.007459ms.
The whole time is 90.9926ms.
*/
在上面的实现中,我们需要单独在host和device上进行内存分配,并且要进行数据拷贝,这是很容易出错的。CUDA 6.0引入统一内存(Unified Memory)来避免这种麻烦,简单来说就是统一内存使用一个托管内存来共同管理host和device中的内存,并且自动在host和device中进行数据传输,具体函数为cudaMallocManaged
:
cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flag=0);
注意:使用统一内存,不会降低耗时,也不会减少内存占用,只会降低代码工作量。一般来说,要部署于生产环境的代码,尽可能不使用统一内存分配。利用统一内存,可以将上面的程序简化如下:
#include
#include
#include
#include
#include
#include
// For units of time such as h, min, s, ms, us, ns
using namespace std::literals::chrono_literals;
namespace {
// 1 << 20 ==> 1048576
constexpr size_t kDataSize = 1 << 20;
constexpr size_t kDataBytes = kDataSize * sizeof(float);
constexpr size_t kThreadPerBlock = 256;
} // namespace
__global__ void AddVector(const float *const managed_input1,
const float *const managed_input2, const int num,
float *const managed_result) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
// printf("blockDim.x = %d, blockIdx.x = %d, threadIdx.x = %d, i = %d\n",
// blockDim.x, blockIdx.x, threadIdx.x, i);
if (i < num) {
// printf("managed_input1[%d] = %.2f, managed_input2[%d] = %.2f\n", i,
// managed_input1[i], i, managed_input2[i]);
managed_result[i] = managed_input1[i] + managed_input2[i];
}
}
int main() {
auto global_start = std::chrono::high_resolution_clock::now();
// Use unified memories for the data pointers.
auto start = std::chrono::high_resolution_clock::now();
float *managed_input1 = nullptr;
float *managed_input2 = nullptr;
float *managed_result = nullptr;
cudaMallocManaged((void **)&managed_input1, kDataBytes);
cudaMallocManaged((void **)&managed_input2, kDataBytes);
cudaMallocManaged((void **)&managed_result, kDataBytes);
for (size_t i = 0; i < kDataSize; ++i) {
managed_input1[i] = static_cast<float>(i);
managed_input2[i] = static_cast<float>(2 * i);
}
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = stop - start;
std::cout
<< "The time taken to allocate managed memories and initialize data is "
<< duration.count() << "ms. \n";
const size_t block_per_grid =
(kDataSize + kThreadPerBlock - 1) / kThreadPerBlock;
std::cout << "block_per_grid = " << block_per_grid << std::endl;
// Invoke the kernel function.
start = std::chrono::high_resolution_clock::now();
AddVector<<<block_per_grid, kThreadPerBlock>>>(
managed_input1, managed_input2, kDataSize, managed_result);
// Synchronize the device to obtain correct results.
cudaDeviceSynchronize();
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to perform the kernel task is "
<< duration.count() << "ms. \n";
// Show first 20 elements.
start = std::chrono::high_resolution_clock::now();
size_t display_num = 20;
std::cout << "The first " << display_num << " results are:\n";
std::copy(managed_result, managed_result + display_num,
std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to output the results is " << duration.count()
<< "ms. \n";
// Free up managed memories.
cudaFree(managed_input1);
cudaFree(managed_input2);
cudaFree(managed_result);
stop = std::chrono::high_resolution_clock::now();
duration = stop - start;
std::cout << "The time taken to free up managed memories is "
<< duration.count() << "ms. \n";
auto global_stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> global_duration =
global_stop - global_start;
std::cout << "The whole time is " << global_duration.count() << "ms. \n";
return 0;
}
注意,使用统一内存的方式,获取最终的计算结果前,需要调用如下代码来同步device的结果,如果不添加同步语句,最终的结果会不正确:
// Synchronize the device to obtain correct results.
cudaDeviceSynchronize();
使用统一内存方式的运行结果如下:
First call:
The time taken to allocate managed memories and initialize data is 1693.49ms.
block_per_grid = 4096
The time taken to perform the kernel task is 2.88018ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.066513ms.
The time taken to free up managed memories is 0.956636ms.
The whole time is 1697.41ms.
Second call:
The time taken to allocate managed memories and initialize data is 74.3958ms.
block_per_grid = 4096
The time taken to perform the kernel task is 2.47183ms.
The first 20 results are:
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57
The time taken to output the results is 0.037014ms.
The time taken to free up managed memories is 0.624669ms.
The whole time is 77.5455ms.
CUDA GDB在host端的调试指令和普通GDB无差别,在device端的调试指令主要有如下几种:
cuda thread lane warp block sm grid device kernel
关于CUDA GDB的详细介绍,请参考CUDA官方网站:https://docs.nvidia.com/cuda/cuda-gdb/#cuda-gdb-extensions
编译时添加如下指令,CUDA程序才能被调试:
nvcc -O3 -G -g -std=c++17 vector_add.cu -o vector_add
上述指令中,-G
表示设备端(device)代码带调试信息,-g
表示主机端(host)代码带调试信息。
调试过程示意图如下所示:
我在该调试过程中演示了如下指令:
# 启动CUDA GDB
cuda-gdb -q vector_add
# 在22行处设置断点
b vector_add.cu:22
# 运行程序
r
# 显示CUDA相关信息
cuda thread lane warp block sm grid device kernel
# 切换到第128个CUDA线程
cuda thread (128)
# 显示变量blockDim.x的值
p blockDim.x
# 显示变量i的值
p i
# 设置条件断点:当i等512时中断
b vector_add.cu:26 if i == 512
# 继续运行程序
c
# 显示变量i的值
p i
# 显示显卡信息
info cuda devices