一个良好的cuda编程习惯里,我们习惯在调用一个cuda runtime api时,例如cudaMalloc() cudaMemcpy()我们就用error handler进行包装。这样
可以方便我们排查错误的来源
具体来说,CUDA的runtime API都会返回一个cudaError(枚举类), 可以通过枚举类来查看到它里面要么是成功了要么就是各种错误
__FILE__, __LINE__
这两个指的是当前文件,下面的行和文件名就是这里来的
ERROR: src/matmul_gpu_basic.cu:62, CODE:cudaErrorInvalidConfiguration, DETAIL:invalid configuration argument
至于这里两个,宏定义, 一个是用来检查CUDA Runtime API, 一个是检查核函数的。检查kernel function的时候,用LAST_KERNEL_CHECK()
, 这个放在同步后面, 确保之前的所有CUDA操作(包括kernel的执行)都已经完成,Z再来检查
有cudaPeekAtLastError或者cudaGetLastError, 区别是是否传播错误
kernelFunction<<<numBlocks, numThreads>>>();
cudaError_t err1 = cudaPeekAtLastError(); // 只查看,不清除错误状态
cudaError_t err2 = cudaGetLastError(); // 查看并清除错误状态
#include
#include
#define CUDA_CHECK(call) __cudaCheck(call, __FILE__, __LINE__)
#define LAST_KERNEL_CHECK() __kernelCheck(__FILE__, __LINE__)
#define BLOCKSIZE 16
inline static void __cudaCheck(cudaError_t err, const char* file, const int line) {
if (err != cudaSuccess) {
printf("ERROR: %s:%d, ", file, line);
printf("CODE:%s, DETAIL:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
exit(1);
}
}
inline static void __kernelCheck(const char* file, const int line) {
/*
* 在编写CUDA是,错误排查非常重要,默认的cuda runtime API中的函数都会返回cudaError_t类型的结果,
* 但是在写kernel函数的时候,需要通过cudaPeekAtLastError或者cudaGetLastError来获取错误
*/
cudaError_t err = cudaPeekAtLastError();
if (err != cudaSuccess) {
printf("ERROR: %s:%d, ", file, line);
printf("CODE:%s, DETAIL:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
exit(1);
}
}
这里分配之前矩阵乘法的blockSize = 64, 那么一个线程块里面有64x64=4096个线程,超出了1024的限制, 下面是不用KernelCheck()和用了的区别
不加是不会报错的
matmul in cpu uses 4092.84 ms
matmul in GPU Warmup uses 199.453 ms
matmul in GPU blockSize = 1 uses 13.1558 ms
matmul in GPU blockSize = 16 uses 13.0716 ms
matmul in GPU blockSize = 32 uses 13.0694 ms
matmul in GPU blockSize = 64 uses 2.00626 ms
res is different in 0, cpu: 260.89050293, gpu: 0.00000000
Matmul result is different
加了会出现报错, 这个错误 cudaErrorInvalidConfiguration 表示在执行CUDA kernel时,传递给 kernel 的配置参数无效。具体来说,CUDA kernel的配置包括线程块的数量、线程块内线程的数量等。
matmul in cpu uses 4115.42 ms
matmul in GPU Warmup uses 201.464 ms
matmul in GPU blockSize = 1 uses 13.1182 ms
matmul in GPU blockSize = 16 uses 13.0607 ms
matmul in GPU blockSize = 32 uses 13.0602 ms
ERROR: src/matmul_gpu_basic.cu:69, CODE:cudaErrorInvalidConfiguration, DETAIL:invalid configuration argument
// 分配grid, block
dim3 dimBlock(blockSize, blockSize);
int gridDim = (width + blockSize - 1) / blockSize;
dim3 dimGrid(gridDim, gridDim);
写成了
// 分配grid, block
dim3 dimBlock(blockSize, blockSize);
int gridDim = (width + blockSize - 1) / blockSize;
dim3 dimGrid(gridDim);
matmul in cpu uses 4152.26 ms
matmul in GPU Warmup uses 189.667 ms
matmul in GPU blockSize = 1 uses 2.92747 ms
matmul in GPU blockSize = 16 uses 2.85372 ms
matmul in GPU blockSize = 32 uses 2.86483 ms
res is different in 32768, cpu: 260.76977539, gpu: 0.00000000
这个没有报错, 这里grid(网格)只有一个, 然后这里不够块去计算了, 所以计算了一部分他就不计算了, 所以运行的速度快了很多, 以后如果CUDA编程中速度快了很多,要参考是否是没有完整的计算。
当进行CUDA编程时,了解硬件规格是非常重要的,因为这些规格限制了你可以使用的并行策略和优化方式。
*********************Architecture related**********************
Device id: 7
Device name: NVIDIA GeForce RTX 3090
Device compute capability: 8.6
GPU global meory size: 23.70GB
L2 cache size: 6.00MB
Shared memory per block: 48.00KB
Shared memory per SM: 100.00KB
Device clock rate: 1.69GHz
Device memory clock rate: 9.75Ghz
Number of SM: 82
Warp size: 32
*********************Parameter related************************
Max block numbers: 16
Max threads per block: 1024
Max block dimension size: 1024:1024:64
Max grid dimension size: 2147483647:65535:65535
#include
#include
#include
#define CUDA_CHECK(call) __cudaCheck(call, __FILE__, __LINE__)
#define LAST_KERNEL_CHECK(call) __kernelCheck(__FILE__, __LINE__)
#define LOG(...) __log_info(__VA_ARGS__)
#define BLOCKSIZE 16
static void __cudaCheck(cudaError_t err, const char* file, const int line) {
if (err != cudaSuccess) {
printf("ERROR: %s:%d, ", file, line);
printf("CODE:%s, DETAIL:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
exit(1);
}
}
static void __kernelCheck(const char* file, const int line) {
cudaError_t err = cudaPeekAtLastError();
if (err != cudaSuccess) {
printf("ERROR: %s:%d, ", file, line);
printf("CODE:%s, DETAIL:%s\n", cudaGetErrorName(err), cudaGetErrorString(err));
exit(1);
}
}
// 使用变参进行LOG的打印。比较推荐的打印log的写法
static void __log_info(const char* format, ...) {
char msg[1000];
va_list args;
va_start(args, format);
vsnprintf(msg, sizeof(msg), format, args);
fprintf(stdout, "%s\n", msg);
va_end(args);
}
#include
#include
#include
#include "utils.hpp"
int main(){
int count;
int index = 0;
cudaGetDeviceCount(&count);
while (index < count) {
cudaSetDevice(index);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, index);
LOG("%-40s", "*********************Architecture related**********************");
LOG("%-40s%d%s", "Device id: ", index, "");
LOG("%-40s%s%s", "Device name: ", prop.name, "");
LOG("%-40s%.1f%s", "Device compute capability: ", prop.major + (float)prop.minor / 10, "");
LOG("%-40s%.2f%s", "GPU global meory size: ", (float)prop.totalGlobalMem / (1<<30), "GB");
LOG("%-40s%.2f%s", "L2 cache size: ", (float)prop.l2CacheSize / (1<<20), "MB");
LOG("%-40s%.2f%s", "Shared memory per block: ", (float)prop.sharedMemPerBlock / (1<<10), "KB");
LOG("%-40s%.2f%s", "Shared memory per SM: ", (float)prop.sharedMemPerMultiprocessor / (1<<10), "KB");
LOG("%-40s%.2f%s", "Device clock rate: ", prop.clockRate*1E-6, "GHz");
LOG("%-40s%.2f%s", "Device memory clock rate: ", prop.memoryClockRate*1E-6, "Ghz");
LOG("%-40s%d%s", "Number of SM: ", prop.multiProcessorCount, "");
LOG("%-40s%d%s", "Warp size: ", prop.warpSize, "");
LOG("%-40s", "*********************Parameter related************************");
LOG("%-40s%d%s", "Max block numbers: ", prop.maxBlocksPerMultiProcessor, "");
LOG("%-40s%d%s", "Max threads per block: ", prop.maxThreadsPerBlock, "");
LOG("%-40s%d:%d:%d%s", "Max block dimension size:", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2], "");
LOG("%-40s%d:%d:%d%s", "Max grid dimension size: ", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2], "");
index ++;
printf("\n");
}
return 0;
}