cudaOpenMP项目展示了如何在cuda项目中运用openmp技术。
该项目位于cuda samples文件夹下的0_Simple/cudaOpenMP文件夹下。
在正式开始剖析代码之前,让我们先来了解一下openmp的背景知识。
__global__ void kernelAddConstant(int *g_a, const int b) { int idx = blockIdx.x * blockDim.x + threadIdx.x; g_a[idx] += b; }可以看到,每一个CUDA线程仅仅从全局内存空间g_a中读取其线程ID对应位置的数值,并将其与一个常量b相加。
int num_gpus = 0; printf("%s Starting...\n\n", argv[0]); cudaGetDeviceCount(&num_gpus); printf("number of host CPUs:\t%d\n", omp_get_num_procs()); printf("number of CUDA devices:\t%d\n", num_gpus); unsigned int n = num_gpus * 8192; unsigned int nbytes = n * sizeof(int); int *a = 0; int b = 3; a = (int *)malloc(nbytes); for (unsigned int i = 0; i < n; ++i) a[i] = i;上面的代码中,函数cudaGetDeviceCount()用来获得系统中可用的GPU数目,它的值保存在变量num_gpus中。omp_get_num_procs()是OpenMP提供的库函数,它的作用是获取函数调用时系统中可用的CPU数目。变量n保存待处理的整数的数目,nbytes保存这些整数所需要的内存空间。指针a指示这部分空间的起始地址,代码最后对这部分内存的每一个元素赋初值。至此,初始化过程结束。
omp_set_num_threads(2 * num_gpus); #pragma omp parallel { unsigned int cpu_thread_id = omp_get_thread_num(); unsigned int num_cpu_threads = omp_get_num_threads(); int gpu_id = -1; checkCudaErrors(cudaSetDevice(cpu_thread_id % num_gpus)); checkCudaErrors(cudaGetDevice(&gpu_id)); printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id); int *d_a = 0; int *sub_a = a + cpu_thread_id * n / num_cpu_threads; unsigned int nbytes_per_kernel = nbytes / num_cpu_threads; dim3 gpu_threads(128); dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads)); checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel)); checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel)); checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice)); kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b); checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaFree(d_a)); }计算过程一开始首先调用openmp函数omp_set_num_threads()设置执行并行代码片段的CPU线程数目。
if (cudaSuccess != cudaGetLastError()) printf("%s\n", cudaGetErrorString(cudaGetLastError())); bool bResult = correctResult(a, n, b); if (a) free(a); cudaDeviceReset(); exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);这部分代码首先检查GPU执行是否出错,然后调用函数correctResult()检查执行结果是否正确,最后释放内存并退出。
int correctResult(int *data, const int n, const int b) { for (int i = 0; i < n; ++i) if (data[i] !== i + b) return 0; return 1; }