前言:
CUDA线程的组织形式(
block的维度配置)对程序的性能影响是至关重要的。
线程索引:
矩阵在memory中是row-major线性存储的:
在kernel里,线程的唯一索引非常有用,为了确定一个线程的索引,需要(以2D为例):
- 线程和block索引
- 矩阵中元素坐标
- 线性global memory 的偏移
首先可以将thread和block索引映射到矩阵坐标:
ix = threadIdx.x + blockIdx.x * blockDim.x
iy = threadIdx.y + blockIdx.y * blockDim.y
之后可以利用上述变量计算线性地址:
idx = iy * nx + ix
上图展示了block和thread索引,矩阵坐标以及线性地址之间的关系,
谨记,相邻的thread拥有连续的threadIdx.x,也就是索引为(0,0)(1,0)(2,0)(3,0)...的thread连续,而不是(0,0)(0,1)(0,2)(0,3)...连续,跟我们线性代数里玩矩阵的时候不一样。
下面我们以2D矩阵相加为例,来测试CUDA线程配置(
block的大小和数量 )对程序性能的影响,,这里以2D grid和2D block为例。
测试环境:Tesla M2070一块,CUDA 6.0,
操作系统:Red Hat 4.1.2-50,gcc version 4.1.2 20080704
测试代码:
//Threads assign test #include #include #include #include #define PRECISION 1e-5 #define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ )) static void HandleError( cudaError_t err,const char *file,int line ) { if (err != cudaSuccess) { printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line ); exit( EXIT_FAILURE ); } } __global__ void sumMatrix2DKernel(float *d_MatA,float *d_MatB,float *d_MatC,int nx,int ny) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; int tid = nx*idy + idx; if(idx < nx && idy < ny) d_MatC[tid] = d_MatA[tid] + d_MatB[tid]; } void sumMatrix2DOnHost (float *h_A,float *h_B,float *hostRef,int nx,int ny) { for(int i=0; i< nx*ny; i++) hostRef[i] = h_A[i] + h_B[i]; } int main(int argc, char **argv) { printf("%s Program Starting...\n",argv[0]); // set up device int devID = 0; cudaDeviceProp deviceProp; HANDLE_ERROR(cudaGetDeviceProperties(&deviceProp, devID)); printf("Using Device %d: %s\n", devID, deviceProp.name); HANDLE_ERROR(cudaSetDevice(devID)); // set up date size of matrix int nx = 1<<14; int ny = 1<<14; int nxy = nx*ny; int nBytes = nxy * sizeof(float); printf("Matrix size: nx= %d, ny= %d\n",nx, ny); // malloc host memory float *h_A, *h_B, *hostRef, *gpuRef; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); hostRef = (float *)malloc(nBytes); gpuRef = (float *)malloc(nBytes); // initialize data at host side clock_t iStart,iEnd; iStart = clock(); for(int i=0;i<nxy;i++) { h_A[i] = rand()/(float)RAND_MAX; h_B[i] = rand()/(float)RAND_MAX; } iEnd = clock(); double iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; memset(hostRef, 0, nBytes); memset(gpuRef, 0, nBytes); // add matrix at host side for result checks iStart = clock(); sumMatrix2DOnHost(h_A, h_B, hostRef, nx,ny); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnHost() elapsed %f sec..\n", iElaps); // malloc device global memory float *d_MatA, *d_MatB, *d_MatC; cudaMalloc((void **)&d_MatA, nBytes); cudaMalloc((void **)&d_MatB, nBytes); cudaMalloc((void **)&d_MatC, nBytes); // transfer data from host to device cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice); /////////////////////////////////////////////////////////////////////////////////// // invoke kernel at host side int dimx = 32; int dimy = 32; //int dimy = 16; dim3 block(dimx, dimy); dim3 grid((nx+block.x-1)/block.x, (ny+block.y-1)/block.y); iStart = clock(); sumMatrix2DKernel <<< grid, block >>>(d_MatA, d_MatB, d_MatC, nx, ny); cudaDeviceSynchronize(); iEnd = clock(); iElaps = (double)(iEnd-iStart)/CLOCKS_PER_SEC; printf("--sumMatrix2DOnGPU<<<(%d,%d),(%d,%d)>>> elapsed %f sec..\n", grid.x, grid.y, block.x, block.y, iElaps); /////////////////////////////////////////////////////////////////////////////////// // copy kernel result back to host side cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost); // check device results for(int i=0; i< nxy; i++) { if(fabs(gpuRef[i]-hostRef[i]) > PRECISION) { fprintf(stderr,"Result verification failed at elemnt %d\n", i); exit(EXIT_FAILURE); } } // free device global memory cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC); // free host memory free(h_A); free(h_B); free(hostRef); free(gpuRef); // reset device cudaDeviceReset(); printf("Test Passed..\n"); return 0; }
编译运行:
$ nvcc -arch=sm_20 sumMatrix2D.cu -o sumMatrix2D
$ ./sumMatrix2D
程序输出:
./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,32)>>> elapsed 0.070000 sec.. Test Passed..
现在我们
将block的大小改成(32, 16),此时block数量为512*1024,再次编译运行,会发现:
./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.410000 sec.. --sumMatrix2DOnGPU<<<(512,1024),(32,16)>>> elapsed 0.040000 sec.. Test Passed..
可以看到,程序性能提升了将近1倍,直观来看是第二次线程配置比第一次配置block的数量
增加了1倍,实际上也正是由于block数量增加了的缘故。但是如果继续增加block的数量,性能反而又会下降。
现在我们
将block的大小改为(16,16),此时block数量为1024*1024,
再次编译运行,会发现:
./sumMatrix2D Program Starting... Using Device 0: Tesla M2070 Matrix size: nx= 16384, ny= 16384 --sumMatrix2DOnHost() elapsed 1.400000 sec.. --sumMatrix2DOnGPU<<<(1024,1024),(16,16)>>> elapsed 0.050000 sec.. Test Passed..
关于线程块配置的性能分析参考后续章节。