最近在做并行计算, 应用的是典型的计算快速傅立叶变换 FFT, 程序设计的环境是 Window7, GTX 660ti 使用的软件操作是 CUDA 6.0, OpenCL1.1 , VC2005
现做规模为 2048 的 FFT 做512 次, 这样得到的结果用 matlab 来衡量程序计算结果的正确性。 在保证正确性的条件下, 测量程序中 512次规模为2048 FFT 计算时间,
先用CUDA cufft计算, 在用OpenCL采用两种方法计算, 一种是普通的并行蝶形计算方法, 另外一种择采用矩阵 FFT算法, 通过上述比较CUDA 与 OpenCL 的运行时间及计算精度
1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 3 #include4 #include "cufft.h" 5 6 #define Nx 2048 // FFT 7 #define Ny 512 // FFT총수 8 #define pi 3.1415926 9 10 int main(void) 11 { 12 Mytime g_timer; 13 14 float *h_fk; 15 h_fk =(float*)malloc(sizeof(float)*Nx*Ny); 16 for (int j=0; j< Nx*Ny; j++) 17 { 18 h_fk[j] = j%Nx; 19 } 20 21 #if 0 22 FILE *fid_h_fk; 23 fid_h_fk = fopen("h_fk.txt","wt"); 24 if(fid_h_fk==NULL) 25 { 26 //cout<<"error"< 27 printf( "Error : file open failed \n " ); 28 system("pause"); 29 } 30 31 for (int j = 0; j < Nx; j++) 32 { 33 fprintf(fid_h_fk, "%f \n" , h_fk[j]); 34 } 35 fclose(fid_h_fk); 36 #endif 37 / 38 /cufft 39 int i,j; 40 cufftComplex *h_hatCH; 41 h_hatCH=(cufftComplex*)malloc(sizeof(cufftComplex)*(Nx/2+1)*Ny); 42 43 cufftReal *d_fk; 44 cufftComplex *d_cufft_CH; 45 46 cudaMalloc((void**)&d_fk, Nx * Ny * sizeof(cufftReal)); 47 cudaMalloc((void**)&d_cufft_CH, (Nx/2+1) * Ny * sizeof(cufftComplex)); 48 49 cufftHandle plan; 50 51 cufftPlan1d(&plan, Nx, CUFFT_R2C,Ny); // Real float To Complex 52 53 g_timer.Start( &g_timer ); 54 55 cufftExecR2C(plan, (cufftReal*)d_fk, (cufftComplex*)d_cufft_CH); 56 57 cudaMemcpy(d_fk, h_fk, Nx * Ny * sizeof(cufftReal), cudaMemcpyHostToDevice); 58 59 60 /// d_cufft_CH output 61 62 cudaMemcpy(h_hatCH, d_cufft_CH, sizeof(cufftComplex)*(Nx/2+1)*Ny, cudaMemcpyDeviceToHost); 63 64 gfFrametime = g_timer.End( &g_timer ); 65 printf("Time = %f Sec \n", gfFrametime); 66 #if 0 67 FILE *fid_h_hatCH; 68 fid_h_hatCH = fopen("h_hatCH1.txt","wt"); 69 if(fid_h_hatCH==NULL) 70 { 71 printf( "Error : file open failed \n " ); 72 system("pause"); 73 } 74 75 for (int j = 0; j < (Nx/2+1)*Ny; j++) 76 { 77 if (h_hatCH[j].y == 0) 78 fprintf(fid_h_hatCH, "%f\n" , h_hatCH[j].x); 79 else if (h_hatCH[j].y > 0) 80 fprintf(fid_h_hatCH, "%f+%fi\n" , h_hatCH[j].x ,h_hatCH[j].y ); 81 else fprintf(fid_h_hatCH, "%f%fi\n" , h_hatCH[j].x ,h_hatCH[j].y ); 82 } 83 fclose(fid_h_hatCH); 84 #endif 85 86 FILE *fid_h_hatCH; 87 fid_h_hatCH = fopen("h_hatCH1.txt","wt"); 88 if(fid_h_hatCH==NULL) 89 { 90 printf( "Error : file open failed \n " ); 91 system("pause"); 92 } 93 94 for (int j = 0; j < Nx/2; j++) 95 { 96 fprintf(fid_h_hatCH, "%f\n" , sqrt(pow(h_hatCH[j].x,2)+pow(h_hatCH[j].y,2))); 97 } 98 fclose(fid_h_hatCH); 99 100 cufftDestroy(plan); 101 cudaFree(d_fk); 102 cudaFree(d_cufft_CH); 103 free(h_hatCH); 104 free(h_fk); 105 106 return 0; 107 108 }
运行时间为: 0.003841 sec
如果用 OpenCL 蝶形运算方法, Device kernel 源码为
1 #define PI 3.14159265358979323846 2 #define PI_2 1.57079632679489661923 3 4 /* 회전인자 생성 */ 5 __kernel void spinFact(__global float2* w, int n) 6 { 7 unsigned int i = get_global_id(0); 8 9 float2 angle = (float2)(2*i*PI/(float)n, (2*i*PI/(float)n)+PI_2); 10 w[i] = cos(angle); 11 } 12 13 /* 비트 역순 */ 14 __kernel void bitReverse(__global float2 *dst, __global float2 *src, int m, int n) 15 { 16 unsigned int gid = get_global_id(0); 17 unsigned int nid = get_global_id(1); 18 19 unsigned int j = gid; 20 j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; 21 j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; 22 j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; 23 j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; 24 j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; 25 26 j >>= (32-m); 27 28 dst[nid*n+j] = src[nid*n+gid]; 29 } 30 31 /* 버터플라이 연산 */ 32 __kernel void butterfly(__global float2 *x, __global float2* w, int m, int n, int iter, uint flag) 33 { 34 unsigned int gid = get_global_id(0); 35 unsigned int nid = get_global_id(1); 36 37 int butterflySize = 1 << (iter-1); 38 int butterflyGrpDist = 1 << iter; 39 int butterflyGrpNum = n >> iter; 40 int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); 41 int butterflyGrpOffset = gid & (butterflySize-1); 42 43 int a = nid * n + butterflyGrpBase + butterflyGrpOffset; 44 int b = a + butterflySize; 45 46 int l = butterflyGrpNum * butterflyGrpOffset; 47 48 float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; 49 50 xa = x[a]; 51 xb = x[b]; 52 xbxx = xb.xx; 53 xbyy = xb.yy; 54 55 wab = as_float2(as_uint2(w[l]) ^ (uint2)(0x0, flag)); 56 wayx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x80000000, 0x0)); 57 wbyx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x0, 0x80000000)); 58 59 resa = xa + xbxx*wab + xbyy*wayx; 60 resb = xa - xbxx*wab + xbyy*wbyx; 61 62 x[a] = resa; 63 x[b] = resb; 64 }
计算时间为 0.060381 sec
采用矩阵算法的源码为
1 /// 2 //! Matrix multiplication on the device: C = A * B 3 //! uiWA is A's width and uiWB is B's width 4 //// 5 __kernel void 6 matrixMul( __global float* C, __global float* A, __global float2* B, 7 __local float* As, __local float* Bsr,__local float* Bsi, int uiWA, int uiWB) 8 { 9 // Block index 10 int bx = get_group_id(0); 11 int by = get_group_id(1); 12 13 // Thread index 14 int tx = get_local_id(0); 15 int ty = get_local_id(1); 16 17 // Index of the first sub-matrix of A processed by the block 18 int aBegin = uiWA * BLOCK_SIZE * by; 19 20 // Index of the last sub-matrix of A processed by the block 21 int aEnd = aBegin + uiWA - 1; 22 23 // Step size used to iterate through the sub-matrices of A 24 int aStep = BLOCK_SIZE; 25 26 // Index of the first sub-matrix of B processed by the block 27 int bBegin = BLOCK_SIZE * bx; 28 29 // Step size used to iterate through the sub-matrices of B 30 int bStep = BLOCK_SIZE * uiWB; 31 32 // Csub is used to store the element of the block sub-matrix 33 // that is computed by the thread 34 float2 Csub =(float2)(0.0f, 0.0f); 35 36 // Loop over all the sub-matrices of A and B 37 // required to compute the block sub-matrix 38 for (int a = aBegin, b = bBegin; 39 a <= aEnd; 40 a += aStep, b += bStep) { 41 42 // Load the matrices from device memory 43 // to shared memory; each thread loads 44 // one element of each matrix 45 AS(ty, tx) = A[a + uiWA * ty + tx]; 46 BSr(ty, tx) = B[b + uiWB * ty + tx].x; 47 BSi(ty, tx) = B[b + uiWB * ty + tx].y; 48 49 // Synchronize to make sure the matrices are loaded 50 barrier(CLK_LOCAL_MEM_FENCE); 51 52 // Multiply the two matrices together; 53 // each thread computes one element 54 // of the block sub-matrix 55 #pragma unroll 56 for (int k = 0; k < BLOCK_SIZE; ++k) 57 Csub +=(float2)(AS(ty, k)*BSr(k, tx),AS(ty, k)* BSi(k, tx)); 58 59 // Synchronize to make sure that the preceding 60 // computation is done before loading two new 61 // sub-matrices of A and B in the next iteration 62 barrier(CLK_LOCAL_MEM_FENCE); 63 } 64 65 // Write the block sub-matrix to device memory; 66 // each thread writes one element 67 C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = 10*log10(Csub.x*Csub.x + Csub.y * Csub.y) ; 68 69 }
计算时间为 0.024020 sec
精确度上 OpenCL明显优于 CUDA