CUFFT中FFT点数对FFT性能的影响测试

基2FFT比非基2FFT的运行速度要快,在NVIDIA GPU设备上使用FFT的时候也是尽量使用基2FFT,因为本身使用GPU就是为了追求快速计算。测试了下一个二维复数矩阵在基2FFT和非基2FFT的性能差别(二维矩阵中的一个维度进行FFT,用到了batch)。从我测试的结果来看,基2FFT比非基2FFT快5倍。测试的数据尺寸:基2FFT:512×16384,非基2FFT:428×16384,测试数据是用matlab生成的。

%生成用于cufft测试的随机数(使用matlab生成的,也可以在GPU程序中生成) close all; clear; clc; a1 = randn(428 * 2, 1); %生成428点长度的随机数, 复数据,按IQ存储 a2 = zeros(428 * 2, 16384); for ii = 1 : 16384, a2(:, ii) = a1; end, clear a1; b1 = randn(512 * 2, 1); b2 = zeros(512 * 2, 16384); for ii = 1 : 16384, b2(:, ii) = b1; end, clear b1; %将生成的随机数写到硬盘,以用于c程序调用 output_file_428='D:/cufft_test/428.dat'; output_file_512='D:/cufft_test/512.dat'; fid1 = fopen(output_file_428, 'w'); fid2 = fopen(output_file_512, 'w'); fwrite(fid1, a2, 'single'); fwrite(fid2, b2, 'single'); fclose(fid1); fclose(fid2);  

GPU测试程序:

#include #include #include #include #include #include #include //使用openmp开启了CPU并行 using namespace std; //设备端头文件 #include #include typedef float2 Complex; //复数类型 #define datasize_428 7012352 #define datasize_512 8388608 char data_file_428[200] = "D://cufft_test//428.dat"; char data_file_512[200] = "D://cufft_test//512.dat"; int main(int argc, char* argv[]) { clock_t start, end, start1, end1, start2, end2; time_t timer; struct tm *tblock; /* gets time of day */ timer = time(NULL); /* converts date/time to a structure */ tblock = localtime(&timer); start = clock(); system("cls"); system("color 0a"); printf("Local time: %s", asctime(tblock)); cudaSetDevice(cutGetMaxGflopsDeviceId()); int devID; cudaDeviceProp props; //get number of SMs on this GPU cutilSafeCall(cudaGetDevice(&devID)); cutilSafeCall(cudaGetDeviceProperties(&props, devID)); printf("Device %d: /"%s/" with Compute %d.%d capability/n", devID, props.name, props.major, props.minor); cudaSetDevice(cutGetMaxGflopsDeviceId()); //读取428点的测试数据 Complex *echodata_428 = new Complex[datasize_428]; //存放回波复数据 float *echo_iq_428 = new float[2 * datasize_428]; //用于读取存在硬盘上的IQ回波 FILE *fp_echo_428; int ncount = 0; //用于数据校验 printf("Now start to read float type echo data from disk/n"); fp_echo_428 = fopen(data_file_428, "rb"); if (fp_echo_428 == NULL) { printf("Open echo data file fails!/n"); } else { ncount = fread(echo_iq_428, sizeof(float), 2 * datasize_428, fp_echo_428); if (ncount != 2 * datasize_428) { printf("Read data from disk error, not completely!/n"); } else { printf("Read echo data sucesses/n"); fclose(fp_echo_428); } } printf("now, start to transform the [I Q] data to complex type/n"); int i; #pragma omp parallel for num_threads(8) private(i) for (i = 0; i < datasize_428; i++) { echodata_428[i].x = echo_iq_428[2 * i]; echodata_428[i].y = echo_iq_428[2 * i + 1]; } #pragma omp barrier /*free(echo_iq); echo_iq = NULL;*/ printf("transform the [I Q] echo data to complex type successes/n"); //读取512点的测试数据 Complex *echodata_512 = new Complex[datasize_512]; //存放回波复数据 float *echo_iq_512 = new float[2 * datasize_512]; //用于读取存在硬盘上的IQ回波 FILE *fp_echo_512; ncount = 0; //用于数据校验 printf("Now start to read float type echo data from disk/n"); fp_echo_512 = fopen(data_file_512, "rb"); if (fp_echo_512 == NULL) { printf("Open echo data file fails!/n"); } else { ncount = fread(echo_iq_512, sizeof(float), 2 * datasize_512, fp_echo_512); if (ncount != 2 * datasize_512) { printf("Read data from disk error, not completely!/n"); } else { printf("Read echo data sucesses/n"); fclose(fp_echo_512); } } printf("now, start to transform the [I Q] data to complex type/n"); #pragma omp parallel for num_threads(8) private(i) for (i = 0; i < datasize_512; i++) { echodata_512[i].x = echo_iq_512[2 * i]; echodata_512[i].y = echo_iq_512[2 * i + 1]; } #pragma omp barrier /*free(echo_iq); echo_iq = NULL;*/ printf("transform the [I Q] echo data to complex type successes/n"); //428点傅里叶变换 start1 = clock(); printf("start to do 428 points fft/n"); Complex *g_idata_428_fft; cufftHandle plan_428; //创建CUFFT句柄 //在GPU上为信号开辟空间 cutilSafeCall(cudaMalloc((void **)&g_idata_428_fft, datasize_428 * 8)); //将开辟的显存全部置零 cutilSafeCall(cudaMemset((void *)g_idata_428_fft, '/0', datasize_428 * 8)); //拷贝内存信号到显存 cutilSafeCall(cudaMemcpy(g_idata_428_fft, echodata_428, datasize_428 * 8, cudaMemcpyHostToDevice)); //start1 = clock(); cufftSafeCall(cufftPlan1d(&plan_428, 428, CUFFT_C2C, 16384)); cufftSafeCall(cufftExecC2C(plan_428, (cufftComplex *)g_idata_428_fft, (Complex *)g_idata_428_fft, CUFFT_FORWARD)); end1= clock(); //拷贝显存数据到内存 cutilSafeCall(cudaMemcpy(echodata_428, g_idata_428_fft, datasize_428 * 8, cudaMemcpyDeviceToHost)); cufftSafeCall(cufftDestroy(plan_428)); cutilSafeCall(cudaFree(g_idata_428_fft)); printf("428 points fft finishes/n"); //512点傅里叶变换 start2 = clock(); printf("start to do 512 points fft/n"); Complex *g_idata_512_fft; cufftHandle plan_512; //创建CUFFT句柄 //在GPU上为信号开辟空间 cutilSafeCall(cudaMalloc((void **)&g_idata_512_fft, datasize_512 * 8)); //将开辟的显存全部置零 cutilSafeCall(cudaMemset((void *)g_idata_512_fft, '/0', datasize_512 * 8)); //拷贝内存信号到显存 cutilSafeCall(cudaMemcpy(g_idata_512_fft, echodata_512, datasize_512 * 8, cudaMemcpyHostToDevice)); //start2 = clock(); cufftSafeCall(cufftPlan1d(&plan_512, 512, CUFFT_C2C, 16384)); cufftSafeCall(cufftExecC2C(plan_512, (cufftComplex *)g_idata_512_fft, (Complex *)g_idata_512_fft, CUFFT_FORWARD)); end2 = clock(); //拷贝显存数据到内存 cutilSafeCall(cudaMemcpy(echodata_512, g_idata_512_fft, datasize_512 * 8, cudaMemcpyDeviceToHost)); cufftSafeCall(cufftDestroy(plan_512)); cutilSafeCall(cudaFree(g_idata_512_fft)); printf("512 points fft finishes/n"); end = clock(); double duration = double(end - start)/ CLOCKS_PER_SEC; printf("This Tool uses %f s/n", duration); double duration1 = double(end1 - start1)/ CLOCKS_PER_SEC; printf("428 points fft uses %f s/n", duration1); double duration2 = double(end2 - start2)/ CLOCKS_PER_SEC; printf("512 points fft uses %f s/n", duration2); getchar(); return 0; }  

你可能感兴趣的:(CUFFT中FFT点数对FFT性能的影响测试)