CUDA调用cuFFT后对复数求模

      当我们使用显卡调用cuFFT库计算FFT后(FFT计算,请参考https://blog.csdn.net/endlch/article/details/46724811),需要对cufftComplex*类型的数据进行进一步处理,比如取模,两个复数相乘等操作,恰巧,库里面也配套了cuComplex.h,其中包含复数基本操作函数,主机和设备端均可调用。我目前还没找到针对复数数组取模的现成并行函数,就写了个简单的核函数。以下是代码。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 
#include 
#include 
#include 
using namespace std;




__global__ void abs_complex(cufftComplex* pSrc, float* pDst)
{
	int tx = threadIdx.x;
	pDst[tx] = cuCabsf(pSrc[tx]);
}
int main()
{
	cufftComplex* hSrc = (cufftComplex*)malloc(sizeof(cufftComplex) * 10);
	float* hDst = (float*)malloc(sizeof(float) * 10);
	cufftComplex* pSrc;
	float* pDst;
	cudaMalloc((void **)(&pSrc), sizeof(cufftComplex) * 10);
	cudaMalloc((void **)(&pDst), sizeof(float) * 10);
	memset(hSrc, 0, sizeof(cufftComplex) * 10);
	for (int i = 0; i < 10; i++)
	{
		hSrc[i].x = i;
		hSrc[i].y = i;
	}
	cudaMemcpy(pSrc, hSrc, 10 * sizeof(cufftComplex), cudaMemcpyHostToDevice);
	dim3 block(10);
	dim3 grid(1);
	abs_complex << > > (pSrc, pDst);
	cudaMemcpy(hDst, pDst, sizeof(float) *  10, cudaMemcpyDeviceToHost);

	for (int i = 0; i < 10; i++)
	{
		//hDst[i] = cuCabsf(hSrc[i]);
		cout<

其中重点是取模函数 float cuCabsf (cuFloatComplex x)    (来自cuComplex.h):

__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
{
    float a = cuCrealf(x);
    float b = cuCimagf(x);
    float v, w, t;
    a = fabsf(a);
    b = fabsf(b);
    if (a > b) {
        v = a;
        w = b; 
    } else {
        v = b;
        w = a;
    }
    t = w / v;
    t = 1.0f + t * t;
    t = v * sqrtf(t);
    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
        t = v + w;
    }
    return t;
}
求模函数这样写原因之一是为了不让实部或虚部平方后的数不超float的32位,但可能会影响精度。

你可能感兴趣的:(CUDA调用cuFFT后对复数求模)