//cuda头文件
struct CoreDataGPU
{
// host cpu
float *H_X;
float *H_Y ;
float *H_Z ;
//device gpu
float *D_X;
float *D_Y ;
float *D_Z ;
};
//开辟内存空间.
void initPointGPU(PointGpu **_HostPointGPU,int arraySize);
//释放内存空间
void freePointGPU(PointGpu &_HostPointGPU);
//核心算子.
__global__ void addPoint(PointGpu* _PointGPU,int arraySize);
//对上面结构体进行处理测试.
void PointGpuMethod_Test();
//测试二:上面结构体,假设里面结构体不是指针,只是一个变量.,那么做一个结构体数组
//
void PointGpuMethod_dim1();
///
void initPointGPU(PointGpu **_HostPointGPU, int arraySize)
{
//开辟空间. 在本地开辟空间.
cudaMallocHost((void**)(_HostPointGPU), sizeof(PointGpu));
//在cpu上开辟空间,这里考虑的是:后续进行类封装的时候,直接从gpu上对应的地址拷贝到cpu对应的数据内存,
cudaMallocHost((void**)&(*_HostPointGPU)->host_X,sizeof(float)*arraySize);
cudaMallocHost((void**)&(*_HostPointGPU)->host_Y, sizeof(float)*arraySize);
cudaMallocHost((void**)&(*_HostPointGPU)->host_Z, sizeof(float)*arraySize);
//在gpu上开辟空间
cudaMalloc((void**)&(*_HostPointGPU)->device_X,sizeof(float)*arraySize);
cudaMalloc((void**)&(*_HostPointGPU)->device_Y, sizeof(float)*arraySize);
cudaMalloc((void**)&(*_HostPointGPU)->device_Z, sizeof(float)*arraySize);
}
void freePointGPU(PointGpu &_HostPointGPU)
{
cudaFreeHost(_HostPointGPU.host_X);
cudaFreeHost(_HostPointGPU.host_Y);
cudaFreeHost(_HostPointGPU.host_Z);
cudaFree(_HostPointGPU.device_X);
cudaFree(_HostPointGPU.device_Y);
cudaFree(_HostPointGPU.device_Z);
cudaFreeHost(&_HostPointGPU);
}
//核函数,我就直接赋值了,不做计算.
__global__ void addPoint(PointGpu *_PointGPU, int arraySize)
{
int tid = threadIdx.x;
if (_PointGPU != nullptr && tid < arraySize)
{
_PointGPU->device_X[tid] = 9;
_PointGPU->device_Y[tid] = 8;
_PointGPU->device_Z[tid] = 10;
}
}
//接下来就是如何使用了.
void PointGpuMethod_Test()
{
这里是单一的数据
const int arraySize = 10;
float X[arraySize] = {0,};
float Y[arraySize] = { 0, };
float Z[arraySize] = { 0 ,};
memset(X, 0, sizeof(float)*arraySize);
memset(Y, 0, sizeof(float)*arraySize);
memset(Z, 0, sizeof(float)*arraySize);
PointGpu* source = {nullptr};
initPointGPU(&source,arraySize); //初始化数据.
addPoint << < 1,arraySize >> > (source, arraySize); //计算
cudaDeviceSynchronize(); //同步 或者在核函数里面使用 __syncthreads();
cudaMemcpy(source->host_Z, source->device_Z, arraySize*sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据.
cudaMemcpy(source->host_Y, source->device_Y, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据.
cudaMemcpy(source->host_X, source->device_X, arraySize * sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost); //下载数据.
memcpy(Z, source->host_Z,sizeof(float)*arraySize);
memcpy(Y, source->host_Y, sizeof(float)*arraySize);
memcpy(X, source->host_X, sizeof(float)*arraySize);
//输出:
std::cout << "GPU下载的数据:" << std::endl;
for (int i = 0; i < arraySize; i++)
{
std::cout << Z[i]<<" "<> > (source[i], 1);
cudaDeviceSynchronize();
//获取数值.赋值,并输出数据.
float buffer[3][2] = {0};
cudaMemcpy(buffer[0], source[i]->device_X, sizeof(float),cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaMemcpy(buffer[1], source[i]->device_Y, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaMemcpy(buffer[2], source[i]->device_Z, sizeof(float), cudaMemcpyKind::cudaMemcpyDeviceToHost);
std::cout << buffer[0][0] << " " << buffer[1][0] << " " << buffer[2][0] << " "<