1.简介
以二维数组为例,cuda内对二维数组的存储时按照行优先,一维方式存储的。在分配二维数组空间和数据操作的时候为了提高效率,需要对二维空间的行进行内存对齐。在CUDA10.2版本中,内存对齐是以512Byte的整数倍进行操作的。并行处理时,可以用二维线程块对数组按行进行索引,这也是官方文档例子中使用的方法。
2.使用方法
首先创建二维数组,然后相加。代码如下。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
using namespace std;
//测试数据列数
#define M 1024
#define N 129
//用一维方式操作二维数组-2 数组相加
cudaError_t AddData_2DArray(/*float* d_transMat, size_t pitch,*/size_t blockSize, size_t threadSize);
int main()
{
size_t blockSize = 512;
size_t threadSize = 256;
AddData_2DArray(blockSize, threadSize);
return 0;
}
//这里相加的两个数组大小相等,所以两个pitch相等
//d_transMat1和2相加,结果放到1内
__global__ void Add_transMat (float* d_transMat1, size_t pitch1,float* d_transMat2, size_t pitch2)
{
size_t pitch = pitch1 = pitch2;
//int count = 1;
for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < M; j += blockDim.y * gridDim.y)
{
float* row_d_transMat1 = (float*)((char*)d_transMat1 + j * pitch);
float* row_d_transMat2 = (float*)((char*)d_transMat2 + j * pitch);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
row_d_transMat1[i] = 1;
row_d_transMat2[i] = 2;
row_d_transMat1[i] += row_d_transMat2[i];
//count++;
}
}
}
cudaError_t AddData_2DArray(/*float* d_transMat, size_t pitch,*/size_t blockSize, size_t threadSize)
{
float* d_transMat1, * d_transMat2;
float* transMat1;
size_t pitch1, pitch2;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
transMat1 = (float*)malloc(sizeof(float)*M*N);
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMallocPitch(&d_transMat1, &pitch1, sizeof(float) * N, M);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMallocPitch(&d_transMat2, &pitch2, sizeof(float) * N, M);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
Add_transMat << > > (d_transMat1, pitch1, d_transMat2, pitch2);
cudaStatus = cudaMemcpy2D(transMat1, sizeof(float) * N, d_transMat1, pitch1, sizeof(float) * N, M, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for (int i = 0; i < N*M; i++)
cout << i << " "<