CUDA使用多维数组

//https://stackoverflow.com/questions/40388242/multidimensional-array-allocation-with-cuda-unified-memory-on-power-8

#include 
#include 

template
T**** create_4d_flat(int a, int b, int c, int d) {
	T *base;
	cudaError_t err = cudaMallocManaged(&base, a*b*c*d * sizeof(T));
	assert(err == cudaSuccess);
	T ****ary;
	err = cudaMallocManaged(&ary, (a + a * b + a * b*c) * sizeof(T*));
	assert(err == cudaSuccess);
	for (int i = 0; i < a; i++) {
		ary[i] = (T ***)((ary + a) + i * b);
		for (int j = 0; j < b; j++) {
			ary[i][j] = (T **)((ary + a + a * b) + i * b*c + j * c);
			for (int k = 0; k < c; k++)
				ary[i][j][k] = base + ((i*b + j)*c + k)*d;
		}
	}
	return ary;
}

template
void free_4d_flat(T**** ary) {
	if (ary[0][0][0]) cudaFree(ary[0][0][0]);
	if (ary) cudaFree(ary);
}


template
__global__ void fill(T**** data, int a, int b, int c, int d) {
	unsigned long long int val = 0;
	for (int i = 0; i < a; i++)
		for (int j = 0; j < b; j++)
			for (int k = 0; k < c; k++)
				for (int l = 0; l < d; l++)
					data[i][j][k][l] = val++;
}

void report_gpu_mem()
{
	size_t free, total;
	cudaMemGetInfo(&free, &total);
	std::cout << "Free = " << free << " Total = " << total << std::endl;
}

int main() {
	report_gpu_mem();

	unsigned long long int ****data2;
	std::cout << "allocating..." << std::endl;
	data2 = create_4d_flat(64, 63, 62, 5);

	report_gpu_mem();

	fill << <1, 1 >> > (data2, 64, 63, 62, 5);
	cudaError_t err = cudaDeviceSynchronize();
	assert(err == cudaSuccess);

	std::cout << "validating..." << std::endl;
	for (int i = 0; i < 64 * 63 * 62 * 5; i++)
		if (*(data2[0][0][0] + i) != i) { std::cout << "mismatch at " << i << " was " << *(data2[0][0][0] + i) << std::endl; return -1; }
	free_4d_flat(data2);
	return 0;
}

 

 

https://devtalk.nvidia.com/default/topic/821510/is-it-possible-to-process-multidimensional-arrays-inside-the-kernel-/

#include 
#include 
#include 

#include 
#include 

#define FILE_MAX 3
#define MOL_MAX 3
#define ATOM_MAX 3
#define XMAX 5
#define YMAX 5
#define ZMAX 5

typedef struct full {
	int array[FILE_MAX][MOL_MAX][ATOM_MAX][ZMAX][YMAX][XMAX];
} full_t;

typedef struct coord {
	int data[ZMAX][YMAX][XMAX];
} coord_t;

__global__ void sample(coord_t * a)
{
	int x = threadIdx.x + blockDim.x * blockIdx.x;
	int y = threadIdx.y + blockDim.y * blockIdx.y;
	int z = threadIdx.z + blockDim.z * blockIdx.z;

	a->data[z][y][x] = z * 100 + y * 10 + x;
	a->data[z][y][x]++;

}

int main(void)
{
	int i, j, k;
	int x, y, z;
	dim3 BlockPerGrid(1, 1, 1);
	dim3 ThreadPerBlock(XMAX, YMAX, ZMAX);
	size_t size;

	full_t *top;
	size = sizeof(full);

	cudaMallocManaged(&top, size);

	for (i = 0; i < FILE_MAX; i++) {
		for (j = 0; j < MOL_MAX; j++) {
			for (k = 0; k < ATOM_MAX; k++) {
				sample << < BlockPerGrid, ThreadPerBlock >> > ((coord_t *) &(top->array[i][j][k][0][0][0]));
				cudaDeviceSynchronize();
			}
		}
	}

	for (i = 0; i < FILE_MAX; i++) {
		for (j = 0; j < MOL_MAX; j++) {
			for (k = 0; k < ATOM_MAX; k++) {
				for (z = 0; z < ZMAX; z++) {
					for (y = 0; y < YMAX; y++) {
						for (x = 0; x < XMAX; x++) {
							printf(" %d", top->array[i][j][k][z][y][x]);
						}
						printf("\n");
					}
					printf("\n");
				}
				printf("\n");
			}
			printf("\n");
		}
		printf("\n");
		printf("\n");
	}

	cudaFree(top);

}

 

你可能感兴趣的:(CUDA)