CUDA(16)之内存对齐

摘要

本文主要讲述CUDA内存对齐。

 

1. 背景

CUDA内存对齐的背景就不说了。

 

2. 采用SoA设计/构造并行的数据结构

array of structures(AoS)和structure of arrays(SoA)是C语言的基本背景。SoA的内存操作适合并行计算的数据结构的设计。SoA在并行计算上的具体实现见下面过程分析。

 

#define threads 16

struct T {  
    int s0[threads];  
    int s1[threads];
    int s2[threads];  
    int s3[threads];
    int s4[threads];  
    int s5[threads];
    int s6[threads];  
    int s7[threads];
    int s8[threads];  
    int s9[threads];
    int s10[threads];  
    int s11[threads];
    int s12[threads];  
    int s13[threads];
    int s14[threads];  
    int s15[threads];
}; 

SoA结构的数据类型T如上图所示,T在CUDA的global memory中的存储方式是理解并行数据结构设计的关键,thread0读取s0[0],s0[1],...,s0[15]; thread1读取s1[0],s1[1],...,s1[15]; thread15读取s15[0],s15[1],...,s15[15]。

任何一次并发的15个threads访问的内存总是对齐的;比如,并发的thread0,thread1,......,thread15第一次访问的global memory的对齐的地址为s0[0],s0[1],s0[2],......,s0[15].

 

3. global memory内存对齐代码测试

 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 
#include 

using namespace std;


#define threads 160

struct T {  
	int s1[threads];  
    int s2[threads];
	int s3[threads];  
    int s4[threads];
	int s5[threads];  
    int s6[threads];
	int s7[threads];  
    int s8[threads];
	int s9[threads];  
    int s10[threads];
	int s11[threads];  
    int s12[threads];
	int s13[threads];  
    int s14[threads];
	int s15[threads];  
    int s16[threads];
};  

__global__ void initStruct(T *data, const int threadsNum){
	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 
	if(i < threadsNum){
		data->s1[i] = 1;
		data->s2[i] = 2;
		data->s3[i] = 3;
		data->s4[i] = 4;
		data->s5[i] = 5;
		data->s6[i] = 6;
		data->s7[i] = 7;
		data->s8[i] = 8;
		data->s9[i] = 9;
		data->s10[i] = 10;
		data->s11[i] = 11;
		data->s12[i] = 12;
		data->s13[i] = 13;
		data->s14[i] = 14;
		data->s15[i] = 15;
		data->s16[i] = 16;
	}
}
  

      

int main(int argc, char **argv) {  
	 
	int dev = 0;  
	cudaSetDevice(dev);  

	
	T *data;  
	cudaMalloc((T **)&data,sizeof(T));  

	T *res = (T *)malloc(sizeof(T)); 

	
	// execution configuration  
	dim3 block (threads,1);  
	dim3 grid (1,1);  
	
	// kernel
	initStruct<<< grid, block >>> (data, threads);  
	
	// copy back
	cudaMemcpy(res, data, sizeof(T), cudaMemcpyDeviceToHost);  
	 
	// print
	for (int i=0; is1[i] <<" ";
		cout << res->s2[i] <<" ";
		cout << res->s3[i] <<" ";
		cout << res->s4[i] <<" ";
		cout << res->s5[i] <<" ";
		cout << res->s6[i] <<" ";
		cout << res->s7[i] <<" ";
		cout << res->s8[i] <<" ";
		cout << res->s9[i] <<" ";
		cout << res->s10[i] <<" ";
		cout << res->s11[i] <<" ";
		cout << res->s12[i] <<" ";
		cout << res->s13[i] <<" ";
		cout << res->s14[i] <<" ";
		cout << res->s15[i] <<" ";
		cout << res->s16[i] <<" ";
		cout << endl;
	}

	// free memories both host and device  
	cudaFree(data);  
	free(res);  
	
	// reset device  
	cudaDeviceReset();  
	
	return EXIT_SUCCESS;  
}  

 

 

 

 

 

 

你可能感兴趣的:(GPU编程)