/* asum: sum of all entries of a vector.
* This code only calculates one block to show the usage of shared memory and synchronization */
typedef double FLOAT;
/* sum all entries in x and asign to y */
__global__ void reduction_1(const FLOAT *x, FLOAT *y)
__shared__ FLOAT sdata[256];
int tid = threadIdx.x;
/* load data to shared mem
为了避免内存竞争,可以使用同步语句:void __syncthreads();语句相当于在线程
一步的计算;但是,__syncthreads(); 频繁使用会影响内核执行效率。*/
sdata[tid] = x[tid];//这个x是 FLOAT *x;
/* reduction using shared mem 把for循环展开*/
if (tid < 128) sdata[tid] += sdata[tid + 128];
if (tid < 64) sdata[tid] += sdata[tid + 64];
if (tid < 32) sdata[tid] += sdata[tid + 32];
if (tid < 16) sdata[tid] += sdata[tid + 16];
if (tid < 8) sdata[tid] += sdata[tid + 8];
if (tid < 4) sdata[tid] += sdata[tid + 4];
if (tid < 2) sdata[tid] += sdata[tid + 2];
if (tid == 0) {
*y = sdata[0] + sdata[1];
__global__ void reduction_2(const FLOAT *x, FLOAT *y)
__shared__ volatile FLOAT sdata[256];//加volatile关键字,避免编译器自己进行优化.
int tid = threadIdx.x;
sdata[tid] = x[tid];
if(tid < 128) sdata[tid] += sdata[tid + 128];
if(tid < 64) sdata[tid] += sdata[tid + 64];
if(tid < 32)
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
if(tid == 0) y[0] =sdata[0];
//__device__ 只能在GPU上被调用
__device__ void warpReduce(volatile FLOAT *sdata, int tid)
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
__global__ void reduction_3(const FLOAT *x, FLOAT *y)
__shared__ FLOAT sdata[256];
int tid = threadIdx.x;
/* load data to shared mem */
sdata[tid] = x[tid];
/* reduction using shared mem */
if (tid < 128) sdata[tid] += sdata[tid + 128];
if (tid < 64) sdata[tid] += sdata[tid + 64];
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) y[0] = sdata[0];
int main()
int N = 256; /* must be 256 */
int nbytes = N * sizeof(FLOAT);
FLOAT *dx = NULL, *hx = NULL;
int i;
FLOAT as = 0;
/************** allocate GPU mem ***************/
cudaMalloc((void **)&dx, nbytes);
cudaMalloc((void **)&dy, sizeof(FLOAT));
if (dx == NULL || dy == NULL) {
printf("couldn't allocate GPU memory\n");
return -1;
printf("allocated %e MB on GPU\n", nbytes / (1024.f * 1024.f));
/**************** alllocate CPU mem ************/
hx = (FLOAT *) malloc(nbytes);
if (hx == NULL) {
printf("couldn't allocate CPU memory\n");
return -2;
printf("allocated %e MB on CPU\n", nbytes / (1024.f * 1024.f));
/****************** init *********************/
for (i = 0; i < N; i++) {
hx[i] = 1;
/* copy data to GPU */
cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
/* call GPU */
reduction_1<<<1, N>>>(dx, dy);
/* let GPU finish */
/* copy data from GPU */
cudaMemcpy(&as, dy, sizeof(FLOAT), cudaMemcpyDeviceToHost);
printf("reduction_1, answer: 256, calculated by GPU:%g\n", as);
/* call GPU */
reduction_2<<<1, N>>>(dx, dy);
/* let GPU finish */
/* copy data from GPU */
cudaMemcpy(&as, dy, sizeof(FLOAT), cudaMemcpyDeviceToHost);
printf("reduction_2, answer: 256, calculated by GPU:%g\n", as);
/* call GPU */
reduction_3<<<1, N>>>(dx, dy);
/* let GPU finish */
/* copy data from GPU */
cudaMemcpy(&as, dy, sizeof(FLOAT), cudaMemcpyDeviceToHost);
printf("reduction_3, answer: 256, calculated by GPU:%g\n", as);
return 0;
/* sum all entries in x and asign to y */
__global__ void reduction_1(const FLOAT *x, FLOAT *y)
__shared__ FLOAT sdata[256];
int tid = threadIdx.x;
/* load data to shared mem
为了避免内存竞争,可以使用同步语句:void __syncthreads();语句相当于在线程
一步的计算;但是,__syncthreads(); 频繁使用会影响内核执行效率。*/
sdata[tid] = x[tid];//这个x是 FLOAT *x;
/* reduction using shared mem 把for循环展开*/
if (tid < 128) sdata[tid] += sdata[tid + 128];
if (tid < 64) sdata[tid] += sdata[tid + 64];
if (tid < 32) sdata[tid] += sdata[tid + 32];
if (tid < 16) sdata[tid] += sdata[tid + 16];
if (tid < 8) sdata[tid] += sdata[tid + 8];
if (tid < 4) sdata[tid] += sdata[tid + 4];
if (tid < 2) sdata[tid] += sdata[tid + 2];
if (tid == 0) {
*y = sdata[0] + sdata[1];
__global__ void reduction_2(const FLOAT *x, FLOAT *y)
__shared__ volatile FLOAT sdata[256];//加volatile关键字,避免编译器自己进行优化.
int tid = threadIdx.x;
sdata[tid] = x[tid];
if(tid < 128) sdata[tid] += sdata[tid + 128];
if(tid < 64) sdata[tid] += sdata[tid + 64];
if(tid < 32)
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
if(tid == 0) y[0] =sdata[0];
对于第一点,第二节中已经介绍了;对于第二点,GPU中最小的操作为一个warp,一个warp32个线程,这也是程序从if(tid < 32)
//__device__ 只能在GPU上被调用
__device__ void warpReduce(volatile FLOAT *sdata, int tid)
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
__global__ void reduction_3(const FLOAT *x, FLOAT *y)
__shared__ FLOAT sdata[256];
int tid = threadIdx.x;
/* load data to shared mem */
sdata[tid] = x[tid];
/* reduction using shared mem */
if (tid < 128) sdata[tid] += sdata[tid + 128];
if (tid < 64) sdata[tid] += sdata[tid + 64];
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) y[0] = sdata[0];