__syncthreads() acts as a barrier at which all threads in the block must wait before any is allowed to proceed.
//计算向量的内积程序
#include<stdio.h>
#define imin(a,b) (a<b?a:b)
//N为输入的向量的规模
const int N=33*1024;
const int threadsPerBlock=256;
const int blocksPerGrid=
imin(32,(N+threadsPerBlock-1)/threadsPerBlock);
__global__ void dot(float *a,float *b,float *c)
{
//每一个块上都有cache变量的拷贝,相互之间不影响
__shared__ float cache[threadsPerBlock];
//tid为线程的偏移量
int tid=threadIdx.x+blockIdx.x*blockDim.x;
int cacheIndex=threadIdx.x;
float temp=0;
while(tid<N)
{
temp+=a[tid]*b[tid];
//增加的下标量为进程总数
tid+=blockDim.x*gridDim.x;
}
cache[cacheIndex]=temp;
//同步化当前块上的线程
__syncthreads();
int i=blockDim.x/2;
//在块内计算部分和
while(i!=0)
{
if(cacheIndex<i)
cache[cacheIndex]+=cache[cacheIndex+i];
__syncthreads();
i/=2;
}
if(cacheIndex==0)
c[blockIdx.x]=cache[0];
}
int main(void)
{
float *a,*b,c,*partial_c;
float *dev_a,*dev_b,*dev_partial_c;
a=(float*)malloc(N*sizeof(float));
b=(float*)malloc(N*sizeof(float));
partial_c=(float*)malloc(blocksPerGrid*sizeof(float));
cudaMalloc((void**)&dev_a,N*sizeof(float));
cudaMalloc((void**)&dev_b,N*sizeof(float));
cudaMalloc((void**)&dev_partial_c,blocksPerGrid*sizeof(float));
for(int i=0;i<N;i++)
{
a[i]=i;
b[i]=i*2;
}
cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice);
dot<<<blocksPerGrid,threadsPerBlock>>>(dev_a,dev_b,dev_partial_c);
cudaMemcpy(partial_c,dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost);
c=0;
for(int i=0;i<blocksPerGrid;i++)
c+=partial_c[i];
//此程序相当于计算0,1,...N-1的平方和
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
//测试向量内积的正确性
printf("Does GPU value %.6g = %.6g?\n",c,
2*sum_squares((float)(N-1)));
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_partial_c);
free(a);
free(b);
free(partial_c);
//测试输出
int j;
scanf("%d",&j);
}