CUDA计算向量内积的程序(源自CUDA范例编程)

__syncthreads() acts as a barrier at which all threads in the block must wait before any is allowed to proceed.
//计算向量的内积程序

#include<stdio.h>
#define imin(a,b)    (a<b?a:b)

//N为输入的向量的规模
const int N=33*1024;
const int threadsPerBlock=256;
const int blocksPerGrid=
			imin(32,(N+threadsPerBlock-1)/threadsPerBlock);

__global__ void dot(float *a,float *b,float *c)
{
	//每一个块上都有cache变量的拷贝,相互之间不影响
	__shared__ float cache[threadsPerBlock];
	//tid为线程的偏移量
	int tid=threadIdx.x+blockIdx.x*blockDim.x;
	int cacheIndex=threadIdx.x;
	float temp=0;
	while(tid<N)
	{
		temp+=a[tid]*b[tid];
		//增加的下标量为进程总数
		tid+=blockDim.x*gridDim.x;

	}
	cache[cacheIndex]=temp;
	//同步化当前块上的线程
	__syncthreads();

	int i=blockDim.x/2;
	//在块内计算部分和
	while(i!=0)
	{
		if(cacheIndex<i)
			cache[cacheIndex]+=cache[cacheIndex+i];
		__syncthreads();
		i/=2;
	}

	if(cacheIndex==0)
		c[blockIdx.x]=cache[0];
	
}


int main(void)
{

	float *a,*b,c,*partial_c;

	float *dev_a,*dev_b,*dev_partial_c;

	a=(float*)malloc(N*sizeof(float));
	b=(float*)malloc(N*sizeof(float));
	partial_c=(float*)malloc(blocksPerGrid*sizeof(float));

	cudaMalloc((void**)&dev_a,N*sizeof(float));
	cudaMalloc((void**)&dev_b,N*sizeof(float));
	cudaMalloc((void**)&dev_partial_c,blocksPerGrid*sizeof(float));


	for(int i=0;i<N;i++)
	{

		a[i]=i;
		b[i]=i*2;
	}

	cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice);

	dot<<<blocksPerGrid,threadsPerBlock>>>(dev_a,dev_b,dev_partial_c);

	cudaMemcpy(partial_c,dev_partial_c,blocksPerGrid*sizeof(float),cudaMemcpyDeviceToHost);

	c=0;
	for(int i=0;i<blocksPerGrid;i++)
		c+=partial_c[i];
    //此程序相当于计算0,1,...N-1的平方和
	#define sum_squares(x)   (x*(x+1)*(2*x+1)/6)
    //测试向量内积的正确性
	printf("Does GPU value %.6g = %.6g?\n",c,
		2*sum_squares((float)(N-1)));


	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_partial_c);

	free(a);
	free(b);
	free(partial_c);
	//测试输出
	int j;
	scanf("%d",&j);






}

 

你可能感兴趣的:(编程,c,cache,CUDA,测试,float)