cuda——使用多个stream

/*/*对stream的介绍,使用两个流*/
#include 
#define N 1024*1024//每次从CPU传输到GPU的数据块大小
#define M N*20//CPU上的总数据量
/*测试设备是否支持边执行核函数边复制数据*/
bool support_overlap(){
	cudaDeviceProp prop;
	int preDev;
	cudaGetDevice(&preDev);
	cudaGetDeviceProperties(&prop,preDev);
	if(prop.deviceOverlap)
		return true;
	return false;
}
__global__ void add(int* a,int* b,int* c){
	int tid = threadIdx.x+blockIdx.x*blockDim.x;
	if(tid>>(d_a1,d_b1,d_c1);
		add<<>>(d_a2,d_b2,d_c2);
		cudaMemcpyAsync(c+i,d_c1,sizeof(int),cudaMemcpyDeviceToHost,stream1);
		cudaMemcpyAsync(c+i+N,d_c2,sizeof(int),cudaMemcpyDeviceToHost,stream2);
	}
	/*实现CPU和GPU的同步*/
	cudaStreamSynchronize(stream1);
	cudaStreamSynchronize(stream2);
	cudaEventRecord(stop,0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime,start,stop);
	printf("%f",elapsedTime);
	cudaFree(d_a1);
	cudaFree(d_b1);
	cudaFree(d_c1);
	cudaFree(d_a2);
	cudaFree(d_b2);
	cudaFree(d_c2);
	cudaFreeHost(a);
	cudaFreeHost(b);
	cudaFreeHost(b);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	cudaStreamDestroy(stream1);
	cudaStreamDestroy(stream2);
	return 0;
}

你可能感兴趣的:(cuda)