/*对stream的介绍*/ #include <stdio.h> #define N 1024*1024//每次从CPU传输到GPU的数据块大小 #define M N*10//CPU上的总数据量 /*测试设备是否支持边执行核函数边复制数据*/ bool support_overlap(){ cudaDeviceProp prop; int preDev; cudaGetDevice(&preDev); cudaGetDeviceProperties(&prop,preDev); if(prop.deviceOverlap) return true; return false; } __global__ void add(int* a,int* b,int* c){ int tid = threadIdx.x+blockIdx.x*blockDim.x; if(tid<N){ c[tid] = a[tid] + b[tid]; } } int main(){ cudaEvent_t start,stop; float elapsedTime; cudaStream_t stream;//声明流 int *a,*b,*c,*d_a,*d_b,*d_c; if(!support_overlap){ printf("Sorry,the device cannot support overlap.\n"); return 0; } cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); cudaStreamCreate(&stream);//初始化流 cudaMalloc((void**)&d_a,N*sizeof(int)); cudaMalloc((void**)&d_b,N*sizeof(int)); cudaMalloc((void**)&d_c,N*sizeof(int)); /*在主机上分配页锁定内存*/ cudaHostAlloc((void**)&a,M*sizeof(int),cudaHostAllocDefault); cudaHostAlloc((void**)&b,M*sizeof(int),cudaHostAllocDefault); cudaHostAlloc((void**)&c,M*sizeof(int),cudaHostAllocDefault); /*用随机数填充主机内存*/ for(int i = 0;i<M;i++){ a[i] = rand(); b[i] = rand(); } /*将输入缓冲区划分为更小的块,并在每个块上执行“数据传输到GPU”,“计算”,“数据传输回CPU”三个步骤*/ for(int i = 0;i<M;i+=N){ //主机上的页锁定内存以异步方式复制到设备上 cudaMemcpyAsync(d_a,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream); cudaMemcpyAsync(d_b,b+i,N*sizeof(int),cudaMemcpyHostToDevice,stream); add<<<N/256,256,0,stream>>>(d_a,d_b,d_c); cudaMemcpyAsync(c+i,d_c,sizeof(int),cudaMemcpyDeviceToHost,stream); } cudaStreamSynchronize(stream);//实现CPU和GPU的同步 cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime,start,stop); printf("%f",elapsedTime); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFreeHost(a); cudaFreeHost(b); cudaFreeHost(b); cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); return 0; }