CUDA编程模型系列十( CUDA Stream / CUDA 流 / 多流执行)
本系列视频目的是帮助开发者们一步步地学会利用CUDA编程模型加速GPU应用, 我们的口号是: 让GPU飞起来
本期我介绍了CUDA 中Stream的概念和使用方法, CUDA流也是很多CUDA加速库(cuBLAS, cuDNN, TensorRT)中常用的手段, 它能让多个执行队列并行执行, 还能让这些队列执行的过程中相对独立, 彼此不受影响
// cuda stream
// cuda lib, cudnn cublas tensort
// stream-- 一系列的指令执行队列
// mul-stream -- asyn -- order-- asyn
#include
#include
// a[] + b[] = c[]
#define N (1024 * 1024)
#define FULL_SIZE (N * 30)
//stream[2]
__global__ void kernel(int *a, int *b, int *c)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if( idx < N)
{
int idx1 = (idx + 1)%256;
int idx2 = (idx + 2)%256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0;
c[idx] = (as + bs)/2;
}
}
int main()
{
cudaDeviceProp prop;
int whichDevice;
cudaGetDevice(&whichDevice);
cudaGetDeviceProperties(&prop, whichDevice);
if( !prop.deviceOverlap )
{
printf("Your device will not support speed up from multi-streams\n");
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t my_stream[3];
int *h_a, *h_b, *h_c;
int *d_a0, *d_b0, *d_c0;
int *d_a1, *d_b1, *d_c1;
int *d_a2, *d_b2, *d_c2;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaStreamCreate(&my_stream[0]);
cudaStreamCreate(&my_stream[1]);
cudaStreamCreate(&my_stream[2]);
cudaMalloc((void**) &d_a0, N * sizeof(int));
cudaMalloc((void**) &d_b0, N * sizeof(int));
cudaMalloc((void**) &d_c0, N * sizeof(int));
cudaMalloc((void**) &d_a1, N * sizeof(int));
cudaMalloc((void**) &d_b1, N * sizeof(int));
cudaMalloc((void**) &d_c1, N * sizeof(int));
cudaMalloc((void**) &d_a2, N * sizeof(int));
cudaMalloc((void**) &d_b2, N * sizeof(int));
cudaMalloc((void**) &d_c2, N * sizeof(int));
cudaHostAlloc((void**) &h_a, FULL_SIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**) &h_b, FULL_SIZE * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**) &h_c, FULL_SIZE * sizeof(int), cudaHostAllocDefault);
for(int i = 0; i<FULL_SIZE; i++)
{
h_a[i] = rand()%1024;
h_b[i] = rand()%1024;
}
cudaEventRecord(start);
for(int i = 0; i < FULL_SIZE; i += N * 1)
{
cudaMemcpyAsync(d_a0, h_a+i, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[0]);
//cudaMemcpyAsync(d_a1, h_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[1]);
//cudaMemcpyAsync(d_a2, h_a+i+N+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[2]);
cudaMemcpyAsync(d_b0, h_a+i, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[0]);
//cudaMemcpyAsync(d_b1, h_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[1]);
//cudaMemcpyAsync(d_b2, h_a+i+N+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[2]);
kernel<<<N/256, 256, 0, my_stream[0]>>>(d_a0, d_b0, d_c0);
//kernel<<>>(d_a1, d_b1, d_c1);
//kernel<<>>(d_a2, d_b2, d_c2);
cudaMemcpyAsync(h_c+i, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);
//cudaMemcpyAsync(h_c+i+N, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);
//cudaMemcpyAsync(h_c+i+N+N, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);
}
cudaStreamSynchronize(my_stream[0]);
cudaStreamSynchronize(my_stream[1]);
cudaStreamSynchronize(my_stream[2]);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("Time: %3.2f ms\n", elapsedTime);
// cudaFree
return 0;
}