多个流之间的异步执行

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include
using namespace std;


const int N = 8192*8192;
const int threads = 256;
const int blocks = 16*2048/threads;


__global__ void kernel(int* a, int* b, int* c,int size)
{
const int tid = blockDim.x*blockIdx.x + threadIdx.x;
for (int i = tid; i < size; i += gridDim.x*blockDim.x)
{
c[i] = a[i] + b[i];
}
}


int main(void)
{
int *a_h, *a_d, *b_h, *b_d, *c_h, *c_d;


a_h = (int*)malloc(N*sizeof(int));
b_h = (int*)malloc(N*sizeof(int));
c_h = (int*)malloc(N*sizeof(int));


cudaMalloc((void**)&a_d, N*sizeof(int));
cudaMalloc((void**)&b_d, N*sizeof(int));
cudaMalloc((void**)&c_d, N*sizeof(int));


for (int i = 0; i < N; i++)
{
a_h[i] = i;
b_h[i] = 2 * i;
}


const int nStream = 8;
int size = N*sizeof(int) / nStream;
int offset;
cudaStream_t stream[nStream];
for (int i = 0; i < nStream; i++)
{
cudaStreamCreate(&stream[i]);
}


/*cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);*/


for (int i = 0; i < nStream; i++)
{
offset = i*N / nStream;
cudaMemcpyAsync(a_d + offset, a_h + offset, size, cudaMemcpyHostToDevice, stream[i]);
cudaMemcpyAsync(b_d + offset, b_h + offset, size, cudaMemcpyHostToDevice, stream[i]);
}


cudaEvent_t start, stop;
float elapsedtime;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < nStream; i++)
{
offset = i*N / nStream;
kernel << > >(a_d + offset, b_d + offset, c_d + offset,size/4);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cout << elapsedtime << endl;
//cudaThreadSynchronize();
for (int i = 0; i < nStream; i++)
{
offset = i*N / nStream;
cudaMemcpyAsync(c_h + offset, c_d + offset, size, cudaMemcpyDeviceToHost, stream[i]);
}

for (int i = 0; i < N; i++)
{
if (c_h[i] % 3 != 0)
cout << c_h[i] << "  ";
}
cout << "GPU completed....." << endl;






int *e_d, *f_d, *g_d;
cudaMalloc((void**)&e_d, N*sizeof(int));
cudaMalloc((void**)&f_d, N*sizeof(int));
cudaMalloc((void**)&g_d, N*sizeof(int));
//cudaEventRecord(start, 0);
cudaMemcpy(e_d, a_h, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(f_d, b_h, N*sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
kernel << > >(e_d, f_d, g_d,N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime, start, stop);
cout << elapsedtime << endl;






clock_t begin, end;
begin = clock();
for (int i = 0; i < N; i++)
{
c_h[i] = a_h[i] + b_h[i];
}
end = clock() - begin;
cout << end << endl;
cout << "CPU completed....." << endl;


free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return 0;

}

多个流之间的异步执行_第1张图片

你可能感兴趣的:(多个流之间的异步执行)