1 #include2 #include 3 #include 4 #include "book.h" 5 6 #define N (1024*1024) 7 #define FULL_DATA_SIZE (N * 20) 8 9 __global__ void kernel(int *a, int *b, int *c){ 10 int idx = threadIdx.x + blockIdx.x * blockDim.x; 11 if (idx < N){ 12 int idx1 = (idx + 1) % 256; 13 int idx2 = (idx + 2) % 256; 14 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 15 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 16 c[idx] = (as + bs) / 2; 17 } 18 } 19 20 int main(void){ 21 cudaDeviceProp prop; 22 int whichDevice; 23 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 24 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 25 if (!prop.deviceOverlap){ 26 printf("Device will not handle overlaps, so no speed up from streams\n"); 27 return 0; 28 } 29 cudaEvent_t start, stop; 30 float elapsedTime; 31 32 //启动计时器 33 HANDLE_ERROR(cudaEventCreate(&start)); 34 HANDLE_ERROR(cudaEventCreate(&stop)); 35 HANDLE_ERROR(cudaEventRecord(start, 0)); 36 37 //初始化流 38 cudaStream_t stream0, stream1; 39 HANDLE_ERROR(cudaStreamCreate(&stream0)); 40 HANDLE_ERROR(cudaStreamCreate(&stream1)); 41 42 int *host_a, *host_b, *host_c; 43 int *dev_a0, *dev_b0, *dev_c0;//为第0个流分配的GPU内存 44 int *dev_a1, *dev_b1, *dev_c1;//为第1个流分配的GPU内存 45 46 //在GPU上分配内存 47 HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int))); 48 HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int))); 49 HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int))); 50 HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int))); 51 HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int))); 52 HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int))); 53 54 //分配在流中使用的页锁定内存 55 HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int), 56 cudaHostAllocDefault)); 57 HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int), 58 cudaHostAllocDefault)); 59 HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int), 60 cudaHostAllocDefault)); 61 62 for (int i = 0; i < FULL_DATA_SIZE; i++){ 63 host_a[i] = rand(); 64 host_b[i] = rand(); 65 } 66 67 //在整体数据上循环,每个数据块的大小为N 68 for (int i = 0; i < FULL_DATA_SIZE; i += N * 2){ 69 //将锁定内存以异步方式复制到设备上 70 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), 71 cudaMemcpyHostToDevice, stream0)); 72 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), 73 cudaMemcpyHostToDevice, stream0)); 74 kernel << 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0); 75 76 //将数据从设备复制回锁定内存 77 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), 78 cudaMemcpyDeviceToHost, stream0)); 79 80 //将锁定内存以异步方式复制到设备上 81 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N* sizeof(int), 82 cudaMemcpyHostToDevice, stream1)); 83 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N, N * sizeof(int), 84 cudaMemcpyHostToDevice, stream1)); 85 kernel << 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1); 86 87 //将数据从设备复制回到锁定内存 88 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), 89 cudaMemcpyDeviceToHost, stream1)); 90 } 91 92 //在停止应用程序的计时器之前,首先将两个流进行同步 93 HANDLE_ERROR(cudaStreamSynchronize(stream0)); 94 HANDLE_ERROR(cudaStreamSynchronize(stream1)); 95 HANDLE_ERROR(cudaEventRecord(stop, 0)); 96 HANDLE_ERROR(cudaEventSynchronize(stop)); 97 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); 98 printf("Time taken: %3.1f ms\n", elapsedTime); 99 100 //释放流和内存 101 HANDLE_ERROR(cudaFreeHost(host_a)); 102 HANDLE_ERROR(cudaFreeHost(host_b)); 103 HANDLE_ERROR(cudaFreeHost(host_c)); 104 HANDLE_ERROR(cudaFree(dev_a0)); 105 HANDLE_ERROR(cudaFree(dev_b0)); 106 HANDLE_ERROR(cudaFree(dev_c0)); 107 HANDLE_ERROR(cudaFree(dev_a1)); 108 HANDLE_ERROR(cudaFree(dev_b1)); 109 HANDLE_ERROR(cudaFree(dev_c1)); 110 HANDLE_ERROR(cudaStreamDestroy(stream0)); 111 HANDLE_ERROR(cudaStreamDestroy(stream1)); 112 113 return 0; 114 115 116 117 }
如果同时调度某个流的所有操作,那么很容易在无意中阻塞另一个流的复制操作或者核函数执行。要解决这个问题,在将操作放入流的队列时应采用宽度优先方式,而非深度优先方式。
1 #include2 #include 3 #include 4 #include "book.h" 5 6 #define N (1024*1024) 7 #define FULL_DATA_SIZE (N * 20) 8 9 __global__ void kernel(int *a, int *b, int *c){ 10 int idx = threadIdx.x + blockIdx.x * blockDim.x; 11 if (idx < N){ 12 int idx1 = (idx + 1) % 256; 13 int idx2 = (idx + 2) % 256; 14 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 15 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 16 c[idx] = (as + bs) / 2; 17 } 18 } 19 20 int main(void){ 21 cudaDeviceProp prop; 22 int whichDevice; 23 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 24 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 25 if (!prop.deviceOverlap){ 26 printf("Device will not handle overlaps, so no speed up from streams\n"); 27 return 0; 28 } 29 cudaEvent_t start, stop; 30 float elapsedTime; 31 32 //启动计时器 33 HANDLE_ERROR(cudaEventCreate(&start)); 34 HANDLE_ERROR(cudaEventCreate(&stop)); 35 HANDLE_ERROR(cudaEventRecord(start, 0)); 36 37 //初始化流 38 cudaStream_t stream0, stream1; 39 HANDLE_ERROR(cudaStreamCreate(&stream0)); 40 HANDLE_ERROR(cudaStreamCreate(&stream1)); 41 42 int *host_a, *host_b, *host_c; 43 int *dev_a0, *dev_b0, *dev_c0;//为第0个流分配的GPU内存 44 int *dev_a1, *dev_b1, *dev_c1;//为第1个流分配的GPU内存 45 46 //在GPU上分配内存 47 HANDLE_ERROR(cudaMalloc((void **)&dev_a0, N * sizeof(int))); 48 HANDLE_ERROR(cudaMalloc((void **)&dev_b0, N * sizeof(int))); 49 HANDLE_ERROR(cudaMalloc((void **)&dev_c0, N * sizeof(int))); 50 HANDLE_ERROR(cudaMalloc((void **)&dev_a1, N * sizeof(int))); 51 HANDLE_ERROR(cudaMalloc((void **)&dev_b1, N * sizeof(int))); 52 HANDLE_ERROR(cudaMalloc((void **)&dev_c1, N * sizeof(int))); 53 54 //分配在流中使用的页锁定内存 55 HANDLE_ERROR(cudaHostAlloc((void **)&host_a, FULL_DATA_SIZE * sizeof(int), 56 cudaHostAllocDefault)); 57 HANDLE_ERROR(cudaHostAlloc((void **)&host_b, FULL_DATA_SIZE * sizeof(int), 58 cudaHostAllocDefault)); 59 HANDLE_ERROR(cudaHostAlloc((void **)&host_c, FULL_DATA_SIZE * sizeof(int), 60 cudaHostAllocDefault)); 61 62 for (int i = 0; i < FULL_DATA_SIZE; i++){ 63 host_a[i] = rand(); 64 host_b[i] = rand(); 65 } 66 67 //在整体数据上循环,每个数据块的大小为N 68 for (int i = 0; i > >(dev_a0, dev_b0, dev_c0); 90 kernel << 102 103 104 //在停止应用程序的计时器之前,首先将两个流进行同步 105 HANDLE_ERROR(cudaStreamSynchronize(stream0)); 106 HANDLE_ERROR(cudaStreamSynchronize(stream1)); 107 HANDLE_ERROR(cudaEventRecord(stop, 0)); 108 HANDLE_ERROR(cudaEventSynchronize(stop)); 109 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop)); 110 printf("Time taken: %3.1f ms\n", elapsedTime); 111 112 //释放流和内存 113 HANDLE_ERROR(cudaFreeHost(host_a)); 114 HANDLE_ERROR(cudaFreeHost(host_b)); 115 HANDLE_ERROR(cudaFreeHost(host_c)); 116 HANDLE_ERROR(cudaFree(dev_a0)); 117 HANDLE_ERROR(cudaFree(dev_b0)); 118 HANDLE_ERROR(cudaFree(dev_c0)); 119 HANDLE_ERROR(cudaFree(dev_a1)); 120 HANDLE_ERROR(cudaFree(dev_b1)); 121 HANDLE_ERROR(cudaFree(dev_c1)); 122 HANDLE_ERROR(cudaStreamDestroy(stream0)); 123 HANDLE_ERROR(cudaStreamDestroy(stream1)); 124 125 return 0; 126 127 128 129 }> >(dev_a1, dev_b1, dev_c1); 91 92 // enqueue copies of c from device to locked memory 93 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, 94 N * sizeof(int), 95 cudaMemcpyDeviceToHost, 96 stream0)); 97 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, 98 N * sizeof(int), 99 cudaMemcpyDeviceToHost, 100 stream1)); 101 }