本文是Nvidia 90美金的课程笔记
无论是从出色的性能,还是从易用性来看,CUDA计算平台都是加速计算的制胜法宝。CUDA 提供了一种可扩展 C、C++、Python 和 Fortran 等语言的编码范式,该范式能够在世界上性能超强劲的并行处理器 NVIDIA GPU 上运行经加速的大规模并行代码。CUDA 可以毫不费力地大幅加速应用程序,具有适用于DNN、BLAS、图形分析和FFT等更多运算的高度优化库生态系统,并且还附带功能强大的命令行和可视化性能分析器。
https://www.nvidia.com/en-us/gpu-accelerated-applications/
gridDim.x 网格中的块数,图中为2
blockIdx.x网格中块的索引,图中为0,1
blockDim.x块中线程数 图中为4
threadIdx.x块中线程的索引,图中为,0,1,2,3
流多处理器(Streaming Multiprocessors)
统一内存(UM)
nsight-sys
命令示例
nvcc -o vector-add-no-prefetch 01-vector-add/01-vector-add.cu -run
nsys profile --stats=true -o vector-add-no-prefetch-report ./vector-add-no-prefetch
包含
使用跨网格循环来处理比网格更大的数组
CUDA错误处理功能
#include
#include
inline cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
void initWith(float num, float *a, int N)
{
for(int i = 0; i < N; ++i)
{
a[i] = num;
}
}
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *array, int N)
{
for(int i = 0; i < N; i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
exit(1);
}
}
printf("SUCCESS! All values added correctly.\n");
}
int main()
{
const int N = 2<<20;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
checkCuda( cudaMallocManaged(&a, size) );
checkCuda( cudaMallocManaged(&b, size) );
checkCuda( cudaMallocManaged(&c, size) );
initWith(3, a, N);
initWith(4, b, N);
initWith(0, c, N);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
addVectorsInto<<>>(c, a, b, N);
checkCuda( cudaGetLastError() );
checkCuda( cudaDeviceSynchronize() );
checkElementsAre(7, c, N);
checkCuda( cudaFree(a) );
checkCuda( cudaFree(b) );
checkCuda( cudaFree(c) );
}
包含
查询设备信息
异步内存预取
cudaMemPrefetchAsync(pointerToSomeUMData, size, deviceId);
cudaMemPrefetchAsync(pointerToSomeUMData, size, cudaCpuDeviceId);
将内存预取回CPU
#include
void initWith(float num, float *a, int N)
{
for(int i = 0; i < N; ++i)
{
a[i] = num;
}
}
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *vector, int N)
{
for(int i = 0; i < N; i++)
{
if(vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
/*
* Prefetching can also be used to prevent CPU page faults.
*/
cudaMemPrefetchAsync(a, size, cudaCpuDeviceId);
cudaMemPrefetchAsync(b, size, cudaCpuDeviceId);
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
initWith(3, a, N);
initWith(4, b, N);
initWith(0, c, N);
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr;
cudaError_t asyncErr;
addVectorsInto<<>>(c, a, b, N);
addVectorsErr = cudaGetLastError();
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
/*
* Prefetching can also be used to prevent CPU page faults.
*/
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
包含CUDA并发流
cudaStream_t stream; // CUDA流的类型为 cudaStream_t
cudaStreamCreate(&stream); // 注意,必须将一个指针传递给 cudaCreateStream
someKernel<<stream
作为第4个EC参数传递
cudaStreamDestroy(stream); // 注意,将值(而不是指针)传递给 cudaDestroyStream
流用于并行进行数据初始化的核函数
#include
__global__
void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
a[i] = num;
}
}
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *vector, int N)
{
for(int i = 0; i < N; i++)
{
if(vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr;
cudaError_t asyncErr;
/*
* Create 3 streams to run initialize the 3 data vectors in parallel.
*/
cudaStream_t stream1, stream2, stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
/*
* Give each `initWith` launch its own non-standard stream.
*/
initWith<<>>(3, a, N);
initWith<<>>(4, b, N);
initWith<<>>(0, c, N);
addVectorsInto<<>>(c, a, b, N);
addVectorsErr = cudaGetLastError();
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
checkElementsAre(7, c, N);
/*
* Destroy streams when they are no longer needed.
*/
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaStreamDestroy(stream3);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
手动内存管理CUDA API 调用的代码。
手动分配主机和设备内存
使用流实现数据传输和代码的重叠执行
核函数和内存复制回主机重叠执行
int *host_a, *device_a; // Define host-specific and device-specific arrays.
cudaMalloc(&device_a, size); // device_a
is immediately available on the GPU.
cudaMallocHost(&host_a, size); // host_a
is immediately available on CPU, and is page-locked, or pinned.
initializeOnHost(host_a, N); // No CPU page faulting since memory is already allocated on the host.
// cudaMemcpy
takes the destination, source, size, and a CUDA-provided variable for the direction of the copy.
cudaMemcpy(device_a, host_a, size, cudaMemcpyHostToDevice);
kernel<<
// cudaMemcpy
can also copy data from device to host.
cudaMemcpy(host_a, device_a, size, cudaMemcpyDeviceToHost);
verifyOnHost(host_a, N);
cudaFree(device_a);
cudaFreeHost(host_a); // Free pinned memory like this.
#include
__global__
void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
a[i] = num;
}
}
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
void checkElementsAre(float target, float *vector, int N)
{
for(int i = 0; i < N; i++)
{
if(vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
float *h_c;
cudaMalloc(&a, size);
cudaMalloc(&b, size);
cudaMalloc(&c, size);
cudaMallocHost(&h_c, size);
size_t threadsPerBlock;
size_t numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr;
cudaError_t asyncErr;
/*
* Create 3 streams to run initialize the 3 data vectors in parallel.
*/
cudaStream_t stream1, stream2, stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
/*
* Give each `initWith` launch its own non-standard stream.
*/
initWith<<>>(3, a, N);
initWith<<>>(4, b, N);
initWith<<>>(0, c, N);
for (int i = 0; i < 4; ++i)
{
cudaStream_t stream;
cudaStreamCreate(&stream);
addVectorsInto<<>>(&c[i*N/4], &a[i*N/4], &b[i*N/4], N/4);
cudaMemcpyAsync(&h_c[i*N/4], &c[i*N/4], size/4, cudaMemcpyDeviceToHost, stream);
cudaStreamDestroy(stream);
}
addVectorsErr = cudaGetLastError();
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
checkElementsAre(7, h_c, N);
/*
* Destroy streams when they are no longer needed.
*/
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaStreamDestroy(stream3);
cudaFree(a);
cudaFree(b);
cudaFree(c);
cudaFreeHost(h_c);
}
练习作业
https://yangwc.com/2019/06/20/NbodySimulation/