#include
#include
__device__ float devData; //- GPU静态变量(所有设备代码均可见,主机代码不允许直接访问)
__global__ void checkGlobalVariable()
{
printf("threadIdx.x=%d devData = %0.2f\n", threadIdx.x, devData);
devData += 2.0f;
}
int main(int argc, char **argv)
{
float init_value = 3.14f;
//- 注意传递参数是devData,而不是地址,不需要指定拷贝方向,因为函数名已经明确了
cudaMemcpyToSymbol(devData, &init_value, sizeof(float));
dim3 block(3);
dim3 grid(2);
checkGlobalVariable<<>>();
cudaDeviceSynchronize();
//- 注意传递参数是devData,而不是地址,不需要指定拷贝方向,因为函数名已经明确了
cudaMemcpyFromSymbol(&init_value, devData, sizeof(float));
printf("devData=%0.2f\n", init_value);
return 0;
}
[mmhe@k231 chapter4]$ ./test
threadIdx.x=0 devData = 3.14
threadIdx.x=1 devData = 3.14
threadIdx.x=2 devData = 3.14
threadIdx.x=0 devData = 3.14
threadIdx.x=1 devData = 3.14
threadIdx.x=2 devData = 3.14
devData=5.14
在CUDA框架下,主机内存分为两种:(1)可分页内存——由malloc开辟,有free释放;(2)固定内存——由cudaMallocHost开辟,由cudaFreeHost释放,占用的内存更多,但是主机设备间的数据传递更快。
#include
#include
int main(int argc, char **argv)
{
int *pin_arr, *page_arr;
int nElem = 1<<30;
cudaMallocHost((void **)&pin_arr, nElem * sizeof(int));
page_arr = (int *)malloc(nElem * sizeof(int));
int *d_arr;
cudaMalloc((void **)&d_arr, nElem * sizeof(int));
cudaMemcpy(d_arr, page_arr, nElem * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(page_arr, d_arr, nElem * sizeof(int), cudaMemcpyDeviceToHost);
// cudaMemcpy(d_arr, pin_arr, nElem * sizeof(int), cudaMemcpyHostToDevice);
// cudaMemcpy(pin_arr, d_arr, nElem * sizeof(int), cudaMemcpyDeviceToHost);
free(page_arr);
cudaFreeHost(pin_arr);
cudaDeviceReset();
return 0;
}
==45339== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 84.54% 3.21079s 1 3.21079s 3.21079s 3.21079s [CUDA memcpy DtoH]
15.46% 587.34ms 1 587.34ms 587.34ms 587.34ms [CUDA memcpy HtoD]
==48677== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 50.19% 574.67ms 1 574.67ms 574.67ms 574.67ms [CUDA memcpy DtoH]
49.81% 570.23ms 1 570.23ms 570.23ms 570.23ms [CUDA memcpy HtoD]
上半部分是使用可分页的内存,下半部分使用的是固定内存。可以发现,对于设备到主机的传输,固定内存要远快于可分页内存,但是对于主机到设备的传输,则性能差不多。
零拷贝内存是主机和设备都能直接访问的一片内存,主要用于显存不够时的一个补充。但是由于数据需要经过PCIe总线,因此如果频繁对这片区域的数据进行读写,那么会显著降低性能。
以矢量相加为例子,比较一下当A和B都位于GMEM和位于零拷贝内存上,核函数执行效率的区别。
#include
#include
__host__ void init(int *arr, int nElem)
{
for (int i = 0; i < nElem; i++)
{
arr[i] = i;
}
}
__global__ void sumArr(int *arrA, int *arrB, int *arrC, int nElem)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nElem)
{
arrC[index] = arrA[index] + arrB[index];
}
}
__host__ void check(int *arrA, int *arrB, int *arrC, int nElem)
{
for (int i = 0; i < nElem; i++)
{
if (arrC[i] != (arrA[i] + arrB[i]))
{
printf("error at i = %d\n", i);
return;
}
}
printf("result is correct\n");
}
int main(int argc, char **argv)
{
int nElem = 1<<24;
int *h_arrA, *h_arrB, *h_arrC, *d_arrA_G, *d_arrB_G, *d_arrC_G, *d_arrA_U, *d_arrB_U, *d_arrC_U;
h_arrA = (int *)malloc(nElem * sizeof(int));
h_arrB = (int *)malloc(nElem * sizeof(int));
h_arrC = (int *)malloc(nElem * sizeof(int));
cudaMalloc((void **)&d_arrA_G, nElem * sizeof(int));
cudaMalloc((void **)&d_arrB_G, nElem * sizeof(int));
cudaMalloc((void **)&d_arrC_G, nElem * sizeof(int));
cudaHostAlloc((void **)&d_arrA_U, nElem * sizeof(int), cudaHostAllocMapped);
cudaHostAlloc((void **)&d_arrB_U, nElem * sizeof(int), cudaHostAllocMapped);
cudaHostAlloc((void **)&d_arrC_U, nElem * sizeof(int), cudaHostAllocMapped);
init(h_arrA, nElem);
init(h_arrB, nElem);
init(d_arrA_U, nElem);
init(d_arrB_U, nElem);
cudaMemcpy(d_arrA_G, h_arrA, nElem * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_arrB_G, h_arrB, nElem * sizeof(int), cudaMemcpyHostToDevice);
dim3 block(512);
dim3 grid((nElem + block.x - 1) / block.x);
//- A和B都是全局内存,C是全局内存
sumArr<<>>(d_arrA_G, d_arrB_G, d_arrC_G, nElem);
cudaMemcpy(h_arrC, d_arrC_G, nElem * sizeof(int), cudaMemcpyDeviceToHost);
check(h_arrA, h_arrB, h_arrC, nElem);
// //- A和B都是零拷贝内存,C是全局内存
// sumArr<<>>(d_arrA_U, d_arrB_U, d_arrC_G, nElem);
// cudaMemcpy(h_arrC, d_arrC_G, nElem * sizeof(int), cudaMemcpyDeviceToHost);
// check(h_arrA, h_arrB, h_arrC, nElem);
// //- A和B都是零拷贝内存,C也是零拷贝内存
// sumArr<<>>(d_arrA_U, d_arrB_U, d_arrC_U, nElem);
// cudaDeviceSynchronize(); //- 这个地方一定要注意,鉴于d_arrC_U主机可以访问,就没有用调用cudaMemcpy来拷贝,因此一定要显式同步。
// check(d_arrA_U, d_arrB_U, d_arrC_U, nElem);
free(h_arrA);
free(h_arrB);
free(h_arrC);
cudaFree(d_arrA_G);
cudaFree(d_arrB_G);
cudaFree(d_arrC_G);
cudaFreeHost(d_arrA_U);
cudaFreeHost(d_arrB_U);
cudaFreeHost(d_arrC_U);
cudaDeviceReset();
return 0;
}
#- A和B都是全局内存,C是全局内存
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 60.39% 42.203ms 1 42.203ms 42.203ms 42.203ms [CUDA memcpy DtoH]
27.05% 18.908ms 2 9.4539ms 9.3450ms 9.5628ms [CUDA memcpy HtoD]
12.56% 8.7768ms 1 8.7768ms 8.7768ms 8.7768ms sumArr(int*, int*, int*, int)
#- A和B都是零拷贝内存,C是全局内存
GPU activities: 54.52% 39.520ms 1 39.520ms 39.520ms 39.520ms [CUDA memcpy DtoH]
29.03% 21.047ms 2 10.523ms 10.407ms 10.639ms [CUDA memcpy HtoD]
16.45% 11.926ms 1 11.926ms 11.926ms 11.926ms sumArr(int*, int*, int*, int)
#- A和B都是零拷贝内存,C也是零拷贝内存
GPU activities: 66.12% 49.425ms 2 24.713ms 21.864ms 27.561ms [CUDA memcpy HtoD]
33.88% 25.330ms 1 25.330ms 25.330ms 25.330ms sumArr(int*, int*, int*, int)
可以看到,对性能的影响还是挺大的。零拷贝内存不适合大数据集,书上提到,随着处理的数据变大,减速越明显。
上面的零拷贝例子中,我们直接将零拷贝内存的指针传递给了核函数,让它在设备代码上被解析。其实这里隐藏了一个称为虚拟统一寻址(UVA)的机制。对于不支持这一特性的老版本cuda来说,我们不能这样用同一个指针来在设备和主机代码中进行解析,而是需要先通过API获取零拷贝内存的地址,然后传递给核函数,才能进行解析,如:
int *d_arrA_U_ref, *d_arrB_U_ref, *d_arrC_U_ref;
cudaHostGetDevicePointer((void **)&d_arrA_U_ref, d_arrA_U, 0);
cudaHostGetDevicePointer((void **)&d_arrB_U_ref, d_arrB_U, 0);
cudaHostGetDevicePointer((void **)&d_arrC_U_ref, d_arrC_U, 0);
sumArr<<>>(d_arrA_U_ref, d_arrB_U_ref, d_arrC_U_ref, nElem);
这两种做法性能是相同的,只是代码可读性和维护性更强。
这里有一个推论概念:以L1缓存内存加载事务为例,粒度为128字节。首先会以数组的首地址为起始点,每间隔128字节为一个区间,如果一个warp中的32个thread请求的数据内存刚好都落在这一个区间内,那么只需要一个128字节内存加载事务就能满足需求;否则,如果散落在两个区间内,就需要两个加载事务。这里的关键要明确,划分区间的起始点是数组的首地址。
#include
#include
__global__ void sumArrOffSet(int *arrA, int *arrB, int *arrC, int nElem, int offset)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int k = index + offset;
if (index < nElem)
{
arrC[index] = arrA[k] + arrB[k];
}
}
int main(int argc, char **argv)
{
int nElem = 1<<14;
int *d_arrA, *d_arrB, *d_arrC;
cudaMalloc((void **)&d_arrA, nElem * sizeof(int));
cudaMalloc((void **)&d_arrB, nElem * sizeof(int));
cudaMalloc((void **)&d_arrC, nElem * sizeof(int));
dim3 block(512);
dim3 grid((nElem + block.x - 1) / block.x);
sumArrOffSet<<>>(d_arrA, d_arrB, d_arrC, nElem, atoi(argv[1]));
cudaDeviceSynchronize();
cudaFree(d_arrA);
cudaFree(d_arrB);
cudaFree(d_arrC);
cudaDeviceReset();
return 0;
}
[mmhe@k231 chapter4]$ nvprof --metrics gld_transactions,gst_transactions ./test 0
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: sumArrOffSet(int*, int*, int*, int, int)
1 gld_transactions Global Load Transactions 1024 1024 1024
1 gst_transactions Global Store Transactions 512 512 512
[mmhe@k231 chapter4]$ nvprof --metrics gld_transactions,gst_transactions ./test 1
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: sumArrOffSet(int*, int*, int*, int, int)
1 gld_transactions Global Load Transactions 2048 2048 2048
1 gst_transactions Global Store Transactions 512 512 512
[mmhe@k231 chapter4]$ nvprof --metrics gld_transactions,gst_transactions ./test 16
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: sumArrOffSet(int*, int*, int*, int, int)
1 gld_transactions Global Load Transactions 2048 2048 2048
1 gst_transactions Global Store Transactions 512 512 512
[mmhe@k231 chapter4]$ nvprof --metrics gld_transactions,gst_transactions ./test 32
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: sumArrOffSet(int*, int*, int*, int, int)
1 gld_transactions Global Load Transactions 1024 1024 1024
1 gst_transactions Global Store Transactions 512 512 512
本次处理的矢量长度为16384,blockDim.x=512,共有32个block,每个block有16个warp。
写入的粒度有多种,因此它能提供更高的利用率。
#include
#include
struct point
{
int x;
int y;
};
__global__ void kernel(point *arr, int nElem)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nElem)
{
arr[index].x = arr[index].x + 2;
arr[index].y = arr[index].y + 2;
}
}
int main(int argc, char **argv)
{
int nElem = 1<<14;
struct point *d_arr;
cudaMalloc((void **)&d_arr, nElem * sizeof(point));
dim3 block(512);
dim3 grid((nElem + block.x - 1)/block.x);
kernel<<>>(d_arr, nElem);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
数组尺寸16384,block尺寸512,共有32个block,每个block有16个warp。
当结构体对象是数组的基本单元时,在这个数组的内存中,对象成员x和y是紧挨在一起的,即[x,y,x,y…x,y]。那么对于一个warp中对x的索引,会分散在以数组首地址为起始的两个128字节区间内,也就是说,加载x需要2个事务,那么总的核函数加载x和y共需要2048个内存加载事务,并且利用率只有50%。
#include
#include
struct point
{
int *x;
int *y;
};
__global__ void init(point *A, int nElem)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nElem)
{
A->x[index] = index;
A->y[index] = index;
}
}
__global__ void kernel(point *A, int nElem)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nElem)
{
A->x[index] += 2;
A->y[index] += 2;
}
}
int main(int argc, char **argv)
{
int nElem = 32;
point *h_arr;
point *d_arr;
h_arr = (point *)malloc(sizeof(point));
cudaMalloc((void **)&(h_arr->x), nElem * sizeof(int));
cudaMalloc((void **)&(h_arr->y), nElem * sizeof(int));
cudaMalloc((void **)&d_arr, sizeof(point));
cudaMemcpy(d_arr, h_arr, sizeof(point), cudaMemcpyHostToDevice);
dim3 block(32);
dim3 grid((nElem + block.x - 1)/block.x);
init<<>>(d_arr, nElem);
cudaDeviceSynchronize();
kernel<<>>(d_arr, nElem);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
Invocations Metric Name Metric Description Min Max Avg
Device "Tesla K80 (0)"
Kernel: init(point*, int)
1 gld_transactions Global Load Transactions 2 2 2
1 gst_transactions Global Store Transactions 2 2 2
1 gld_efficiency Global Memory Load Efficiency 25.00% 25.00% 25.00%
1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
Kernel: kernel(point*, int)
1 gld_transactions Global Load Transactions 4 4 4
1 gst_transactions Global Store Transactions 2 2 2
1 gld_efficiency Global Memory Load Efficiency 85.00% 85.00% 85.00%
1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00%
这个结果比较有意思,一步一步来分析:(32个数据,整体只有一个warp,便于分析)
K80一块板上集成了2个GPU,内存总线宽度是384bit,内存时钟频率为2505Mhz,双通道,8bit为一个字节,则内存带宽峰值为:
2 × 384 b i t × 2.505 G H z × 2 8 b i t s / b y t e = 480.96 G B / s \frac{2\times 384bit\times2.505GHz\times2}{8bits/byte} = 480.96GB/s 8bits/byte2×384bit×2.505GHz×2=480.96GB/s
effective bandwidth (GB/s) = ( byte read + byte written ) time × 1024 × 1024 × 1024 \text{effective bandwidth (GB/s)} = \frac{\left(\text{byte read + byte written}\right)}{\text{time}\times1024\times1024\times1024} effective bandwidth (GB/s)=time×1024×1024×1024(byte read + byte written)
L1缓存是否加载会导致加载粒度不同。
#include
#include
#include"../../CodeSamples/common/common.h"
__global__ void copyRow(int *arrA, int *arrB, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row)
{
arrB[y * col + x] = arrA[y * col + x];
}
}
__global__ void copyCol(int *arrA, int *arrB, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row)
{
arrB[x * col + y] = arrA[x * col + y];
}
}
__global__ void NaiveRow(int *arrA, int *arrB, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row)
{
arrB[x * col + y] = arrA[y * col + x];
}
}
__global__ void NaiveCol(int *arrA, int *arrB, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row)
{
arrB[y * col + x] = arrA[x * col + y];
}
}
int main(int argc, char **argv)
{
cudaSetDevice(5);
int row = 1<<14;
int col = 1<<14;
int nElem = row * col;
int *d_arrA, *d_arrB;
cudaMalloc((void **)&d_arrA, nElem * sizeof(int));
cudaMalloc((void **)&d_arrB, nElem * sizeof(int));
dim3 block(atoi(argv[1]), atoi(argv[2]));
dim3 grid((row + block.y - 1) / block.y, (col + block.x - 1) / block.x);
printf("gridDim:(%d,%d,%d) blockDim:(%d,%d,%d)\n", grid.x, grid.y, grid.z, block.x, block.y, block.z);
double iStart = seconds();
copyRow<<>>(d_arrA, d_arrB, row, col);
cudaDeviceSynchronize();
double iElaps = seconds() - iStart;
printf("copyRow time=%fs, brandwidth = %fGB/s\n", iElaps, 2.0*row*col*sizeof(int)/iElaps/1024/1024/1024);
iStart = seconds();
copyCol<<>>(d_arrA, d_arrB, row, col);
cudaDeviceSynchronize();
iElaps = seconds() - iStart;
printf("copyCol time=%fs, brandwidth = %fGB/s\n", iElaps, 2.0*row*col*sizeof(int)/iElaps/1024/1024/1024);
iStart = seconds();
NaiveRow<<>>(d_arrA, d_arrB, row, col);
cudaDeviceSynchronize();
iElaps = seconds() - iStart;
printf("NaiveRow time=%fs, brandwidth = %fGB/s\n", iElaps, 2.0*row*col*sizeof(int)/iElaps/1024/1024/1024);
iStart = seconds();
NaiveCol<<>>(d_arrA, d_arrB, row, col);
cudaDeviceSynchronize();
iElaps = seconds() - iStart;
printf("NaiveCol time=%fs, brandwidth = %fGB/s\n", iElaps, 2.0*row*col*sizeof(int)/iElaps/1024/1024/1024);
cudaFree(d_arrA);
cudaFree(d_arrB);
return 0;
}
L1缓存开启
Kernel BandWidth Notes 加载吞吐量 存储吞吐量 加载效率 存储效率
copyRow 41.340298GB/s Upper bound 23.154GB/s 23.154GB/s 100.00% 100.00%
copyCol 21.524815GB/s Lower bound 308.22GB/s 77.056GB/s 3.12% 12.50%
NaiveRow 30.102732GB/s 合并读/间隔写 15.946GB/s 127.57GB/s 100.00% 12.50%
NaiveCol 22.602213GB/s 间隔写/合并读 379.29GB/s 11.853GB/s 3.12% 100.00%
L1缓存关闭
Kernel BandWidth Notes 加载吞吐量 存储吞吐量 加载效率 存储效率
copyRow 41.362517GB/s Upper bound 23.460GB/s 23.460GB/s 100.00% 100.00%
copyCol 13.826092GB/s Lower bound 53.674GB/s 53.674GB/s 12.50% 12.50%
NaiveRow 29.522903GB/s 合并读/间隔写 15.419GB/s 123.35GB/s 100.00% 12.50%
NaiveCol 17.568648GB/s 间隔写/合并读 72.086GB/s 9.0108GB/s 12.50% 100.00%