问题很简单,当我们在编写KERNEL的时候,分配了一个数组,那么这段数组空间是在register中,还是local memory中呢?通过几个测试,我们可以来看一下:
首先一些定义:
#define BLOCK_SIZE 32
#define GRID_SIZE 1
#define ARRAY_SIZE 32
__global__ void kernel1(float *buf) {
float a[ARRAY_SIZE];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
#pragma unroll
for (int i = 0; i < 5; ++i) {
a[i] = buf[tid];
}
float sum = 0.f;
#pragma unroll
for (int i = 0; i < 5; ++i) {
//static indexing
sum += a[i];
}
buf[tid] = sum;
}
__global__ void kernel2(float *buf, int idx_begin) {
float a[ARRAY_SIZE];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
#pragma unroll
for (int i = 0; i < 5; ++i) {
a[i] = buf[tid];
}
float sum = 0.f;
#pragma unroll
for (int i = 0; i < 5; ++i) {
//dynamic indexing with uniform access
//The data in a will be stored in local memory, the access of array a will results a local load instruct per warp(LDL), and replays 0 times
sum += a[i+idx_begin];
}
buf[tid] = sum;
}
__global__ void kernel3(float *buf, int *idxBuf) {
float a[ARRAY_SIZE+5];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
#pragma unroll
for (int i = 0; i < 5; ++i) {
a[i] = buf[tid];
}
float sum = 0.f;
#pragma unroll
for (int i = 0; i < 5; ++i) {
//dynamic indexing with non-uniform access
//The data in a will be stored in local memory, the access of a[] will results a local load instructs per warp(LDL), and replays 31 times
sum += a[i+threadIdx.x];
}
buf[tid] = sum;
}
这个kernel中,同样是动态的索引,因此数组a是在Local memory中分配的。我们可以看一下SASS代码。这里,同样要说明一下,这32个线程的索引不一致,因此从local memory中读取的时候数组不在一个cache line中,因此读操作会replay很多次。这里,32个索引各不相同,因此将会replay31次。
wenx@LINUX-14-04:~/work/$ nvprof --events local_load,local_store,__local_ld_mem_divergence_replays,__local_st_mem_divergence_replays ./a.out
==7893== NVPROF is profiling process 7893, command: ./a.out
==7893== Profiling application: ./a.out
==7893== Profiling result:
==7893== Event result:
Invocations Event Name Min Max Avg
Device "Tesla K20c (0)"
Kernel: kernel3(float*, int*)
1 local_load 5 5 5
1 local_store 5 5 5
1 __local_ld_mem_divergence_replays 155 155 155
1 __local_st_mem_divergence_replays 0 0 0
Kernel: kernel2(float*, int)
1 local_load 5 5 5
1 local_store 5 5 5
1 __local_ld_mem_divergence_replays 0 0 0
1 __local_st_mem_divergence_replays 0 0 0
Kernel: kernel1(float*)
1 local_load 0 0 0
1 local_store 0 0 0
1 __local_ld_mem_divergence_replays 0 0 0
1 __local_st_mem_divergence_replays 0 0 0