CUDA学习笔记(6) 共享内存与全局内存

  共享内存(Shared memory)是位于每个流处理器组(SM)中的高速内存空间,主要作用是存放一个线程块(Block)中所有线程都会频繁访问的数据。流处理器(SP)访问它的速度仅比寄存器(Register)慢,它的速度远比全局显存快。但是他也是相当宝贵的资源,一般只有几十KByte,

  这里以我电脑上的“Quadro K620”为例:

硬件资源 参数
流处理器(SP) 128 * 3 = 384
流处理器组(SM/SMM) 3
全局显存(Global memory) 2048MByte
每个SM中的共享内存(Shared memory) 64KByte
每个SM中的寄存器(Register) 65536个32bit寄存器


  可以看到全局显存(Global memory)的大小比共享显存(Shared memory)大了几个数量级。当然,共享显存的访问速度也一定比全局显存快。再从硬件上看:

CUDA学习笔记(6) 共享内存与全局内存_第1张图片

  每个流处理器组(SM)都拥有属于自己的共享显存(Shared memory)且大小只有64KByte,而这些流处理器组(SM)都位于同一块芯片上(on the same chip),这块芯片又通过PCB电路连接内存芯片(DRAM)。

  • 流处理器(SP)共享显存(Shared memory)上数据的访问属于片上访问,可以立刻取得数据。
  • 流处理器(SP)内存芯片(DRAM)的访问要通过请求内存控制器等一系列操作,然后才能得到数据。

  为了能够明显的看到共享显存(Shared memory)的速度优势,我们就需要把那些被频繁访问的数据直接放到共享显存(Shared memory)中,这样我们的程序在执行时就可以避免频繁访问全局显存(Global memory),导致时间浪费在数据的传输和访问上。

  举个例子,我们需要统计一张超高清图片或者20Mbyte大小的一维数组中每个数值的个数,也就是做直方图统计,结合前面的原子操作(Atomic functions)可以将计算时间至少减半,当然不同架构的GPU会有不同程度的优化。

*注4:直方图统计用于记录一组数据中的数值分布情况,即每个数值出现了多少次,统计数值的分布。

  方案一:

  1. 统计数值分布的数组直接放在全局显存(Global memory)中,每个线程依次读取全局显存中的待统计数据。
  2. 然后使用原子操作对统计数组(Global)中与源数据中数值所对应的元素执行“++”。

  方案二:

  1. 统计数值分布的数组放在全局显存(Global memory)中,每个线程块拥有一个自己的统计数组(Shared)放在共享显存(Shared memory)中。
  2. 线程块中的每个线程依次读取全局显存中的待统计数据,然后使用原子操作对统计数组(Shared)中与源数据中数值所对应的元素执行“++”。
  3. 对线程块中的线程同步,以保证该线程块中的所有线程都已经完成原子操作。
  4. 将当前线程块中的统计数组(Shared)加到统计数组(Global)中。

  这样,方案二就通过使用共享显存(Shared memory)避免了对全局显存(Global memory)的频繁访问,以达到提高程序运行速度的效果。

  方案一核函数的实现:

// This is kernel function file !
#define hist_MAX_VALUE 256

extern "C" __global__ void kernel_func(unsigned int * thread_index_array, unsigned char *source_data_array, \
    unsigned int * histogram_array, unsigned int *clock_counter_array)
{
    // 计算线程号
    unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \
        threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
    // 总线程数
    unsigned int thread_num = gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z;

    // 记录线程号
    thread_index_array[thread_index] = thread_index;

    unsigned int counter_1, counter_2;

    counter_1 = clock();

    unsigned int value = source_data_array[thread_index];
    atomicAdd(&histogram_array[value], 1);

    counter_2 = clock();

    clock_counter_array[thread_index] = counter_1;
    clock_counter_array[thread_index + thread_num] = counter_2;
}

CUDA学习笔记(6) 共享内存与全局内存_第2张图片

结果是:方案一统计10MByte大小数据的直方图使用了11.35毫秒


  方案二核函数的实现:

// This is kernel function file !
#define hist_MAX_VALUE 256

extern "C" __global__ void kernel_func(unsigned int * thread_index_array, unsigned char *source_data_array, \
    unsigned int * histogram_array, unsigned int *clock_counter_array)
{
    // 计算线程号
    unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \
        threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
    // 总线程数
    unsigned int thread_num = gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z;

    // 记录线程号
    thread_index_array[thread_index] = thread_index;

    unsigned int counter_1, counter_2;

    counter_1 = clock();

    // 初始化共享内存统计数组
    __shared__ unsigned int histogram_shared[hist_MAX_VALUE];
    histogram_shared[thread_index % hist_MAX_VALUE] = 0;
    __syncthreads();

    // 对每个Block进行统计
    unsigned int value = source_data_array[thread_index];
    atomicAdd(&histogram_shared[value], 1);
    __syncthreads();
    // 将每个Block统计的共享显存数据 加到 全局显存变量中
    atomicAdd(&histogram_array[thread_index % hist_MAX_VALUE], histogram_shared[thread_index % hist_MAX_VALUE]);

    counter_2 = clock();

    clock_counter_array[thread_index] = counter_1;
    clock_counter_array[thread_index + thread_num] = counter_2;
}

CUDA学习笔记(6) 共享内存与全局内存_第3张图片

结果是:方案而统计10MByte大小数据的直方图使用了7.33毫秒

  这里我使用了两个寄存器“counter_1, counter_2”来保证记录时间的准确性,通过实验,流处理器(SP)访问寄存器的时间只有6个时钟周期(大约6纳秒),而单个线程耗时越3000个时钟周期,有较高的可信度。

  就结果而言,加速比达到了 160% ,其实在其他显卡上还可能达到 200%400% 甚至更多。



  下面补充一个SMM的结构,SMM是SM的改良变种,与SM的功能完全相同。区别在于它的内部将流处理器(SP)以及寄存器(Register)等区域分成了4块,主要目的是获取更大的带宽。

CUDA学习笔记(6) 共享内存与全局内存_第4张图片

  我认为只要原因是我的“Quadro K620”显卡专门对此做过硬件改良,使用SMM的结构代替原来传统的SM结构,大大增加了带宽。同时也拥有2MByte之大的二级缓存,大大增加了访问全局显存(Global memory)的速度,因此使用共享显存(Shared memory)加速比不如其他游戏型显卡高。这些可以在 参考文献5 中查证。





我的“main_code.cpp”代码:

// CUDA Shared memory

// C/C++ IO
#include 
#include 
using namespace std;

// For the CUDA runtime routines (prefixed with "cuda_")
#include 
#include 

// helper functions and utilities to work with CUDA
#include 
#include 
// NVRTC
#include 

// timer
#include 
#include 


// GPU核心时钟频率
int GPU_clock_rate;
// Block最大线程数
int max_thread_per_block;
// 检查显卡硬件属性
void check_Cuda_information(int main_argc, char ** main_argv);
// 输出PTX代码
void print_PTX(char * PTX_ptr, int size_of_PTX);


int main(int argc, char **argv)
{
    // Host变量(内存变量)
    const int ARRAY_LENGTH = 10 * 1024 * 1024, STATIS_size = UCHAR_MAX + 1;
    unsigned int *thread_index_array;
    unsigned char *source_data_array;
    unsigned int *clock_counter_array, *histogram_array;
    float *cost_time_array;

    // Device变量(显存变量)
    CUdeviceptr dev_thread_index_array, dev_source_data_array, dev_clock_counter_array, dev_histogram_array;

    // 检查显卡硬件属性
    check_Cuda_information(argc, &argv[0]);

#pragma region(CPU硬件计时器 开始计时)
    //用QueryPerformanceCounter()来计时  微秒
    LARGE_INTEGER  large_interger;
    double CPU_counter_frequency;
    __int64
        CPU_c1_start, CPU_c2_compile, CPU_c3_alloc, CPU_c4_caculate, CPU_c5_copyout, CPU_c6_end;

    QueryPerformanceFrequency(&large_interger);
    CPU_counter_frequency = large_interger.QuadPart;
    printf("CPU硬件计数器频率:\t%.2lf Hz\r\n\r\n", CPU_counter_frequency);

    // 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c1_start = large_interger.QuadPart;
#pragma endregion


#pragma region(读取、编译CUDA核函数)
    // 读取、编译、加载CUDA核函数
    char *ptx, *kernel_file;
    size_t ptxSize;

    // 打开核函数的 .cu 文件 并编译为PTX
    kernel_file = sdkFindFilePath("kernel.cu", argv[0]);
    // 如果找不到 kernel_file 会报错:error: unable to open ./xxxx.cu for reading!
    compileFileToPTX(kernel_file, NULL, NULL, &ptx, &ptxSize, 0);
    // 输出PTX代码
    //print_PTX(ptx, (int)ptxSize);


    // 选择GPU设备,加载核函数到GPU设备
    CUmodule module = loadPTX(ptx, argc, argv);
    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "kernel_func"));

    // CPU 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c2_compile = large_interger.QuadPart;
#pragma endregion


#pragma region(内存、显存分配/初始化)
    thread_index_array = (unsigned int*)malloc(ARRAY_LENGTH * sizeof(unsigned int));
    source_data_array = (unsigned char*)malloc(ARRAY_LENGTH * sizeof(unsigned char));
    histogram_array = (unsigned int*)malloc(STATIS_size * sizeof(unsigned int));
    clock_counter_array = (unsigned int*)malloc(2 * ARRAY_LENGTH * sizeof(unsigned int));
    cost_time_array = (float*)malloc(ARRAY_LENGTH * sizeof(float));

    checkCudaErrors(cuMemAlloc(&dev_thread_index_array, ARRAY_LENGTH * sizeof(unsigned int)));
    checkCudaErrors(cuMemAlloc(&dev_source_data_array, 2 * ARRAY_LENGTH * sizeof(unsigned char)));
    checkCudaErrors(cuMemAlloc(&dev_histogram_array, STATIS_size * sizeof(unsigned int)));
    checkCudaErrors(cuMemAlloc(&dev_clock_counter_array, 2 * ARRAY_LENGTH * sizeof(unsigned int)));

    for (int i = 0; i < ARRAY_LENGTH; i++)
    {
        source_data_array[i] = (unsigned char)(rand() % STATIS_size);//STATIS_size);
    }

    // 传入待统计的数组
    checkCudaErrors(cuMemcpyHtoD(dev_source_data_array, source_data_array, ARRAY_LENGTH * sizeof(unsigned char)));
    checkCudaErrors(cuMemsetD32(dev_histogram_array, 0, STATIS_size));

    // CPU 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c3_alloc = large_interger.QuadPart;
#pragma endregion


#pragma region(执行核函数,并在核函数完成时检查错误报告)
    // 定义网格的大小(block_rect)、块的大小(thread_rect)
    dim3 block_rect(max(1, ARRAY_LENGTH / max_thread_per_block), 1, 1);
    dim3 thread_rect(min(max_thread_per_block, ARRAY_LENGTH), 1, 1);

    cout << "block_rect :\t" << block_rect.x << "\t" << block_rect.y << "\t" << block_rect.z << "\t" << endl;
    cout << "thread_rect :\t" << thread_rect.x << "\t" << thread_rect.y << "\t" << thread_rect.z << "\t" << endl;

    void *arr[] = { (void *)&dev_thread_index_array, (void*)&dev_source_data_array, \
        (void *)&dev_histogram_array, (void *)&dev_clock_counter_array };

    // 启动核函数
    checkCudaErrors(cuLaunchKernel(kernel_addr, \
        block_rect.x, block_rect.y, block_rect.z, \
        thread_rect.x, thread_rect.y, thread_rect.z, \
        0, 0, \
        &arr[0], 0));

    // 同步,检查核函数计算过程
    checkCudaErrors(cuCtxSynchronize());


    // CPU 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c4_caculate = large_interger.QuadPart;
#pragma endregion


#pragma region(传出数据,输出结果)
    checkCudaErrors(cuMemcpyDtoH(thread_index_array, dev_thread_index_array, ARRAY_LENGTH * sizeof(int)));
    checkCudaErrors(cuMemcpyDtoH(histogram_array, dev_histogram_array, STATIS_size * sizeof(unsigned int)));
    checkCudaErrors(cuMemcpyDtoH(clock_counter_array, dev_clock_counter_array, 2 * ARRAY_LENGTH * sizeof(unsigned int)));


    // 输出结果
    /*
    float temp_float;
    printf("\r\n\tthread index\tresult\t\tclock counter\tcost time(ms)\r\n");
    for (int i = 0; i < 128; i++)
    {
        printf("\t%u\t", thread_index_array[i]);
        printf("\t%u\t\t", source_data_array[i]);
        printf("%u\t", \
            ((clock_counter_array[i + ARRAY_LENGTH] - clock_counter_array[i])));

        if (clock_counter_array[i + ARRAY_LENGTH] > clock_counter_array[i])
        {
            temp_float = ((float)((clock_counter_array[i + ARRAY_LENGTH] - clock_counter_array[i])))\
                / (float)(GPU_clock_rate / 1000);
        }
        else
        {
            temp_float = ((float)((MAXUINT32 + clock_counter_array[i + ARRAY_LENGTH] - clock_counter_array[i])))\
                / (float)(GPU_clock_rate / 1000);
        }

        cost_time_array[i] = temp_float;
        printf("\t%f\r\n", cost_time_array[i]);
    }
    cout << endl;
    for (int i = 0; i < STATIS_size; i++)
    {
        printf("%d\t%u\r\n", i, histogram_array[i]);
    }
    */


    // CPU 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c5_copyout = large_interger.QuadPart;
#pragma endregion


#pragma region(释放内存、显存)
    // 释放显存
    checkCudaErrors(cuMemFree(dev_thread_index_array));
    checkCudaErrors(cuMemFree(dev_source_data_array));
    checkCudaErrors(cuMemFree(dev_histogram_array));
    checkCudaErrors(cuMemFree(dev_clock_counter_array));
    // 释放内存
    free(thread_index_array);
    free(source_data_array);
    free(histogram_array);
    free(clock_counter_array);
    free(cost_time_array);

    // CPU 计数
    QueryPerformanceCounter(&large_interger);
    CPU_c6_end = large_interger.QuadPart;
#pragma endregion


#pragma region(CPU结束计时,输出结果)


    printf("\r\n编译.cu时长:\t\t%.4lf 毫秒\r\n", (double)(CPU_c2_compile - CPU_c1_start) / CPU_counter_frequency * 1000);
    printf("分配/初始化数据时长:\t%.4lf 毫秒\r\n", (double)(CPU_c3_alloc - CPU_c2_compile) / CPU_counter_frequency * 1000);
    printf("GPU计算时长:\t\t%.4lf 毫秒\r\n", (double)(CPU_c4_caculate - CPU_c3_alloc) / CPU_counter_frequency * 1000);
    printf("传出数据时长:\t\t%.4lf 毫秒\r\n", (double)(CPU_c5_copyout - CPU_c4_caculate) / CPU_counter_frequency * 1000);
    printf("释放内存时长:\t\t%.4lf 毫秒\r\n", (double)(CPU_c6_end - CPU_c5_copyout) / CPU_counter_frequency * 1000);
    printf("CPU计时总时长:\t\t%.4lf 毫秒\r\n\r\n", (double)(CPU_c6_end - CPU_c1_start) / CPU_counter_frequency * 1000);
#pragma endregion



    return 0;
}



// 检查显卡硬件属性
void check_Cuda_information(int main_argc, char ** main_argv)
{
    // 设备ID
    int devID;
    // 设备属性
    cudaDeviceProp deviceProps;

    //
    cout << "argc = " << main_argc << endl;
    for (int i = 0; i < main_argc; i++)
    {
        printf("argv[%d] = %s\r\n", i, main_argv[i]);
    }
    cout << endl;

    // 获取设备ID
    devID = findCudaDevice(main_argc, (const char **)main_argv);

    // 获取GPU信息
    checkCudaErrors((CUresult)cudaGetDeviceProperties(&deviceProps, devID));
    cout << "devID = " << devID << endl;
    // 显卡名称
    cout << "CUDA device is \t\t\t" << deviceProps.name << endl;
    // 每个 线程块(Block)中的最大线程数
    cout << "CUDA max Thread per Block is \t" << deviceProps.maxThreadsPerBlock << endl;
    max_thread_per_block = deviceProps.maxThreadsPerBlock;
    // 每个 多处理器组(MultiProcessor)中的最大线程数
    cout << "CUDA max Thread per SM is \t" << deviceProps.maxThreadsPerMultiProcessor << endl;
    // GPU 中 SM 的数量
    cout << "CUDA SM counter\t\t\t" << deviceProps.multiProcessorCount << endl;
    // 线程束大小
    cout << "CUDA Warp size is \t\t" << deviceProps.warpSize << endl;
    // 每个SM中共享内存的大小
    cout << "CUDA shared memorize is \t" << deviceProps.sharedMemPerMultiprocessor << "\tbyte" << endl;
    // 二级缓存(cache)大小
    cout << "CUDA L2 cache size is \t\t" << deviceProps.l2CacheSize << "\tbyte" << endl;
    // 
    cout << "CUDA texture 1D size is \t" << deviceProps.maxTexture1D << "\tbyte" << endl;
    // 全局显存大小
    cout << "CUDA global memory size is \t" << deviceProps.totalGlobalMem / 1024 / 1024 << "\tMB" << endl;
    // 每个Block中寄存器的数量
    cout << "CUDA register per block is \t" << deviceProps.regsPerBlock << "\t" << endl;
    // 每个SM中寄存器的数量
    cout << "CUDA register per SM is \t" << deviceProps.regsPerMultiprocessor << "\t" << endl;
    // GPU时钟频率
    //  int    clockRate;       /**< Clock frequency in kilohertz */
    cout << "GPU clock frequency is \t\t" << deviceProps.clockRate << "\tkHz" << endl;
    GPU_clock_rate = deviceProps.clockRate * 1000;


    cout << endl;
}


void print_PTX(char * PTX_ptr, int size_of_PTX)
{
    // 输出PTX
    printf("\r\nHere is the PTX code:\r\n\r\n");
    for (int i = 0; i < size_of_PTX; i++)
    {
        printf("%c", *(PTX_ptr + i));
    }
    printf("\r\nAbove is the PTX code--------------------------\r\n\r\n");

    return;
}

我的“kernel.cu”代码:

// This is kernel function file !
#define hist_MAX_VALUE 256

extern "C" __global__ void kernel_func(unsigned int * thread_index_array, unsigned char *source_data_array, \
    unsigned int * histogram_array, unsigned int *clock_counter_array)
{
    // 计算线程号
    unsigned int block_index = blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
    unsigned int thread_index = block_index * blockDim.x * blockDim.y * blockDim.z + \
        threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.x * blockDim.y;
    // 总线程数
    unsigned int thread_num = gridDim.x * gridDim.y * gridDim.z * blockDim.x * blockDim.y * blockDim.z;

    // 记录线程号
    thread_index_array[thread_index] = thread_index;

    unsigned int counter_1, counter_2;

    counter_1 = clock();
#if(1)
    // 初始化共享内存统计数组
    __shared__ unsigned int histogram_shared[hist_MAX_VALUE];
    histogram_shared[thread_index % hist_MAX_VALUE] = 0;
    __syncthreads();

    // 对每个Block进行统计
    unsigned int value = source_data_array[thread_index];
    atomicAdd(&histogram_shared[value], 1);
    __syncthreads();
    // 将每个Block统计的共享显存数据 加到 全局显存变量中
    atomicAdd(&histogram_array[thread_index % hist_MAX_VALUE], histogram_shared[thread_index % hist_MAX_VALUE]);

#else
    unsigned int value = source_data_array[thread_index];
    atomicAdd(&histogram_array[value], 1);
#endif
    counter_2 = clock();

    clock_counter_array[thread_index] = counter_1;
    clock_counter_array[thread_index + thread_num] = counter_2;
}





参考:

1.《CUDA并行程序设计》机械工业出版社

2.NVIDIA官方原子操作说明文档

3.CUDA Toolkit Documation

4.CUDA Toolkit Documation Shared memory篇

5.GM107芯片白皮书 NVIDIA GeForce GTX 750 Ti Whitepaper

你可能感兴趣的:(CUDA学习笔记)