

矩阵乘法是cuda samples中的一个。所以就从这个较为简单的程序开始熟悉cuda吧。下面的代码有三个部分。

  • CPU版本:使用三层循环进行常规的矩阵乘法运算。
  • cuda global memory 版本:使用多个block进行并行计算,但GPU线程访问都是在global memory中。
  • cuda shared memory 版本:也就是cuda 提供的样例代码。基本思路如下,对矩阵A*B=C,用block去划分C矩阵,得到若干个子矩阵SubC。利用分块矩阵乘法公式,在计算某一个SubC时,只取对应的A和B的子块,并读到shared memory中。这样在计算同一个block的C(x,y)时,A和B的子矩阵都是从shared memory 读取。和前面的多次读取global 版本相比少了很多读取延迟。



#include "device_launch_parameters.h"


int matrixMutilCPU(float *A, float *B, int rowA, int colA, int rowB, int colB,float *C){
    float tmp ;
    if (colA != rowB)
        return -1;
    for (int i = 0; i < rowA; i++){
        for (int j = 0; j < colB; j++){ //loop for C
            tmp = 0;
            for (int k = 0; k < colA; k++){
                tmp += (A[colA*i+k]*B[colB*k+j]);
            C[i*colB+j]= tmp;
    return 0;

//globle memory矩阵乘法
__global__ void matrixMutilGPU_slow(float *A, float *B, int colA, int colB, float *C){
    int xb = blockIdx.x;
    int yb = blockIdx.y;
    int x = threadIdx.x;
    int y = threadIdx.y;
    int row = blockDim.y*yb + y;//blockDim.x == blockSize
    int col = blockDim.x*xb + x;
    float tsum = 0;
    //每一个block A和B的子块首地址
    for (int k = 0; k < colB; k++){
        tsum += (A[row*colA+k] * B[k*colB+col]);
    C[row*colB+col] = tsum;

// shared  memory 版分块矩阵乘法
template <int blockSize>
__global__ void matrixMutilGPU(float *A, float *B, int colA,int colB, float *C){
    int xb = blockIdx.x;
    int yb = blockIdx.y;
    int x = threadIdx.x;
    int y = threadIdx.y;

    //  A11 A12 A13    B11 B12 B13
    //  A21 A22 A23  * B21 B22 B23 
    //  A31 A32 A33    B31 B32 B33  ,则计算C22时这样计算:A21*B12+A22*B22+A23*B32    
    float *BeginA = A + yb* blockSize*colA;
    float *EndA = BeginA + colA; 
    float *BeginB = B + blockSize*xb;
    int stepA = blockSize;
    int stepB = blockSize*colB;

    float tsum = 0;
    //每一个block A和B的子块首地址
    for (; BeginA < EndA; BeginA += stepA, BeginB += stepB){
        __shared__ float As[blockSize][blockSize];
        __shared__ float Bs[blockSize][blockSize];

        // 每个线程load一个元素到shared mem中
        As[y][x] = *(BeginA + y*colA + x);
        Bs[y][x] = *(BeginB + y*colB + x);
        for (int k = 0; k < blockSize;k++){
            tsum = tsum + As[y][k] * Bs[k][x];
    //写入结果 注意坐标的计算方法

int main(int argc,char **argv){
    //1.construct Mat A & Mat B
    int rowA, colA,rowB,colB;
    colA = 1024;
    rowB = 1024;
    colB = 1024;

    float *Ah, *Bh,*Ch;
    int memSizeA, memSizeB,memSizeC;
    memSizeA = sizeof(float)*colA*rowA;
    memSizeB = sizeof(float)*colB*rowB;
    memSizeC = sizeof(float)*colA*rowB;
    Ah = (float *)malloc(memSizeA);
    Bh = (float *)malloc(memSizeB);
    Ch = (float *)malloc(memSizeC);
    for (int k = 0; k < colA*rowA; k++)
        Ah[k] = 1.0l;
    for (int k = 0; k < colB*rowB; k++)
        Bh[k] = 0.1l;

    clock_t t1 = clock();
    matrixMutilCPU(Ah, Bh, rowA, colA, rowB, colB, Ch);
    clock_t t2 = clock();
    double time = (t2 - t1)*1.0 / CLOCKS_PER_SEC;
    printf("CPU cost:%.8lf s\n",time);
    printf("C[0]=%.8lf\n", Ch[0]);
    for (int k = 0; k < colB*rowA; k++)
        Ch[k] = 0.0l;
    //2.set GPU params
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    const int blockSize = 16;
    cudaError_t error;
    dim3 threads(blockSize, blockSize);
    dim3 grid(colB / threads.x, rowA / threads.y);
    float *Ad, *Bd, *Cd;

    error=cudaMalloc(&Ad, memSizeA);
    if (error != cudaSuccess)
        printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);

    error=cudaMalloc(&Bd, memSizeB);
    if (error != cudaSuccess)
        printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
    error=cudaMalloc(&Cd, memSizeC);
    if (error != cudaSuccess)
        printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);

    //3. move data from host to device
    cudaMemcpy(Ad, Ah, memSizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(Bd, Bh, memSizeB, cudaMemcpyHostToDevice);
    cudaMemcpy(Cd, Ch, memSizeC, cudaMemcpyHostToDevice);

    t1 = clock();
    //matrixMutilGPU_slow << > >(Ad, Bd, colA, colB, Cd); //全局显存
    matrixMutilGPU << > >(Ad, Bd, colA, colB, Cd);//共享内存

    //5.move data to host
    cudaMemcpy(Ch, Cd, memSizeC, cudaMemcpyDeviceToHost);
    t2 = clock(); //计算GPU耗时 包括内存移动
    time = (t2 - t1)*1.0 / CLOCKS_PER_SEC;
    printf("GPU cost:%.8lf s\n", time);
    printf("C[0]=%.8lf\n", Ch[0]);

    //6.release memory

    return 0;



block大小 4*4 16*16 32*32 64*64
CPU 2.386s 2.386s 2.386s 2.386s
GPU_基础版本 0.036s 0.011s 0.011s kernel函数未运行
GPU_共享内存版本 0.027s 0.005s 0.005s kernel函数未运行

GPU提速的效果是很明显的,GPU版本单纯计算的速度更快,这里面的时间消耗也包括把结果数据从显存搬到内存中的时间。 kernel未运行初步判断是显存不够,分的子块越大,内存占用也就更多。但是基础内存版本为什么也内存不足呢? 有清楚细节的可以在下方留言指教。
