cuda入门demo(1)——矩阵相乘

因为实验需要,零零碎碎入门了cuda,但都是用完就扔了,缺乏总结。相隔好几个月又有使用需求,发现有所淡忘,这就是不及时总结的后果,知识碎片化,难以管理!

最近定一个小目标,把今年学习的cuda编程知识从零碎化掌握转化到系统化掌握,所以开一个cuda入门demo系列,记录一些经典demo。希望写完这个系列,自己达成的目标是:能像手搓力扣一样手搓cuda(希望不是想peach)

⚠️主要是自己温习用,只保证代码正确性,不保证讲解的详细性。

目录

      • 矩阵相乘
      • 心得

矩阵相乘

/**
 * 实现两个1000*1000的矩阵相乘
 */
#include 
#include 

using namespace std;

const int N = 1000;
const int M = 1000;


__global__
void matrix_mul_gpu(int* a, int* b, int* c, size_t pitch) {
    // a b c用一维数组抽象表示二维数组
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < M) {
        // 每行的字节数为pitch,char的字节数为1,+操作是移动若干字节,所以用char来计算再转回去
        // 转回去后再直接加col,应该自动加的是col * size(int)这么多字节
        int* e = (int*) ((char*)c + row * pitch) + col;
        int* row_a = (int*) ((char*)a + row * pitch);   // a的row行首地址
        int* col_b = b + col;   // b的col列首地址
        int res = 0;
        for (int i = 0; i < N; ++ i) {
            res += (row_a[i] * (*(int*) ((char*)col_b + i * pitch)));
        }
        *e = res;
    }
    
}


int main() {
    int *h_a, *h_b, *h_c;
    h_a = (int *) malloc(M * N * sizeof(int));
    h_b = (int *) malloc(M * N * sizeof(int));
    h_c = (int *) malloc(M * N * sizeof(int));

    // 初始化
    for (int i = 0; i < M * N; ++ i) {
        h_a[i] = 1;
        h_b[i] = 2;
    }

    cout << 1 << endl;

    int *d_a, *d_b, *d_c;
    size_t pitch_a, pitch_b, pitch_c;
    cudaMallocPitch((void**)&d_a, &pitch_a, M * sizeof(int), N);
    cudaMallocPitch((void**)&d_b, &pitch_b, M * sizeof(int), N);
    cudaMallocPitch((void**)&d_c, &pitch_c, M * sizeof(int), N);

    cout << 2 << endl;

    cudaMemcpy2D(d_a, pitch_a, h_a, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);
    cudaMemcpy2D(d_b, pitch_b, h_b, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);
    cudaMemcpy2D(d_c, pitch_c, h_c, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);

    cout << 3 << endl;

    const dim3 block_dim(32, 32);
    const int U = (N + block_dim.x - 1) / block_dim.x;
    const dim3 grid_dim(U, U);

    matrix_mul_gpu<<<grid_dim, block_dim>>>(d_a, d_b, d_c, pitch_a);

    cudaMemcpy2D(h_a, M * sizeof(int), d_a, pitch_a, M * sizeof(int), N, cudaMemcpyDeviceToHost);
    cudaMemcpy2D(h_b, M * sizeof(int), d_b, pitch_b, M * sizeof(int), N, cudaMemcpyDeviceToHost);
    cudaMemcpy2D(h_c, M * sizeof(int), d_c, pitch_c, M * sizeof(int), N, cudaMemcpyDeviceToHost);

    bool if_all_2000 = true;
    for (int i = 0; i < N; ++ i) {
        for (int j = 0; j < N; ++ j) {
            if (h_c[i * M + j] != 2000) {
                if_all_2000 = false;
                break;
            }
        }
    }

    for (int i = 0; i < 10; i ++ ) {
        for (int j = 0; j < 10; j ++ ) {
            cout << h_c[i * M + j] << ' ';
        }
        cout << endl;
    }

    cout << (if_all_2000 ? "all 2000" : "not all 2000") << endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

心得

cudaMallocPitch和cudaMemcpy2D这两个函数一开始给我写麻了,误解了pitch的含义。

强烈推荐直接去官网看英文文档,非常好使:

https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
涉及gpu内存分配的部分都在这里

你可能感兴趣的:(并行计算,&,CUDA,C++,矩阵,c++,gpu,cuda)