因为实验需要,零零碎碎入门了cuda,但都是用完就扔了,缺乏总结。相隔好几个月又有使用需求,发现有所淡忘,这就是不及时总结的后果,知识碎片化,难以管理!
最近定一个小目标,把今年学习的cuda编程知识从零碎化掌握转化到系统化掌握,所以开一个cuda入门demo系列,记录一些经典demo。希望写完这个系列,自己达成的目标是:能像手搓力扣一样手搓cuda(希望不是想peach)
⚠️主要是自己温习用,只保证代码正确性,不保证讲解的详细性。
/**
* 实现两个1000*1000的矩阵相乘
*/
#include
#include
using namespace std;
const int N = 1000;
const int M = 1000;
__global__
void matrix_mul_gpu(int* a, int* b, int* c, size_t pitch) {
// a b c用一维数组抽象表示二维数组
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < M) {
// 每行的字节数为pitch,char的字节数为1,+操作是移动若干字节,所以用char来计算再转回去
// 转回去后再直接加col,应该自动加的是col * size(int)这么多字节
int* e = (int*) ((char*)c + row * pitch) + col;
int* row_a = (int*) ((char*)a + row * pitch); // a的row行首地址
int* col_b = b + col; // b的col列首地址
int res = 0;
for (int i = 0; i < N; ++ i) {
res += (row_a[i] * (*(int*) ((char*)col_b + i * pitch)));
}
*e = res;
}
}
int main() {
int *h_a, *h_b, *h_c;
h_a = (int *) malloc(M * N * sizeof(int));
h_b = (int *) malloc(M * N * sizeof(int));
h_c = (int *) malloc(M * N * sizeof(int));
// 初始化
for (int i = 0; i < M * N; ++ i) {
h_a[i] = 1;
h_b[i] = 2;
}
cout << 1 << endl;
int *d_a, *d_b, *d_c;
size_t pitch_a, pitch_b, pitch_c;
cudaMallocPitch((void**)&d_a, &pitch_a, M * sizeof(int), N);
cudaMallocPitch((void**)&d_b, &pitch_b, M * sizeof(int), N);
cudaMallocPitch((void**)&d_c, &pitch_c, M * sizeof(int), N);
cout << 2 << endl;
cudaMemcpy2D(d_a, pitch_a, h_a, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(d_b, pitch_b, h_b, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);
cudaMemcpy2D(d_c, pitch_c, h_c, M * sizeof(int), M * sizeof(int), N, cudaMemcpyHostToDevice);
cout << 3 << endl;
const dim3 block_dim(32, 32);
const int U = (N + block_dim.x - 1) / block_dim.x;
const dim3 grid_dim(U, U);
matrix_mul_gpu<<<grid_dim, block_dim>>>(d_a, d_b, d_c, pitch_a);
cudaMemcpy2D(h_a, M * sizeof(int), d_a, pitch_a, M * sizeof(int), N, cudaMemcpyDeviceToHost);
cudaMemcpy2D(h_b, M * sizeof(int), d_b, pitch_b, M * sizeof(int), N, cudaMemcpyDeviceToHost);
cudaMemcpy2D(h_c, M * sizeof(int), d_c, pitch_c, M * sizeof(int), N, cudaMemcpyDeviceToHost);
bool if_all_2000 = true;
for (int i = 0; i < N; ++ i) {
for (int j = 0; j < N; ++ j) {
if (h_c[i * M + j] != 2000) {
if_all_2000 = false;
break;
}
}
}
for (int i = 0; i < 10; i ++ ) {
for (int j = 0; j < 10; j ++ ) {
cout << h_c[i * M + j] << ' ';
}
cout << endl;
}
cout << (if_all_2000 ? "all 2000" : "not all 2000") << endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
return 0;
}
cudaMallocPitch和cudaMemcpy2D这两个函数一开始给我写麻了,误解了pitch的含义。
强烈推荐直接去官网看英文文档,非常好使:
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
涉及gpu内存分配的部分都在这里