利用GPU和CUDA实现矩阵乘法

#include


//内核函数
__global__ void matrixMulti(int *A,int *B,int *C,int m,int n,int k){

    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if(ix < k && iy < m){
        int tmp = 0;
        for(int i=0;i<n;i++){
            tmp += A[iy * n + i] * B[i * k + ix];
        }
        C[iy * k + ix] = tmp;
    }

}



int main(){

    //定义矩阵的大小
    int m = 1000;
    int n = 1000;
    int k = 1000;

    //为矩阵分配CPU内存
    int *h_A,*h_B,*h_C;
    h_A = (int *)malloc(m*n*sizeof(int));
    h_B = (int *)malloc(n*k*sizeof(int));
    h_C = (int *)malloc(m*k*sizeof(int));

    //矩阵初始化
    for(int i=0;i<m;i++){
        for(int j=0;j<n;j++){
            h_A[i * n + j] = rand()%1024;
        }
    }
    for(int i=0;i<n;i++){
        for(int j=0;j<k;j++){
            h_B[i * k + j] = rand()%1024;
        }
    }

    //为矩阵分配GPU内存
    int *d_A,*d_B,*d_C;
    cudaMalloc((int **)&d_A,m*n*sizeof(int));
    cudaMalloc((int **)&d_B,n*k*sizeof(int));
    cudaMalloc((int **)&d_C,m*k*sizeof(int));

    //将矩阵数据从CPU拷贝到GPU
    cudaMemcpy(d_A,h_A,m*n*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,h_B,n*k*sizeof(int),cudaMemcpyHostToDevice);

    //配置内核
    int blockx = 16;
    int blocky = 16;
    dim3 block(blockx,blocky);
    dim3 grid((k + blockx - 1) / blockx,(m + blocky - 1) / blocky);

    //调用内核函数
    matrixMulti<<<grid,block>>>(d_A,d_B,d_C,m,n,k);

    //将结果从GPU拷贝到CPU
    cudaMemcpy(h_C,d_C,m*k*sizeof(int),cudaMemcpyDeviceToHost);

    //输出结果
    for(int i=0;i<m;i++){
        for(int j=0;j<k;j++){
            printf("%d,   ",h_C[i * k + j]);
        }
        printf("\n");
    }

    //释放内存
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

你可能感兴趣的:(矩阵,算法)