CUDA使用共享内存实现矩阵相乘

#include


#define blockSize 16
#define M 1000
#define N 500
#define K 1000

__managed__ int A[M * N];
__managed__ int B[N * K];
__managed__ int C_GPU[M * K];
__managed__ int C_CPU[M * K];


__global__ void gpu_Matrix_Multi(int *a,int *b,int *c,int m,int n,int k){

    //申请共性内存
    __shared__ int sub_a[blockSize][blockSize];
    __shared__ int sub_b[blockSize][blockSize];

    //当前线程的全局坐标
    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    //将数据从全局内存转移到共享内存
    int sum = 0; //记录矩阵相乘累加的结果
    for(int step=0; step<(n/blockSize); step++){

        //转移a矩阵的局部数据
        //该线程要转移的数据的全局坐标
        int step_x = blockSize * step + threadIdx.x;
        int step_y = iy;
        int index = step_y * n + step_x;
        //进行数据转移
        if(step_x >= n || step_y >= m){ //数据越界
            sub_a[threadIdx.y][threadIdx.x] = 0;
        }else{ //数据未越界
            sub_a[threadIdx.y][threadIdx.x] = a[index];
        }

        //转移b矩阵的局部数据
        step_x = ix;
        step_y = blockSize * step + threadIdx.y;
        index = step_y * k + step_x;
        if(step_x >= k || step_y >= n){ //数据越界
            sub_b[threadIdx.y][threadIdx.x] = 0;
        }else{
            sub_b[threadIdx.y][threadIdx.x] = b[index];
        }

        //进行线程块中线程同步
        __syncthreads();

        for(int i=0; i<blockSize; i++){
            sum += sub_a[threadIdx.x][i] * sub_b[i][threadIdx.y];
        }
        
        //进行线程块中线程同步
        __syncthreads();

    }

    //将计算结果写到输出矩阵
    if(ix < k && iy < m){
        c[iy *k + ix] = sum;
    }

}

void cpu_Matrix_Multi(int *a,int *b,int *c,int m,int n,int k){

    for(int i=0; i<m; i++){
        for(int j=0; j<k; j++){
            int tmp = 0;
            for(int l=0; l<n; l++){
                tmp += a[i * n + l] * b[l * k + j];
            }
            c[i * k + j] = tmp;
        }
    }

}


int main(){

    cudaEvent_t start,stop_GPU,stop_CPU;
    cudaEventCreate(&start);
    cudaEventCreate(&stop_GPU);
    cudaEventCreate(&stop_CPU);


    for(int i=0; i<M; i++){
        for(int j=0; j<N; j++){
            A[i * N + j] = rand()%100;
        }
    }
    for(int i=0; i<N; i++){
        for(int j=0; j<K; j++){
            A[i * K + j] = rand()%100;
        }
    }

    dim3 block(blockSize,blockSize);
    dim3 grid((K + blockSize - 1)/blockSize,(M + blockSize - 1)/blockSize);

    cudaEventRecord(start);

    gpu_Matrix_Multi<<<grid,block>>>(A,B,C_GPU,M,N,K);

    cudaEventRecord(stop_GPU);
    cudaEventSynchronize(stop_GPU);

    cpu_Matrix_Multi(A,B,C_CPU,M,N,K);

    cudaEventRecord(stop_CPU);
    cudaEventSynchronize(stop_CPU);

    float time_GPU,time_CPU;
    cudaEventElapsedTime(&time_GPU,start,stop_GPU);
    cudaEventElapsedTime(&time_CPU,start,stop_CPU);
    printf("GPU Time:%.3lf,   CPU Time:%.3lf\n",time_GPU,time_CPU);

    bool flag = true;
    for(int i=0; i<M; i++){
        for(int j=0; j<N; j++){
            if(fabs(C_GPU[i * N + j] - C_CPU[i * N + j]) != 0){
                flag = false;
            }
        }
    }
    printf("%s\n",flag?"Pass!":"Error!");



    return 0;
}

你可能感兴趣的:(矩阵,算法)