利用GPU和CUDA实现矩阵转置

#include


//内核函数
__global__ void matrixTransposition(int *A,int *B,int n){

    int ix = blockIdx.x * blockDim.x + threadIdx.x;
    int iy = blockIdx.y * blockDim.y + threadIdx.y;

    if(ix < n && iy < n){

        B[ix * n + iy] = A[iy * n + ix];

    }

}



int main(){

    //定义矩阵的大小
    int n = 10;

    //为矩阵分配CPU内存
    int *h_A,*h_B;
    h_A = (int *)malloc(n*n*sizeof(int));
    h_B = (int *)malloc(n*n*sizeof(int));

    //矩阵初始化
    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            h_A[i * n + j] = rand()%6;
        }
    }

    //为矩阵分配GPU内存
    int *d_A,*d_B;
    cudaMalloc((int **)&d_A,n*n*sizeof(int));
    cudaMalloc((int **)&d_B,n*n*sizeof(int));

    //将矩阵数据从CPU拷贝到GPU
    cudaMemcpy(d_A,h_A,n*n*sizeof(int),cudaMemcpyHostToDevice);

    //配置内核
    int blockx = 16;
    int blocky = 16;
    dim3 block(blockx,blocky);
    dim3 grid((n + blockx - 1) / blockx,(n + blocky - 1) / blocky);

    //调用内核函数
    matrixTransposition<<<grid,block>>>(d_A,d_B,n);

    //将结果从GPU拷贝到CPU
    cudaMemcpy(h_B,d_B,n*n*sizeof(int),cudaMemcpyDeviceToHost);

    //输出结果
    printf("原矩阵为:\n");
    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%d,   ",h_A[i * n + j]);
        }
        printf("\n");
    }
    printf("矩阵转置后为:\n");
    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%d,   ",h_B[i * n + j]);
        }
        printf("\n");
    }

    //释放内存
    free(h_A);
    free(h_B);
    cudaFree(d_A);
    cudaFree(d_B);

    return 0;
}

你可能感兴趣的:(矩阵,算法)