CUDA:消除bank conflict前后的矩阵转置

#include 
#include 

const int TILE_DIM = 32;

typedef float real;

__global__ void transpose1(const real* A, real* B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

__global__ void transpose2(const real* A, real* B, const int N)
{
    __shared__ real S[TILE_DIM][TILE_DIM + 1];
    int bx = blockIdx.x * TILE_DIM;
    int by = blockIdx.y * TILE_DIM;

    int nx1 = bx + threadIdx.x;
    int ny1 = by + threadIdx.y;
    if (nx1 < N && ny1 < N)
    {
        S[threadIdx.y][threadIdx.x] = A[ny1 * N + nx1];
    }
    __syncthreads();

    int nx2 = bx + threadIdx.y;
    int ny2 = by + threadIdx.x;
    if (nx2 < N && ny2 < N)
    {
        B[nx2 * N + ny2] = S[threadIdx.x][threadIdx.y];
    }
}

int main()
{
    const int N = 1024;
    real* h_A = new real[N * N];
    real* h_B = new real[N * N];

    // Initialize matrix A
    for (int i = 0; i < N * N; ++i) {
        h_A[i] = i;
    }

    real* d_A, * d_B;
    cudaMalloc(&d_A, N * N * sizeof(real));
    cudaMalloc(&d_B, N * N * sizeof(real));

    cudaMemcpy(d_A, h_A, N * N * sizeof(real), cudaMemcpyHostToDevice);

    dim3 dimGrid((N + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM, 1);
    dim3 dimBlock(TILE_DIM, TILE_DIM, 1);

    transpose2 << <dimGrid, dimBlock >> > (d_A, d_B, N);

    cudaMemcpy(h_B, d_B, N * N * sizeof(real), cudaMemcpyDeviceToHost);

    // ... Here you can print/check the result, etc.

    // Cleanup
    delete[] h_A;
    delete[] h_B;
    cudaFree(d_A);
    cudaFree(d_B);

    return 0;
}

你可能感兴趣的:(矩阵,算法)