CUDA:使用共享内存的数组规约

#include 
#include 

// 你的kernel函数
typedef float real; // 假设real是float
const int N = 1024 * 1024; // 假设N是这个值,你可以根据实际需求进行修改

void __global__ reduce_shared(real* d_x, real* d_y)
{
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int n = bid * blockDim.x + tid;
    __shared__ real s_y[128];
    s_y[tid] = (n < N) ? d_x[n] : 0.0;
    __syncthreads();

    for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
    {

        if (tid < offset)
        {
            s_y[tid] += s_y[tid + offset];
        }
        __syncthreads();
    }

    if (tid == 0)
    {
        d_y[bid] = s_y[0];
    }
}

int main() {
    real* h_x = new real[N];
    real* h_y = new real[N / 128];

    for (int i = 0; i < N; ++i) {
        h_x[i] = 1.0f; // 为了简化,这里将h_x的所有元素初始化为1.0f
    }

    real* d_x, * d_y;
    cudaMalloc((void**)&d_x, sizeof(real) * N);
    cudaMalloc((void**)&d_y, sizeof(real) * N / 128);

    cudaMemcpy(d_x, h_x, sizeof(real) * N, cudaMemcpyHostToDevice);

    dim3 block(128);
    dim3 grid((N + block.x - 1) / block.x);

    reduce_shared << <grid, block >> > (d_x, d_y);

    cudaMemcpy(h_y, d_y, sizeof(real) * N / 128, cudaMemcpyDeviceToHost);

    // 打印第一个元素作为例子,你可以根据需要修改
    std::cout << "The first element of result is: " << h_y[0] << std::endl;

    cudaFree(d_x);
    cudaFree(d_y);
    delete[] h_x;
    delete[] h_y;

    return 0;
}

你可能感兴趣的:(算法)