CUDA卷积加速(一)

推出这个系列的目的呢,主要是因为CUDA学习的一个主要用途就是对图像处理进行加速,而处理图像的过程中经常要用到卷积。卷积的计算有多种形式,本系列主要研究的是二维矩阵的卷积计算。从最原始的计算方法(就是本科教科书上的那种)再到优化后的适用于大型数据的算法,均提供并行化的设计思路。

考虑到机器学习很火,很多人也用python来做图形处理。但不好的地方在于python目前还不支持GPGPU编程,这就很尴尬。所以说编程语言掌握全面一点总归是好的嘛!

下面是普通版的CUDA进行卷积的操作方法,是对卷积最初始的算法进行的GPU平台的移植。我们称之为版本二(版本一就是原始版的嘛)。算法本身不难,但他考验了一些CUDA的基本功,对以后的深入学习有着铺垫作用。

那么利用计算机计算卷积最原始的版本呢,莫过于离散乘积求和,即对f[n-m]g[m]求和。那么整个卷积就只需要几个循环便可以做出来了。

那么这里我们也提供对上述过程最简单的CUDA程序给大家看:

__global__ void conv_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= len_out)
    {
        return;
    }

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - m;
        if (0 <= k && k < len_a)
        {
            sum += ina[k] * inb[m];
        }
    }
    out[tid] = sum;
}
__global__ void conv2_kernel(const float *ina, const float *inb, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int tx = threadIdx.x;                  //线程数
    const size_t len_s = blockDim.x;                      //Block x宽度

    extern __shared__ float sIna[];                       //共享内存

    if (tid >= len_out)
    {
        return;
    }

    if (tid < len_a)
    {
        sIna[tx] = ina[tid];
    }
    __syncthreads();

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - len_b + m + 1;
        int sk = tx - len_b + m + 1;
        if (0 <= sk && sk < len_s)
        {
            sum += sIna[sk] * inb[m];
        }
        else
        {
            if (0 <= k && k < len_a)
            {
                sum += ina[k] * inb[m];
            }
        }
    }
    out[tid] = sum;
}

__constant__ static float c_b[1024];
__global__ void conv3_kernel(const float *ina, float *out, size_t len_a, size_t len_b, size_t len_out)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int tx = threadIdx.x;
    const size_t len_s = blockDim.x;

    extern __shared__ float sIna[];

    const float *inb = &c_b[0];

    if (tid >= len_out)
    {
        return;
    }

    if (tid < len_a)
    {
        sIna[tx] = ina[tid];
    }
    __syncthreads();

    float sum = 0.0f;
    for (int m = 0; m < len_b; ++m)
    {
        int k = tid - len_b + m + 1;
        int sk = tx - len_b + m + 1;
        if (0 <= sk && sk < len_s)
        {
            sum += sIna[sk] * inb[m];
        }
        else
        {
            if (0 <= k && k < len_a)
            {
                sum += ina[k] * inb[m];
            }
        }
    }
    out[tid] = sum;
}

 
   
一口气列了三种,都是非常经典的加速技巧。
下面附上main()
int main(int argc, char const *argv[])
{
    thrust::host_vector h_a(N_A);
    thrust::device_vector d_a(N_A);
    thrust::host_vector h_b(N_B);
    thrust::device_vector d_b(N_B);

    size_t N = h_a.size() + h_b.size() - 1;
    size_t L = pow( 2, static_cast(log2(N - 1)) + 1 );

    thrust::host_vector h_result(N);
    thrust::device_vector d_result(N);

    thrust::sequence(h_a.begin(), h_a.end());
    // thrust::sequence(h_b.begin(), h_b.end());
    thrust::sequence(h_b.rbegin(), h_b.rend());
    
    // get raw pointer for kernel
    float *raw_point_a = thrust::raw_pointer_cast( &d_a[0] );
    float *raw_point_b = thrust::raw_pointer_cast( &d_b[0] );
    float *raw_point_result = thrust::raw_pointer_cast( &d_result[0] );

    int numThreads = NUM_THREADS;
    int numBlocks = (L + numThreads - 1) / numThreads;

    cudaEvent_t start, stop;
    checkCudaErrors( cudaEventCreate(&start) );
    checkCudaErrors( cudaEventCreate(&stop) );
    cudaEventRecord(start);

    // copy a b to device
    thrust::copy(h_a.begin(), h_a.end(), d_a.begin());
    thrust::copy(h_b.begin(), h_b.end(), d_b.begin());

    // conv(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, L, numBlocks, numThreads);
    conv2(raw_point_a, raw_point_b, raw_point_result, N_A, N_B, N, numBlocks, numThreads);

    thrust::copy(d_result.begin(), d_result.end(), h_result.begin());

    cudaEventRecord(stop);
    checkCudaErrors( cudaThreadSynchronize() );
    float time = 0;
    cudaEventElapsedTime(&time, start, stop);

    cout << "run times: " << time << " ms " << endl;

    // for (thrust::host_vector::iterator i = h_result.begin(); i != h_result.end(); ++i)
    // {
    //     cout << *i << "\t";
    // }
    cout << endl;

    return 0;
}

 
  

你可能感兴趣的:(并行计算)