双线性插值GPU加速

传统的物体检测算法中常用的方式有Sliding Window,通过这种方式进行物体搜索需要做大量的图像缩放,会占用较多的计算资源,本文将介绍如何通过GPU加速图像缩放算法。
较为常用的图像缩放算法有双线性插值法,算法的思想比较简单,可以参考百度百科。

下面是我依据百度百科的介绍实现的GPU和CPU版本的双线性插值算法,具体的加速效果根据CPU和GPU有一定差异,跟GPU的代码也有一定的关系,这里水平有限没有合理利用GPU资源,比如共享内存。经过多次测试在5-20倍之间。不过和opencv的版本比较了一下,速度差了有5倍左右,而且opencv的效果也更清晰一些(应该不是传统的双线性插值),看来算法还是关键,不能光拼硬件。
这里代码的执行效率不高,GPU的编程需要对硬件有一定的了解,还有很多可以优化的部分。

#include 
#include "time.h"
#include 
#include 
#include 
#include 

using namespace std;
using namespace cv;

__global__ void gpu_bilinear(const uchar *cuda_original_img, uchar *cuda_resize_img, 
                            int original_cols, int original_rows, int resize_cols, int resize_rows){
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < resize_cols&&y < resize_rows){
        int A, B, C, D; 
        float w1, w2, w3, w4;
        float fx, fy;   //对应原图的坐标
        int ax, ay;     //A点的坐标
        fx = (x + 1)*original_rows / resize_rows - 1;
        fy = (y + 1)*original_cols / resize_cols - 1;
        ax = int(fx);
        ay = int(fy);
        if (ax < 0){
            ax = 0;
        }
        else if (ax > original_cols - 1){
            ax = original_cols - 1;
        }
        if (ay < 0){
            ay = 0;
        }
        else if (ay > original_rows - 1){
            ay = original_rows - 1;
        }
        A = ax + ay*original_cols;
        B = ax + ay*original_cols + 1;
        C = ax + ay*original_cols + original_cols;
        D = ax + ay*original_cols + original_cols + 1;
        w1 = floor(fx) - fx;
        w2 = 1 - w1;
        w3 = floor(fy) - fy;
        w4 = 1 - w3;
        for (int c = 0; c < 3; ++c){
            cuda_resize_img[(x + y*resize_cols) * 3 + c] = cuda_original_img[A * 3 + c] * w1*w3
                + cuda_original_img[B * 3 + c] * w2*w3
                + cuda_original_img[C * 3 + c] * w1*w4
                + cuda_original_img[D * 3 + c] * w2*w4;
        }
    }
}


cv::Mat gpu_resize(cv::Mat input_img, float scale){
    int resize_w = int(input_img.cols*scale);
    int resize_h = int(input_img.rows*scale);
    cv::Mat resize_img(resize_h, resize_w, CV_8UC3);

    uchar *cuda_resize_img;
    int resize_size = resize_img.cols*resize_img.rows * 3;
    cudaMalloc(&cuda_resize_img, resize_size*sizeof(uchar));

    uchar *cuda_original_img;
    int original_size = input_img.cols*input_img.rows * 3;
    cudaMalloc(&cuda_original_img, original_size*sizeof(uchar));
    cudaMemcpy(cuda_original_img, input_img.data, original_size*sizeof(uchar), cudaMemcpyHostToDevice);

    int original_cols = input_img.cols;
    int original_rows = input_img.rows;
    int resize_cols = resize_img.cols;
    int resize_rows = resize_img.rows;

    int BLOCK_SIZE = 256;// 线程数量,要是32的整数倍(和硬件有关)
    dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grids(input_img.cols + BLOCK_SIZE - 1 / BLOCK_SIZE, input_img.rows + BLOCK_SIZE - 1 / BLOCK_SIZE);
    gpu_bilinear << < grids, blocks >> >(cuda_original_img, cuda_resize_img, original_cols, original_rows, resize_cols, resize_rows);
    cudaMemcpy(resize_img.data, cuda_resize_img, resize_size*sizeof(uchar), cudaMemcpyDeviceToHost);

    cudaFree(cuda_resize_img);
    cudaFree(cuda_original_img);
    return resize_img;
}

void cpu_bilinear(cv::Mat const orign_img, cv::Mat &resize_img, float original_w, float original_h, int resize_w, int resize_h){
    for (int c = 0; c < 3; ++c){
        uchar q1 = orign_img.data[(int)(orign_img.cols*ceil(original_h) + ceil(original_w)) * 3 + c];
        uchar q2 = orign_img.data[(int)(orign_img.cols*ceil(original_h) + floor(original_w)) * 3 + c];
        uchar q3 = orign_img.data[(int)(orign_img.cols*floor(original_h) + ceil(original_w)) * 3 + c];
        uchar q4 = orign_img.data[(int)(orign_img.cols*floor(original_h) + floor(original_w)) * 3 + c];
        float w1 = ceil(original_w) == floor(original_w) ? 0.5 : floor(original_w) - original_w;
        float w2 = ceil(original_w) == floor(original_w) ? 0.5 : original_w - ceil(original_w);
        float w3 = ceil(original_h) == floor(original_h) ? 0.5 : floor(original_h) - original_h;
        float w4 = ceil(original_h) == floor(original_h) ? 0.5 : original_h - ceil(original_h);
        resize_img.data[(resize_h*resize_img.cols + resize_w) * 3 + c] = uchar(w1*w3*(q1 + q3) + w2*w4*(q2 + q4));
    }

}

cv::Mat cpu_resize(cv::Mat input_img, float scale){
    int resize_w = static_cast<int>(input_img.cols*scale);
    int resize_h = static_cast<int>(input_img.rows*scale);
    cv::Mat resize_img(resize_h, resize_w, CV_8UC3);
    for (int i = 0; i < resize_h; ++i){
        for (int j = 0; j < resize_w; ++j){
            float original_w = j * input_img.cols / resize_w;
            float original_h = i * input_img.rows / resize_h;
            cpu_bilinear(input_img, resize_img, original_w, original_h, j, i);
        }
    }
    return resize_img;
}

int main()
{
    uchar *test;
    cudaMalloc(&test, sizeof(uchar));//第一次初始化时间比较久
    cudaFree(test);
    Mat img = imread("./1.jpg");
    clock_t start, finish;
    double   duration;
    start = clock();
    Mat resize_c;
    Mat resize_g;
    Mat resize_cv;
    float scale = 1.789;
    for (int i = 0; i < 100; ++i){
        //resize_c = cpu_resize(img, scale);
        resize_g = gpu_resize(img, scale);
        //resize(img, resize_cv, Size(img.rows*scale, img.cols*scale));

    }
    finish = clock();
    resize_c = cpu_resize(img, scale);
    resize_g = gpu_resize(img, scale);
    resize(img, resize_cv, Size(img.rows*scale, img.cols*scale));
    imshow("Opencv", resize_cv);
    imshow("Cpu", resize_c);
    imshow("Gpu", resize_g);
    cvWaitKey(1);
    duration = (double)(finish - start) / CLOCKS_PER_SEC;
    cout << duration << endl;
    //system("pause");
    return 0;
}

下面贴一个CMakeLists.txt 有些同学可能喜欢在linux下操作。linux下的编译速度比windows快了10倍的样子吧,实在没法忍受在windows下调试。而且有时可能因为显存的问题,正常的代码运行结果有错+_+

CMAKE_MINIMUM_REQUIRED(VERSION 2.8)  
PROJECT(test_cuda)  


FIND_PACKAGE(CUDA REQUIRED)  
FIND_PACKAGE(OpenCV REQUIRED) 

INCLUDE_DIRECTORIES(/usr/local/cuda-7.5/include)

set(CUDA_NVCC_FLAGS_DEBUG "-g -G")  
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(GENCODE -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_35,code=compute_35)  
set(GENCODE ${GENCODE} -gencode=arch=compute_20,code=sm_20)  

CUDA_ADD_EXECUTABLE(test_cuda test.cu OPTIONS ${GENCODE})
target_link_libraries(test_cuda ${OpenCV_LIBS})

有点偷懒,几乎是只贴了代码。gpu的调试比较麻烦,之前因为一些边界问题调的有些烦。。。

你可能感兴趣的:(cuda)