传统的物体检测算法中常用的方式有Sliding Window,通过这种方式进行物体搜索需要做大量的图像缩放,会占用较多的计算资源,本文将介绍如何通过GPU加速图像缩放算法。
较为常用的图像缩放算法有双线性插值法,算法的思想比较简单,可以参考百度百科。
下面是我依据百度百科的介绍实现的GPU和CPU版本的双线性插值算法,具体的加速效果根据CPU和GPU有一定差异,跟GPU的代码也有一定的关系,这里水平有限没有合理利用GPU资源,比如共享内存。经过多次测试在5-20倍之间。不过和opencv的版本比较了一下,速度差了有5倍左右,而且opencv的效果也更清晰一些(应该不是传统的双线性插值),看来算法还是关键,不能光拼硬件。
这里代码的执行效率不高,GPU的编程需要对硬件有一定的了解,还有很多可以优化的部分。
#include
#include "time.h"
#include
#include
#include
#include
using namespace std;
using namespace cv;
__global__ void gpu_bilinear(const uchar *cuda_original_img, uchar *cuda_resize_img,
int original_cols, int original_rows, int resize_cols, int resize_rows){
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < resize_cols&&y < resize_rows){
int A, B, C, D;
float w1, w2, w3, w4;
float fx, fy; //对应原图的坐标
int ax, ay; //A点的坐标
fx = (x + 1)*original_rows / resize_rows - 1;
fy = (y + 1)*original_cols / resize_cols - 1;
ax = int(fx);
ay = int(fy);
if (ax < 0){
ax = 0;
}
else if (ax > original_cols - 1){
ax = original_cols - 1;
}
if (ay < 0){
ay = 0;
}
else if (ay > original_rows - 1){
ay = original_rows - 1;
}
A = ax + ay*original_cols;
B = ax + ay*original_cols + 1;
C = ax + ay*original_cols + original_cols;
D = ax + ay*original_cols + original_cols + 1;
w1 = floor(fx) - fx;
w2 = 1 - w1;
w3 = floor(fy) - fy;
w4 = 1 - w3;
for (int c = 0; c < 3; ++c){
cuda_resize_img[(x + y*resize_cols) * 3 + c] = cuda_original_img[A * 3 + c] * w1*w3
+ cuda_original_img[B * 3 + c] * w2*w3
+ cuda_original_img[C * 3 + c] * w1*w4
+ cuda_original_img[D * 3 + c] * w2*w4;
}
}
}
cv::Mat gpu_resize(cv::Mat input_img, float scale){
int resize_w = int(input_img.cols*scale);
int resize_h = int(input_img.rows*scale);
cv::Mat resize_img(resize_h, resize_w, CV_8UC3);
uchar *cuda_resize_img;
int resize_size = resize_img.cols*resize_img.rows * 3;
cudaMalloc(&cuda_resize_img, resize_size*sizeof(uchar));
uchar *cuda_original_img;
int original_size = input_img.cols*input_img.rows * 3;
cudaMalloc(&cuda_original_img, original_size*sizeof(uchar));
cudaMemcpy(cuda_original_img, input_img.data, original_size*sizeof(uchar), cudaMemcpyHostToDevice);
int original_cols = input_img.cols;
int original_rows = input_img.rows;
int resize_cols = resize_img.cols;
int resize_rows = resize_img.rows;
int BLOCK_SIZE = 256;// 线程数量,要是32的整数倍(和硬件有关)
dim3 blocks(BLOCK_SIZE, BLOCK_SIZE);
dim3 grids(input_img.cols + BLOCK_SIZE - 1 / BLOCK_SIZE, input_img.rows + BLOCK_SIZE - 1 / BLOCK_SIZE);
gpu_bilinear << < grids, blocks >> >(cuda_original_img, cuda_resize_img, original_cols, original_rows, resize_cols, resize_rows);
cudaMemcpy(resize_img.data, cuda_resize_img, resize_size*sizeof(uchar), cudaMemcpyDeviceToHost);
cudaFree(cuda_resize_img);
cudaFree(cuda_original_img);
return resize_img;
}
void cpu_bilinear(cv::Mat const orign_img, cv::Mat &resize_img, float original_w, float original_h, int resize_w, int resize_h){
for (int c = 0; c < 3; ++c){
uchar q1 = orign_img.data[(int)(orign_img.cols*ceil(original_h) + ceil(original_w)) * 3 + c];
uchar q2 = orign_img.data[(int)(orign_img.cols*ceil(original_h) + floor(original_w)) * 3 + c];
uchar q3 = orign_img.data[(int)(orign_img.cols*floor(original_h) + ceil(original_w)) * 3 + c];
uchar q4 = orign_img.data[(int)(orign_img.cols*floor(original_h) + floor(original_w)) * 3 + c];
float w1 = ceil(original_w) == floor(original_w) ? 0.5 : floor(original_w) - original_w;
float w2 = ceil(original_w) == floor(original_w) ? 0.5 : original_w - ceil(original_w);
float w3 = ceil(original_h) == floor(original_h) ? 0.5 : floor(original_h) - original_h;
float w4 = ceil(original_h) == floor(original_h) ? 0.5 : original_h - ceil(original_h);
resize_img.data[(resize_h*resize_img.cols + resize_w) * 3 + c] = uchar(w1*w3*(q1 + q3) + w2*w4*(q2 + q4));
}
}
cv::Mat cpu_resize(cv::Mat input_img, float scale){
int resize_w = static_cast<int>(input_img.cols*scale);
int resize_h = static_cast<int>(input_img.rows*scale);
cv::Mat resize_img(resize_h, resize_w, CV_8UC3);
for (int i = 0; i < resize_h; ++i){
for (int j = 0; j < resize_w; ++j){
float original_w = j * input_img.cols / resize_w;
float original_h = i * input_img.rows / resize_h;
cpu_bilinear(input_img, resize_img, original_w, original_h, j, i);
}
}
return resize_img;
}
int main()
{
uchar *test;
cudaMalloc(&test, sizeof(uchar));//第一次初始化时间比较久
cudaFree(test);
Mat img = imread("./1.jpg");
clock_t start, finish;
double duration;
start = clock();
Mat resize_c;
Mat resize_g;
Mat resize_cv;
float scale = 1.789;
for (int i = 0; i < 100; ++i){
//resize_c = cpu_resize(img, scale);
resize_g = gpu_resize(img, scale);
//resize(img, resize_cv, Size(img.rows*scale, img.cols*scale));
}
finish = clock();
resize_c = cpu_resize(img, scale);
resize_g = gpu_resize(img, scale);
resize(img, resize_cv, Size(img.rows*scale, img.cols*scale));
imshow("Opencv", resize_cv);
imshow("Cpu", resize_c);
imshow("Gpu", resize_g);
cvWaitKey(1);
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << duration << endl;
//system("pause");
return 0;
}
下面贴一个CMakeLists.txt 有些同学可能喜欢在linux下操作。linux下的编译速度比windows快了10倍的样子吧,实在没法忍受在windows下调试。而且有时可能因为显存的问题,正常的代码运行结果有错+_+
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
PROJECT(test_cuda)
FIND_PACKAGE(CUDA REQUIRED)
FIND_PACKAGE(OpenCV REQUIRED)
INCLUDE_DIRECTORIES(/usr/local/cuda-7.5/include)
set(CUDA_NVCC_FLAGS_DEBUG "-g -G")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(GENCODE -gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_35,code=compute_35)
set(GENCODE ${GENCODE} -gencode=arch=compute_20,code=sm_20)
CUDA_ADD_EXECUTABLE(test_cuda test.cu OPTIONS ${GENCODE})
target_link_libraries(test_cuda ${OpenCV_LIBS})
有点偷懒,几乎是只贴了代码。gpu的调试比较麻烦,之前因为一些边界问题调的有些烦。。。