【cuda】——npp/cuda图像预处理resize+norm对比

1. npp

核心代码:

// 2. npp 图像预处理
bool keepRation = 0 ,keepCenter= 0;
int width_in = img.cols; 
int height_in = img.rows;
NppiSize srcSize = {width_in, height_in};
NppiRect srcROI = {0, 0, width_in, height_in};
int dst_width = inputDim.d[2];
int dst_height = inputDim.d[1];
NppiSize dstSize = {dst_width, dst_height};
NppiRect dstROI  = {0, 0, dst_width, dst_height};
int bgr2rgb[3] = {2, 1, 0};
Npp32f m_scale[3] = {0.003921569, 0.003921569, 0.003921569};
Npp32f* r_plane = (Npp32f*)(mCudaBuffers[0]);
Npp32f* g_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*sizeof(float) );
Npp32f* b_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*2*sizeof(float) );
Npp32f* dst_planes[3] = {r_plane, g_plane, b_plane};

CUDA_CHECK(cudaMemcpy(mCudaImg, img.data, img.step[0]*img.rows, cudaMemcpyHostToDevice));
nppiResize_8u_C3R( (Npp8u*)mCudaImg, width_in * 3, srcSize, srcROI, 
                   (Npp8u*)gpu_img_resize_buf, dst_width * 3, dstSize, dstROI, 
                   NPPI_INTER_LINEAR);
nppiConvert_8u32f_C3R( (Npp8u*)gpu_img_resize_buf, dst_width*3, 
                       (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize);      // 转成32float
nppiMulC_32f_C3IR(m_scale, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize);        // 每个通道 × scale
nppiCopy_32f_C3P3R( (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dst_planes, dst_width*sizeof(float), dstSize );

耗时

cost: 113  ms
cost: 68  ms
cost: 52.6667  ms
cost: 45.5  ms
cost: 41  ms
cost: 37.8333  ms
cost: 35.7143  ms
cost: 34  ms
cost: 32.7778  ms
cost: 31.6  ms
cost: 30.8182  ms
cost: 30.0833  ms
cost: 29.6154  ms
cost: 29.0714  ms
cost: 28.6667  ms
cost: 28.3125  ms
cost: 27.8824  ms
cost: 27.6111  ms
cost: 27.3158  ms
cost: 27.05  ms
cost: 26.8095  ms
cost: 26.5455  ms
cost: 26.3478  ms
cost: 26.125  ms
cost: 25.96  ms
cost: 25.8077  ms
cost: 25.6667  ms
cost: 25.5  ms
cost: 25.3793  ms
cost: 25.2333  ms
cost: 25.129  ms
cost: 25  ms
cost: 24.9091  ms
cost: 24.7941  ms
cost: 24.7143  ms
cost: 24.5833  ms
cost: 24.5135  ms
cost: 24.3947  ms
cost: 24.3077  ms
cost: 24.2  ms
cost: 24.0976  ms

2. cuda

核心代码:

#include 
#include 
#include 
#include 
__forceinline__ __device__ float3 get(uchar3* src, int x,int y,int w,int h){
    if(x < 0 || x>=w || y<0 || y>=h) return make_float3(0.5,0.5,0.5);
    uchar3 temp = src[y*w + x];
    return make_float3(float(temp.x)/255.,float(temp.y)/255.,float(temp.z)/255.);
}

__global__ void resizeNormKernel(uchar3* src,float *dst,int dstW, int dstH,int srcW,int srcH,
                                                float scaleX, float scaleY,float shiftX, float shiftY) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const int x = idx % dstW;
    const int y = idx / dstW;
    if (x >= dstW || y >= dstH)
        return;
    float w = (x - shiftX + 0.5) * scaleX - 0.5;        // 缩放的反向映射矩阵
    float h = (y - shiftY + 0.5) * scaleY - 0.5;        // opencv 
    int h_low = (int)h;
    int w_low = (int)w;
    int h_high = h_low + 1;
    int w_high = w_low + 1;
    float lh = h - h_low;
    float lw = w - w_low;
    float hh = 1 - lh, hw = 1 - lw;
    float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
    float3 v1 = get(src,w_low,h_low,srcW,srcH);
    float3 v2 = get(src,w_high,h_low,srcW,srcH);
    float3 v3 = get(src,w_low,h_high,srcW,srcH);
    float3 v4 = get(src,w_high,h_high,srcW,srcH);
    int stride = dstW*dstH;
    dst[y*dstW + x] = w1 *v1.x + w2 * v2.x + w3 *v3.x + w4 * v4.x ;
    dst[stride + y*dstW + x] = w1 *v1.y + w2 * v2.y + w3 *v3.y + w4 * v4.y ;
    dst[stride*2 + y*dstW + x] = w1 *v1.z + w2 * v2.z + w3 *v3.z + w4 * v4.z;
}

int resizeAndNorm(void * p,float *d,int w,int h,int in_w,int in_h, bool keepration ,bool keepcenter,cudaStream_t stream){
    float scaleX = (w*1.0f / in_w);
    float scaleY = (h*1.0f / in_h);
    float shiftX = 0.f ,shiftY = 0.f;
    if(keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
    if(keepration && keepcenter){shiftX = (in_w - w/scaleX)/2.f;shiftY = (in_h - h/scaleY)/2.f;}
    const int n = in_w*in_h;
    int blockSize = 1024;
    const int gridSize = (n + blockSize - 1) / blockSize;
    resizeNormKernel<<<gridSize, blockSize, 0, stream>>>((uchar3*)(p),d,in_w,in_h,w,h,scaleX,scaleY,shiftX,shiftY);
    return 0;
}

自己写的cuda函数

cost: 21  ms
cost: 22  ms
cost: 22  ms
cost: 22.75  ms
cost: 22.8  ms
cost: 23.1667  ms
cost: 23.1429  ms
cost: 23  ms
cost: 23.1111  ms
cost: 22.9  ms
cost: 23  ms
cost: 22.9167  ms
cost: 23.1538  ms
cost: 23.1429  ms
cost: 23  ms
cost: 22.875  ms
cost: 22.7647  ms
cost: 22.6667  ms
cost: 22.6316  ms
cost: 22.55  ms
cost: 22.5714  ms
cost: 22.5  ms
cost: 22.5217  ms
cost: 22.4583  ms
cost: 22.48  ms
cost: 22.4231  ms
cost: 22.4444  ms
cost: 22.3929  ms
cost: 22.4138  ms
cost: 22.3667  ms
cost: 22.3871  ms
cost: 22.3438  ms
cost: 22.3636  ms
cost: 22.3235  ms
cost: 22.3143  ms
cost: 22.25  ms
cost: 22.2162  ms
cost: 22.1579  ms
cost: 22.1538  ms
cost: 22.1  ms
cost: 22.0732  ms

3. 总结

  • 自写cuda算子在开始的时候有更快的速度,在后续二者相差不大
  • 自写cuda,需要懂c,cuda,数据排列。但是灵活度更大
  • 用npp,只需要知道数据排列。但是灵活度不够。

你可能感兴趣的:(cuda,opencv,cuda,opencv,深度学习,npp)