核心代码:
// 2. npp 图像预处理
bool keepRation = 0 ,keepCenter= 0;
int width_in = img.cols;
int height_in = img.rows;
NppiSize srcSize = {width_in, height_in};
NppiRect srcROI = {0, 0, width_in, height_in};
int dst_width = inputDim.d[2];
int dst_height = inputDim.d[1];
NppiSize dstSize = {dst_width, dst_height};
NppiRect dstROI = {0, 0, dst_width, dst_height};
int bgr2rgb[3] = {2, 1, 0};
Npp32f m_scale[3] = {0.003921569, 0.003921569, 0.003921569};
Npp32f* r_plane = (Npp32f*)(mCudaBuffers[0]);
Npp32f* g_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*sizeof(float) );
Npp32f* b_plane = (Npp32f*)(mCudaBuffers[0] + dst_width*dst_height*2*sizeof(float) );
Npp32f* dst_planes[3] = {r_plane, g_plane, b_plane};
CUDA_CHECK(cudaMemcpy(mCudaImg, img.data, img.step[0]*img.rows, cudaMemcpyHostToDevice));
nppiResize_8u_C3R( (Npp8u*)mCudaImg, width_in * 3, srcSize, srcROI,
(Npp8u*)gpu_img_resize_buf, dst_width * 3, dstSize, dstROI,
NPPI_INTER_LINEAR);
nppiConvert_8u32f_C3R( (Npp8u*)gpu_img_resize_buf, dst_width*3,
(Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize); // 转成32float
nppiMulC_32f_C3IR(m_scale, (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dstSize); // 每个通道 × scale
nppiCopy_32f_C3P3R( (Npp32f*)gpu_img_plane, dst_width*3*sizeof(float), dst_planes, dst_width*sizeof(float), dstSize );
耗时
cost: 113 ms
cost: 68 ms
cost: 52.6667 ms
cost: 45.5 ms
cost: 41 ms
cost: 37.8333 ms
cost: 35.7143 ms
cost: 34 ms
cost: 32.7778 ms
cost: 31.6 ms
cost: 30.8182 ms
cost: 30.0833 ms
cost: 29.6154 ms
cost: 29.0714 ms
cost: 28.6667 ms
cost: 28.3125 ms
cost: 27.8824 ms
cost: 27.6111 ms
cost: 27.3158 ms
cost: 27.05 ms
cost: 26.8095 ms
cost: 26.5455 ms
cost: 26.3478 ms
cost: 26.125 ms
cost: 25.96 ms
cost: 25.8077 ms
cost: 25.6667 ms
cost: 25.5 ms
cost: 25.3793 ms
cost: 25.2333 ms
cost: 25.129 ms
cost: 25 ms
cost: 24.9091 ms
cost: 24.7941 ms
cost: 24.7143 ms
cost: 24.5833 ms
cost: 24.5135 ms
cost: 24.3947 ms
cost: 24.3077 ms
cost: 24.2 ms
cost: 24.0976 ms
核心代码:
#include
#include
#include
#include
__forceinline__ __device__ float3 get(uchar3* src, int x,int y,int w,int h){
if(x < 0 || x>=w || y<0 || y>=h) return make_float3(0.5,0.5,0.5);
uchar3 temp = src[y*w + x];
return make_float3(float(temp.x)/255.,float(temp.y)/255.,float(temp.z)/255.);
}
__global__ void resizeNormKernel(uchar3* src,float *dst,int dstW, int dstH,int srcW,int srcH,
float scaleX, float scaleY,float shiftX, float shiftY) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int x = idx % dstW;
const int y = idx / dstW;
if (x >= dstW || y >= dstH)
return;
float w = (x - shiftX + 0.5) * scaleX - 0.5; // 缩放的反向映射矩阵
float h = (y - shiftY + 0.5) * scaleY - 0.5; // opencv
int h_low = (int)h;
int w_low = (int)w;
int h_high = h_low + 1;
int w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh, hw = 1 - lw;
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float3 v1 = get(src,w_low,h_low,srcW,srcH);
float3 v2 = get(src,w_high,h_low,srcW,srcH);
float3 v3 = get(src,w_low,h_high,srcW,srcH);
float3 v4 = get(src,w_high,h_high,srcW,srcH);
int stride = dstW*dstH;
dst[y*dstW + x] = w1 *v1.x + w2 * v2.x + w3 *v3.x + w4 * v4.x ;
dst[stride + y*dstW + x] = w1 *v1.y + w2 * v2.y + w3 *v3.y + w4 * v4.y ;
dst[stride*2 + y*dstW + x] = w1 *v1.z + w2 * v2.z + w3 *v3.z + w4 * v4.z;
}
int resizeAndNorm(void * p,float *d,int w,int h,int in_w,int in_h, bool keepration ,bool keepcenter,cudaStream_t stream){
float scaleX = (w*1.0f / in_w);
float scaleY = (h*1.0f / in_h);
float shiftX = 0.f ,shiftY = 0.f;
if(keepration)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
if(keepration && keepcenter){shiftX = (in_w - w/scaleX)/2.f;shiftY = (in_h - h/scaleY)/2.f;}
const int n = in_w*in_h;
int blockSize = 1024;
const int gridSize = (n + blockSize - 1) / blockSize;
resizeNormKernel<<<gridSize, blockSize, 0, stream>>>((uchar3*)(p),d,in_w,in_h,w,h,scaleX,scaleY,shiftX,shiftY);
return 0;
}
自己写的cuda函数
cost: 21 ms
cost: 22 ms
cost: 22 ms
cost: 22.75 ms
cost: 22.8 ms
cost: 23.1667 ms
cost: 23.1429 ms
cost: 23 ms
cost: 23.1111 ms
cost: 22.9 ms
cost: 23 ms
cost: 22.9167 ms
cost: 23.1538 ms
cost: 23.1429 ms
cost: 23 ms
cost: 22.875 ms
cost: 22.7647 ms
cost: 22.6667 ms
cost: 22.6316 ms
cost: 22.55 ms
cost: 22.5714 ms
cost: 22.5 ms
cost: 22.5217 ms
cost: 22.4583 ms
cost: 22.48 ms
cost: 22.4231 ms
cost: 22.4444 ms
cost: 22.3929 ms
cost: 22.4138 ms
cost: 22.3667 ms
cost: 22.3871 ms
cost: 22.3438 ms
cost: 22.3636 ms
cost: 22.3235 ms
cost: 22.3143 ms
cost: 22.25 ms
cost: 22.2162 ms
cost: 22.1579 ms
cost: 22.1538 ms
cost: 22.1 ms
cost: 22.0732 ms