首先原理和cpu的实现我这里就不赘述了,大家可以移步此篇文章,讲的很详细,并且gpu的代码也是按照这篇的思路去做的,链接:图像处理之双线性插值法。
下面直接贴代码:
CPU端实现:
void Resize(cv::Mat matDst1, cv::Mat matSrc) {
uchar* dataDst = matDst1.data;
int stepDst = matDst1.step;
uchar* dataSrc = matSrc.data;
int stepSrc = matSrc.step;
int iWidthSrc = matSrc.cols;
int iHiehgtSrc = matSrc.rows;
float scale_x = (float)iWidthSrc / matDst1.cols;
float scale_y = (float)iHiehgtSrc / matDst1.rows;
for (int j = 0; j < matDst1.rows; ++j)
{
float fy = (float)((j + 0.5) * scale_y - 0.5);
int sy = cvFloor(fy);
fy -= sy;
sy = std::min(sy, iHiehgtSrc - 2);
sy = std::max(0, sy);
short cbufy[2];
cbufy[0] = cv::saturate_cast((1.f - fy) * 2048);
cbufy[1] = 2048 - cbufy[0];
for (int i = 0; i < matDst1.cols; ++i)
{
float fx = (float)((i + 0.5) * scale_x - 0.5);
int sx = cvFloor(fx);
fx -= sx;
if (sx < 0) {
fx = 0, sx = 0;
}
if (sx >= iWidthSrc - 1) {
fx = 0, sx = iWidthSrc - 2;
}
short cbufx[2];
cbufx[0] = cv::saturate_cast((1.f - fx) * 2048);
cbufx[1] = 2048 - cbufx[0];
for (int k = 0; k < matSrc.channels(); ++k)
{
*(dataDst + j * stepDst + 3 * i + k) = (*(dataSrc + sy * stepSrc + 3 * sx + k) * cbufx[0] * cbufy[0] +
*(dataSrc + (sy + 1)*stepSrc + 3 * sx + k) * cbufx[0] * cbufy[1] +
*(dataSrc + sy * stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[0] +
*(dataSrc + (sy + 1)*stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[1]) >> 22;
}
}
}
}
GPU端实现:
__global__ void CudaResizeKernel(uint8_t* dataSrc, int widthSrc, int heightSrc, int stepSrc, uint8_t* dataDst, int widthDst, int heightDst, int stepDst, float scale_x, float scale_y, int chanel) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < widthDst && y < heightDst) {
float fy = (float)((y + 0.5) * scale_y - 0.5);
int sy = floor(fy);
fy -= sy;
sy = sy <= heightSrc - 2 ? sy : heightSrc - 2;
sy = sy >= 0 ? sy : 0;
short cbufy[2];
//cbufy[0] = (1.f - fy) * 2048 < -32768 ? -32768 : (cbufy[0] = (1.f - fy) * 2048 > 32767 ? 32767 : (1.f - fy) * 2048);
cbufy[0] = (1.f - fy) * 2048;
cbufy[1] = 2048 - cbufy[0];
float fx = (float)((x + 0.5) * scale_x - 0.5);
int sx = floor(fx);
fx -= sx;
if (sx < 0) {
fx = 0, sx = 0;
}
if (sx >= widthSrc - 1) {
fx = 0, sx = widthSrc - 2;
}
short cbufx[2];
//cbufx[0] = (1.f - fx) * 2048 < -32768 ? -32768 : (cbufy[0] = (1.f - fx) * 2048 > 32767 ? 32767 : (1.f - fx) * 2048);
cbufx[0] = (1.f - fx) * 2048;
cbufx[1] = 2048 - cbufx[0];
for (int k = 0; k < chanel; ++k)
{
*(dataDst + y * stepDst + 3 * x + k) = (*(dataSrc + sy * stepSrc + 3 * sx + k) * cbufx[0] * cbufy[0] +
*(dataSrc + (sy + 1)*stepSrc + 3 * sx + k) * cbufx[0] * cbufy[1] +
*(dataSrc + sy * stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[0] +
*(dataSrc + (sy + 1)*stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[1]) >> 22;
}
}
}
GPU端调用:
void CudaResize(uint8_t* dataSrc, int widthSrc, int heightSrc, int stepSrc, uint8_t* dataDst, int widthDst, int heightDst, int stepDst, int chanel) {
dim3 dimGrid(widthDst + 32 - 1 / 32, heightDst + 32 - 1 / 32);
dim3 dimBlock(32, 32);
float xScale = (float)widthSrc / widthDst;
float yScale = (float)heightSrc / heightDst;
CudaResizeKernel << < dimGrid, dimBlock >> > (dataSrc, widthSrc, heightSrc, stepSrc, dataDst, widthDst, heightDst, stepDst, xScale, yScale, 3);
效率还没有测试过,欢迎沟通和学习本人qq:37593238,添加请注明题目,如需转载请请注明出处。