我与计算机视觉-[CUDA]-[Opencv.Resize的CPU实现和GPU实现]

首先原理和cpu的实现我这里就不赘述了,大家可以移步此篇文章,讲的很详细,并且gpu的代码也是按照这篇的思路去做的,链接:图像处理之双线性插值法。

下面直接贴代码:

CPU端实现:

void Resize(cv::Mat matDst1, cv::Mat matSrc) {
	uchar* dataDst = matDst1.data;
	int stepDst = matDst1.step;
	uchar* dataSrc = matSrc.data;
	int stepSrc = matSrc.step;
	int iWidthSrc = matSrc.cols;
	int iHiehgtSrc = matSrc.rows;

	float scale_x = (float)iWidthSrc / matDst1.cols;
	float scale_y = (float)iHiehgtSrc / matDst1.rows;

	for (int j = 0; j < matDst1.rows; ++j)
	{
		float fy = (float)((j + 0.5) * scale_y - 0.5);
		int sy = cvFloor(fy);
		fy -= sy;
		sy = std::min(sy, iHiehgtSrc - 2);
		sy = std::max(0, sy);

		short cbufy[2];
		cbufy[0] = cv::saturate_cast((1.f - fy) * 2048);
		cbufy[1] = 2048 - cbufy[0];

		for (int i = 0; i < matDst1.cols; ++i)
		{
			float fx = (float)((i + 0.5) * scale_x - 0.5);
			int sx = cvFloor(fx);
			fx -= sx;

			if (sx < 0) {
				fx = 0, sx = 0;
			}
			if (sx >= iWidthSrc - 1) {
				fx = 0, sx = iWidthSrc - 2;
			}

			short cbufx[2];
			cbufx[0] = cv::saturate_cast((1.f - fx) * 2048);
			cbufx[1] = 2048 - cbufx[0];

			for (int k = 0; k < matSrc.channels(); ++k)
			{
				*(dataDst + j * stepDst + 3 * i + k) = (*(dataSrc + sy * stepSrc + 3 * sx + k) * cbufx[0] * cbufy[0] +
					*(dataSrc + (sy + 1)*stepSrc + 3 * sx + k) * cbufx[0] * cbufy[1] +
					*(dataSrc + sy * stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[0] +
					*(dataSrc + (sy + 1)*stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[1]) >> 22;
			}
		}
	}
}

GPU端实现:

__global__ void CudaResizeKernel(uint8_t* dataSrc, int widthSrc, int heightSrc, int stepSrc, uint8_t* dataDst, int widthDst, int heightDst, int stepDst, float scale_x, float scale_y, int chanel) {

	int x = blockIdx.x * blockDim.x + threadIdx.x;
	int y = blockIdx.y * blockDim.y + threadIdx.y;

	if (x < widthDst && y < heightDst) {
		float fy = (float)((y + 0.5) * scale_y - 0.5);
		int sy = floor(fy);
		fy -= sy;
		sy = sy <= heightSrc - 2 ? sy : heightSrc - 2;
		sy = sy >= 0 ? sy : 0;

		short cbufy[2];
		//cbufy[0] = (1.f - fy) * 2048 < -32768 ? -32768 : (cbufy[0] = (1.f - fy) * 2048 > 32767 ? 32767 : (1.f - fy) * 2048);
		cbufy[0] = (1.f - fy) * 2048;
		cbufy[1] = 2048 - cbufy[0];

		float fx = (float)((x + 0.5) * scale_x - 0.5);
		int sx = floor(fx);
		fx -= sx;

		if (sx < 0) {
			fx = 0, sx = 0;
		}
		if (sx >= widthSrc - 1) {
			fx = 0, sx = widthSrc - 2;
		}

		short cbufx[2];
		//cbufx[0] = (1.f - fx) * 2048 < -32768 ? -32768 : (cbufy[0] = (1.f - fx) * 2048 > 32767 ? 32767 : (1.f - fx) * 2048);
		cbufx[0] = (1.f - fx) * 2048;
		cbufx[1] = 2048 - cbufx[0];

		for (int k = 0; k < chanel; ++k)
		{
			*(dataDst + y * stepDst + 3 * x + k) = (*(dataSrc + sy * stepSrc + 3 * sx + k) * cbufx[0] * cbufy[0] +
				*(dataSrc + (sy + 1)*stepSrc + 3 * sx + k) * cbufx[0] * cbufy[1] +
				*(dataSrc + sy * stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[0] +
				*(dataSrc + (sy + 1)*stepSrc + 3 * (sx + 1) + k) * cbufx[1] * cbufy[1]) >> 22;
		}
	}
}

GPU端调用:

void CudaResize(uint8_t* dataSrc, int widthSrc, int heightSrc, int stepSrc, uint8_t* dataDst, int widthDst, int heightDst, int stepDst, int chanel) {
	dim3 dimGrid(widthDst + 32 - 1 / 32, heightDst + 32 - 1 / 32);
	dim3 dimBlock(32, 32);
	float xScale = (float)widthSrc / widthDst;
	float yScale = (float)heightSrc / heightDst;
	CudaResizeKernel << < dimGrid, dimBlock >> > (dataSrc, widthSrc, heightSrc, stepSrc, dataDst, widthDst, heightDst, stepDst, xScale, yScale, 3);

 效率还没有测试过,欢迎沟通和学习本人qq:37593238,添加请注明题目,如需转载请请注明出处。

 

 

你可能感兴趣的:(学习记录,深度学习,边缘计算,自动驾驶,cuda,图像处理)