基于CUDA的图像处理——超快速图像旋转

 图像旋转特点:每个像素相互独立,旋转后像素位置为浮点数,需要进行差值。

有以下几种方法:

  1. 基于opencv的仿射变换;cv::warpAffine();
  2. 基于cpu或多线程遍历所有像素;
  3. 基于CUDA遍历所有像素;速度超快不妨试试,如代码不理解先去看看基础CUDA编程

 *.cpp文件

int RotateImage(const unsigned char* img, const unsigned char* imgMask, const cv::Size imgSize, const float angle, unsigned char*& dst, unsigned char*& dstMask, cv::Size& dstSize)
	{
		if (img == nullptr || imgSize.area() <= 0)
			return -1;
		//Init
		unsigned char *gpu_img;//旋转前
		unsigned char *gpu_imgMask;
		cv::Size* gpu_imgSize;
		float* gpu_angle;
		unsigned char *gpu_dst;//旋转后
		unsigned char *gpu_dstMask;
		cv::Size* gpu_dstSize;

		//Malloc
		cudaMalloc((void**)&gpu_img, sizeof(unsigned char)* imgSize.width*imgSize.height);
		if (imgMask != nullptr)
			cudaMalloc((void**)&gpu_imgMask, sizeof(unsigned char)* (imgSize.width*imgSize.height));//旋转前模板蒙板大小,按最大分配空间
		cudaMalloc((void**)&gpu_imgSize, sizeof(cv::Size));
		cudaMalloc((void**)&gpu_angle, sizeof(float));
		cudaMalloc((void**)&gpu_dst, sizeof(unsigned char)*(imgSize.width*imgSize.width + imgSize.height*imgSize.height));//旋转后模板大小,按最大分配空间
		cudaMalloc((void**)&gpu_dstMask, sizeof(unsigned char)*(imgSize.width*imgSize.width + imgSize.height*imgSize.height));//旋转后模板大小,按最大分配空间
		cudaMalloc((void**)&gpu_dstSize, sizeof(cv::Size));

		//Memcpy
		cudaMemcpy(gpu_img, img, sizeof(unsigned char)* imgSize.width*imgSize.height, cudaMemcpyHostToDevice);
		if (imgMask != nullptr)
			cudaMemcpy(gpu_imgMask, imgMask, sizeof(unsigned char)* imgSize.width*imgSize.height, cudaMemcpyHostToDevice);
		cudaMemcpy(gpu_imgSize, &imgSize, sizeof(cv::Size), cudaMemcpyHostToDevice);
		cudaMemcpy(gpu_angle, &angle, sizeof(float), cudaMemcpyHostToDevice);
		dstSize.width = (int)(abs(imgSize.width*cos(angle)) + abs(imgSize.height*sin(angle)));
		dstSize.height = (int)(abs(imgSize.width*sin(angle)) + abs(imgSize.height*cos(angle)));
		cudaMemcpy(gpu_dstSize, &dstSize, sizeof(cv::Size), cudaMemcpyHostToDevice);
		
		//Compute
		if (imgMask != nullptr)
			RotateImage_CUDA(gpu_img, gpu_imgMask, gpu_imgSize, gpu_angle, gpu_dst, gpu_dstMask, gpu_dstSize, dstSize.area());
		else
			RotateImage_CUDA(gpu_img, nullptr, gpu_imgSize, gpu_angle, gpu_dst, gpu_dstMask, gpu_dstSize, dstSize.area());
		cudaDeviceSynchronize();//等待GPU执行完成

		dst = new unsigned char[dstSize.width*dstSize.height];//拷贝结果
		memset(dst, 0, sizeof(unsigned char)*dstSize.width*dstSize.height);
		cudaMemcpy(dst, gpu_dst, sizeof(unsigned char) * dstSize.width*dstSize.height, cudaMemcpyDeviceToHost);
		dstMask = new unsigned char[dstSize.width*dstSize.height];//拷贝结果
		memset(dstMask, 0, sizeof(unsigned char)*dstSize.width*dstSize.height);
		cudaMemcpy(dstMask, gpu_dstMask, sizeof(unsigned char) * dstSize.width*dstSize.height, cudaMemcpyDeviceToHost);

#ifdef _DEBUG
		cv::Mat mat_dst = cv::Mat(dstSize, CV_8UC1, dst, 0);
		cv::Mat mat_dstMask = cv::Mat(dstSize, CV_8UC1, dstMask, 0);
#endif
		//cudaFree
		cudaFree(gpu_img);
		if (imgMask != nullptr)
			cudaFree(gpu_imgMask);
		cudaFree(gpu_imgSize);
		cudaFree(gpu_angle);
		cudaFree(gpu_dst);
		cudaFree(gpu_dstMask);
		cudaFree(gpu_dstSize);
		return 0;
	}

*.cu文件 

void RotateImage_CUDA(unsigned char* img, unsigned char* maskOld, cv::Size*  imgsize, float* angle,
	unsigned char* result, unsigned char* maskNew, cv::Size* resultsize, int threads)
{
	int block_size = 32;
	int grid_size = (threads + block_size - 1) / block_size;
	RotateImage_CUDA_SubFunction << > > (img, maskOld, imgsize, angle,
		result, maskNew, resultsize);
}
template
__global__ void RotateImage_CUDA_SubFunction(T* img, T* maskOld, cv::Size* imgsize, float* angle,
	T* result, T* maskNew, cv::Size*  resultsize)
{
	//根据图像中心+角度进行旋转
	unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;//线程数,对应每个像素
	unsigned int x_res = tid % (*resultsize).width;
	unsigned int y_res = tid / (*resultsize).width;
	float center_x_res = 1.0*(*resultsize).width / 2;
	float center_y_res = 1.0*(*resultsize).height / 2;
	float center_x_org = 1.0*(*imgsize).width / 2;
	float center_y_org = 1.0*(*imgsize).height / 2;

	if (tid < (*resultsize).width*(*resultsize).height)
	{
		int x_org = (x_res - center_x_res) * cos(-*angle) - (y_res - center_y_res) * sin(-*angle) + center_x_org;
		int y_org = (x_res - center_x_res) * sin(-*angle) + (y_res - center_y_res) * cos(-*angle) + center_y_org;
		if (x_org >= 0 && x_org < (*imgsize).width&&y_org >= 0 && y_org < (*imgsize).height)
		{
			result[tid] = img[x_org + y_org*(*imgsize).width];
			if (maskOld == nullptr)
				maskNew[tid] = 255;
			else
				maskNew[tid] = maskOld[x_org + y_org*(*imgsize).width];
		}
		else
		{
			result[tid] = 0;
			maskNew[tid] = 0;
		}
	}
	//else
	//	printf("%d\n", tid);
}

你可能感兴趣的:(CUDA,C++,c++)