cuda图像灰度化

 注意事项:

  1. kernel函数内的某些变量一定要初始化,切记!
  2. 感觉CUDA计算uchar4类型的图像还是不可行,uchar3都很顺利
  3. 别忘了在kernel函数里加if设置有效线程的数量
  4. .kernel函数的调用必须在.cu文件中,外部.CPP要调用kernel函数的话,必须现在.cu中对kernel封装一层再暴露出去(在.cu自己.h中暴露,.cpp只需在自己的.h中包含.cu的.h文件)
  5. 调试CUDA代码的一般步骤
  • 先检查需要传入的数据的类型,是uchar3—CV_8UC3、float—CV_32FC1、int—CV_32SC1、
  • double—CV_64FC1、int2—CV_32SC2
  • 再在kernel函数内输出检查拷贝进GPU的数据是否正确
  • 以上前提工作做好以后,才是检查kernel函数算法是不是写对了

 


#include 
#include 
#include "opencv2/highgui.hpp"  //实际上在/usr/include下
#include "opencv2/opencv.hpp"
#include 
#include 
#include 

using namespace cv;
using namespace std;

#define PAUSE printf("Press Enter key to continue..."); fgetc(stdin);

__global__ void rgb2grayincuda(uchar3 * const d_in, unsigned char * const d_out,
	uint imgheight, uint imgwidth)
{
	const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
	const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;

	if (idx < imgwidth && idy < imgheight)
	{
		uchar3 rgb = d_in[idy * imgwidth + idx];
		d_out[idy * imgwidth + idx] = 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z;
	}
}

void rgb2grayincpu(unsigned char * const d_in, unsigned char * const d_out,
	uint imgheight, uint imgwidth)
{
	for (int i = 0; i < imgheight; i++)
	{
		for (int j = 0; j < imgwidth; j++)
		{
			d_out[i * imgwidth + j] = 0.299f * d_in[(i * imgwidth + j) * 3]
				+ 0.587f * d_in[(i * imgwidth + j) * 3 + 1]
				+ 0.114f * d_in[(i * imgwidth + j) * 3 + 2];
		}
	}
}



//打印GPU设备信息
void print_GPU_device_info()
{
	int deviceCount;
	cudaGetDeviceCount(&deviceCount);
	for (int i = 0; i < deviceCount; i++)
	{
		cudaDeviceProp devProp;
		cudaGetDeviceProperties(&devProp, i);
		std::cout << "使用GPU device " << i << ": " << devProp.name << std::endl;
		std::cout << "设备全局内存总量: " << devProp.totalGlobalMem / 1024 / 1024 << "MB" << std::endl;
		std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
		std::cout << "每个SM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
		std::cout << "每个SM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
		std::cout << "每个Block的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
		std::cout << "每个Block的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
		std::cout << "每个Block中可用的32位寄存器数量: " << devProp.regsPerBlock << std::endl;
		std::cout << "======================================================" << std::endl;

	}
}



int main(void)
{
	print_GPU_device_info();
	Mat srcImage = imread("123.png");
	imshow("srcImage", srcImage);
	

	const uint imgheight = srcImage.rows;
	const uint imgwidth = srcImage.cols;

	Mat grayImage(imgheight, imgwidth, CV_8UC1, Scalar(0));

	uchar3 *d_in;
	unsigned char *d_out;

	cudaMalloc((void**)&d_in, imgheight*imgwidth * sizeof(uchar3));
	cudaMalloc((void**)&d_out, imgheight*imgwidth * sizeof(unsigned char));

	cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth * sizeof(uchar3), cudaMemcpyHostToDevice);

	dim3 threadsPerBlock(32, 32);
	dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
		(imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

	clock_t start, end;
	start = clock();

	rgb2grayincuda << > >(d_in, d_out, imgheight, imgwidth);

	cudaDeviceSynchronize();
	end = clock();

	printf("cuda exec time is %.8f\n", (double)(end - start) / CLOCKS_PER_SEC);

	cudaMemcpy(grayImage.data, d_out, imgheight*imgwidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);

	cudaFree(d_in);
	cudaFree(d_out);

	start = clock();

	rgb2grayincpu(srcImage.data, grayImage.data, imgheight, imgwidth);

	end = clock();

	printf("cpu exec time is %.8f\n", (double)(end - start) / CLOCKS_PER_SEC);

	start = clock();
	cvtColor(srcImage, grayImage, COLOR_BGR2GRAY);

	end = clock();

	printf("opencv-cpu exec time is %.8f\n", (double)(end - start) / CLOCKS_PER_SEC);

	imshow("grayImage", grayImage);
	waitKey(0);

	return 0;

}

 

参考:https://blog.csdn.net/lingsuifenfei123/article/details/83444159

你可能感兴趣的:(cuda图像处理)