CUDA图像处理NPP库-CUDA和OpenCV联合编程

全图像素值相加

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 


/*================================
* @brief 多block,多thread,
* 利用原子操作 代码比较简单,但是原子操作对数据的访问是串行的,频繁的原子操作会影响性能
* 此时原子操作了 blockDim.x * gridDim.x 次
=================================*/
__global__ void sumOfCuda1(float* pData, float* pSum, int N)
{

	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	const int nStep = blockDim.x * gridDim.x;

	double dSum = 0.0;
	for (int i = tid; i < N; i += nStep)
	{
		dSum += pData[i];
	}
	atomicAdd(pSum, dSum); // 对所有线程进行原子操作
}

const int N = 5120 * 5120;
const int THREAD_NUM = 2048;
const int BLOCK_NUM = 2048;

/*================================
* @brief thread 里面多线程归约,归约后将各个thread的结果保存到共享内存里,
* 然后再对block归约, 最后将各个block的结果用原子操作求和
* 利用__shared__ 声明的变量表示这是 shared memory,是一个block中每个thread
* 都共享的内存, 这会使用GPU上的内存,存取的速度相当快,不需要担心latency的问题
* __syncthreads()是一个cuda内部的函数,表示block中的所有thread都要同步到这个点,
* 才能继续执行下一步操作。 在操作共享内存时,需要注意bank conflict的问题
=================================*/
__global__ void sumOfCuda2(float* pfData, float* pSum, int N)
{
	// printf("blockDim.x = %d\n", blockDim.x);
	__shared__ double share_dTemp[THREAD_NUM];
	const int nStep = gridDim.x * blockDim.x;
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	double dTempSum = 0.0;
	for (int i = tid; i < N; i += nStep)
	{
		dTempSum += pfData[i];
	}
	share_dTemp[threadIdx.x] = dTempSum;
	__syncthreads();// 同步操作,等待上面执行完成
	// 此时每个block内的每一个线程,都放了各自的求和
	// 然后需要对每个block内的线程进行归约
	// 每个block内有 blockDim.x 个线程, 也就是对每个block内的

	for (int i = blockDim.x / 2; i != 0; i /= 2)
	{
		if (threadIdx.x < i)
		{
			share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i];
		}
		__syncthreads();
	}

	if (0 == threadIdx.x)
	{
		atomicAdd(pSum, share_dTemp[0]);
	}
}


int main()
{
	cv::Mat matBgrImg = cv::imread("1.jpg");
	cv::resize(matBgrImg, matBgrImg, cv::Size(5120, 5120));
	cv::Mat matGrayIMg;
	cv::cvtColor(matBgrImg, matGrayIMg, cv::COLOR_BGR2GRAY);
	cv::Mat matF32;
	matGrayIMg.convertTo(matF32, CV_32FC1);
	matF32 = matF32 / 255.;
	int nH = matF32.rows;
	int nW = matF32.cols;
	int nStep = matF32.step;
	printf("matF32 h = %d, w = %d, channel = %d, step = %d \n", matF32.rows, matF32.cols, matF32.channels(), nStep);

	int LOOPS = 10000;

	clock_t t0 = clock();
	for (size_t i = 0; i < LOOPS; i++)
	{
		sum(matF32)[0];
	}
	clock_t t1 = clock();
	std::cout << "cpu costime is " << t1 - t0 << "ms" << std::endl;

	void* pvData = malloc(1 * 1 * N * sizeof(float));
	memcpy(pvData, (unsigned char*)matF32.data, N * sizeof(float));
	float* pfData_dev = NULL;

	cudaMalloc((void**)& pfData_dev, N * sizeof(float));
	cudaMemcpy(pfData_dev, pvData, N * sizeof(float), cudaMemcpyHostToDevice);

	float fSum = 0.0;
	clock_t start = clock();
	float* pfSum_dev = NULL;
	cudaMalloc((void**)& pfSum_dev, THREAD_NUM * sizeof(float));

	//printf("block num = %d, thread num = %d \n", BLOCK_NUM, THREAD_NUM);

	for (int i = 0; i < LOOPS; ++i)
	{
		cudaMemset(pfSum_dev, 0, THREAD_NUM * sizeof(float));
		sumOfCuda2 << <BLOCK_NUM, THREAD_NUM >> > (pfData_dev, pfSum_dev, N);

		float pfSum[THREAD_NUM] = { 0.0 };
		cudaMemcpy(pfSum, pfSum_dev, THREAD_NUM * sizeof(float), cudaMemcpyDeviceToHost);
		fSum = 0.0;
		for (int j = 0; j < THREAD_NUM; ++j)
		{
			fSum += pfSum[j];
		}
	}
	
	clock_t t2 = clock();
	std::cout << "costime is " << t2 - t1 << "ms" << std::endl;

	//std::cout << fSum << std::endl;
	return 0;
}

YUV转BGR

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"

#include 

int main()
{
	cv::Mat matBrgImg = cv::imread("1.jpg");
	int nWidth = matBrgImg.cols;
	int nHeight = matBrgImg.rows;
	int nStep = matBrgImg.step; // 每一行的步长,这里 = nWidth * 3
	cv::Mat matYuvImg;
	cv::cvtColor(matBrgImg, matYuvImg, cv::COLOR_BGR2YUV);

	Npp8u* pu8YUV_dev = NULL;
	cudaMalloc((void**)& pu8YUV_dev, nWidth * nHeight * 3 * sizeof(Npp8u));
	cudaMemcpy(pu8YUV_dev, (Npp8u*)matYuvImg.data, nWidth * nHeight * 3 * sizeof(Npp8u), cudaMemcpyHostToDevice);

	NppStatus nppRet = NPP_NO_ERROR;
	NppiSize nppSize{ nWidth, nHeight };
	int nLineStep_npp = 0;
	Npp8u* pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp);
	printf("nLineStep_npp = %d \n", nLineStep_npp);

	nppRet = nppiYUVToBGR_8u_C3R(pu8YUV_dev, nStep, pu8BGR_dev, nStep, nppSize);
	printf("nppRet = %d \n", nppRet);

	unsigned char* pu8Bgr_host = NULL;
	pu8Bgr_host = (unsigned char*)malloc(nWidth * nHeight * 3);
	memset(pu8Bgr_host, 0, nWidth * nHeight * 3);
	cudaMemcpy(pu8Bgr_host, pu8BGR_dev, nWidth * nHeight * 3, cudaMemcpyDeviceToHost);

	cv::Mat newimage(nHeight, nWidth, CV_8UC3);
	memcpy(newimage.data, pu8Bgr_host, nWidth * nHeight * 3);

	cv::imwrite("YUV2BGR.jpg", newimage);

	if (NULL != pu8BGR_dev)
	{
		nppiFree(pu8BGR_dev);
		pu8BGR_dev = NULL;
	}

	if (NULL != pu8YUV_dev)
	{
		cudaFree(pu8YUV_dev);
		pu8YUV_dev = NULL;
	}

	if (NULL != pu8Bgr_host)
	{
		free(pu8Bgr_host);
		pu8Bgr_host = NULL;
	}

	return 0;
}

图像缩放

#include 
#include 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


const int N = 2048;
const int threadnum = 32;//开32个线程


/* cpu 向量内积 */
template <typename T>
void dot_cpu(T* a, T* b, T* c, int n)
{
	double dTemp = 0;
	for (int i = 0; i < n; ++i)
	{
		dTemp += a[i] * b[i];
	}
	*c = dTemp;
}


/*单block 分散归约 */
template <typename T>
__global__ void dot_gpu_1(T* a, T* b, T* c, int n)
{
	__shared__ T tmp[threadnum];
	const int tid = threadIdx.x; //线程ID索引号
	const int t_n = blockDim.x; // 一个block内开启的线程总数
	int nTid = tid;
	double dTemp = 0.0;
	while (nTid < n)
	{
		dTemp += a[nTid] * b[nTid];
		nTid += t_n;
	}
	tmp[tid] = dTemp; // 将每个线程中的内积放入到共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = 2, j = 1;
	while (i <= threadnum)
	{
		if (tid % i == 0)
		{
			tmp[tid] += tmp[tid + j];
		}
		__syncthreads();
		i *= 2;
		j *= 2;
	}
	if (0 == tid)
	{
		c[0] = tmp[0];
	}
}

/*单block 低线程归约向量内积*/
template <typename T>
__global__ void dot_gpu_2(T* a, T* b, T* c, int n)
{
	__shared__ T tmp[threadnum];
	const int nThreadIdX = threadIdx.x; //线程ID索引号
	const int nBlockDimX = blockDim.x; // 一个block内开启的线程总数
	int nTid = nThreadIdX;
	double dTemp = 0.0;
	while (nTid < n)
	{
		dTemp += a[nTid] * b[nTid];
		nTid += nBlockDimX;
	}
	tmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = threadnum / 2;
	while (i != 0)
	{
		if (nThreadIdX < i)
		{
			tmp[nThreadIdX] += tmp[nThreadIdX + i];
		}
		__syncthreads();// 同步操作,即等所有线程内上面的操作都执行完
		i /= 2;
	}
	if (0 == nThreadIdX)
	{
		c[0] = tmp[0];
	}
}

/*多block多线程向量内积*/
template <typename T>
__global__ void dot_gpu_3(T* a, T* b, T* c, int n)
{
	__shared__ T aTmp[threadnum];
	const int nThreadIdX = threadIdx.x; //线程ID索引号
	const int nStep = gridDim.x * blockDim.x; // 跳步的步长,即所有线程的数量
	int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x; // 当前线程在全局线程的索引

	double dTemp = 0.0;
	while (nTidIdx < n)
	{
		dTemp += a[nTidIdx] * b[nTidIdx];
		nTidIdx += nStep;
	}
	aTmp[nThreadIdX] = dTemp; // 将每个线程中的内积放入到对应block的共享内存中
	__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完

	int i = threadnum / 2;
	while (i != 0)
	{
		if (nThreadIdX < i)
		{
			aTmp[nThreadIdX] += aTmp[nThreadIdX + i];
		}
		__syncthreads(); // 同步操作,即等所有线程内上面的操作都执行完
		i /= 2;
	}

	if (0 == nThreadIdX)
	{
		c[blockIdx.x] = aTmp[0];
	}

}


int main()
{
	float a[N], b[N];
	float c = 0;
	for (int i = 0; i < N; ++i) // 为数组a、b赋值
	{
		a[i] = i * 1.0;
		b[i] = 1.0;
	}

	float* d_a = 0, * d_b = 0, * d_c = 0;
	cudaMalloc(&d_a, N * sizeof(float));
	cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMalloc(&d_b, N * sizeof(float));
	cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);

	cudaMalloc(&d_c, sizeof(float));
	dot_cpu(a, b, &c, N);
	//dot_gpu_1 << <1, threadnum >> > (d_a, d_b, d_c, N);
	//dot_gpu_2 << <1, threadnum >> > (d_a, d_b, d_c, N);
	//dot_gpu_3<< <1, threadnum >> > (d_a, d_b, d_c, N);
	//cudaMemcpy(&c, d_c, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << c << std::endl;

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	return 0;
}

你可能感兴趣的:(CUDA,opencv,图像处理,CUDA)