NPP库使用入门

图像处理的过程中经常会使用到Intel的ipp库(Integrated Performance Primitives),IPP库包含了许多底层优化的函数。如果已经安装了IPP库,opencv在运行时也会自动调用。
近年来,随着计算机视觉的快速发展,使用GPU进行图像处理越来越流行和展现出优势。NVIDIA公司针对GPU开发了的GPU加速的图像、视频、信号处理库NPP(NVIDIA Performance Primitives)。
从取名就可以看出两者之间有着双胞胎一般的相似。事实上,NPP和IPP库函数基本上是一一对应,不同的是一个在CPU上计算一个在GPU上计算。
高度优化的NPP计算单元,执行速度比仅使用CPU要快80倍。
NPP库使用入门_第1张图片

ipp 均值滤波

使用IPP 之前需要安装IPP或者拥有相应的库环境

#include 
#include "ipp.h"
#include 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"

#define check_sts(st) if((st) != ippStsNoErr) throw std::exception("run error") /* Go to Exit if IPP function returned status different from ippStsNoErr */


int filterBoxBorder_ipp(cv::Mat src, cv::Mat dst)
{
	IppStatus status = ippStsNoErr;
	Ipp8u* pSrc = NULL, *pDst = NULL;     /* Pointers to source/destination images */
	int srcStep = 0, dstStep = 0;         /* Steps, in bytes, through the source/destination images */
	IppiSize roiSize; /* Size of source/destination ROI in pixels */
	Ipp8u *pBuffer = NULL;                /* Pointer to the work buffer */
	std::vector buffer;
	int iTmpBufSize = 0;                  /* Common work buffer size */
	IppiBorderType borderType = ippBorderRepl;
	Ipp8u borderValue = 254;
	IppiSize maskSize = { 3,3 };
	int numChannels = 1;

	//pSrc = ippiMalloc_8u_C1(roiSize.width, roiSize.height, &srcStep);
	//pDst = ippiMalloc_8u_C1(roiSize.width, roiSize.height, &dstStep);
	pSrc = src.data;
	pDst = dst.data;
	srcStep = src.step; 
	dstStep = dst.step;
	roiSize = { src.size().width, src.size().height };

	check_sts(status = ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ipp8u, numChannels, &iTmpBufSize));
	buffer.resize(iTmpBufSize);
	//pBuffer = ippsMalloc_8u(iTmpBufSize);

	check_sts(status = ippiFilterBoxBorder_8u_C1R(pSrc, srcStep, pDst, dstStep, roiSize, maskSize, borderType, &borderValue, buffer.data()));

		//EXIT_MAIN
	//ippsFree(pBuffer);
	//ippiFree(pSrc);
	//ippiFree(pDst);
	//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
	return (int)status;
}

NPP 均值滤波

使用NPP之前需要有cuda开发环境,可参考另一篇博客CUDA10.1安装 +VS2015开发环境搭建


int filterBoxBorder_npp(cv::Mat src, cv::Mat dst)
{
	NppiSize oSrcSize = { src.size().width, src.size().height };
	Npp32s nSrcStep = src.step;
	Npp32s nDstStep = dst.step;
	NppiPoint oSrcOffset = { 0,0 };
	NppiPoint oAnchor = { 1, 1 };

	Npp8u * pSrc;
	Npp8u * pDst;
	cudaMalloc(&pSrc, nSrcStep*oSrcSize.height);
	cudaMalloc(&pDst, nDstStep*oSrcSize.height);
	NppiSize oMaskSize = { 3, 3 };
	//NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
	NppiBorderType eBorderType = NPP_BORDER_REPLICATE;

	cudaError_t error_s = cudaMemcpy(pSrc, src.data, nSrcStep*oSrcSize.height, cudaMemcpyHostToDevice);
	//NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);
	NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);

	error_s = cudaMemcpy(dst.data, pDst, nDstStep*oSrcSize.height, cudaMemcpyDeviceToHost);

	cudaFree(pSrc);
	cudaFree(pDst);
	return int(status);


}

note: NPP在GPU上计算,所以函数所需的内存都应该在GPU上分配,也就是使用cudaMalloc()分配内存。

对比IPP和NPP计算结果


int main(int argc, char* argv[])
{

	//cv::Mat src = cv::imread(R"(D:\filterPicture_cpu\rotatedImage.bmp)", cv::IMREAD_GRAYSCALE);
	cv::Mat src = cv::imread(R"(C:\WorkSpace\Data\testImage\lena.jpg)", cv::IMREAD_GRAYSCALE);
	cv::Mat dst(src.size(), CV_8UC1);
	cv::Mat dst_d(src.size(), CV_8UC1);

	filterBoxBorder_ipp(src, dst);
	filterBoxBorder_npp(src, dst_d);
	cv::Mat diff = dst - dst_d;
	
	return 0;
}

IPP 与 NPP 计算结果不一致,即CPU和GPU计算结果不一致。
一开始我用NPP结果图减去IPP结果图:cv::Mat diff = dst_d - dst;
差值图像diff 是一张全黑图像,所以我误以为IPP和NPP的均值滤波计算结果是完全一致的。
其实结果并不一致,但我使用IPP结果图减去NPP结果图:cv::Mat diff = dst - dst_d;就发现差值图像是一张0、1图像。所以它们计算结果不一致。
之所以会误解,是因为Ipp的计算结果比Npp的大1,而差值图像diff是无符号的图,所以小于0的值被限制到了0.

同理可以测试高斯滤波函数,任意系数的滤波函数。它们的计算结果都不一致,而且都是Ipp的部分像素计算结果比Npp的大1。

高斯滤波


int  gaussFilter_IPP(cv::Mat src, cv::Mat dst)
{
	cv::Size size = src.size();
	IppStatus status = ippStsNoErr;
	IppiSize roiSize = { size.width, size.height };  /* Size of source/destination ROI in pixels */
	Ipp32u kernelSize = 3;
	Ipp32f sigma = 1.0f;                   /* 高斯函数的标准差 */
	Ipp8u *pBuffer = NULL;                 /* Pointer to the work buffer */
	IppFilterGaussianSpec* pSpec = NULL;   /* context structure */
	std::vector buffer;
	std::vector spec;
	int iTmpBufSize = 0, iSpecSize = 0;    /* Common work buffer size */
	IppiBorderType borderType = ippBorderRepl;
	Ipp8u borderValue = 0;
	int numChannels =src.channels();

	Ipp8u* pSrc = NULL, *pDst = NULL;     /* Pointers to source/destination images */
	pSrc = src.data;
	pDst = dst.data;
	int srcStep = src.step, dstStep = dst.step;          /* Steps, in bytes, through the source/destination images */
	//pSrc = ippiMalloc_32f_C1(roiSize.width, roiSize.height, &srcStep);
	//pDst = ippiMalloc_32f_C1(roiSize.width, roiSize.height, &dstStep);

	//check_sts(status = ippiImageJaehne_32f_C1R(pSrc, srcStep, roiSize)) /* fill source image */

	check_sts(status = ippiFilterGaussianGetBufferSize(roiSize, kernelSize, ipp8u,
		numChannels, &iSpecSize, &iTmpBufSize));

	//pSpec = (IppFilterGaussianSpec *)ippsMalloc_8u(iSpecSize);
	//pBuffer = ippsMalloc_8u(iTmpBufSize);
	spec.resize(iSpecSize);
	buffer.resize(iTmpBufSize);
	pSpec = (IppFilterGaussianSpec *)spec.data();

	check_sts(status = ippiFilterGaussianInit(roiSize, kernelSize, sigma,
		borderType, ipp8u, numChannels, pSpec, buffer.data()));

	check_sts(status = ippiFilterGaussianBorder_8u_C1R(pSrc, srcStep, pDst, dstStep,
		roiSize, borderValue, pSpec, buffer.data()));

	//ippsFree(pBuffer);
	//ippsFree(pSpec);
	//ippiFree(pSrc);
	//ippiFree(pDst);
	//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
	return (int)status;

}
int gaussFilter_npp(cv::Mat src, cv::Mat dst)
{
	//NppiSize oSizeROI = { src.cols, src.rows };
	NppiSize oSrcSize = { src.cols, src.rows };
	Npp32s nStep = src.step;
	Npp8u * pSrc;
	cudaMalloc(&pSrc, nStep*oSrcSize.height);
	NppiPoint oSrcOffset = { 0,0 };

	Npp8u * pDst;
	cudaMalloc(&pDst, nStep*oSrcSize.height);
	NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
	NppiBorderType eBorderType = NPP_BORDER_REPLICATE;

	cudaMemcpy(pSrc, src.data, nStep*oSrcSize.height, cudaMemcpyHostToDevice);
	//NppStatus status = nppiFilterGaussBorder_8u_C1R(pSrc, nStep, oSrcSize, oSrcOffset, pDst, nStep, oSrcSize, eMaskSize, eBorderType);
	NppStatus status =  nppiFilterGauss_8u_C1R(pSrc, nStep, pDst, nStep, oSrcSize, eMaskSize );
	cudaMemcpy(dst.data, pDst, nStep*oSrcSize.height, cudaMemcpyDeviceToHost);

	cudaFree(pSrc);
	cudaFree(pDst);

	return (int)status;
}

任意系数的滤波函数


int filterBorder_ipp(cv::Mat src, cv::Mat dst)
{
	int divisor = 10000;  // 灰度值的除数,滤波系数其实是kernel值除以divisor
	const Ipp16s kernel[3 * 3]
		= { 625,1250,625,
		1250,2500,1250,
		625,1250,625 }; // 3x3 Gauss kernal   

	IppStatus status = ippStsNoErr;
	Ipp8u* pSrc = NULL, *pDst = NULL;     /* Pointers to source/destination images */
	IppiSize roiSize = { src.size().width, src.size().height}; /* Size of source/destination ROI in pixels */
	IppiSize  kernelSize = { 3, 3 };
	Ipp8u *pBuffer = NULL;                /* Pointer to the work buffer */
	IppiFilterBorderSpec* pSpec = NULL;   /* context structure */
	std::vector buffer;
	std::vector spec;

	int iTmpBufSize = 0, iSpecSize = 0;   /* Common work buffer size */
	IppiBorderType borderType = ippBorderRepl;
	Ipp8u borderValue = 0;
	int numChannels = 1;
	
	pSrc = src.data;
	pDst = dst.data;
	int srcStep = src.step, dstStep = dst.step;         /* Steps, in bytes, through the source/destination images */

	check_sts(status = ippiFilterBorderGetSize(kernelSize, roiSize, ipp8u, ipp16s, numChannels, &iSpecSize, &iTmpBufSize));

	//pSpec = (IppiFilterBorderSpec *)ippsMalloc_8u(iSpecSize);
	//pBuffer = ippsMalloc_8u(iTmpBufSize);
	spec.resize(iSpecSize);
	buffer.resize(iTmpBufSize);
	pSpec = (IppiFilterBorderSpec *)spec.data();

	check_sts(status = ippiFilterBorderInit_16s(kernel, kernelSize, divisor, ipp8u, numChannels, ippRndNear, pSpec));

	check_sts(status = ippiFilterBorder_8u_C1R(pSrc, srcStep, pDst, dstStep, roiSize, borderType, &borderValue, pSpec, buffer.data()));


	//ippsFree(pBuffer);
	//ippsFree(pSpec);
	//ippiFree(pSrc);
	//ippiFree(pDst);
	//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
	return (int)status;
}
int filterBorder_npp(cv::Mat src, cv::Mat dst)
{
	
	NppiSize oKernelSize = { 3 , 3 };
	Npp32s nDivisor = 10000;          // 灰度值的除数,滤波系数其实是kernel值除以divisor
	Npp32s kernel[] = { 625,1250,625,
		1250,2500,1250,
		625,1250,625 }; // 3x3 Gauss kernal   
	Npp32s *pkernel_d;
	int lengthByte = oKernelSize.width*oKernelSize.height * sizeof(Npp32s);
	cudaMalloc(&pkernel_d, lengthByte);
	cudaMemcpy(pkernel_d, kernel, lengthByte, cudaMemcpyHostToDevice);


	NppiSize oSrcSize = { src.size().width, src.size().height};
	Npp32s nSrcStep = src.step;
	Npp32s nDstStep = dst.step;
	NppiPoint oSrcOffset = { 0,0 };
	NppiPoint oAnchor = {1, 1};

	Npp8u * pSrc;
	Npp8u * pDst;
	cudaMalloc(&pSrc, nSrcStep*oSrcSize.height);
	cudaMalloc(&pDst, nDstStep*oSrcSize.height);
	NppiSize oMaskSize = { 3, 3 };
	//NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
	NppiBorderType eBorderType = NPP_BORDER_REPLICATE;

	cudaError_t error_s = cudaMemcpy(pSrc, src.data, nSrcStep*oSrcSize.height, cudaMemcpyHostToDevice);
	//NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);
	NppStatus status =  nppiFilterBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, pkernel_d, oKernelSize, oAnchor, nDivisor, eBorderType);

	error_s = cudaMemcpy(dst.data, pDst, nDstStep*oSrcSize.height, cudaMemcpyDeviceToHost);

	cudaFree(pSrc);
	cudaFree(pDst);
	cudaFree(pkernel_d);
	return int(status);
}

参考文献

CUDA10.1安装 +VS2015开发环境搭建
IPP库简介
NVIDIA NPP

你可能感兴趣的:(CUDA,图像处理,NPP,cuda,IPP)