图像处理的过程中经常会使用到Intel的ipp库(Integrated Performance Primitives),IPP库包含了许多底层优化的函数。如果已经安装了IPP库,opencv在运行时也会自动调用。
近年来,随着计算机视觉的快速发展,使用GPU进行图像处理越来越流行和展现出优势。NVIDIA公司针对GPU开发了的GPU加速的图像、视频、信号处理库NPP(NVIDIA Performance Primitives)。
从取名就可以看出两者之间有着双胞胎一般的相似。事实上,NPP和IPP库函数基本上是一一对应,不同的是一个在CPU上计算一个在GPU上计算。
高度优化的NPP计算单元,执行速度比仅使用CPU要快80倍。
使用IPP 之前需要安装IPP或者拥有相应的库环境
#include
#include "ipp.h"
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"
#define check_sts(st) if((st) != ippStsNoErr) throw std::exception("run error") /* Go to Exit if IPP function returned status different from ippStsNoErr */
int filterBoxBorder_ipp(cv::Mat src, cv::Mat dst)
{
IppStatus status = ippStsNoErr;
Ipp8u* pSrc = NULL, *pDst = NULL; /* Pointers to source/destination images */
int srcStep = 0, dstStep = 0; /* Steps, in bytes, through the source/destination images */
IppiSize roiSize; /* Size of source/destination ROI in pixels */
Ipp8u *pBuffer = NULL; /* Pointer to the work buffer */
std::vector buffer;
int iTmpBufSize = 0; /* Common work buffer size */
IppiBorderType borderType = ippBorderRepl;
Ipp8u borderValue = 254;
IppiSize maskSize = { 3,3 };
int numChannels = 1;
//pSrc = ippiMalloc_8u_C1(roiSize.width, roiSize.height, &srcStep);
//pDst = ippiMalloc_8u_C1(roiSize.width, roiSize.height, &dstStep);
pSrc = src.data;
pDst = dst.data;
srcStep = src.step;
dstStep = dst.step;
roiSize = { src.size().width, src.size().height };
check_sts(status = ippiFilterBoxBorderGetBufferSize(roiSize, maskSize, ipp8u, numChannels, &iTmpBufSize));
buffer.resize(iTmpBufSize);
//pBuffer = ippsMalloc_8u(iTmpBufSize);
check_sts(status = ippiFilterBoxBorder_8u_C1R(pSrc, srcStep, pDst, dstStep, roiSize, maskSize, borderType, &borderValue, buffer.data()));
//EXIT_MAIN
//ippsFree(pBuffer);
//ippiFree(pSrc);
//ippiFree(pDst);
//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
return (int)status;
}
使用NPP之前需要有cuda开发环境,可参考另一篇博客CUDA10.1安装 +VS2015开发环境搭建
int filterBoxBorder_npp(cv::Mat src, cv::Mat dst)
{
NppiSize oSrcSize = { src.size().width, src.size().height };
Npp32s nSrcStep = src.step;
Npp32s nDstStep = dst.step;
NppiPoint oSrcOffset = { 0,0 };
NppiPoint oAnchor = { 1, 1 };
Npp8u * pSrc;
Npp8u * pDst;
cudaMalloc(&pSrc, nSrcStep*oSrcSize.height);
cudaMalloc(&pDst, nDstStep*oSrcSize.height);
NppiSize oMaskSize = { 3, 3 };
//NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
cudaError_t error_s = cudaMemcpy(pSrc, src.data, nSrcStep*oSrcSize.height, cudaMemcpyHostToDevice);
//NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);
NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);
error_s = cudaMemcpy(dst.data, pDst, nDstStep*oSrcSize.height, cudaMemcpyDeviceToHost);
cudaFree(pSrc);
cudaFree(pDst);
return int(status);
}
note: NPP在GPU上计算,所以函数所需的内存都应该在GPU上分配,也就是使用cudaMalloc()分配内存。
int main(int argc, char* argv[])
{
//cv::Mat src = cv::imread(R"(D:\filterPicture_cpu\rotatedImage.bmp)", cv::IMREAD_GRAYSCALE);
cv::Mat src = cv::imread(R"(C:\WorkSpace\Data\testImage\lena.jpg)", cv::IMREAD_GRAYSCALE);
cv::Mat dst(src.size(), CV_8UC1);
cv::Mat dst_d(src.size(), CV_8UC1);
filterBoxBorder_ipp(src, dst);
filterBoxBorder_npp(src, dst_d);
cv::Mat diff = dst - dst_d;
return 0;
}
IPP 与 NPP 计算结果不一致,即CPU和GPU计算结果不一致。
一开始我用NPP结果图减去IPP结果图:cv::Mat diff = dst_d - dst;
差值图像diff 是一张全黑图像,所以我误以为IPP和NPP的均值滤波计算结果是完全一致的。
其实结果并不一致,但我使用IPP结果图减去NPP结果图:cv::Mat diff = dst - dst_d;就发现差值图像是一张0、1图像。所以它们计算结果不一致。
之所以会误解,是因为Ipp的计算结果比Npp的大1,而差值图像diff是无符号的图,所以小于0的值被限制到了0.
int gaussFilter_IPP(cv::Mat src, cv::Mat dst)
{
cv::Size size = src.size();
IppStatus status = ippStsNoErr;
IppiSize roiSize = { size.width, size.height }; /* Size of source/destination ROI in pixels */
Ipp32u kernelSize = 3;
Ipp32f sigma = 1.0f; /* 高斯函数的标准差 */
Ipp8u *pBuffer = NULL; /* Pointer to the work buffer */
IppFilterGaussianSpec* pSpec = NULL; /* context structure */
std::vector buffer;
std::vector spec;
int iTmpBufSize = 0, iSpecSize = 0; /* Common work buffer size */
IppiBorderType borderType = ippBorderRepl;
Ipp8u borderValue = 0;
int numChannels =src.channels();
Ipp8u* pSrc = NULL, *pDst = NULL; /* Pointers to source/destination images */
pSrc = src.data;
pDst = dst.data;
int srcStep = src.step, dstStep = dst.step; /* Steps, in bytes, through the source/destination images */
//pSrc = ippiMalloc_32f_C1(roiSize.width, roiSize.height, &srcStep);
//pDst = ippiMalloc_32f_C1(roiSize.width, roiSize.height, &dstStep);
//check_sts(status = ippiImageJaehne_32f_C1R(pSrc, srcStep, roiSize)) /* fill source image */
check_sts(status = ippiFilterGaussianGetBufferSize(roiSize, kernelSize, ipp8u,
numChannels, &iSpecSize, &iTmpBufSize));
//pSpec = (IppFilterGaussianSpec *)ippsMalloc_8u(iSpecSize);
//pBuffer = ippsMalloc_8u(iTmpBufSize);
spec.resize(iSpecSize);
buffer.resize(iTmpBufSize);
pSpec = (IppFilterGaussianSpec *)spec.data();
check_sts(status = ippiFilterGaussianInit(roiSize, kernelSize, sigma,
borderType, ipp8u, numChannels, pSpec, buffer.data()));
check_sts(status = ippiFilterGaussianBorder_8u_C1R(pSrc, srcStep, pDst, dstStep,
roiSize, borderValue, pSpec, buffer.data()));
//ippsFree(pBuffer);
//ippsFree(pSpec);
//ippiFree(pSrc);
//ippiFree(pDst);
//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
return (int)status;
}
int gaussFilter_npp(cv::Mat src, cv::Mat dst)
{
//NppiSize oSizeROI = { src.cols, src.rows };
NppiSize oSrcSize = { src.cols, src.rows };
Npp32s nStep = src.step;
Npp8u * pSrc;
cudaMalloc(&pSrc, nStep*oSrcSize.height);
NppiPoint oSrcOffset = { 0,0 };
Npp8u * pDst;
cudaMalloc(&pDst, nStep*oSrcSize.height);
NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
cudaMemcpy(pSrc, src.data, nStep*oSrcSize.height, cudaMemcpyHostToDevice);
//NppStatus status = nppiFilterGaussBorder_8u_C1R(pSrc, nStep, oSrcSize, oSrcOffset, pDst, nStep, oSrcSize, eMaskSize, eBorderType);
NppStatus status = nppiFilterGauss_8u_C1R(pSrc, nStep, pDst, nStep, oSrcSize, eMaskSize );
cudaMemcpy(dst.data, pDst, nStep*oSrcSize.height, cudaMemcpyDeviceToHost);
cudaFree(pSrc);
cudaFree(pDst);
return (int)status;
}
int filterBorder_ipp(cv::Mat src, cv::Mat dst)
{
int divisor = 10000; // 灰度值的除数,滤波系数其实是kernel值除以divisor
const Ipp16s kernel[3 * 3]
= { 625,1250,625,
1250,2500,1250,
625,1250,625 }; // 3x3 Gauss kernal
IppStatus status = ippStsNoErr;
Ipp8u* pSrc = NULL, *pDst = NULL; /* Pointers to source/destination images */
IppiSize roiSize = { src.size().width, src.size().height}; /* Size of source/destination ROI in pixels */
IppiSize kernelSize = { 3, 3 };
Ipp8u *pBuffer = NULL; /* Pointer to the work buffer */
IppiFilterBorderSpec* pSpec = NULL; /* context structure */
std::vector buffer;
std::vector spec;
int iTmpBufSize = 0, iSpecSize = 0; /* Common work buffer size */
IppiBorderType borderType = ippBorderRepl;
Ipp8u borderValue = 0;
int numChannels = 1;
pSrc = src.data;
pDst = dst.data;
int srcStep = src.step, dstStep = dst.step; /* Steps, in bytes, through the source/destination images */
check_sts(status = ippiFilterBorderGetSize(kernelSize, roiSize, ipp8u, ipp16s, numChannels, &iSpecSize, &iTmpBufSize));
//pSpec = (IppiFilterBorderSpec *)ippsMalloc_8u(iSpecSize);
//pBuffer = ippsMalloc_8u(iTmpBufSize);
spec.resize(iSpecSize);
buffer.resize(iTmpBufSize);
pSpec = (IppiFilterBorderSpec *)spec.data();
check_sts(status = ippiFilterBorderInit_16s(kernel, kernelSize, divisor, ipp8u, numChannels, ippRndNear, pSpec));
check_sts(status = ippiFilterBorder_8u_C1R(pSrc, srcStep, pDst, dstStep, roiSize, borderType, &borderValue, pSpec, buffer.data()));
//ippsFree(pBuffer);
//ippsFree(pSpec);
//ippiFree(pSrc);
//ippiFree(pDst);
//printf("Exit status %d (%s)\n", (int)status, ippGetStatusString(status));
return (int)status;
}
int filterBorder_npp(cv::Mat src, cv::Mat dst)
{
NppiSize oKernelSize = { 3 , 3 };
Npp32s nDivisor = 10000; // 灰度值的除数,滤波系数其实是kernel值除以divisor
Npp32s kernel[] = { 625,1250,625,
1250,2500,1250,
625,1250,625 }; // 3x3 Gauss kernal
Npp32s *pkernel_d;
int lengthByte = oKernelSize.width*oKernelSize.height * sizeof(Npp32s);
cudaMalloc(&pkernel_d, lengthByte);
cudaMemcpy(pkernel_d, kernel, lengthByte, cudaMemcpyHostToDevice);
NppiSize oSrcSize = { src.size().width, src.size().height};
Npp32s nSrcStep = src.step;
Npp32s nDstStep = dst.step;
NppiPoint oSrcOffset = { 0,0 };
NppiPoint oAnchor = {1, 1};
Npp8u * pSrc;
Npp8u * pDst;
cudaMalloc(&pSrc, nSrcStep*oSrcSize.height);
cudaMalloc(&pDst, nDstStep*oSrcSize.height);
NppiSize oMaskSize = { 3, 3 };
//NppiMaskSize eMaskSize = NPP_MASK_SIZE_3_X_3;
NppiBorderType eBorderType = NPP_BORDER_REPLICATE;
cudaError_t error_s = cudaMemcpy(pSrc, src.data, nSrcStep*oSrcSize.height, cudaMemcpyHostToDevice);
//NppStatus status = nppiFilterBoxBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, oMaskSize, oAnchor, eBorderType);
NppStatus status = nppiFilterBorder_8u_C1R(pSrc, nSrcStep, oSrcSize, oSrcOffset, pDst, nDstStep, oSrcSize, pkernel_d, oKernelSize, oAnchor, nDivisor, eBorderType);
error_s = cudaMemcpy(dst.data, pDst, nDstStep*oSrcSize.height, cudaMemcpyDeviceToHost);
cudaFree(pSrc);
cudaFree(pDst);
cudaFree(pkernel_d);
return int(status);
}
CUDA10.1安装 +VS2015开发环境搭建
IPP库简介
NVIDIA NPP