博主在CUDA库之NPP:NVIDIA 2D Image and Signal Processing Performance Primitives中已经详细介绍了NPP是啥,以及如何编译NPP。
这里就以 YUV转BGR为例,来完成NPP中的第一个例子(PS:也是博主的第一个Demo)
nppiYUVToBGR_8u_C3R
函数,将内存从设备端拷贝到主机端,再利用Opencv将图像保存出来第一步、利用Opencv读取BGR图像,并转换成YUV数据格式
cv::Mat matBrgImg = cv::imread("./data/Fig0638(a)(lenna_RGB).jpg");
int nWidth = matBrgImg.cols;
int nHeight = matBrgImg.rows;
int nStep = matBrgImg.step; // 每一行的步长,这里 = nWidth * 3
cv::Mat matYuvImg;
cv::cvtColor(matBrgImg, matYuvImg, CV_BGR2YUV);
上图中,左边是原图,右边是YUV数据
第二步,将YUV数据从 host拷贝到 dev端
Npp8u *pu8YUV_dev = NULL;
cudaMalloc((void **)& pu8YUV_dev, nWidth * nHeight * 3 * sizeof(Npp8u));
cudaMemcpy(pu8YUV_dev, (Npp8u *)matYuvImg.data, nWidth * nHeight * 3 * sizeof(Npp8u), cudaMemcpyHostToDevice);
这里的 Npp8u
在 nppdefs.h里,就是一个 unsigned char
关于更多npp中数据类型中的定义可见本文末
第三步、在Device上创建存放BGR数据的内存
这里推荐使用Npp8u * nppiMalloc_8u_C3(int nWidthPixels, int nHeightPixels, int * pStepBytes);
当然 cudamalloc
函数也可以在Device上开辟内存空间。
npp这个函数,可以看到一个参数是 pStepBytes
, 这个返回每一行占用字节数,由于本文采用的 512 ∗ 512 512*512 512∗512 的图像,所以这个值返回为 512 ∗ 3 512 * 3 512∗3
NppStatus nppRet = NPP_NO_ERROR;
NppiSize nppSize{nWidth, nHeight};
int nLineStep_npp = 0;
Npp8u *pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp);
printf("nLineStep_npp = %d \n", nLineStep_npp);
第四步、利用npp中nppiYUVToBGR_8u_C3R
函数将yuv数据转换成BGR
nppRet = nppiYUVToBGR_8u_C3R(pu8YUV_dev, nStep, pu8BGR_dev, nStep, nppSize);
printf("nppRet = %d \n", nppRet);
第五步、将BGR数据从dev端拷贝到Host端,并验证结果
unsigned char *pu8Bgr_host = NULL;
pu8Bgr_host = (unsigned char *)malloc( nWidth * nHeight * 3);
memset(pu8Bgr_host, 0, nWidth * nHeight * 3);
cudaMemcpy(pu8Bgr_host, pu8BGR_dev, nWidth * nHeight * 3, cudaMemcpyDeviceToHost);
cv::Mat newimage(nHeight, nWidth, CV_8UC3);
memcpy(newimage.data, pu8Bgr_host, nWidth * nHeight * 3);
cv::imwrite("./yuv2BGR.jpg",newimage );
最后,别忘记释放内存空间
if (NULL != pu8BGR_dev)
{
nppiFree(pu8BGR_dev);
pu8BGR_dev = NULL;
}
if (NULL != pu8YUV_dev)
{
cudaFree(pu8YUV_dev);
pu8YUV_dev = NULL;
}
if (NULL != pu8Bgr_host)
{
free(pu8Bgr_host);
pu8Bgr_host = NULL;
}
1、npp中基础数据类型定义,包括int、float等
具体定义可在 nppdefs.h里查下,以下仅摘录关于数据类型定义的一部分。
/** \defgroup npp_basic_types Basic NPP Data Types
* @{
*/
typedef unsigned char Npp8u; /**< 8-bit unsigned chars */
typedef signed char Npp8s; /**< 8-bit signed chars */
typedef unsigned short Npp16u; /**< 16-bit unsigned integers */
typedef short Npp16s; /**< 16-bit signed integers */
typedef unsigned int Npp32u; /**< 32-bit unsigned integers */
typedef int Npp32s; /**< 32-bit signed integers */
typedef unsigned long long Npp64u; /**< 64-bit unsigned integers */
typedef long long Npp64s; /**< 64-bit signed integers */
typedef float Npp32f; /**< 32-bit (IEEE) floating-point numbers */
typedef double Npp64f; /**< 64-bit floating-point numbers */
2、npp中返回码的定义,主要用来判断函数是否出错以及出错的原因
当返回码为0时,表示成功运行
typedef enum
{
/* negative return-codes indicate errors */
NPP_NOT_SUPPORTED_MODE_ERROR = -9999,
NPP_INVALID_HOST_POINTER_ERROR = -1032,
NPP_INVALID_DEVICE_POINTER_ERROR = -1031,
NPP_LUT_PALETTE_BITSIZE_ERROR = -1030,
NPP_ZC_MODE_NOT_SUPPORTED_ERROR = -1028, /**< ZeroCrossing mode not supported */
NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY = -1027,
NPP_TEXTURE_BIND_ERROR = -1024,
NPP_WRONG_INTERSECTION_ROI_ERROR = -1020,
NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR = -1006,
NPP_MEMFREE_ERROR = -1005,
NPP_MEMSET_ERROR = -1004,
NPP_MEMCPY_ERROR = -1003,
NPP_ALIGNMENT_ERROR = -1002,
NPP_CUDA_KERNEL_EXECUTION_ERROR = -1000,
NPP_ROUND_MODE_NOT_SUPPORTED_ERROR = -213, /**< Unsupported round mode*/
NPP_QUALITY_INDEX_ERROR = -210, /**< Image pixels are constant for quality index */
NPP_RESIZE_NO_OPERATION_ERROR = -201, /**< One of the output image dimensions is less than 1 pixel */
NPP_OVERFLOW_ERROR = -109, /**< Number overflows the upper or lower limit of the data type */
NPP_NOT_EVEN_STEP_ERROR = -108, /**< Step value is not pixel multiple */
NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR = -107, /**< Number of levels for histogram is less than 2 */
NPP_LUT_NUMBER_OF_LEVELS_ERROR = -106, /**< Number of levels for LUT is less than 2 */
NPP_CORRUPTED_DATA_ERROR = -61, /**< Processed data is corrupted */
NPP_CHANNEL_ORDER_ERROR = -60, /**< Wrong order of the destination channels */
NPP_ZERO_MASK_VALUE_ERROR = -59, /**< All values of the mask are zero */
NPP_QUADRANGLE_ERROR = -58, /**< The quadrangle is nonconvex or degenerates into triangle, line or point */
NPP_RECTANGLE_ERROR = -57, /**< Size of the rectangle region is less than or equal to 1 */
NPP_COEFFICIENT_ERROR = -56, /**< Unallowable values of the transformation coefficients */
NPP_NUMBER_OF_CHANNELS_ERROR = -53, /**< Bad or unsupported number of channels */
NPP_COI_ERROR = -52, /**< Channel of interest is not 1, 2, or 3 */
NPP_DIVISOR_ERROR = -51, /**< Divisor is equal to zero */
NPP_CHANNEL_ERROR = -47, /**< Illegal channel index */
NPP_STRIDE_ERROR = -37, /**< Stride is less than the row length */
NPP_ANCHOR_ERROR = -34, /**< Anchor point is outside mask */
NPP_MASK_SIZE_ERROR = -33, /**< Lower bound is larger than upper bound */
NPP_RESIZE_FACTOR_ERROR = -23,
NPP_INTERPOLATION_ERROR = -22,
NPP_MIRROR_FLIP_ERROR = -21,
NPP_MOMENT_00_ZERO_ERROR = -20,
NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR = -19,
NPP_THRESHOLD_ERROR = -18,
NPP_CONTEXT_MATCH_ERROR = -17,
NPP_FFT_FLAG_ERROR = -16,
NPP_FFT_ORDER_ERROR = -15,
NPP_STEP_ERROR = -14, /**< Step is less or equal zero */
NPP_SCALE_RANGE_ERROR = -13,
NPP_DATA_TYPE_ERROR = -12,
NPP_OUT_OFF_RANGE_ERROR = -11,
NPP_DIVIDE_BY_ZERO_ERROR = -10,
NPP_MEMORY_ALLOCATION_ERR = -9,
NPP_NULL_POINTER_ERROR = -8,
NPP_RANGE_ERROR = -7,
NPP_SIZE_ERROR = -6,
NPP_BAD_ARGUMENT_ERROR = -5,
NPP_NO_MEMORY_ERROR = -4,
NPP_NOT_IMPLEMENTED_ERROR = -3,
NPP_ERROR = -2,
NPP_ERROR_RESERVED = -1,
/* success */
NPP_NO_ERROR = 0, /**< Error free operation */
NPP_SUCCESS = NPP_NO_ERROR, /**< Successful operation (same as NPP_NO_ERROR) */
/* positive return-codes indicate warnings */
NPP_NO_OPERATION_WARNING = 1, /**< Indicates that no operation was performed */
NPP_DIVIDE_BY_ZERO_WARNING = 6, /**< Divisor is zero however does not terminate the execution */
NPP_AFFINE_QUAD_INCORRECT_WARNING = 28, /**< Indicates that the quadrangle passed to one of affine warping functions doesn't have necessary properties. First 3 vertices are used, the fourth vertex discarded. */
NPP_WRONG_INTERSECTION_ROI_WARNING = 29, /**< The given ROI has no interestion with either the source or destination ROI. Thus no operation was performed. */
NPP_WRONG_INTERSECTION_QUAD_WARNING = 30, /**< The given quadrangle has no intersection with either the source or destination ROI. Thus no operation was performed. */
NPP_DOUBLE_SIZE_WARNING = 35, /**< Image size isn't multiple of two. Indicates that in case of 422/411/420 sampling the ROI width/height was modified for proper processing. */
NPP_MISALIGNED_DST_ROI_WARNING = 10000, /**< Speed reduction due to uncoalesced memory accesses warning. */
} NppStatus;
3、nppiMalloc_8u_C3
创建内存
函数定义如下:
Npp8u *
nppiMalloc_8u_C3(int nWidthPixels, int nHeightPixels, int * pStepBytes);
int * pStepBytes
返回每行占用字节数,由于本文采用的 512 ∗ 512 512*512 512∗512 的图像,所以这个值返回为 512 ∗ 3 512 * 3 512∗3如果输入图像的宽小于512,这个值会补齐为512(这个512可能和显卡设备有关,笔者实验是1080卡)的最小倍数
比如,输入图像的宽为400, pStepBytes = 1536;
比如,输入图像的宽为513, pStepBytes = 2048;
可用如下代码实验
#include "npp.h"
#include "npps_support_functions.h"
int main()
{
int nWidth = 513;
int nHeight = 400;
int nLineStep_npp = 0;
Npp8u *pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp);
printf("nLineStep_npp = %d \n", nLineStep_npp);
printf("hello world \n");
return 0;
}
4、NppStatus nppiYUVToBGR_8u_C3R(const Npp8u * pSrc, int nSrcStep, Npp8u * pDst, int nDstStep, NppiSize oSizeROI);
函数将YUV转换成BGR