cuda小白
原文链接 NPP
GPU架构近些年也有不少的变化,具体的可以参考别的博主的介绍,都比较详细。还有一些cuda中的专有名词的含义,可以参考《详解CUDA的Context、Stream、Warp、SM、SP、Kernel、Block、Grid》
常见的NppStatus,可以看这里。
当前模块主要是加减乘除,abs,平方,矩阵相乘,开根,ln,exp等。不同相同功能,以其中一个为例进行介绍。
针对图像中每一个像素添加一个常量值。与之类似的还有MulC,SubC,DivC,AbsDiffC,MulScale
大概接口样式:
// 两个结果的区别在于,有I的结果可以就原始的图像地址进行操作,无需进行拷贝
// Sfs的含义 表示可以对图像的数值范围进行约束操作。
NppStatus nppiAddC_[数据类型]_C[通道数]RSfs_[是否使用流]
NppStatus nppiAddC_[数据类型]_C[通道数]IRSfs_[是否使用流]
以为三通道的uint8_t的图像数据为例子:
NppStatus nppiAddC_8u_C3RSfs(const Npp8u * pSrc1,
int nSrc1Step,
const Npp8u aConstants[3],
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
int nScaleFactor);
NppStatus nppiAddC_8u_C3RSfs(const Npp8u aConstants[3],
Npp8u *pDst,
int nDstStep,
NppiSize oSizeROI,
int nScaleFactor);
#include
#include
#include
#include
#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }
int main() {
std::string directory = "../";
// =============== load image ===============
cv::Mat image = cv::imread(directory + "dog.png");
if (image.empty()) {
std::cout << "Load image error!" << std::endl;
return -1;
}
int image_width = image.cols;
int image_height = image.rows;
int image_size = image_width * image_height * 3 * sizeof(uint8_t);
std::cout << "Image info : image_width = " << image_width
<< ", image_height = " << image_height << std::endl;
// =============== malloc && cpy ===============
uint8_t *in_ptr, *in_ptr2, *out_ptr, *roi_out_ptr;
cudaMalloc((void**)&in_ptr, image_size);
cudaMalloc((void**)&in_ptr2, image_size);
cudaMalloc((void**)&out_ptr, image_size);
cudaMalloc((void**)&roi_out_ptr, image_size);
cudaMemcpy(in_ptr, image.data, image_size, cudaMemcpyHostToDevice);
cudaMemcpy(in_ptr2, image.data, image_size, cudaMemcpyHostToDevice);
uint8_t host_constant[3] = { (uint8_t)0, (uint8_t)20, (uint8_t)0 };
NppiSize roi1, roi2;
roi1.width = image_width;
roi1.height = image_height;
roi2.width = image_width / 2;
roi2.height = image_height / 2;
// nppiAddC_8u_C3RSfs
cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3);
NppStatus status;
status = nppiAddC_8u_C3RSfs(in_ptr, image_width * 3, host_constant, out_ptr,
image_width * 3, roi1, 0);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddC_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add_constant.jpg", out_image);
status = nppiAddC_8u_C3RSfs(in_ptr, image_width * 3, host_constant, out_ptr, image_width * 3,
roi1, 1);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddC_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add_constant_scale.jpg", out_image);
status = nppiAddC_8u_C3RSfs(in_ptr, image_width * 3, host_constant, out_ptr, image_width * 3,
roi2, 0);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddC_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add_constant_roi.jpg", out_image);
// free
CUDA_FREE(in_ptr)
CUDA_FREE(in_ptr2)
CUDA_FREE(out_ptr)
CUDA_FREE(roi_out_ptr)
}
cmake_minimum_required(VERSION 3.20)
project(test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")
add_executable(test test.cpp)
target_link_libraries(test
${OpenCV_LIBS}
${CUDA_LIBS}
)
与AddC不同的是,Add输入的是两张图像。同样的还有Mul,MulScale,Sub,Div,Div_round,Abs,AbsDiff,Sqr,Sqrt,Ln,Exp。(由于Abs,AbsDiff,Sqr,Sqrt,Ln,Exp在图像出列方面使用的不是很多,就不细述)。
以uint8_t的三通道图像为例:
// 命名规则与nppiAddC*类似
NppStatus nppiAdd_8u_C3RSfs(const Npp8u * pSrc1,
int nSrc1Step,
const Npp8u *pSrc2,
int nSrc2Step,
Npp8u * pDst,
int nDstStep,
NppiSize oSizeROI,
int nScaleFactor);
NppStatus nppiAdd_8u_C3IRSfs(const Npp8u *pSrc,
int nSrcStep,
Npp8u *pSrcDst,
int nSrcDstStep,
NppiSize oSizeROI,
int nScaleFactor);
#include
#include
#include
#include
#define PRINT_VALUE(value) { \
std::cout << "[GPU] " << #value << " = " << value << std::endl; }
#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }
int main() {
std::string directory = "../";
// =============== load image ===============
cv::Mat image = cv::imread(directory + "dog.png");
if (image.empty()) {
std::cout << "Load image error!" << std::endl;
return -1;
}
int image_width = image.cols;
int image_height = image.rows;
int image_size = image_width * image_height * 3 * sizeof(uint8_t);
std::cout << "Image info : image_width = " << image_width
<< ", image_height = " << image_height << std::endl;
// =============== malloc && cpy ===============
uint8_t *in_ptr, *in_ptr2, *out_ptr, *roi_out_ptr;
cudaMalloc((void**)&in_ptr, image_size);
cudaMalloc((void**)&in_ptr2, image_size);
cudaMalloc((void**)&out_ptr, image_size);
cudaMalloc((void**)&roi_out_ptr, image_size);
cudaMemcpy(in_ptr, image.data, image_size, cudaMemcpyHostToDevice);
cudaMemcpy(in_ptr2, image.data, image_size, cudaMemcpyHostToDevice);
NppiSize roi1, roi2;
roi1.width = image_width;
roi1.height = image_height;
roi2.width = image_width / 2;
roi2.height = image_height / 2;
// nppiAdd_8u_C3RSfs
cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_8UC3);
NppStatus status;
status = nppiAdd_8u_C3RSfs(in_ptr, image_width * 3, in_ptr2, image_width * 3, out_ptr,
image_width * 3, roi1, 0);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAdd_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add.jpg", out_image);
status = nppiAdd_8u_C3RSfs(in_ptr, image_width * 3, in_ptr2, image_width * 3, out_ptr,
image_width * 3, roi1, 1);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAdd_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add_scale.jpg", out_image);
status = nppiAdd_8u_C3RSfs(in_ptr, image_width * 3, in_ptr2, image_width * 3, out_ptr,
image_width * 3, roi2, 0);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAdd_8u_C3RSfs failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size, cudaMemcpyDeviceToHost);
cv::imwrite(directory + "add_roi.jpg", out_image);
// free
CUDA_FREE(in_ptr)
CUDA_FREE(in_ptr2)
CUDA_FREE(out_ptr)
CUDA_FREE(roi_out_ptr)
}
cmake_minimum_required(VERSION 3.20)
project(test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")
add_executable(test test.cpp)
target_link_libraries(test
${OpenCV_LIBS}
${CUDA_LIBS}
)
将特定区域的图像进行填充weight
NppStatus nppiAddWeighted_8u32f_C1IR(const Npp8u *pSrc,
int nSrcStep,
Npp32f * pSrcDst,
int nSrcDstStep,
NppiSize oSizeROI,
Npp32f nAlpha);
NppStatus nppiAddWeighted_8u32f_C1IMR(const Npp8u *pSrc,
int nSrcStep,
const Npp8u *pMask,
int nMaskStep,
Npp32f * pSrcDst,
int nSrcDstStep,
NppiSize oSizeROI,
Npp32f nAlpha);
#include
#include
#include
#include
#define PRINT_VALUE(value) { \
std::cout << "[GPU] " << #value << " = " << value << std::endl; }
#define CUDA_FREE(ptr) { if (ptr != nullptr) { cudaFree(ptr); ptr = nullptr; } }
int main() {
std::string directory = "../";
// =============== load image ===============
cv::Mat image = cv::imread(directory + "dog.png");
if (image.empty()) {
std::cout << "Load image error!" << std::endl;
return -1;
}
cv::Mat gray;
cv::cvtColor(image, gray, CV_BGR2GRAY);
cv::imwrite(directory + "gray.jpg", gray);
int image_width = gray.cols;
int image_height = gray.rows;
int image_size = image_width * image_height;
std::cout << "Image info : image_width = " << image_width
<< ", image_height = " << image_height << std::endl;
cv::Mat mat_mask = cv::Mat::ones(image_height, image_width, CV_8UC1);
cv::Rect rc_center = cv::Rect(image_width / 4, image_height / 4,
image_width / 2, image_height / 2);
mat_mask(rc_center) = cv::Mat::ones(image_height / 2, image_width / 2, CV_8UC1) * 255;
cv::imwrite(directory + "mask.jpg", mat_mask);
// =============== malloc && cpy ===============
uint8_t *in_ptr, *mask;
cudaMalloc((void**)&in_ptr, image_size * sizeof(uint8_t));
cudaMalloc((void**)&mask, image_size * sizeof(uint8_t));
cudaMemcpy(in_ptr, gray.data, image_size, cudaMemcpyHostToDevice);
cudaMemcpy(mask, mat_mask.data, image_size, cudaMemcpyHostToDevice);
float *out_ptr, *out_ptr1, *out_ptr2;
cudaMalloc((void**)&out_ptr, image_size * sizeof(float));
cudaMalloc((void**)&out_ptr1, image_size * sizeof(float));
cudaMalloc((void**)&out_ptr2, image_size * sizeof(float));
NppiSize roi1, roi2;
roi1.width = image_width;
roi1.height = image_height;
roi2.width = image_width / 2;
roi2.height = image_height / 2;
// nppiAdd_8u_C3RSfs
cv::Mat out_image = cv::Mat::zeros(image_height, image_width, CV_32FC1);
NppStatus status;
status = nppiAddWeighted_8u32f_C1IMR(in_ptr, image_width * sizeof(uint8_t),
mask, image_width * sizeof(uint8_t),
out_ptr, image_width * sizeof(float),
roi1, 1.0);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddWeighted_8u32f_C1IMR failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr, image_size * sizeof(float), cudaMemcpyDeviceToHost);
cv::imwrite(directory + "addweight.jpg", out_image);
status = nppiAddWeighted_8u32f_C1IMR(in_ptr, image_width * sizeof(uint8_t),
mask, image_width * sizeof(uint8_t),
out_ptr1, image_width * sizeof(float),
roi1, 0.5);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddWeighted_8u32f_C1IMR failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr1, image_size * sizeof(float), cudaMemcpyDeviceToHost);
cv::imwrite(directory + "addweight_scale.jpg", out_image);
status = nppiAddWeighted_8u32f_C1IMR(in_ptr, image_width * sizeof(uint8_t),
mask, image_width * sizeof(uint8_t),
out_ptr2, image_width * sizeof(float),
roi2, 0.5);
if (status != NPP_SUCCESS) {
std::cout << "[GPU] ERROR nppiAddWeighted_8u32f_C1IMR failed, status = " << status << std::endl;
return false;
}
cudaMemcpy(out_image.data, out_ptr2, image_size * sizeof(float), cudaMemcpyDeviceToHost);
cv::imwrite(directory + "addweight_roi_scale.jpg", out_image);
// free
CUDA_FREE(in_ptr)
CUDA_FREE(mask)
CUDA_FREE(out_ptr)
CUDA_FREE(out_ptr1)
CUDA_FREE(out_ptr2)
}
cmake_minimum_required(VERSION 3.20)
project(test)
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
include_directories(${CUDA_INCLUDE_DIRS})
file(GLOB CUDA_LIBS "/usr/local/cuda/lib64/*.so")
add_executable(test test.cpp)
target_link_libraries(test
${OpenCV_LIBS}
${CUDA_LIBS}
)
注意点:
1.nAlpha是针对原图中的每一个像素的值需要添加的权重,mask仅影响目标位置中那些部分需要输出。
2. roi表示输入的区域约束。