NVIDIA ® cuDNN is a GPU-accelerated library of primitives for deep neural networks. It provides highly tuned implementations of routines arising frequently in DNN applications:
CuDNN中计算卷积操作的由cudnnConvolutionForward()
来实现,其原型为
cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t xDesc,
const void *x,
const cudnnFilterDescriptor_t wDesc,
const void *w,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t yDesc,
void *y );
其中:
x
为输入数据的地址,w
为卷积核的地址,y
为输出数据的地址,对应的xDesc
、wDesc
和yDesc
为描述这三个数据的描述子,比如记录了数据的batch size、channels、height和width等。alpha
对卷积结果x*w
进行缩放,beta
对输出y
进行缩放,其表达式为:dstValue = alpha[0]*computedValue + beta[0]*priorDstValue
workspace
是指向进行卷积操作时需要的GPU空间的指针workSpaceSizeInBytes
为该空间的大小algo
用来指定使用什么算法来进行卷积运算handle
是创建的library context
的句柄,使用CuDNN库必须用cudaCreate()
来初始化。对图片进行拉普拉斯变换。
auto image = imread("red.png");
auto image_float = Matrix32f(image);
//handle
cudnnHandle_t handle;
cudnnCreate(&handle);
// input
Tensor<float> input({ 1, image.channels(), image.rows, image.cols });
Memory::copy(image_float.count() * sizeof(float), input.gptr(), image_float.ptr());
cudnnTensorDescriptor_t input_descriptor;
cudnnCreateTensorDescriptor(&input_descriptor);
cudnnSetTensor4dDescriptor(input_descriptor,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
input.shape(0), input.shape(1), input.shape(2), input.shape(3));
input_descriptor
保存了输入的信息,其中
CUDNN_TENSOR_NHWC
是数据数据的结构,这表示,input为一个四维的张量,四个维度分别为 CUDNN_DATA_FLOAT
为计算的数据类型 // output
Tensor<float> output(input.shape());
vector_set_gpu(output.count(), 0.0f, output.gptr());
cudnnTensorDescriptor_t output_descriptor;
cudnnCreateTensorDescriptor(&output_descriptor);
cudnnSetTensor4dDescriptor(output_descriptor,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
output.shape(0), output.shape(1), output.shape(2), output.shape(3));
// kernel
Tensor<float> kernel({ output.shape(1), input.shape(1), 3, 3 });
auto kernel_size = kernel.count(2, 4);
float kernel_[kernel_size] = { 0, 1, 0, 1, -4, 1, 0, 1, 0 };
for(auto i = 0; i < kernel.count(0, 2); ++i) {
memcpy(kernel.cptr() + i * kernel_size, kernel_, kernel_size * sizeof(float));
}
cudnnFilterDescriptor_t kernel_descriptor;
cudnnCreateFilterDescriptor(&kernel_descriptor);
cudnnSetFilter4dDescriptor(kernel_descriptor,
CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW,
kernel.shape(0), kernel.shape(1), kernel.shape(2), kernel.shape(3));
// convolution descriptor
cudnnConvolutionDescriptor_t conv_descriptor;
cudnnCreateConvolutionDescriptor(&conv_descriptor);
cudnnSetConvolution2dDescriptor(conv_descriptor,
1, 1, // zero-padding
1, 1, // stride
1, 1,
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
// algorithm
cudnnConvolutionFwdAlgo_t algo;
cudnnGetConvolutionForwardAlgorithm(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0,
&algo);
告诉CuDNN优先使用计算速度快的算法,并且没有内存限制
// workspace size && allocate memory
size_t workspace_size = 0;
cudnnGetConvolutionForwardWorkspaceSize(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
algo,
&workspace_size);
void * workspace = nullptr;
cudaMalloc(&workspace, workspace_size);
// convolution
auto alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle,
&alpha, input_descriptor, input.gptr(),
kernel_descriptor, kernel.gptr(),
conv_descriptor, algo,
workspace, workspace_size,
&beta, output_descriptor, output.gptr());
// destroy
cudaFree(workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyConvolutionDescriptor(conv_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroy(handle);
#include
#include "alchemy.h"
using namespace std;
using namespace alchemy;
int main()
{
// image
auto image = imread("red.png");
auto image_float = Matrix32f(image);
//handle
cudnnHandle_t handle;
cudnnCreate(&handle);
// input
Tensor<float> input({ 1, image.channels(), image.rows, image.cols });
Memory::copy(image_float.count() * sizeof(float), input.gptr(), image_float.ptr());
cudnnTensorDescriptor_t input_descriptor;
cudnnCreateTensorDescriptor(&input_descriptor);
cudnnSetTensor4dDescriptor(input_descriptor,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
input.shape(0), input.shape(1), input.shape(2), input.shape(3));
// output
Tensor<float> output(input.shape());
vector_set_gpu(output.count(), 0.0f, output.gptr());
cudnnTensorDescriptor_t output_descriptor;
cudnnCreateTensorDescriptor(&output_descriptor);
cudnnSetTensor4dDescriptor(output_descriptor,
CUDNN_TENSOR_NHWC,
CUDNN_DATA_FLOAT,
output.shape(0), output.shape(1), output.shape(2), output.shape(3));
// kernel
Tensor<float> kernel({ output.shape(1), input.shape(1), 3, 3 });
auto kernel_size = kernel.count(2, 4);
float kernel_[kernel_size] = { 0, 1, 0, 1, -4, 1, 0, 1, 0 };
for(auto i = 0; i < kernel.count(0, 2); ++i) {
memcpy(kernel.cptr() + i * kernel_size, kernel_, kernel_size * sizeof(float));
}
cudnnFilterDescriptor_t kernel_descriptor;
cudnnCreateFilterDescriptor(&kernel_descriptor);
cudnnSetFilter4dDescriptor(kernel_descriptor,
CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW,
kernel.shape(0), kernel.shape(1), kernel.shape(2), kernel.shape(3));
// convolution descriptor
cudnnConvolutionDescriptor_t conv_descriptor;
cudnnCreateConvolutionDescriptor(&conv_descriptor);
cudnnSetConvolution2dDescriptor(conv_descriptor,
1, 1, // zero-padding
1, 1, // stride
1, 1,
CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
// algorithm
cudnnConvolutionFwdAlgo_t algo;
cudnnGetConvolutionForwardAlgorithm(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0,
&algo);
// workspace size && allocate memory
size_t workspace_size = 0;
cudnnGetConvolutionForwardWorkspaceSize(handle,
input_descriptor,
kernel_descriptor,
conv_descriptor,
output_descriptor,
algo,
&workspace_size);
void * workspace = nullptr;
cudaMalloc(&workspace, workspace_size);
// convolution
auto alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle,
&alpha, input_descriptor, input.gptr(),
kernel_descriptor, kernel.gptr(),
conv_descriptor, algo,
workspace, workspace_size,
&beta, output_descriptor, output.gptr());
Matrix32f output_image(image.shape());
cudaMemcpy(output_image.ptr(), output.gptr(), image.count() * sizeof(float), cudaMemcpyDeviceToHost);
// destroy
cudaFree(workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyConvolutionDescriptor(conv_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroy(handle);
// show
imshow("original", image);
imshow("output", Matrix(output_image/3.0));
waitKey(0);
return 0;
}