CUDA - cuDNN library-cudnnConvolutionForward 使用

#include 
#include 
#include 
#include 
#include "utils.cuh"

using algo_perf_t = cudnnConvolutionFwdAlgoPerf_t;

// if exit, algo_arr[0] will be best candidate
bool get_valid_best_algo(std::vector& algo_arr) {
    auto it = std::remove_if(algo_arr.begin(), algo_arr.end(), [](algo_perf_t algo_perf){
        return algo_perf.status != CUDNN_STATUS_SUCCESS;
    });
    algo_arr.erase(it, algo_arr.end());
    if(algo_arr.size() == 0) {
        std::runtime_error("Found no valid conv algorithm!");
    } 
    std::sort(algo_arr.begin(), algo_arr.end(), [](algo_perf_t algo1, algo_perf_t algo2){
        return algo1.time < algo2.time;
    });
    return algo_arr.size()>0;
}

void cudnn_conv2d(const Tensor& x_gpu, const Tensor& w_gpu, const Conv2dParam& conv_param, Tensor& y_gpu) {
    cudnnHandle_t h_handle;
    CHECK_CUDNN(cudnnCreate(&h_handle));

    cudnnTensorDescriptor_t x_desc, y_desc;
    CHECK_CUDNN(cudnnCreateTensorDescriptor(&x_desc));
    CHECK_CUDNN(cudnnCreateTensorDescriptor(&y_desc));
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(
        x_desc,
        CUDNN_TENSOR_NHWC,
        CUDNN_DATA_FLOAT,
        x_gpu.n,
        x_gpu.c,
        x_gpu.h,
        x_gpu.w
    ));

    // kernel
    cudnnFilterDescriptor_t w_desc;
    CHECK_CUDNN(cudnnCreateFilterDescriptor(&w_desc));
    CHECK_CUDNN(cudnnSetFilter4dDescriptor(
        w_desc,
        CUDNN_DATA_FLOAT,
        CUDNN_TENSOR_NHWC,
        w_gpu.n,
        w_gpu.c,
        w_gpu.h,
        w_gpu.w
    ));

    // conv
    cudnnConvolutionDescriptor_t conv_desc;
    CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(
        conv_desc,
        conv_param.pad_h,
        conv_param.pad_w,
        conv_param.u,
        conv_param.v,
        conv_param.dilation_h,
        conv_param.dilation_w,
        CUDNN_CROSS_CORRELATION,
        CUDNN_DATA_FLOAT
    ));

    // output
    CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
        conv_desc, 
        x_desc, 
        w_desc, 
        &y_gpu.n, 
        &y_gpu.c, 
        &y_gpu.h, 
        &y_gpu.w
    ));
    y_gpu.alloc_gpu();
    CHECK_CUDNN(cudnnSetTensor4dDescriptor(
        y_desc,
        CUDNN_TENSOR_NHWC,
        CUDNN_DATA_FLOAT,
        y_gpu.n,
        y_gpu.c,
        y_gpu.h,
        y_gpu.w
    ));

    // conv algorithm
    std::vector algo_perf_arr;
    int request_cnt = 0;
    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(h_handle, &request_cnt));
    algo_perf_arr.resize(request_cnt);
    int algo_count = 0;

    CHECK_CUDNN(cudnnSetConvolutionMathType(conv_desc, CUDNN_FMA_MATH));

     // cudnnGetConvolutionForwardAlgorithm_v7
    CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm_v7(
        h_handle, 
        x_desc, 
        w_desc, 
        conv_desc, 
        y_desc,
        request_cnt,
        &algo_count,
        algo_perf_arr.data()
        ));

    if(!get_valid_best_algo(algo_perf_arr)) {
        std::runtime_error("Found no valid conv algorithm!");
    }
    cudnnConvolutionFwdAlgo_t best_algo = algo_perf_arr[0].algo;

    size_t ws = 0;
    CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(
        h_handle, 
        x_desc, 
        w_desc, 
        conv_desc, 
        y_desc, 
        best_algo,
        &ws));
    void* workspace = nullptr;
    if(ws > 0) {
        CHECK_CUDA(cudaMalloc(&workspace, ws));
    }

    // Forward
    float alpha = 1.0f;
    float beta = 1.0f;
    float* out_ptr = y_gpu.get_ptr();
    CHECK_CUDNN(cudnnConvolutionForward(
        h_handle,
        &alpha,
        x_desc,
        x_gpu.get_ptr(),
        w_desc,
        w_gpu.get_ptr(),
        conv_desc,
        best_algo,
        workspace,
        ws,
        &beta,
        y_desc,
        out_ptr
    ));

    cudaDeviceSynchronize();
    cudnnDestroyTensorDescriptor(x_desc);
    cudnnDestroyTensorDescriptor(y_desc);
    cudnnDestroyFilterDescriptor(w_desc);
    cudnnDestroyConvolutionDescriptor(conv_desc);
    cudnnDestroy(h_handle);
}


int main() {

    cv::Mat src = cv::imread("./image/lena.jpg");
    cv::Mat src_fp;
    src.convertTo(src_fp, CV_32FC3);

    // Input
    Tensor tensor_x, tensor_w, tensor_y;
    tensor_x.alloc_gpu(1, 3, src.rows, src.cols);
    float* dev_ptr = tensor_x.get_ptr();
    CHECK_CUDA(cudaMemcpy(dev_ptr, src_fp.data, tensor_x.size_byte, cudaMemcpyHostToDevice));

    // kernel & conv
    Conv2dParam param;
    param.pad_h = param.pad_w = 1;
    param.dilation_h = param.dilation_w = 1;
    param.u = param.v = 1;
    make_kernel(tensor_w);

    cudnn_conv2d(tensor_x, tensor_w, param, tensor_y);

    cv::Mat dst_fp(cv::Size2d(tensor_y.w, tensor_y.h), CV_32FC(tensor_y.c));
    cv::Mat dst;
    CHECK_CUDA(cudaMemcpy(dst_fp.data, tensor_y.get_ptr(), tensor_y.size_byte, cudaMemcpyDeviceToHost));
    dst_fp.convertTo(dst, CV_8UC(tensor_y.c));

    cv::imwrite("dst.png", dst);

    // cv::imshow("src", src);
    // cv::imshow("dst", dst);
    // cv::waitKey(0);
}


#include 
#include 
#include 
#include 
#include 


// https://www.codenong.com/6683721/
// https://qa.1r1g.com/sf/ask/467860501/

#define CHECK_CUDA(err) \
    if (err!=cudaSuccess) { \
        std::runtime_error(cudaGetErrorString(err)); \
    }

#define CHECK_CUDNN(s) \
    if (s!=CUDNN_STATUS_SUCCESS) { \
        std::runtime_error(cudnnGetErrorString(s)); \
    }


struct Tensor {
public:
    int n, c, h, w;
    bool is_gpu;
    int size_byte;
public:
    float* ptr;
    bool allocated;

public:
    Tensor() {
        n = c = h = w = size_byte = 0;
        ptr = nullptr;
        is_gpu = true;
        allocated = false;
    }

    ~Tensor() {
        if(is_gpu && allocated && ptr!=nullptr) {
            CHECK_CUDA(cudaFree(ptr));
        }
    }

    void alloc_gpu(int n, int c, int h, int w) {
        this->n = n;
        this->c = c;
        this->h = h;
        this->w = w;
        this->is_gpu = true;
        assert(n>0&&c>0&&h>0&&w>0);
        size_byte = n*c*h*w*sizeof(float);
        is_gpu = true;
        alloc_gpu();
    }

    void alloc_gpu() {
        assert(n>0&&c>0&&h>0&&w>0);
        if(size_byte!=0) {
            assert(size_byte == n*c*h*w*sizeof(float));
        }else {
            size_byte = n*c*h*w*sizeof(float);
        }
        CHECK_CUDA(cudaMalloc((void**)&ptr, size_byte));
        is_gpu = true;
        allocated = true;
    }

    float* get_ptr() const {
        return ptr;
    }

    float* get_ptr() {
        return ptr;
    }

    void set_ptr(float* ptr) {
        this->ptr = ptr;
        this->allocated = false;
    }
};

struct Conv2dParam {
    int pad_h, pad_w;
    int dilation_h, dilation_w;
    int u,v;
};

void make_kernel(Tensor& kernel) {
    // const float kernel_template[3][3] = {
    //     {1, 1, 1},
    //     {1, -8, 1},
    //     {1, 1, 1}
    // };

    float kernel_data[9] = {
        1,1,1,1,-8,1,1,1,1
    };
    kernel.alloc_gpu(3, 3, 3, 3);
    float* ptr = kernel.get_ptr();
    for (int n=0;n
project(TEST_CONV2D)
cmake_minimum_required(VERSION 3.10)

enable_language(CXX CUDA)

# set(OpenCV_DIR "/home/wei/ubuntu/Libs/opencv-4.5.1/INSTALL")
set(CMAKE_PREFIX_PATH "//home/penghuiwei/MyWorkspace/Workspace/DockerEnv/Softwares/opencv/INSTALL")
find_package(OpenCV REQUIRED)

# cudnn
set(CUDNN_INCLUDE_DIRS "/usr/include")
set(CUDNN_LIBS "/usr/lib/x86_64-linux-gnu/libcudnn.so")

# cuda
set(CUDA_INCLUDE_DIRS "/usr/local/cuda/include")
set(CUDA_LIBS "/usr/local/cuda/lib64")

add_executable(main "test.cu")
target_include_directories(main PRIVATE ${OpenCV_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})

输入图像:


image.png

输出结果:


image.png

你可能感兴趣的:(CUDA - cuDNN library-cudnnConvolutionForward 使用)