因为caffe之类的代码很久不更新了,只支持到了使用cudnn7.x,在使用了cudnn8的环境下编译caffe或video-caffe时,会在src/caffe/layers/cudnn_conv_layer.cpp等文件里出错:
error: identifier "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT" is undefined
error: identifier "cudnnGetConvolutionForwardAlgorithm" is undefined
这是因为cudnn8里没有cudnnGetConvolutionForwardAlgorithm()这个函数了,改成了cudnnGetConvolutionForwardAlgorithm_v7(),也没了CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT这个宏定义,这些不兼容带来的麻烦很恼火,但是NVIDIA声明cudnn8不支持了,caffe的代码也没人去更新了,所以不能指望NVIDIA或者berkeley,还只能自己琢磨去解决。
参考了网上的解决caffe的编译问题的方案后,实验解决了video-caffe的编译问题,对video-caffe的代码做如下修改:
1.修改cmake/Cuda.cmake , 将里面的"cudnn.h" 全部用 "cudnn_version.h"代替;
2.修改下面三个源码文件里的代码,增加针对性代码,当cudnn版本是8以上时,改成调用cudnnGetConvolutionForwardAlgorithm_v7(),否则仍保持原来的cudnnGetConvolutionForwardAlgorithm()调用不变:
1) video-caffe/src/caffe/layers/cudnn_ndconv_layer.cu:
...
template
void CudnnNdConvolutionLayer::Forward_gpu(
const vector*>& bottom, const vector*>& top) {
#if CUDNN_VERSION_MIN(8, 0, 0)
int RetCnt;
bool found_conv_algorithm;
size_t free_memory, total_memory;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
//cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
//get memory sizes
cudaMemGetInfo(&free_memory, &total_memory);
#endif
for (int i = 0; i < bottom.size(); ++i) {
const Dtype* bottom_data = bottom[i]->gpu_data();
Dtype* top_data = top[i]->mutable_gpu_data();
const Dtype* weight = this->blobs_[0]->gpu_data();
size_t workspace_limit_bytes = this->channels_*sizeof(int);
for (int j = 0; j < this->kernel_shape_.size(); ++j) {
workspace_limit_bytes *= kernel_shape_[j];
}
++workspace_limit_bytes;
// Forward through cuDNN in parallel over groups.
for (int g = 0; g < this->group_; g++) {
cudnnConvolutionFwdAlgo_t algo;
#if CUDNN_VERSION_MIN(8, 0, 0)
// choose forward algorithm for filter
// in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
4,
&RetCnt,
fwd_algo_pref_));
found_conv_algorithm = false;
for(int n=0;n workspaceSizeInBytes) {
workspaceSizeInBytes = workspaceSizeInBytes_temp;
// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspace_data_);
cudaError_t err = cudaMalloc(&(this->workspace_data_),
workspaceSizeInBytes);
if (err != cudaSuccess) {
// force zero memory path
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
workspace_data_ = NULL;
workspaceSizeInBytes = 0;
}
}
// Filters.
CUDNN_CHECK(cudnnConvolutionForward(handle_[g],
cudnn::dataType::one,
bottom_descs_[i], bottom_data + bottom_offset_ * g,
filter_desc_, weight + weight_offset_ * g,
conv_descs_[i],
algo, workspace_data_, workspaceSizeInBytes,
cudnn::dataType::zero,
top_descs_[i], top_data + top_offset_ * g));
// Bias.
if (this->bias_term_) {
const Dtype* bias_data = this->blobs_[1]->gpu_data();
#if CUDNN_VERSION_MIN(5, 0, 0)
CUDNN_CHECK(cudnnAddTensor(handle_[g],
cudnn::dataType::one,
bias_desc_, bias_data + bias_offset_ * g,
cudnn::dataType::one,
top_descs_[i], top_data + top_offset_ * g));
#else
CUDNN_CHECK(cudnnAddTensor_v3(handle_[g],
cudnn::dataType::one,
bias_desc_, bias_data + bias_offset_ * g,
cudnn::dataType::one,
top_descs_[i], top_data + top_offset_ * g));
#endif
}
}
// Synchronize the work across groups, each of which went into its own
// stream, by launching an empty kernel into the default (null) stream.
// NOLINT_NEXT_LINE(whitespace/operators)
sync_ndconv_groups<<<1, 1>>>();
}
}
...
2) src/caffe/layers/cudnn_conv_layer.cpp:
template
void CuDNNConvolutionLayer::Reshape(
const vector*>& bottom, const vector*>& top) {
ConvolutionLayer::Reshape(bottom, top);
CHECK_LE(2, this->num_spatial_axes_)
<< "CuDNNConvolution input must have 2 spatial axes "
<< "(e.g., height and width). "
<< "Use 'engine: CAFFE' for general ND convolution.";
bottom_offset_ = this->bottom_dim_ / this->group_;
top_offset_ = this->top_dim_ / this->group_;
const bool forced_3d = this->forced_3d_;
const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
const int* pad_data = this->pad_.cpu_data();
const int pad_h = pad_data[0];
const int pad_w = pad_data[1];
const int* stride_data = this->stride_.cpu_data();
const int stride_h = stride_data[0];
const int stride_w = stride_data[1];
#if CUDNN_VERSION_MIN(8, 0, 0)
int RetCnt;
bool found_conv_algorithm;
size_t free_memory, total_memory;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
//get memory sizes
cudaMemGetInfo(&free_memory, &total_memory);
#else
// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes = 8*1024*1024;
#endif
for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc(&bottom_descs_[i],
this->num_,
this->channels_ / this->group_, height, width,
this->channels_ * height * width,
height * width, width, 1);
cudnn::setTensor4dDesc(&top_descs_[i],
this->num_,
this->num_output_ / this->group_, height_out, width_out,
this->num_output_ * this->out_spatial_dim_,
this->out_spatial_dim_, width_out, 1);
cudnn::setConvolutionDesc(&conv_descs_[i], bottom_descs_[i],
filter_desc_, pad_h, pad_w,
stride_h, stride_w);
#if CUDNN_VERSION_MIN(8, 0, 0)
// choose forward algorithm for filter
// in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
4,
&RetCnt,
fwd_algo_pref_));
found_conv_algorithm = false;
for(int n=0;ngroup_ * CUDNN_STREAMS_PER_GROUP);
// this is the total amount of storage needed over all groups + streams
if (total_max_workspace > workspaceSizeInBytes) {
DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
workspaceSizeInBytes = total_max_workspace;
// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspaceData);
cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
if (err != cudaSuccess) {
// force zero memory path
for (int i = 0; i < bottom.size(); i++) {
workspace_fwd_sizes_[i] = 0;
workspace_bwd_filter_sizes_[i] = 0;
workspace_bwd_data_sizes_[i] = 0;
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
// NULL out all workspace pointers
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = NULL;
}
// NULL out underlying data
workspaceData = NULL;
workspaceSizeInBytes = 0;
}
// if we succeed in the allocation, set pointer aliases for workspaces
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace;
}
}
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::setTensor4dDesc(&bias_desc_,
1, this->num_output_ / this->group_, 1, 1);
}
}
3) src/caffe/layers/cudnn_deconv_layer.cpp:
template
void CuDNNDeconvolutionLayer::Reshape(
const vector*>& bottom, const vector*>& top) {
DeconvolutionLayer::Reshape(bottom, top);
CHECK_EQ(2, this->num_spatial_axes_)
<< "CuDNNDeconvolutionLayer input must have 2 spatial axes "
<< "(e.g., height and width). "
<< "Use 'engine: CAFFE' for general ND convolution.";
bottom_offset_ = this->bottom_dim_ / this->group_;
top_offset_ = this->top_dim_ / this->group_;
const bool forced_3d = this->forced_3d_;
const int height = bottom[0]->shape(this->channel_axis_ + 1 + forced_3d);
const int width = bottom[0]->shape(this->channel_axis_ + 2 + forced_3d);
const int height_out = top[0]->shape(this->channel_axis_ + 1 + forced_3d);
const int width_out = top[0]->shape(this->channel_axis_ + 2 + forced_3d);
const int* pad_data = this->pad_.cpu_data();
const int pad_h = pad_data[0];
const int pad_w = pad_data[1];
const int* stride_data = this->stride_.cpu_data();
const int stride_h = stride_data[0];
const int stride_w = stride_data[1];
#if CUDNN_VERSION_MIN(8, 0, 0)
int RetCnt;
bool found_conv_algorithm;
size_t free_memory, total_memory;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
//get memory sizes
cudaMemGetInfo(&free_memory, &total_memory);
#else
// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes = 8*1024*1024;
#endif
for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc(&bottom_descs_[i],
this->num_,
this->channels_ / this->group_,
height,
width,
this->channels_ * height * width,
height * width,
width,
1);
cudnn::setTensor4dDesc(&top_descs_[i],
this->num_,
this->num_output_ / this->group_,
height_out,
width_out,
this->num_output_ * height_out * width_out,
height_out * width_out,
width_out,
1);
cudnn::setConvolutionDesc(&conv_descs_[i],
top_descs_[i],
filter_desc_,
pad_h,
pad_w,
stride_h,
stride_w);
#if CUDNN_VERSION_MIN(8, 0, 0)
// choose forward algorithm for filter
// in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
top_descs_[i],
filter_desc_,
conv_descs_[i],
bottom_descs_[i],
4,
&RetCnt,
fwd_algo_pref_));
found_conv_algorithm = false;
for(int n=0;n= workspace_limit_bytes) {
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} else {
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
}
}
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
handle_[0],
top_descs_[i],
filter_desc_,
conv_descs_[i],
bottom_descs_[i],
fwd_algo_[i],
&(workspace_fwd_sizes_[i])));
// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
handle_[0],
top_descs_[i],
bottom_descs_[i],
conv_descs_[i],
filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&bwd_filter_algo_[i]));
// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
handle_[0],
top_descs_[i],
bottom_descs_[i],
conv_descs_[i],
filter_desc_,
bwd_filter_algo_[i],
&workspace_bwd_filter_sizes_[i]));
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
handle_[0],
filter_desc_,
bottom_descs_[i],
conv_descs_[i],
top_descs_[i],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&bwd_data_algo_[i]));
// get workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
handle_[0],
filter_desc_,
bottom_descs_[i],
conv_descs_[i],
top_descs_[i],
bwd_data_algo_[i],
&workspace_bwd_data_sizes_[i]));
#endif
}
// reduce over all workspace sizes to get a maximum to allocate / reallocate
size_t total_workspace_fwd = 0;
size_t total_workspace_bwd_data = 0;
size_t total_workspace_bwd_filter = 0;
for (size_t i = 0; i < bottom.size(); i++) {
total_workspace_fwd = std::max(total_workspace_fwd,
workspace_fwd_sizes_[i]);
total_workspace_bwd_data = std::max(total_workspace_bwd_data,
workspace_bwd_data_sizes_[i]);
total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
workspace_bwd_filter_sizes_[i]);
}
// get max over all operations
size_t max_workspace = std::max(total_workspace_fwd,
total_workspace_bwd_data);
max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
// ensure all groups have enough workspace
size_t total_max_workspace = max_workspace *
(this->group_ * CUDNN_STREAMS_PER_GROUP);
// this is the total amount of storage needed over all groups + streams
if (total_max_workspace > workspaceSizeInBytes) {
DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
workspaceSizeInBytes = total_max_workspace;
// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspaceData);
cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
if (err != cudaSuccess) {
// force zero memory path
for (int i = 0; i < bottom.size(); i++) {
workspace_fwd_sizes_[i] = 0;
workspace_bwd_filter_sizes_[i] = 0;
workspace_bwd_data_sizes_[i] = 0;
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
// NULL out all workspace pointers
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = NULL;
}
// NULL out underlying data
workspaceData = NULL;
workspaceSizeInBytes = 0;
}
// if we succeed in the allocation, set pointer aliases for workspaces
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = reinterpret_cast(workspaceData) + g*max_workspace;
}
}
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::setTensor4dDesc(
&bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
}
}
上面针对cudnn8的改动全部使用 "#if CUDNN_VERSION_MIN(8, 0, 0)"包含。
对于caffe使用cudnn8编译出错的问题就更简单了,只需像上面那样修改cmake/Cuda.cmake配置文件和修改src/caffe/layers/cudnn_conv_layer.cpp和src/caffe/layers/cudnn_deconv_layer.cpp的代码即可解决。
相关代码我已提交在我的github项目上: https://github.com/arnoldfychen/video-caffe 和https://github.com/arnoldfychen/caffe 详细说明参考README
如果同一环境下安装过多个版本的cudnn,编译前要检查一下确认cudnn8正确安装了(否则会导致cudnn找不到而被disable,因而涉及到c3d的卷积层NdConvolution都不会被编译!),例如,存在 /usr/include/cudnn_version.h,且/usr/lib/aarch64-linux-gnu/libcudnn.so是指向libcudnn.so.8.0.0:
/usr/lib/aarch64-linux-gnu/libcudnn.so -> /etc/alternatives/libcudnn_so
/etc/alternatives/libcudnn_so -> /usr/lib/aarch64-linux-gnu/libcudnn.so.8
/usr/lib/aarch64-linux-gnu/libcudnn.so.8 -> libcudnn.so.8.0.0
如果不存在,使用命令安装或者修复:
apt-get install --reinstall libcudnn8-dev
补充一个提示,如果你的caffe源码对cudnn_conv_layer.cpp和cudnn_deconv_layer.cpp做了上面的修改还是编译时报这样类似的错:
include/caffe/util/cudnn.hpp: In function ‘void libdnn::cudnn::setConvolutionDesc(cudnnConvolutionStruct**, cudnnTensorDescriptor_t, cudnnFilterDescriptor_t, int, int, int, int)’:
include/caffe/util/cudnn.hpp:109:70: error: too few arguments to function ‘cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t)’
pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
这说明你用的caffe的代码版本太老了,需要到GitHub - BVLC/caffe: Caffe: a fast open framework for deep learning. clone最后版本的代码,可能Make和Make.config都得相应用最新的然后根据项目情况修改一下。