TensorFlow 中的 Conv2DOp

TensorFlow 中的2D 卷积主要依赖外部库,如 cuDNN、cuBLAS、ROCm 和 hfp/libxsmm,仅 DeepConv2D 为源码实现。

Conv2DOp

Conv2DOp
BinaryOp

InitConv2DParameters 从 OpKernelConstruction 中读取设置到 Conv2DParameters 并进行检查。
CudnnUseAutotune 标识是否开启自动调优。

template <typename Device, typename T>
class Conv2DOp : public BinaryOp<T> {
 public:
  explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));

    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
    cudnn_use_autotune_ = CudnnUseAutotune();
  }

Conv2DOp::Compute

Conv2DOp::Compute
LaunchXsmmConvOp
LaunchDeepConvOp
LaunchConv2DOp

ComputeConv2DDimension 检查并设置2D 卷积的维度信息。
ShapeFromFormat 根据格式获取形状。
OpKernelContext::allocate_output 根据形状分配输出张量。

  void Compute(OpKernelContext* context) override {
    // Input tensor is of the following dimensions:
    // [ batch, in_rows, in_cols, in_depth ]
    const Tensor& input = context->input(0);

    // Input filter is of the following dimensions:
    // [ filter_rows, filter_cols, in_depth, out_depth]
    const Tensor& filter = context->input(1);

    Conv2DDimensions dimensions;
    OP_REQUIRES_OK(context,
                   ComputeConv2DDimension(params_, input, filter, &dimensions));

    TensorShape out_shape = ShapeFromFormat(
        params_.data_format, dimensions.batch, dimensions.out_rows,
        dimensions.out_cols, dimensions.out_depth);

    // Output tensor is of the following dimensions:
    // [ in_batch, out_rows, out_cols, out_depth ]
    Tensor* output = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));

    VLOG(2) << "Conv2D: in_depth = " << dimensions.in_depth
            << ", patch_depth = " << dimensions.patch_depth
            << ", input_cols = " << dimensions.input_cols
            << ", filter_cols = " << dimensions.filter_cols
            << ", input_rows = " << dimensions.input_rows
            << ", filter_rows = " << dimensions.filter_rows
            << ", stride_rows = " << dimensions.stride_rows
            << ", stride_cols = " << dimensions.stride_cols
            << ", dilation_rows = " << dimensions.dilation_rows
            << ", dilation_cols = " << dimensions.dilation_cols
            << ", out_depth = " << dimensions.out_depth;

    // If there is nothing to compute, return.
    if (out_shape.num_elements() == 0) {
      return;
    }

如果使用 hfp/libxsmm 库,在非明确填充模式下调用 LaunchXsmmConvOp::Run。
仅 CPU 模式下有效,GPU 模式下返回 false。

#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
    if (params_.padding != EXPLICIT &&
        LaunchXsmmConvOp<Device, T>::Run(
            context, input, filter, dimensions.batch, dimensions.input_rows,
            dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
            dimensions.filter_cols, dimensions.pad_rows_before,
            dimensions.pad_cols_before, dimensions.out_rows,
            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
            dimensions.dilation_cols, dimensions.stride_rows,
            dimensions.stride_cols, output, params_.data_format)) {
      return;
    }
#endif

非明确填充先尝试调用 LaunchDeepConvOp::Run。LaunchDeepConvOp 默认会返回 false,只有 CPU 的 float 类型进行了实现。

    if (params_.padding != EXPLICIT &&
        LaunchDeepConvOp<Device, T>::Run(
            context, input, filter, dimensions.batch, dimensions.input_rows,
            dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
            dimensions.filter_cols, dimensions.pad_rows_before,
            dimensions.pad_cols_before, dimensions.out_rows,
            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
            dimensions.dilation_cols, dimensions.stride_rows,
            dimensions.stride_cols, output, params_.data_format)) {
      return;
    }

LaunchConv2DOp::operator() GPU 通用版本为 cuDNN 实现。

    launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
              dimensions.dilation_rows, dimensions.dilation_cols,
              dimensions.stride_rows, dimensions.stride_cols, params_.padding,
              params_.explicit_paddings, output, params_.data_format);
  }

LaunchConv2DOp 对象为实际的执行者。

 private:
  Conv2DParameters params_;
  bool use_cudnn_;
  bool cudnn_use_autotune_;

  LaunchConv2DOp<Device, T> launcher_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
};

InitConv2DParameters

InitConv2DParameters
ShapeFromFormat
GetTensorDim
CheckValidPadding

从 OpKernelConstruction 中读取设置到 Conv2DParameters 并进行检查。
ShapeFromFormat 根据格式获取形状。
GetTensorDim 通过字符属性获取维度。
CheckValidPadding 检查填充值。

Status InitConv2DParameters(const OpKernelConstruction* context,
                            Conv2DParameters* params) {
  TF_RETURN_IF_ERROR(context->GetAttr("dilations", &params->dilations));
  TF_RETURN_IF_ERROR(context->GetAttr("strides", &params->strides));
  TF_RETURN_IF_ERROR(context->GetAttr("padding", &params->padding));
  if (context->HasAttr("explicit_paddings")) {
    TF_RETURN_IF_ERROR(
        context->GetAttr("explicit_paddings", &params->explicit_paddings));
  }
  string data_format_string;
  TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
  TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
              errors::InvalidArgument("Invalid data format"));

  const auto& strides = params->strides;
  const auto& dilations = params->dilations;
  const auto& data_format = params->data_format;

  TF_REQUIRES(dilations.size() == 4,
              errors::InvalidArgument("Sliding window dilations field must "
                                      "specify 4 dimensions"));
  TF_REQUIRES(strides.size() == 4,
              errors::InvalidArgument("Sliding window strides field must "
                                      "specify 4 dimensions"));
  const int64_t stride_n = GetTensorDim(strides, data_format, 'N');
  const int64_t stride_c = GetTensorDim(strides, data_format, 'C');
  const int64_t stride_h = GetTensorDim(strides, data_format, 'H');
  const int64_t stride_w = GetTensorDim(strides, data_format, 'W');
  TF_REQUIRES(
      stride_n == 1 && stride_c == 1,
      errors::Unimplemented("Current implementation does not yet support "
                            "strides in the batch and depth dimensions."));
  TF_REQUIRES(stride_h > 0 && stride_w > 0,
              errors::InvalidArgument(
                  "Row and column strides should be larger than 0."));

  const int64_t dilation_n = GetTensorDim(dilations, data_format, 'N');
  const int64_t dilation_c = GetTensorDim(dilations, data_format, 'C');
  const int64_t dilation_h = GetTensorDim(dilations, data_format, 'H');
  const int64_t dilation_w = GetTensorDim(dilations, data_format, 'W');
  TF_REQUIRES(
      dilation_n == 1 && dilation_c == 1,
      errors::Unimplemented("Current implementation does not yet support "
                            "dilations in the batch and depth dimensions."));
  TF_REQUIRES(
      dilation_h > 0 && dilation_w > 0,
      errors::InvalidArgument("Dilated rates should be larger than 0."));

  TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
                                       params->explicit_paddings,
                                       /*num_dims=*/4, data_format));

  return Status::OK();
}

LaunchConv2DOp

CUDA 和 ROCm 均使用LaunchConv2DOp

template <typename Device, typename T>
struct LaunchConv2DOp {
  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                  const Tensor& input, const Tensor& filter, int row_dilation,
                  int col_dilation, int row_stride, int col_stride,
                  const Padding& padding,
                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
                  TensorFormat data_format);
};

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
struct LaunchConv2DOp<Eigen::GpuDevice, T> {
  void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                  const Tensor& input, const Tensor& filter, int row_dilation,
                  int col_dilation, int row_stride, int col_stride,
                  const Padding& padding,
                  const std::vector<int64_t>& explicit_paddings, Tensor* output,
                  TensorFormat data_format);
};
#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

LaunchConv2DOp::operator()

Created with Raphaël 2.3.0 LaunchConv2DOp::operator() ctx, input_param, filter depthwise or fc? Stream::ThenBlasGemm End ComputeInNhwcEnabled explicit padding ? GetExplicitPaddingForDim GetWindowedOutputSizeVerboseV2 symmetric padding? compute_nchw? NHWCToNCHW::operator() transform_filter GetDnnWorkspaceLimitOrDefault AutotuneUnfusedConv LaunchAutotunedConv NCHWToNHWC::operator() OpKernelContext::allocate_temp PadInput::operator() yes no yes no yes no yes no

GPU 通用版本为 cuDNN 实现。

template <typename T>
void LaunchConv2DOp<GPUDevice, T>::operator()(
    OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
    const Tensor& input_param, const Tensor& filter, int row_dilation,
    int col_dilation, int row_stride, int col_stride, const Padding& padding,
    const std::vector<int64_t>& explicit_paddings, Tensor* output,
    TensorFormat data_format) {
  using se::dnn::AlgorithmConfig;
  using se::dnn::AlgorithmDesc;
  using se::dnn::ProfileResult;
  auto* stream = ctx->op_device_context()->stream();
  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

  if (!use_cudnn) {
    ctx->SetStatus(
        errors::Unimplemented("Conv2D for GPU is not currently supported "
                              "without cudnn"));
    return;
  }

输入维度信息为 int64。

  Tensor input = input_param;
  const int64_t in_batch = GetTensorDim(input, data_format, 'N');
  int64_t in_rows = GetTensorDim(input, data_format, 'H');
  int64_t in_cols = GetTensorDim(input, data_format, 'W');
  const int64_t in_depths = GetTensorDim(input, data_format, 'C');
  const int64_t patch_rows = filter.dim_size(0);
  const int64_t patch_cols = filter.dim_size(1);
  const int64_t patch_depths = filter.dim_size(2);

  OP_REQUIRES(
      ctx, filter.NumElements() > 0,
      errors::InvalidArgument("filter must not have zero elements "
                              "(i.e. all dimensions must be non-zero)"));

如果滤波器深度patch_depths为1且小于输入深度,则为深度卷积; 更一般地,如果滤波器深度不为1但小于输入深度,则为分组卷积。
如果是1x1卷积且数据格式为 NHWC,调用 Stream::ThenBlasGemm 函数。
AsDeviceMemory 将张量映射为封装给定类型缓冲区的 DeviceMemory 对象。

  // If the filter in-depth (patch_depths) is 1 and smaller than the input
  // depth, it's a depthwise convolution. More generally, if the filter in-depth
  // divides but is smaller than the input depth, it is a grouped convolution.
  bool is_grouped_convolution = patch_depths != in_depths;
  if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
      row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
      col_stride == 1 && data_format == FORMAT_NHWC &&
      (padding == VALID || padding == SAME)) {
    // 1x1 filter, so call cublas directly.
    const uint64 m = in_batch * in_rows * in_cols;
    const uint64 k = patch_depths;
    const uint64 n = filter.dim_size(3);

    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                input.template flat<T>().size());
    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                filter.template flat<T>().size());
    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                output->template flat<T>().size());

    auto no_transpose = se::blas::Transpose::kNoTranspose;
    OP_REQUIRES_OK(
        ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
                                  a_ptr, k, &c_ptr, n,
                                  se::blas::kDefaultComputePrecision));
    return;

如果卷积核尺寸与输入完全相同,且数据格式为 NHWC 则同样 Stream::ThenBlasGemm 函数。

  } else if (patch_rows == in_rows && patch_cols == in_cols &&
             !is_grouped_convolution && row_dilation == 1 &&
             col_dilation == 1 && padding == VALID &&
             data_format == FORMAT_NHWC) {
    // The input data and filter have the same height/width, so call cublas
    // directly.
    const uint64 m = in_batch;
    const uint64 k = patch_rows * patch_cols * patch_depths;
    const uint64 n = filter.dim_size(3);

    auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                input.template flat<T>().size());
    auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                filter.template flat<T>().size());
    auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
                                output->template flat<T>().size());

    auto no_transpose = se::blas::Transpose::kNoTranspose;
    OP_REQUIRES_OK(
        ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
                                  a_ptr, k, &c_ptr, n,
                                  se::blas::kDefaultComputePrecision));
    return;
  }

ComputeInNhwcEnabled 根据数据类型、GPU 计算能力和 cuDNN 的版本综合判断。
Tensor Core 在 NHWC 数据布局中支持 NVIDIA Volta+ GPU 的 fp16 和 Ampere+ GPU 的 tf32 高效卷积。 在所有其他配置中,以 NCHW 数据格式运行计算效率更高。

#if GOOGLE_CUDA
  const bool compute_in_nhwc = ComputeInNhwcEnabled(DataTypeToEnum<T>::value,
                                                    stream, /*is_conv2d=*/true);
#else
  // fast NHWC implementation is a CUDA only feature
  const bool compute_in_nhwc = false;
#endif

  // We only do one directional conversion: NHWC->NCHW. We never convert in the
  // other direction. Grappler layout optimizer selects preferred layout and
  // adds necessary annotations to the graph.
  // TODO(ezhulenev): Convert in other direction for fp16?
  const TensorFormat compute_data_format =
      (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
                                                      : FORMAT_NCHW;

  VLOG(3) << "Compute Conv2D with cuDNN:"
          << " data_format=" << ToString(data_format)
          << " compute_data_format=" << ToString(compute_data_format);

获取输出的维度信息。
GetExplicitPaddingForDim 从explicit_paddings获取填充值。
GetWindowedOutputSizeVerboseV2 计算出输出尺寸和填充值。

  const int64_t out_batch = GetTensorDim(*output, data_format, 'N');
  const int64_t out_rows = GetTensorDim(*output, data_format, 'H');
  const int64_t out_cols = GetTensorDim(*output, data_format, 'W');
  const int64_t out_depths = GetTensorDim(*output, data_format, 'C');
  int64_t padding_top = -1, padding_bottom = -1;
  int64_t padding_left = -1, padding_right = -1;
  if (padding == EXPLICIT) {
    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
                             &padding_bottom);
    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
                             &padding_right);
  }
  int64_t out_rows_check, out_cols_check;
  Status status = GetWindowedOutputSizeVerboseV2(
      in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
      &padding_top, &padding_bottom);
  // The status is guaranteed to be OK because we checked the output and padding
  // was valid earlier.
  TF_CHECK_OK(status);
  DCHECK_EQ(out_rows, out_rows_check);
  status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
                                          col_stride, padding, &out_cols_check,
                                          &padding_left, &padding_right);
  TF_CHECK_OK(status);
  DCHECK_EQ(out_cols, out_cols_check);

cuDNN 只支持填充对称,所以 OpKernelContext::allocate_temp 分配一块临时内存给transformed_input
计算为实现对称,4个方向上需要填充的值input_pad_topinput_pad_bottominput_pad_leftinput_pad_right
PadInput::operator() 对input_param进行填充得到transformed_input

  const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
  const int64_t common_padding_cols = std::min(padding_left, padding_right);
  if (padding_top != padding_bottom || padding_left != padding_right) {
    // cuDNN only supports padding the same amount on the left and right sides,
    // and on the top and bottom sides. So we manually create a new padded
    // input tensor such that we can pass it to cuDNN.
    VLOG(4) << "Pad input tensor:"
            << " padding_top=" << padding_top
            << " padding_bottom=" << padding_bottom
            << " padding_left=" << padding_left
            << " padding_right=" << padding_right;

    // TODO(reedwm): In some cases, we can avoid an allocation even if the two
    // padding sides are different. For example, if the input is 2x2, the filter
    // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
    // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
    // such a way would allow us to avoid the allocation.
    Tensor transformed_input;
    const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
    const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
    const int64_t new_in_rows = in_rows + padding_rows_diff;
    const int64_t new_in_cols = in_cols + padding_cols_diff;
    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                            DataTypeToEnum<T>::value,
                            ShapeFromFormat(data_format, in_batch, new_in_rows,
                                            new_in_cols, in_depths),
                            &transformed_input));

    const int64_t input_pad_top = padding_top - common_padding_rows;
    const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
    const int64_t input_pad_left = padding_left - common_padding_cols;
    const int64_t input_pad_right = padding_right - common_padding_cols;
    bool in_bounds =
        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
    if (!in_bounds) {
      ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
      return;
    }
    functor::PadInput<GPUDevice, T, int, 4>()(
        ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
        {{static_cast<int>(input_pad_bottom),
          static_cast<int>(input_pad_right)}},
        To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});

    input = transformed_input;
    in_rows = new_in_rows;
    in_cols = new_in_cols;
  }

如果输入格式为 NHWC 但计算格式为 NCHW,则调用 NHWCToNCHWinput进行转换。

  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
    VLOG(4) << "Convert the input tensor from NHWC to NCHW.";

    TensorShape nchw_shape =
        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
    if (in_depths > 1) {
      Tensor transformed_input;
      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
                                             nchw_shape, &transformed_input));
      functor::NHWCToNCHW<GPUDevice, T, 4>()(
          ctx->eigen_device<GPUDevice>(),
          const_cast<const Tensor&>(input).tensor<T, 4>(),
          transformed_input.tensor<T, 4>());
      input = transformed_input;
    } else {
      // If depth <= 1, then just reshape.
      CHECK(input.CopyFrom(input, nchw_shape));
    }
  } else {
    CHECK(data_format == compute_data_format)  // Crash OK
        << "Illegal data and compute format pair:"
        << " data_format=" << ToString(data_format)
        << " compute_data_format=" << ToString(compute_data_format);
  }

  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
      << "Negative row or col paddings: (" << common_padding_rows << ", "
      << common_padding_cols << ")";

  constexpr auto kComputeInNHWC =
      std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
                      se::dnn::FilterLayout::kOutputYXInput);
  constexpr auto kComputeInNCHW =
      std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
                      se::dnn::FilterLayout::kOutputInputYX);

  se::dnn::DataLayout compute_data_layout;
  se::dnn::FilterLayout filter_layout;

  std::tie(compute_data_layout, filter_layout) =
      compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;

  se::dnn::BatchDescriptor input_desc;
  input_desc.set_count(in_batch)
      .set_feature_map_count(in_depths)
      .set_height(in_rows)
      .set_width(in_cols)
      .set_layout(compute_data_layout);
  se::dnn::BatchDescriptor output_desc;
  output_desc.set_count(out_batch)
      .set_height(out_rows)
      .set_width(out_cols)
      .set_feature_map_count(out_depths)
      .set_layout(compute_data_layout);
  se::dnn::FilterDescriptor filter_desc;
  filter_desc.set_input_filter_height(patch_rows)
      .set_input_filter_width(patch_cols)
      .set_input_feature_map_count(patch_depths)
      .set_output_feature_map_count(filter.dim_size(3))
      .set_layout(filter_layout);
  se::dnn::ConvolutionDescriptor conv_desc;
  conv_desc.set_vertical_dilation_rate(row_dilation)
      .set_horizontal_dilation_rate(col_dilation)
      .set_vertical_filter_stride(row_stride)
      .set_horizontal_filter_stride(col_stride)
      .set_zero_padding_height(common_padding_rows)
      .set_zero_padding_width(common_padding_cols)
      .set_group_count(in_depths / patch_depths);

对于卷积核进行转换。

  Tensor transformed_filter;

  const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
    VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
            << " to " << ToString(dst_format);

    TensorShape dst_shape =
        dst_format == FORMAT_OIHW
            ? TensorShape({filter.dim_size(3), filter.dim_size(2),
                           filter.dim_size(0), filter.dim_size(1)})
            : TensorShape({filter.dim_size(3), filter.dim_size(0),
                           filter.dim_size(1), filter.dim_size(2)});

    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
                                          &transformed_filter));
    functor::TransformFilter<GPUDevice, T, int, 4>()(
        ctx->eigen_device<GPUDevice>(), dst_format,
        To32Bit(filter.tensor<T, 4>()),
        To32Bit(transformed_filter.tensor<T, 4>()));

    return Status::OK();
  };

  if (compute_data_format == FORMAT_NCHW) {
    OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
  } else if (compute_data_format == FORMAT_NHWC) {
    OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
  } else {
    ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
                                           ToString(compute_data_format)));
    return;
  }

如果输出格式与计算格式不同,则为transformed_output分配一块内存。

  Tensor transformed_output;
  if (data_format != compute_data_format) {
    VLOG(4) << "Allocate temporary memory for output in compute data format";
    OP_REQUIRES_OK(
        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
                                ShapeFromFormat(compute_data_format, out_batch,
                                                out_rows, out_cols, out_depths),
                                &transformed_output));
  } else {
    transformed_output = *output;
  }

GetDnnWorkspaceLimit 从环境变量获取工作区大小限制。
ConvAutotuneMap 即 AutotuneSingleton,AutotuneSingleton::GetInstance 返回 AutotuneMap 对象。
AutotuneUnfusedConv 使用 cuDNN 的动态调优功能得到 AutotuneEntry 。
LaunchAutotunedConv 执行 ConvRunner

  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                  input.template flat<T>().size());
  auto filter_ptr =
      AsDeviceMemory(transformed_filter.template flat<T>().data(),
                     transformed_filter.template flat<T>().size());
  auto output_ptr =
      AsDeviceMemory(transformed_output.template flat<T>().data(),
                     transformed_output.template flat<T>().size());

  static int64_t ConvolveScratchSize = GetDnnWorkspaceLimit(
      // default value is in bytes despite the name of the environment variable
      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
  );

  int device_id = stream->parent()->device_ordinal();
  DataType dtype = input.dtype();
  ConvParameters conv_parameters = {in_batch,             // batch
                                    in_depths,            // in_depths
                                    {{in_rows,            // in_rows
                                      in_cols}},          // in_cols
                                    compute_data_format,  // compute_data_format
                                    out_depths,           // out_depths
                                    {{patch_rows,         // filter_rows
                                      patch_cols,         // filter_cols
                                      patch_depths}},     // filter_depths
                                    {{row_dilation,       // dilation_rows
                                      col_dilation}},     // dilation_cols
                                    {{row_stride,         // stride_rows
                                      col_stride}},       // stride_cols
                                    {{common_padding_rows,    // padding_rows
                                      common_padding_cols}},  // padding_cols
                                    dtype,                    // tensor datatype
                                    device_id,                // device_id
                                    conv_desc.group_count()};

  auto entry_or = AutotuneUnfusedConv(
      cudnn_use_autotune, ConvAutotuneMap::GetInstance(), conv_parameters, ctx,
      se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc,
      filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize);
  OP_REQUIRES_OK(ctx, entry_or.status());
  auto autotune_entry = entry_or.ConsumeValueOrDie();

  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
  Status cudnn_launch_status = LaunchAutotunedConv(
      autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
      stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
      output_desc, output_ptr);
  if (!cudnn_launch_status.ok()) {
    ctx->SetStatus(cudnn_launch_status);
    return;
  }

  if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
    VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
    functor::NCHWToNHWC<GPUDevice, T, 4>()(
        ctx->eigen_device<GPUDevice>(),
        const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
        output->tensor<T, 4>());
  }
}

AutotuneUnfusedConv

Created with Raphaël 2.3.0 AutotuneUnfusedConv autotune_map, conv_parameters, ctx AutotuneMap::Find Yes or No? autotune_entry End WrapRedzoneBestEffort AutotuneConvImpl LogConvAutotuneResults BestCudnnConvAlgorithm AutotuneMap::Insert yes no

se::dnn::ConvOp 为 ConvRunner 实现 LazyOpRunner 所需的概念。
AutotuneMap 从参数中查找最佳自动调整配置的帮助程序类。
BatchDescriptor 描述层消耗/产生的维度。
FilterDescriptor
AutotuneEntry 支持 cuDNN 前端 API 的自动调整映射条目。ROCm 仍停留旧版 API,需要一个 AlgorithmConfig。
ConvParameters 唯一标识在特定设备型号上运行的卷积操作。
AutotuneMap::Find 以参数为键查找配置。
ScopedAnnotation 通过当前注册的 TraceCollector 为实例生命周期内的所有活动添加注释。

template <typename T>
StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
    bool cudnn_use_autotune,
    AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
    const ConvParameters& conv_parameters, OpKernelContext* ctx,
    se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
    se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
    se::DeviceMemory<T> filter_ptr,
    const se::dnn::ConvolutionDescriptor& conv_desc,
    const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
    int64_t scratch_size_limit) {
  AutotuneEntry<se::dnn::ConvOp> autotune_entry;

  auto* stream = ctx->op_device_context()->stream();

  if (!autotune_map->Find(conv_parameters, &autotune_entry)) {
    profiler::ScopedAnnotation annotation("cudnn_autotuning");

TfAllocatorAdapter 是封装了 Tensorflow 分配器的适配器类。
RedzoneAllocator 分配器在每次分配的开始/结束时分配一点额外的内存,并且可以检查该内存是否未被修改。
WrapRedzoneBestEffort 调用 RedzoneAllocator::AllocateBytes 分配 DeviceMemory

#if GOOGLE_CUDA
    se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                stream);
    se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
                                      se::GpuAsmOpts());

    // TODO(awpr): second-guess whether it's okay that this profiles
    // convolutions on uninitialized memory.
    switch (kind) {
      case se::dnn::ConvolutionKind::FORWARD:
      case se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
        output_ptr = se::DeviceMemory<T>(
            WrapRedzoneBestEffort(&rz_allocator, output_ptr));
        break;
      case se::dnn::ConvolutionKind::BACKWARD_DATA:
        input_ptr = se::DeviceMemory<T>(
            WrapRedzoneBestEffort(&rz_allocator, input_ptr));
        break;
      case se::dnn::ConvolutionKind::BACKWARD_FILTER:
        filter_ptr = se::DeviceMemory<T>(
            WrapRedzoneBestEffort(&rz_allocator, filter_ptr));
        break;
      default:
        return errors::InvalidArgument(
            absl::StrFormat("Unknown ConvolutionKind %d", kind));
    }

launch_func函数调用 se::dnn::ConvRunner 执行后端。
AutotuneConvImpl 执行传入的launch_func函数,得到一组 AutotuneResult。
LogConvAutotuneResults 记录到 AutotuningLog 中。

    const auto element_type = se::dnn::ToDataType<T>::value;
    std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
    TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
        CudnnUseFrontend(), kind, element_type, element_type, stream,
        input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
        conv_desc, /*use_fallback=*/false, &rz_allocator, &runners));
    auto launch_func =
        [&](se::ScratchAllocator* allocator_used,
            const std::unique_ptr<const se::dnn::ConvRunner>& runner,
            se::dnn::ProfileResult* profile_result) -> Status {
      TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
                                            runner->GetWorkspaceSize()));
      return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr,
                       output_ptr);
    };
    SE_ASSIGN_OR_RETURN(
        auto results,
        AutotuneConvImpl(ctx, runners, cudnn_use_autotune, launch_func,
                         scratch_size_limit, rz_allocator));

    LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
                           filter_ptr, output_ptr, input_desc, filter_desc,
                           output_desc, conv_desc, stream->parent(), results);

两级自动调整:Cudnn 前端支持两个引擎列表:
启发式和回退。 启发式引擎通常更快。
为了减少自动调整时间,我们仅在没有启发式引擎工作时评估回退引擎。
BestCudnnConvAlgorithm 由最优结果创建 AutotuneEntry 对象。

    // Two-level autotuning: Cudnn frontend supports two engine lists:
    // heuristics and fallback. Heuristics engines are normally faster.
    // To reduce autotuning time, we evaluate the fallback engines only when
    // none of the heuristics engines work.
    bool found_working_engine = false;
    for (auto& result : results) {
      if (!result.has_failure()) {
        found_working_engine = true;
        break;
      }
    }

    if (!CudnnUseFrontend() || found_working_engine) {
      SE_ASSIGN_OR_RETURN(
          autotune_entry,
          BestCudnnConvAlgorithm<se::dnn::ConvOp>(results, std::move(runners)));
    } else {
      LOG(WARNING)
          << "None of the algorithms provided by cuDNN frontend heuristics "
             "worked; trying fallback algorithms.  Conv: "
          << conv_parameters.ToString();
      std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners;
      TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
          CudnnUseFrontend(), kind, element_type, element_type, stream,
          input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
          output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
          &fallback_runners));

      SE_ASSIGN_OR_RETURN(
          auto fallback_results,
          AutotuneConvImpl(ctx, fallback_runners, cudnn_use_autotune,
                           launch_func, scratch_size_limit, rz_allocator));

      LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
                             filter_ptr, output_ptr, input_desc, filter_desc,
                             output_desc, conv_desc, stream->parent(),
                             fallback_results);

      SE_ASSIGN_OR_RETURN(autotune_entry,
                          BestCudnnConvAlgorithm<se::dnn::ConvOp>(
                              fallback_results, std::move(fallback_runners)));
    }

ROCm 的实现。

#elif TENSORFLOW_USE_ROCM
    DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);

    std::vector<se::dnn::ProfileResult> algorithms;
    if (!stream->parent()->GetMIOpenConvolveAlgorithms(
            kind, se::dnn::ToDataType<T>::value, stream, input_desc, input_ptr,
            filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
            &scratch_allocator, &algorithms)) {
      return errors::Unknown(
          "Failed to get convolution algorithm. This is probably "
          "because MIOpen failed to initialize, so try looking to "
          "see if a warning log message was printed above.");
    }

    std::vector<tensorflow::AutotuneResult> results;
    if (algorithms.size() == 1) {
      auto profile_result = algorithms[0];
      results.emplace_back();
      auto& result = results.back();
      *result.mutable_algorithm() = profile_result.algorithm().ToProto();

      result.set_scratch_bytes(profile_result.scratch_size());
      *result.mutable_run_time() = proto_utils::ToDurationProto(
          absl::Milliseconds(profile_result.elapsed_time_in_ms()));
    } else {
      for (auto miopen_algorithm : algorithms) {
        auto profile_algorithm = miopen_algorithm.algorithm();
        se::dnn::ProfileResult profile_result;
        auto miopen_launch_status = stream->ConvolveWithAlgorithm(
            kind, input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
            output_ptr, conv_desc, &scratch_allocator,
            se::dnn::AlgorithmConfig(profile_algorithm,
                                     miopen_algorithm.scratch_size()),
            &profile_result);
        if (miopen_launch_status.ok() && profile_result.is_valid()) {
          results.emplace_back();
          auto& result = results.back();
          *result.mutable_algorithm() = profile_algorithm.ToProto();

          result.set_scratch_bytes(scratch_allocator.TotalByteSize());
          *result.mutable_run_time() = proto_utils::ToDurationProto(
              absl::Milliseconds(profile_result.elapsed_time_in_ms()));
        }
      }
    }
    LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
                           filter_ptr, output_ptr, input_desc, filter_desc,
                           output_desc, conv_desc, stream->parent(), results);

    SE_ASSIGN_OR_RETURN(auto algo_desc, BestCudnnConvAlgorithm(results));
    autotune_entry = AutotuneEntry<se::dnn::ConvOp>(algo_desc);
#endif

AutotuneMap::Insert 将卷积参数和对应的 AutotuneEntry 插入到 AutotuneMap 中。

    autotune_map->Insert(conv_parameters, autotune_entry);
  }

  return autotune_entry;
}

AutotuneMap


// A helper class that looks up the best autotuned config from parameters.
// Due to the noisy nature of autotune, especially with multiple devices, it
// only accepts a config if its margin exceeds a threshold.
// For the same shape configs, if a new best config matches the previous best,
// they get promoted; otherwise, the winner gets demoted. This process stops
// when the winner's score exceeds the threshold.
// In a bad case when two configs are very close to each other and flips
// back and forth randomly, the expected number of experiments before autotune
// settles is O(threshold ^ 2). So we recommend that number of warmup runs
// for any benchmarks.
template <typename Parameters, typename Config>
class AutotuneMap {
 private:
  // Retrieves the hash code of Parameters class.
  struct Hasher {
    std::size_t operator()(const Parameters& parameter) const {
      return parameter.hash();
    }
  };

AutotuneMap::Find

如果分数小于最小阈值并且未达到最大调优次数,则返回失败。这种机制下会导致调优的次数不固定。

 public:
  bool Find(const Parameters& params, Config* config) const {
    mutex_lock lock(mu_);
    auto iter = params_config_map_.find(params);
    if (iter == params_config_map_.end() ||
        (iter->second.score < min_score_threshold_ &&
         iter->second.count <= max_autotune_count_)) {
      return false;
    }
    *config = iter->second.config;
    return true;
  }

AutotuneMap::Insert

分数是内部定义的一套机制,新参数的分数为1,min_score_threshold_为1,意味着只会保留旧的?
首先检查params_config_map_字典中是否已有该参数。
如果没有则创建一个条目,默认分数为1;
否则如果原有分数小于最低阈值并且调优次数未达上限,则如果两个设置不同减分,相同加分。
如果min_score_threshold_为2,则只保留稳定值。

  void Insert(const Parameters& params, const Config& config) {
    mutex_lock lock(mu_);
    auto iter = params_config_map_.find(params);
    int new_score = 0;
    if (iter == params_config_map_.end()) {
      // Create a new entry if params is new.
      VLOG(1) << GetActionSummary("creates", params, config);
      params_config_map_.insert(
          std::make_pair(params, ValueType{config, 1, 1}));
      new_score = 1;
    } else if (iter->second.score < min_score_threshold_ &&
               iter->second.count <= max_autotune_count_) {
      DCHECK_GT(iter->second.score, 0);
      if (iter->second.config != config) {
        // If it is different from the current winner, demotes the winner.
        VLOG(1) << GetActionSummary("demotes", params, config);
        new_score = --iter->second.score;
        ++iter->second.count;
        if (new_score <= 0) {
          VLOG(1) << GetActionSummary("erases", params, config);
          params_config_map_.erase(iter);
        }
      } else {
        // If it is the same as the current winner, promotes the winner.
        VLOG(1) << GetActionSummary("promotes", params, config);
        new_score = ++iter->second.score;
        ++iter->second.count;
      }
    }

如果new_score小于最低阈值但是全局调优次数已经超过阈值,则接受当前或者字典中已有的配置,将其分数设置为min_score_threshold_

    if (new_score >= min_score_threshold_) {
      VLOG(1) << GetActionSummary("accepts", params, config);
    } else if (autotune_global_count_ >= max_autotune_global_count_) {
      // The autotuning exceeds the max iteration threshold and we accept the
      // the winner if it exists in the map, otherwise we accept the current
      // winner.
      auto winner = params_config_map_.find(params);
      if (winner == params_config_map_.end()) {
        VLOG(1) << GetActionSummary("creates", params, config);
        for (int i = 0; i < min_score_threshold_; ++i) {
          VLOG(1) << GetActionSummary("promotes", params, config);
        }
        params_config_map_.insert(
            std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
      } else {
        int promotes_times = min_score_threshold_ - winner->second.score;
        for (int i = 0; i < promotes_times; ++i) {
          VLOG(1) << GetActionSummary("promotes", params, config);
        }
        winner->second.score = min_score_threshold_;
      }
      VLOG(1) << GetActionSummary("accepts", params, config);
    }
    autotune_global_count_++;
  }
  std::unordered_map<Parameters, Config, Hasher> GetMap() const {
    mutex_lock lock(mu_);
    std::unordered_map<Parameters, Config, Hasher> map;
    for (const auto& entry : params_config_map_) {
      map.insert(std::make_pair(entry.first, entry.second.config));
    }
    return map;
  }
  // Only for testing
  void ClearMap() {
    mutex_lock lock(mu_);
    params_config_map_.clear();
  }

 private:
  // Underlying data structure of values in the map.
  struct ValueType {
    Config config;
    int32 score;
    int32 count;
  };

如果不修改min_score_threshold_max_autotune_count_大于等于min_warmup_iterations
max_autotune_global_count_max_autotune_count_的两倍。

  AutotuneMap(const std::string& name) : name_(name) {
    min_score_threshold_ = 1;
    int min_warmup_iterations = 10;
    const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
    if (threshold_str != nullptr) {
      VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
      strings::safe_strto32(threshold_str, &min_score_threshold_);
    }
    const char* min_warmup_iteration_str =
        getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
    if (min_warmup_iteration_str != nullptr) {
      strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
    }
    min_score_threshold_ = std::max(min_score_threshold_, 1);
    max_autotune_count_ = std::max(
        5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
    max_autotune_global_count_ = 2 * max_autotune_count_;
    autotune_global_count_ = 0;
  }

  template <class Group, class Params, class Cfg>
  friend class AutotuneSingleton;

  std::string GetActionSummary(StringPiece action, const Parameters& params,
                               const Config& config) {
    return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
                           string(action).c_str(), params.ToString().c_str(),
                           config.ToString().c_str());
  }

  mutable mutex mu_;

  std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
      TF_GUARDED_BY(mu_);
  std::string name_;
  int32 min_score_threshold_;
  int32 max_autotune_count_;
  int32 max_autotune_global_count_;
  int32 autotune_global_count_;

  TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
};

AutotuneConvImpl

设备可以子类化 DeviceContext 以将特定于设备的上下文传递给 OpKernels 类。
se::TfAllocatorAdapter 是包装 Tensorflow 分配器的适配器类。

template <typename LaunchFunc, typename Sig>
StatusOr<std::vector<tensorflow::AutotuneResult>> AutotuneConvImpl(
    OpKernelContext* ctx,
    std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
    bool actually_do_autotune, const LaunchFunc& launch_func,
    size_t scratch_size_limit, const se::RedzoneAllocator& rz_allocator) {
  auto* stream = ctx->op_device_context()->stream();

  se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                              stream);

se::dnn::OpRunner 是拥有特定操作(配置)的缓存状态的抽象类。其主要动机是 cuDNN 后端执行计划(ExecutionPlan)的重新创建成本很高。所有 OpRunner 的寿命都必须超过其父 Stream。
RedzoneAllocator 分配器。
CudnnLegacyConvRunner::ToAlgorithmDesc 调用 CudnnLegacyConvRunner::MakeAlgorithmDesc 创建 dnn::AlgorithmDesc 对象。
se::dnn::ProfileResult 描述 perf 实验的结果。
如果需要实际调优,则调用launch_func函数,否则手动设置profile_result中的值。

  std::vector<tensorflow::AutotuneResult> results;
  // TODO(reedwm): Warn if determinism is enabled after autotune is run
  for (auto& runner : runners) {
    // TODO(zhengxq): profile each algorithm multiple times to better
    // accuracy.
    se::RedzoneAllocator rz_scratch_allocator(
        stream, &tf_allocator_adapter, se::GpuAsmOpts(),
        /*memory_limit=*/scratch_size_limit);
    DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
    se::ScratchAllocator* allocator_used =
        !RedzoneCheckDisabled()
            ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
            : static_cast<se::ScratchAllocator*>(&scratch_allocator);

    SE_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
    se::dnn::ProfileResult profile_result;
    Status cudnn_launch_status =
        actually_do_autotune
            ? launch_func(allocator_used, runner, &profile_result)
            : OkStatus();
    if (!actually_do_autotune) {
      // Make the result valid according to `is_valid`.
      profile_result.set_algorithm(desc);
      profile_result.set_elapsed_time_in_ms(0);
    }

runner 会运行失败?
ProfileResult::is_valid 检查是否有 AlgorithmDesc 值以及时间是否正常。
RedzoneCheckDisabled 读取环境变量TF_DISABLE_RZ_CHECK的值。
RedzoneAllocator::TotalAllocatedBytesExcludingRedzones 返回分配的字节数。
DnnScratchAllocator::TotalByteSize
proto_utils::ToDurationProto 将 absl::Duration 转换为 google::protobuf::Duration。

    // We need to make sure the profiling results are one-to-one with the
    // "runners". So, we insert dummy results when the execution fails.
    results.emplace_back();
    auto& result = results.back();
    *result.mutable_algorithm() = desc.ToProto();
    if (cudnn_launch_status.ok() && profile_result.is_valid()) {
      result.set_scratch_bytes(
          !RedzoneCheckDisabled()
              ? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
              : scratch_allocator.TotalByteSize());
      *result.mutable_run_time() = proto_utils::ToDurationProto(
          absl::Milliseconds(profile_result.elapsed_time_in_ms()));

      CheckRedzones(rz_scratch_allocator, &result);
      CheckRedzones(rz_allocator, &result);
    } else {
      result.mutable_failure()->set_kind(AutotuneResult::UNKNOWN);
      result.mutable_failure()->set_msg(
          absl::StrCat("Profiling failure on CUDNN engine ", desc.ToString(),
                       ": ", cudnn_launch_status.ToString()));
    }
  }

  return results;
}

LogConvAutotuneResults

AutotuningLog 包含 AutotuneResult 以及软硬件信息。
ConvolutionProto 记录卷积信息。

void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
                            se::dnn::DataType element_type,
                            se::DeviceMemoryBase input_buffer,
                            se::DeviceMemoryBase filter_buffer,
                            se::DeviceMemoryBase output_buffer,
                            const se::dnn::BatchDescriptor& input_desc,
                            const se::dnn::FilterDescriptor& filter_desc,
                            const se::dnn::BatchDescriptor& output_desc,
                            const se::dnn::ConvolutionDescriptor& conv_desc,
                            se::StreamExecutor* stream_exec,
                            absl::Span<const AutotuneResult> results) {
  AutotuningLog log;
  {
    ConvolutionProto instr;
    instr.set_kind(kind);
    *instr.mutable_input() = input_desc.ToProto(element_type);
    *instr.mutable_filter() = filter_desc.ToProto(element_type);
    *instr.mutable_output() = output_desc.ToProto(element_type);
    *instr.mutable_conv_desc() = conv_desc.ToProto();
    instr.set_conv_scale(1);
    instr.set_side_value_scale(0);
    instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
    instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
    instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
    log.mutable_instr()->PackFrom(std::move(instr));
  }

GetCudnnVersion
GetComputeCapability

  *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
  *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
  log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
  {
    string blas_version;
    if (auto* blas = stream_exec->AsBlas()) {
      if (blas->GetVersion(&blas_version).ok()) {
        log.set_blas_version(blas_version);
      }
    }
  }
  for (const auto& result : results) {
    *log.add_results() = result;
  }
  VLOG(2) << log.DebugString();
  Logger::GetSingleton()->LogProto(log);
}

BestCudnnConvAlgorithm

BestCudnnConvAlgorithm
BestCudnnConvAlgorithmIndices
AutotuneEntry::FromOpRunners

BestCudnnConvAlgorithmIndices 找到耗时最短算法的索引。
AutotuneEntry::FromOpRunners 使用预缓存的 OpRunner 进行初始化,例如在自动调整期间。
TF_ASSIGN_OR_RETURN 执行表达式,如果成功赋值给变量;否则返回状态。

emplate <typename Op>
StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
    absl::Span<const AutotuneResult> results,
    std::vector<
        std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
        runners) {
  if (runners.size() != results.size()) {
    return errors::Internal(
        "Mismatched size of autotune results and runners vectors.");
  }
  int idx;
  int idx_no_scratch;
  TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
                      BestCudnnConvAlgorithmIndices(results));
  VLOG(2) << "fastest algorithm: "
          << proto_utils::FromDurationProto(results[idx].run_time())
          << " with algo " << runners[idx]->ToString() << ", workspace bytes "
          << results[idx].scratch_bytes();
  return AutotuneEntry<Op>::FromOpRunners(
      std::move(runners[idx]), idx_no_scratch == -1 || idx_no_scratch == idx
                                   ? nullptr
                                   : std::move(runners[idx_no_scratch]));
}

BestCudnnConvAlgorithmIndices

compare_run_times比较结果中的时间。

StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
    absl::Span<const AutotuneResult> results) {
  auto compare_run_times 
  = [](const AutotuneResult& lhs,
                              const AutotuneResult& rhs) {
    return proto_utils::FromDurationProto(lhs.run_time()) <
           proto_utils::FromDurationProto(rhs.run_time());
  };

遍历每个结果,找到最小值的索引。

  int idx = -1;
  int idx_no_scratch = -1;
  for (int i = 0; i < results.size(); i++) {
    if (!results[i].has_failure()) {
      if (OpDeterminismRequired()) {
        // When determinism is enabled, choose first working algorithm, and
        // don't choose a no_scratch algorithm.
        idx = i;
        break;
      }
      if (idx == -1 || compare_run_times(results[i], results[idx])) {
        idx = i;
      }
      if (results[i].scratch_bytes() == 0 &&
          (idx_no_scratch == -1 ||
           compare_run_times(results[i], results[idx_no_scratch]))) {
        idx_no_scratch = i;
      }
    }
  }

如果未找到返回错误。

  if (idx == -1) {
    std::ostringstream msg;
    msg << "No algorithm worked!  Error messages:";
    // TODO(awpr): identify the algorithm as part of this error message, too.
    for (const auto& result : results) {
      msg << "\n  " << result.failure().msg();
    }
    return errors::NotFound(msg.str());
  }

  return std::make_tuple(idx, idx_no_scratch);
}

LaunchAutotunedConv

Created with Raphaël 2.3.0 LaunchAutotunedConv autotune_entry, scratch_allocator, kind, stream AutotuneEntry::is_algorithm_config Yes or No? ConvolveWithAlgorithm End AutotuneEntry::GetOpRunners LazyOpRunner::GetOrCreateRunner CudnnExecutionPlanRunner::operator() yes no

AutotuneEntry::is_algorithm_config 检查是否使用了 AlgorithmConfig。
AutotuneEntry::GetOpRunners 返回 AutotuneEntry::OpRunners 结构体。
se::dnn::ConvOp::Config 为卷积的数据类型和描述符。
LazyOpRunner::GetOrCreateRunner 如果可用,则返回一个已经初始化的 OpRunner,或者创建一个。
AllocateScratchOrFallback 返回指向runners的主要OpRunner的指针,如果可分配,则分配暂存内存;否则指向其后备无暂存空间运行器的指针,以及空DeviceMemoryBase
ConvRunner 使用3个输入的函数签名。
CudnnExecutionPlanRunner::operator() 调用 cudnn 前端操作。

template <typename T>
Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
                           DnnScratchAllocator* scratch_allocator,
                           se::dnn::ConvolutionKind kind, se::Stream* stream,
                           const se::dnn::BatchDescriptor& input_desc,
                           se::DeviceMemory<T> in_ptr,
                           const se::dnn::FilterDescriptor& filter_desc,
                           se::DeviceMemory<T> filter_ptr,
                           const se::dnn::ConvolutionDescriptor& conv_desc,
                           const se::dnn::BatchDescriptor& output_desc,
                           se::DeviceMemory<T> out_ptr) {
  if (!autotune_entry.is_algorithm_config()) {
    const auto& runners = autotune_entry.GetOpRunners();
    se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
    se::dnn::ConvOp::Config config{kind,       element_type, element_type,
                                   input_desc, filter_desc,  output_desc,
                                   conv_desc};
    TF_ASSIGN_OR_RETURN(auto* primary,
                        runners.primary->GetOrCreateRunner(config, stream));

    const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
    if (runners.no_scratch_fallback) {
      TF_ASSIGN_OR_RETURN(
          no_scratch_fallback,
          runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
    }

    TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
                        AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
                            scratch_allocator, primary, no_scratch_fallback));
    auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
    return runner(stream, nullptr,
                  std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
                  filter_ptr, out_ptr);

否则调用 Stream::ConvolveWithAlgorithm

  } else {
    return stream->ConvolveWithAlgorithm(
        kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
        conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
        nullptr);
  }
}

参考资料:

  • Performance of SpatialConvolution implementation in tensorflow
  • TensorFlow中2D卷积代码简析
  • TensorflowXLABeginner/XLA-Report
  • 性能指南
  • tf.config.experimental.enable_op_determinism
  • Optimizing your GPU Jobs
  • 『深度长文』Tensorflow代码解析(三)
  • tensorflow 自定义padding
  • TensorFlow源码分析(4):OpKernelContruction类的构造函数
  • Tensorflow OpKernel机制详解
  • tf.nn.conv2d()函数以及padding填充方式介绍
  • First tf.session.run() performs dramatically different from later runs. Why?
  • os.environ
  • AMD 编译概述 & Fatbin 文件生成 & HIP Runtime API(启动 CUDA 核函数)
  • AMD HIP
  • HIP编程
  • Using with AMD’s HIP on Frontier
  • High latency of hipLaunchKernelGGL #1963
  • Convolutional Layers User’s Guide
  • TF.distribute.MirroredStrategy() crashes #34067
  • Time Programming
  • Tip of the Week #93: using absl::Span

你可能感兴趣的:(TensorFlow,DeepLearning,GPU,高性能计算,深度学习,tensorflow)