TensorRT 初探(3)—— explicit_batch vs implicit_batch

显示batch和隐式batch

  • implicit batch demo
  • explicit batch demo
  • 对比总结

关于explicit_batch 和 implicit_batch的官方文档参考链接:
Explicit vs Implicit Batch

官网部分描述如下,简而言之,隐式batch中,tensor中没有batch维度的信息,并且所有维度必须是常数。tensorrt保留隐式batch是为了向后兼容。因此新代码不推荐使用隐式batch。
在这里插入图片描述

implicit batch demo

下面是一个展示implicit batch使用的demo, network只有一个layer——conv2d,完整代码如下:

#include "NvInfer.h"
#include 
#include 
#include 
#include 
#include 

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  INetworkDefinition* network = builder->createNetworkV2(0); // implict_batch
  builder->setMaxBatchSize(3);
  
  Dims3 input_shape{3, 4, 4};
  Dims4 filter_shape{1, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  const int exec_batch = 3;
  Dims4 output_shape {exec_batch, 1, 3, 3};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");

  {
    std::cout << "conv2d input tensor dims : [";
    for(size_t i = 0; i < conv2d->getInput(0)->getDimensions().nbDims; i++ ) {
      std::cout << conv2d->getInput(0)->getDimensions().d[i] << " "; 
    }
    std::cout << "]" << std::endl;

    std::cout << "conv2d output tensor dims : [";
    for(size_t i = 0; i < conv2d->getOutput(0)->getDimensions().nbDims; i++ ) {
      std::cout << conv2d->getOutput(0)->getDimensions().d[i] << " "; 
    }
    std::cout << "]" << std::endl;
  }

  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape)*exec_batch, DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->execute(3, buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

关键API使用:

INetworkDefinition* network = builder->createNetworkV2(0); // implict_batch
builder->setMaxBatchSize(3);
context->execute(3, buffers);

使用隐式batch时,输入不需要指定batch信息,Dims3 input_shape{3, 4, 4}, 在执行的时候使用 execute(3, buffers)指定。

explicit batch demo

作为类比,explicit demo完整代码如下:

#include "NvInfer.h"
#include 
#include 
#include 
#include 
#include 

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag); // explicit_batch
  
  Dims4 input_shape{3, 3, 4, 4};
  Dims4 filter_shape{1, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  Dims4 output_shape {3, 1, 3, 3};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");

  {
    std::cout << "conv2d input tensor dims : [";
    for(size_t i = 0; i < conv2d->getInput(0)->getDimensions().nbDims; i++ ) {
      std::cout << conv2d->getInput(0)->getDimensions().d[i] << " "; 
    }
    std::cout << "]" << std::endl;

    std::cout << "conv2d output tensor dims : [";
    for(size_t i = 0; i < conv2d->getOutput(0)->getDimensions().nbDims; i++ ) {
      std::cout << conv2d->getOutput(0)->getDimensions().d[i] << " "; 
    }
    std::cout << "]" << std::endl;
  }

  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->executeV2(buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

explicit batch时,input输入需要指定四维,Dims4 input_shape{3, 3, 4, 4}。使用executeV2(buffers) 执行。

对比总结

explicit batch implicit batch
创建方式 createNetworkV2中指定flag(1值), builder->setMaxBatchSize(3) createNetworkV2传入0值
输入shape 四维 三维,没有batch信息
执行方式 executeV2(buffer) execute(batch, buffer)

你可能感兴趣的:(TensorRT,tensorrt,gpu)