关于explicit_batch 和 implicit_batch的官方文档参考链接:
Explicit vs Implicit Batch
官网部分描述如下,简而言之,隐式batch中,tensor中没有batch维度的信息,并且所有维度必须是常数。tensorrt保留隐式batch是为了向后兼容。因此新代码不推荐使用隐式batch。
下面是一个展示implicit batch使用的demo, network只有一个layer——conv2d,完整代码如下:
#include "NvInfer.h"
#include
#include
#include
#include
#include
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
int main() {
Logger logger;
// Create a Network Definition
IBuilder* builder = createInferBuilder(logger);
INetworkDefinition* network = builder->createNetworkV2(0); // implict_batch
builder->setMaxBatchSize(3);
Dims3 input_shape{3, 4, 4};
Dims4 filter_shape{1, 3, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
const int exec_batch = 3;
Dims4 output_shape {exec_batch, 1, 3, 3};
// Add the Input layer to the network
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
// Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
// Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
conv2d->getOutput(0)->setName("output");
{
std::cout << "conv2d input tensor dims : [";
for(size_t i = 0; i < conv2d->getInput(0)->getDimensions().nbDims; i++ ) {
std::cout << conv2d->getInput(0)->getDimensions().d[i] << " ";
}
std::cout << "]" << std::endl;
std::cout << "conv2d output tensor dims : [";
for(size_t i = 0; i < conv2d->getOutput(0)->getDimensions().nbDims; i++ ) {
std::cout << conv2d->getOutput(0)->getDimensions().d[i] << " ";
}
std::cout << "]" << std::endl;
}
// Mark it as the output of the entire network:
network->markOutput(*conv2d->getOutput(0));
// Building an Engine(optimize the network)
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
IRuntime* runtime = createInferRuntime(logger);
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
// Prepare input_data
int32_t inputIndex = engine->getBindingIndex("input");
int32_t outputIndex = engine->getBindingIndex("output");
std::vector<float> input(ProductOfDims(input_shape)*exec_batch, DEFAULT_VALUE);
std::vector<float> output(ProductOfDims(output_shape));
void *GPU_input_Buffer_ptr; // a host ptr point to a GPU buffer
void *GPU_output_Buffer_ptr; // a host ptr point to a GPU buffer
void* buffers[2];
cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);
// Performing Inference
IExecutionContext *context = engine->createExecutionContext();
context->execute(3, buffers);
// copy result data from gpu to cpu
cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost);
// display output
std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
std::cout << "output data : \n";
for(auto i : output)
std::cout << i << " ";
std::cout << std::endl;
}
关键API使用:
INetworkDefinition* network = builder->createNetworkV2(0); // implict_batch
builder->setMaxBatchSize(3);
context->execute(3, buffers);
使用隐式batch时,输入不需要指定batch信息,Dims3 input_shape{3, 4, 4}, 在执行的时候使用 execute(3, buffers)指定。
作为类比,explicit demo完整代码如下:
#include "NvInfer.h"
#include
#include
#include
#include
#include
using namespace nvinfer1;
#define DEFAULT_VALUE 1.0
class Logger : public ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
};
size_t ProductOfDims(Dims dims) {
size_t result = 1;
for(size_t i = 0; i < dims.nbDims; i++) {
result *= dims.d[i];
}
return result;
}
std::string DimsToStr(Dims dims) {
std::stringstream ss;
for(size_t i = 0; i < dims.nbDims; i++) {
ss << dims.d[i] << " ";
}
return ss.str();
}
int main() {
Logger logger;
// Create a Network Definition
IBuilder* builder = createInferBuilder(logger);
uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(flag); // explicit_batch
Dims4 input_shape{3, 3, 4, 4};
Dims4 filter_shape{1, 3, 2, 2};
DimsHW kernel_size{2, 2};
DimsHW stride{1, 1};
Dims4 output_shape {3, 1, 3, 3};
// Add the Input layer to the network
auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
// Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
int32_t output_channel = filter_shape.d[0];
auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
conv2d->setStrideNd(stride);
// Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
conv2d->getOutput(0)->setName("output");
{
std::cout << "conv2d input tensor dims : [";
for(size_t i = 0; i < conv2d->getInput(0)->getDimensions().nbDims; i++ ) {
std::cout << conv2d->getInput(0)->getDimensions().d[i] << " ";
}
std::cout << "]" << std::endl;
std::cout << "conv2d output tensor dims : [";
for(size_t i = 0; i < conv2d->getOutput(0)->getDimensions().nbDims; i++ ) {
std::cout << conv2d->getOutput(0)->getDimensions().d[i] << " ";
}
std::cout << "]" << std::endl;
}
// Mark it as the output of the entire network:
network->markOutput(*conv2d->getOutput(0));
// Building an Engine(optimize the network)
IBuilderConfig* config = builder->createBuilderConfig();
IHostMemory* serializedModel = builder->buildSerializedNetwork(*network, *config);
IRuntime* runtime = createInferRuntime(logger);
ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());
// Prepare input_data
int32_t inputIndex = engine->getBindingIndex("input");
int32_t outputIndex = engine->getBindingIndex("output");
std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
std::vector<float> output(ProductOfDims(output_shape));
void *GPU_input_Buffer_ptr; // a host ptr point to a GPU buffer
void *GPU_output_Buffer_ptr; // a host ptr point to a GPU buffer
void* buffers[2];
cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);
// Performing Inference
IExecutionContext *context = engine->createExecutionContext();
context->executeV2(buffers);
// copy result data from gpu to cpu
cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost);
// display output
std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
std::cout << "output data : \n";
for(auto i : output)
std::cout << i << " ";
std::cout << std::endl;
}
explicit batch时,input输入需要指定四维,Dims4 input_shape{3, 3, 4, 4}。使用executeV2(buffers) 执行。
explicit batch | implicit batch | |
---|---|---|
创建方式 | createNetworkV2中指定flag(1值), builder->setMaxBatchSize(3) | createNetworkV2传入0值 |
输入shape | 四维 | 三维,没有batch信息 |
执行方式 | executeV2(buffer) | execute(batch, buffer) |