深度学习模型在训练好以后,下一步就是部署到不同的设备进行测试,不同设备之间的转换一般可以通过中间件ONNX进行转换,以达到不同平台的通用。本文以模型转为ONNX为起点,分析介绍ONNX转为TensorRT Engine并进行推理的整个流程链路。
ONNX序列化为TRT模型的整个流程可以用下图表示
使用C++的API进行开发时,需要引入头文件NvInfer以及NvOnnxParser,C++的接口都是通过I开头的的接口类定义的,如ILogger、IBuilder等。
#include “NvInfer.h”
#include “NvOnnxParser.h”
using namespace nvonnxparser;
using namespace nvinfer1;
创建构建器之前有两种方式实例化ILogger:
1、引用tensorrtx的logging.h,使用其中的Logger
#include "logging.h"
static Logger gLogger;
IBuilder* builder = createInferBuilder(gLogger);
2、继承ILogger,实例化接口
class Logger : public ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} logger;
IBuilder* builder = createInferBuilder(gLogger);
创建构建器后,需要创建网络定义来进行模型优化:
INetworkDefinition *network = builder->createNetworkV2(0U); //是0U还是1u需视情况而定
创建onnx的解析器来进行网络定义的填充,并读取模型文件并处理是否存在错误。
IParser* parser = createParser(*network, gLogger);
parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
for (int32_t i = 0; i < parser->getNbErrors(); ++i)
{
std::cout << parser->getError(i)->desc() << std::endl;
}
std::cout << "successfully parse the onnx model" << std::endl;
IBuilderConfig *config = builder->createBuilderConfig();
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
auto profile = builder->createOptimizationProfile();
auto input_tensor = network->getInput(0);
auto input_dims = input_tensor->getDimensions();
input_dims.d[0] = 1;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = batchSize;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
config->addOptimizationProfile(profile);
#ifdef USE_FP16
config->setFlag(BuilderFlag::kFP16);
#endif
#ifdef USE_INT8
config->setFlag(BuilderFlag::kINT8);
#endif
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
assert(engine != nullptr);
(*modelStream) = engine->serialize();
assert(modelStream != nullptr);
std::ofstream p(engine_path, std::ios::binary);
if (!p)
{
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();
onnx转换为engine并序列化后,可以减少构建和优化模型的时间,如下图所示,从序列化的engine读取开始完成整个推理过程。
读取序列化的模型,存放在trtModelstream中。
size_t size{ 0 };
std::ifstream file(engine_path, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
通过logger创建runtime
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
通过runtime解析trtModelstream,创建engine
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
runtime->destroy();
前处理
float* input_data = (float*)malloc(3 * input_h * input_w * sizeof(float));
int ImgCount = InputImage.size();
for (int b = 0; b < ImgCount; b++) {
cv::Mat img = InputImage.at(b);
int w = img.cols;
int h = img.rows;
int i = 0;
for (int row = 0; row < h; ++row) {
uchar* uc_pixel = img.data + row * img.step;
for (int col = 0; col < input_w; ++col) {
input_data[b * 3 * input_h * input_w + i] = (float)uc_pixel[2] / 255.0;
input_data[b * 3 * input_h * input_w + i + input_h * input_w] = (float)uc_pixel[1] / 255.0;
input_data[b * 3 * input_h * input_w + i + 2 * input_h * input_w] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
}
前向推理
void doInference()
{
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
//assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
//const int inputIndex = 0;
//const int outputIndex = 1;
// Create GPU buffers on device
cudaMalloc(&buffers[inputIndex], batchSize * 3 * input_h * input_w * sizeof(float));
cudaMalloc(&buffers[outputIndex], batchSize * output_size * sizeof(float));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 *input_h * input_w * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
后处理
以LPRNet为例
std::vector<int> preds;
std::cout << std::endl;
for (int i = 0; i < 18; i++) {
int maxj = 0;
for (int j = 0; j < 68; j++) {
if (prob[i + 18 * j] > prob[i + 18 * maxj]) maxj = j;
}
preds.push_back(maxj);
}
int pre_c = preds[0];
std::vector<int> no_repeat_blank_label;
for (auto c: preds) {
if (c == pre_c || c == 68 - 1) {
if (c == 68 - 1) pre_c = c;
continue;
}
no_repeat_blank_label.push_back(c);
pre_c = c;
}
std::string str;
for (auto v: no_repeat_blank_label) {
str += alphabet[v];
}
以上是利用TensorRT C++ API进行ONNX构建trt engine,并进行推理的全过程解析,基本所有的onnx转化为TRT模型进行推理都包含在以上方式中,仅此记录。
–END–