目前sample中mnist提供了至少caffe、onnx的预训练模型,在TensorRT经过优化生成engine后再进行infer,两种模型的加载处理略有不同,做出简单api处理说明。 最后尝试使用最少的代码来实现整个流程。
目前使用主要API函数位于 "NvInfer.h"
中,根据输入的第三方支持模型类型选择 NvCaffeParser.h
或 NvOnnxParser.h
。
主要的一些对象,包含基本的nvinfer1::ILogger
、nvinfer1::IBuilder
、nvinfer1::INetworkDefinition
、nvinfer1::IBuilderConfig
,模型解析nvcaffeparser1::ICaffeParser
/nvonnxparser::IParser
,推理运行nvinfer1::IRuntime
、nvinfer1::ICudaEngine
、nvinfer1::IExecutionContext
,以及其他有关的基本数据结构不列举。
另外为使用方便,在项目示例common目录中提供了大量文件用于测试,例如简单的
(1) Logger对象
常规使用需要传递一个ILogger的派生类,可以实现如下
class Logger : public ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
}
简化使用直接使用 sample::gLogger.getTRTLogger()
。
(2) std::unique_ptr对象
为通过智能指针管理资源,例如正确使用可能需要如下操作,
nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger());
delete builder; // 结束使用
简化使用直接使用
auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
这里以加载caffe模型为例,先说明类
class SampleMNIST
{
public:
SampleMNIST(const samplesCommon::CaffeSampleParams& params)
: mParams(params)
{}
//! 构建优化网络engine
bool build();
//! engine执行infer操作
bool infer();
//! 释放所有资源
bool teardown();
private:
//! 使用caffe解析器创建mnist网络并配置输出
bool constructNetwork(
SampleUniquePtr<nvcaffeparser1::ICaffeParser>& parser, SampleUniquePtr<nvinfer1::INetworkDefinition>& network);
//! 读取输入、均值数据,进行预处理,并将处理结果保存到buffer中
bool processInput(
const samplesCommon::BufferManager& buffers, const std::string& inputTensorName, int inputFileIdx) const;
//! 验证输出是否正确并打印输出
bool verifyOutput(
const samplesCommon::BufferManager& buffers, const std::string& outputTensorName, int groundTruthDigit) const;
std::shared_ptr<nvinfer1::ICudaEngine> mEngine{nullptr}; //!< 运行网络的engine模型对象
samplesCommon::CaffeSampleParams mParams; //!< 当前示例使用的参数
nvinfer1::Dims mInputDims; //!< 网络输入尺寸
SampleUniquePtr<nvcaffeparser1::IBinaryProtoBlob>
mMeanBlob; //!< 均值blob文件数据
};
主函数运行的代码为
int main(int argc, char** argv)
{
// 命令行参数解析
samplesCommon::Args args;
bool argsOK = samplesCommon::parseArgs(args, argc, argv);
if (!argsOK){
sample::gLogError << "Invalid arguments" << std::endl;
printHelpInfo();
return EXIT_FAILURE;
}
if (args.help){
printHelpInfo();
return EXIT_SUCCESS;
}
auto sampleTest = sample::gLogger.defineTest(gSampleName, argc, argv);
sample::gLogger.reportTestStart(sampleTest);
// 当前测试对象
SampleOnnxMNIST sample(initializeSampleParams(args));
sample::gLogInfo << "Building and running a GPU inference engine for Onnx MNIST" << std::endl;
// 构建阶段
if (!sample.build()){
return sample::gLogger.reportFail(sampleTest);
}
// 推理阶段
if (!sample.infer()){
return sample::gLogger.reportFail(sampleTest);
}
return sample::gLogger.reportPass(sampleTest);
}
(1)创建 builder
为了创建一个builder,首先需要初始化一个ILogger示例作为参数,c++中通常使用智能指针
auto logger = sample::gLogger.getTRTLogger();
auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(sample::gLogger.getTRTLogger()));
(2)网络模型解析
在构建网络实例的时候要指定该网络是批处理模式(Explicit Batch Mode)还是隐式(Implicit Batch Mode)。批处理模式更灵活的方式。并且如果使用ONNX网络格式则必须指定为批处理模式。
// caffe, 可以选择隐式flag
auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0));
// onne,必须使用显式flag
const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
载入已训练好的模型,进行量化出来并进行推理时优化,其已经集成了各种模型格式的解析器,包括Tensorflow、ONNX、Caffe、Pytorch等。每种解析器位于不同的头文件,但是基本的API是一致的。
// caffe
auto parser = SampleUniquePtr<nvcaffeparser1::ICaffeParser>(nvcaffeparser1::createCaffeParser());
// onnx
auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, sample::gLogger.getTRTLogger()));
之后从文件中加载并解析模型文件
// caffe
const nvcaffeparser1::IBlobNameToTensor* blobNameToTensor = parser->parse(
mParams.prototxtFileName.c_str(), mParams.weightsFileName.c_str(), *network, nvinfer1::DataType::kFLOAT);
for (auto& s : mParams.outputTensorNames){
network->markOutput(*blobNameToTensor->find(s.c_str()));
}
// onnx
auto parsed = parser->parseFromFile(
locateFile(mParams.onnxFileName, mParams.dataDirs).c_str(),
static_cast<int>(sample::gLogger.getReportableSeverity()));
在当前caffe模型中,还需要读取均值,增加均值数据的读取、修改均值处理的网络。
// add mean subtraction to the beginning of the network
nvinfer1::Dims inputDims = network->getInput(0)->getDimensions();
mMeanBlob
= SampleUniquePtr<nvcaffeparser1::IBinaryProtoBlob>(parser->parseBinaryProto(mParams.meanFileName.c_str()));
nvinfer1::Weights meanWeights{nvinfer1::DataType::kFLOAT, mMeanBlob->getData(), inputDims.d[1] * inputDims.d[2]};
// For this sample, a large range based on the mean data is chosen and applied to the head of the network.
// After the mean subtraction occurs, the range is expected to be between -127 and 127, so the rest of the network
// is given a generic range.
// The preferred method is use scales computed based on a representative data set
// and apply each one individually based on the tensor. The range here is large enough for the
// network, but is chosen for example purposes only.
float maxMean
= samplesCommon::getMaxValue(static_cast<const float*>(meanWeights.values), samplesCommon::volume(inputDims));
//
auto mean = network->addConstant(nvinfer1::Dims3(1, inputDims.d[1], inputDims.d[2]), meanWeights);
if (!mean->getOutput(0)->setDynamicRange(-maxMean, maxMean)){
return false;
}
if (!network->getInput(0)->setDynamicRange(-maxMean, maxMean)){
return false;
}
auto meanSub = network->addElementWise(*network->getInput(0), *mean->getOutput(0), ElementWiseOperation::kSUB);
if (!meanSub->getOutput(0)->setDynamicRange(-maxMean, maxMean)){
return false;
}
network->getLayer(0)->setInput(0, *meanSub->getOutput(0));
samplesCommon::setAllDynamicRanges(network.get(), 127.0f, 127.0f);
(3)编译生成engine
创建一个配置config用于指明如何优化编译生成engine。
auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
这个接口有很多属性用来控制优化,一个重要的属性是最大工作空间大小,用于限定网络层实现最大的控件占用,例如
config->setMemoryPoolLimit(MemoryPoolType::kWORKSPACE, 1U << 20);
//其他配置
// CUDA stream used for profiling by the builder.
auto profileStream = samplesCommon::makeCudaStream();
config->setProfileStream(*profileStream);
builder->setMaxBatchSize(mParams.batchSize);
config->setFlag(BuilderFlag::kGPU_FALLBACK);
if (mParams.fp16){
config->setFlag(BuilderFlag::kFP16);
}
if (mParams.int8){
config->setFlag(BuilderFlag::kINT8);
}
当配置指定后,就可以进行编译生成engine,其数据保存在HostMemory中。可以进一步保存在本地文件中
SampleUniquePtr<IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
// write engine to disk file
std::ofstream ofs("engine.trt", std::ostream::binary);
ofs.write((char*)plan->data(), plan->size());
一旦序列化engine之后,之前的所有有关对象就可以进行资源释放。
注意:序列化engine不能跨平台移植或跨越不同TensorRT版本,只能指定用于特定编译使用的特定平台、特定版本、特定设备gpu上。
(1)反序列化
一旦我们保留好优化后的engine,后续使用仅需要进行反序列化加载再进行推理。这里需要用到运行时API接口。
// 用ILogger初始化一个IRuntime
SampleUniquePtr<IRuntime> runtime{createInferRuntime(logger)};
// 从内存数据中反序列化
mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
runtime->deserializeCudaEngine(plan->data(), plan->size()), samplesCommon::InferDeleter());
(2)执行上下文和数据管理对象
通过反序列化得到的mEngine创建一个上下文对象IExecutionContext
,并初始化一个BufferManager
数据管理对象用于将输入数据、执行结果在host和device之间同步。
auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
// Create RAII buffer manager object
samplesCommon::BufferManager buffers(mEngine, mParams.batchSize);
(3)输入
输入图像为 (inputH,inputW)大小的灰度图,可以先读取到内存中
std::vector<uint8_t> fileData(inputH * inputW);
readPGMFile(locateFile(std::to_string(inputFileIdx) + ".pgm", mParams.dataDirs), fileData.data(), inputH, inputW);
之后,将图像数据拷贝到BufferManager
的输入数据中。注意数据类型不一致,从uint8_t
转换为float
:
float* hostInputBuffer = static_cast<float*>(buffers.getHostBuffer(inputTensorName));
for (int i = 0; i < inputH * inputW; i++){
hostInputBuffer[i] = float(fileData[i]);
}
之后使用异步方式将host数据拷贝到device中
// Create CUDA stream for the execution of this inference.
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// Asynchronously copy data from host input buffers to device input buffers
buffers.copyInputToDeviceAsync(stream);
(4)推理
使用enqueue函数对输入进行异步处理
// Asynchronously enqueue the inference work
if (!context->enqueue(mParams.batchSize, buffers.getDeviceBindings().data(), stream, nullptr)){
return false;
}
(5)输出
处理结果保存在device中,需要将其拷贝到host中
// Asynchronously copy data from device output buffers to host output buffers
buffers.copyOutputToHostAsync(stream);
// Wait for the work in the stream to complete
CHECK(cudaStreamSynchronize(stream));
// Release stream
CHECK(cudaStreamDestroy(stream));
const float* prob = static_cast<const float*>(buffers.getHostBuffer(outputTensorName));
prob指针指向的就是10个digtis的概率内存区域。
这里以onnx的模型为例,代码分为两个部分 (1)模型优化和序列化 (2)模型序列化和推理。 第一部分执行一次即可,后面部署仅需要执行第二部分。
int simple_test()
{
1. 构建优化模型、序列化 ---------------------------------------------------------
/// 1.1 基本参数
samplesCommon::OnnxSampleParams params;
params.dataDirs.push_back("data/mnist/");
params.dataDirs.push_back("data/samples/mnist/");
params.onnxFileName = "mnist.onnx";
params.inputTensorNames.push_back("Input3");
params.outputTensorNames.push_back("Plus214_Output_0");
params.dlaCore = -1;
params.int8 = true;
params.fp16 = true;
auto& logger = sample::gLogger.getTRTLogger();
/// 1.2 优化需要使用的临时变量
// builder
auto builder = SampleUniquePtr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(logger));
// network
const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
auto network = SampleUniquePtr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));
// onnx parser
auto parser = SampleUniquePtr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, logger));
auto parsed = parser->parseFromFile(locateFile(params.onnxFileName, params.dataDirs).c_str(),
static_cast<int>(sample::gLogger.getReportableSeverity()));
// config
auto config = SampleUniquePtr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
if(params.fp16)
config->setFlag(BuilderFlag::kFP16);
if(params.int8) {
config->setFlag(BuilderFlag::kINT8);
samplesCommon::setAllDynamicRanges(network.get(), 127.0f, 127.0f);
}
samplesCommon::enableDLA(builder.get(), config.get(), params.dlaCore);
auto profileStream = samplesCommon::makeCudaStream();
config->setProfileStream(*profileStream);
/// 1.3 编译优化engine并序列化
// serialize
SampleUniquePtr<IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};
if(!plan) {
return false;
}
std::ofstream ofs("engine.trt", std::ostream::binary);
ofs.write(static_cast<const char*>(plan->data()), plan->size());
ofs.close();
2. 反序列化、推理 ( 后期部署仅需要后面的步骤 ) ---------------------------------------
/// 2.1 加载engine到内存
std::ifstream ifs("engine.trt", std::istream::binary);
ifs.seekg(0,std::ios_base::end);
auto buflen = ifs.tellg();
ifs.seekg(0);
std::vector<char> buf(buflen);
ifs.read(buf.data(), buf.size());
/// 2.2 反序列化
SampleUniquePtr<IRuntime> runtime{createInferRuntime(logger)};
auto mEngine = std::shared_ptr<nvinfer1::ICudaEngine>(
runtime->deserializeCudaEngine(buf.data(), buf.size()), samplesCommon::InferDeleter());
// inference上下文
auto context = SampleUniquePtr<nvinfer1::IExecutionContext>(mEngine->createExecutionContext());
samplesCommon::BufferManager buffers(mEngine);
// 网络输入、输出信息
//auto mInputDims = network->getInput(0)->getDimensions(); // [1,1,28,28]
//auto mOutputDims = network->getOutput(0)->getDimensions(); // [1,10]
auto mInputDims = mEngine->getBindingDimensions(0); // 部署使用
auto mOutputDims = mEngine->getBindingDimensions(1); // 部署使用
int inputH = mInputDims.d[2];
int inputW = mInputDims.d[3];
// 加载一个random image
srand(unsigned(time(nullptr)));
std::vector<uint8_t> fileData(inputH * inputW);
int mNumber = rand() % 10;
readPGMFile(locateFile(std::to_string(mNumber) + ".pgm", params.dataDirs), fileData.data(), inputH, inputW);
sample::gLogInfo << "Input:" << std::endl;
for(int i = 0; i < inputH * inputW; i++) {
sample::gLogInfo << (" .:-=+*#%@"[fileData[i] / 26]) << (((i + 1) % inputW) ? "" : "\n");
}
sample::gLogInfo << std::endl;
// 将图像数据从host空间拷贝到device空间
float* hostDataBuffer = static_cast<float*>(buffers.getHostBuffer(params.inputTensorNames[0]));
for(int i = 0; i < inputH * inputW; i++) {
hostDataBuffer[i] = 1.0 - float(fileData[i] / 255.0);
}
buffers.copyInputToDevice();
// excution执行推理
bool status = context->executeV2(buffers.getDeviceBindings().data());
// 将推理结果从device空间拷贝到host空间
buffers.copyOutputToHost();
/// 2.3 处理推理结果数据
float* output = static_cast<float*>(buffers.getHostBuffer(params.outputTensorNames[0]));
// softmax
std::vector<float> pred(output, output + samplesCommon::volume(mOutputDims));
std::transform(pred.begin(), pred.end(), pred.begin(), [](float d) { return exp(d); });
auto sum = std::accumulate(pred.begin(), pred.end(), 0.f);
std::transform(pred.begin(), pred.end(), pred.begin(), [sum](float d) { return d / sum; });
// argmax
auto idx = std::distance(pred.begin(), std::max_element(pred.begin(), pred.end()));
std::cout << "========= Output: class " << idx << ", conf " << std::setprecision(3) << pred[idx]*100 << "%" << std::endl;
return 0;
}