TensorRT3.0之后给开发者提供了C++和python两种接口,在对性能要求和应用安全性要求比较高的场景主要应用C++接口,TensorRT的安装文件sample提供了很多实例code,从sampleGoogleNet对TensorRT做inference时的Object创建进行简要分析注释;
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"
static Logger gLogger; // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数
using namespace nvinfer1;
using namespace nvcaffeparser1; // 两个TensorRT头文件的作用域
// stuff we know about the network and the caffe input/output blobs
static const int BATCH_SIZE = 4;
static const int TIMING_ITERATIONS = 1000;
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
std::string locateFile(const std::string& input)
{
std::vector<std::string> dirs{"data/samples/googlenet/", "data/googlenet/"};
return locateFile(input, dirs);
}
struct Profiler : public IProfiler
{
typedef std::pair<std::string, float> Record;
std::vector<Record> mProfile;
virtual void reportLayerTime(const char* layerName, float ms)
{
auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
if (record == mProfile.end())
mProfile.push_back(std::make_pair(layerName, ms));
else
record->second += ms;
}
void printLayerTimes()
{
float totalTime = 0;
for (size_t i = 0; i < mProfile.size(); i++)
{
printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS);
totalTime += mProfile[i].second;
}
printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
}
} gProfiler;
void caffeToTRTModel(const std::string& deployFile, // name for caffe prototxt
const std::string& modelFile, // name for model
const std::vector<std::string>& outputs, // network outputs
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with)
IHostMemory *&trtModelStream)
{
// create API root class - must span the lifetime of the engine usage
IBuilder* builder = createInferBuilder(gLogger); // 以Logger为参数创建IBuilder对象,用以创建优化网络
INetworkDefinition* network = builder->createNetwork(); // 创建一个TensorRT网络,后续会对网络进行填充
// parse the caffe model to populate the network, then set the outputs
ICaffeParser* parser = createCaffeParser(); // 创建caffe parser,TensorRT目前有三种主要类型的Parser
/*
CaffeParser for caffe model
UffParser for TF model
OnnxParser for other framework model
*/
bool useFp16 = builder->platformHasFastFp16(); // 判断当前平台GPU是否支持FP16的运算
DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; // create a 16-bit model if it's natively supported
const IBlobNameToTensor *blobNameToTensor =
parser->parse(locateFile(deployFile).c_str(), // caffe deploy file
locateFile(modelFile).c_str(), // caffe model file
*network, // network definition that the parser will populate
modelDataType); // 解析caffemodel,初始化填充TensorRT网络
assert(blobNameToTensor != nullptr);
// the caffe file has no notion of outputs, so we need to manually say which tensors the engine should generate
for (auto& s : outputs)
network->markOutput(*blobNameToTensor->find(s.c_str())); // 指定网络的输出tensor,这步操作是必须的
// Build the engine
builder->setMaxBatchSize(maxBatchSize); // 设置最大的batch size
builder->setMaxWorkspaceSize(16 << 20); // runtime 工作区mem size
// set up the network for paired-fp16 format if available
if(useFp16)
builder->setFp16Mode(true);
ICudaEngine* engine = builder->buildCudaEngine(*network); // 创建TensorRT engine
assert(engine);
// we don't need the network any more, and we can destroy the parser
network->destroy();
parser->destroy();
// serialize the engine, then close everything down
trtModelStream = engine->serialize();
// 序列化并存储engine,用于之后的inference,代码中省略了engine的存储,由于engine的build比较耗时,一般会build一次,之后inference直接反量化导入提升效率
engine->destroy();
builder->destroy();
shutdownProtobufLibrary();
}
void timeInference(ICudaEngine* engine, int batchSize)
{
// input and output buffer pointers that we pass to the engine - the engine requires exactly ICudaEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine->getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than ICudaEngine::getNbBindings()
int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME), outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
// allocate GPU buffers
Dims3 inputDims = static_cast<Dims3&&>(engine->getBindingDimensions(inputIndex)), outputDims = static_cast<Dims3&&>(engine->getBindingDimensions(outputIndex));
size_t inputSize = batchSize * inputDims.d[0] * inputDims.d[1] * inputDims.d[2] * sizeof(float);
size_t outputSize = batchSize * outputDims.d[0] * outputDims.d[1] * outputDims.d[2] * sizeof(float);
CHECK(cudaMalloc(&buffers[inputIndex], inputSize));
CHECK(cudaMalloc(&buffers[outputIndex], outputSize));
IExecutionContext* context = engine->createExecutionContext(); // 创建GPU执行context,用来存储engine中的网络定义和参数等数据
context->setProfiler(&gProfiler);
// zero the input buffer
CHECK(cudaMemset(buffers[inputIndex], 0, inputSize));
for (int i = 0; i < TIMING_ITERATIONS;i++)
context->execute(batchSize, buffers); // 执行inference
// release the context and buffers
context->destroy(); // 释放执行上下文
CHECK(cudaFree(buffers[inputIndex])); // 释放申请的GPU mem buffer
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv)
{
std::cout << "Building and running a GPU inference engine for GoogleNet, N=4..." << std::endl;
// parse the caffe model and the mean file
IHostMemory *trtModelStream{nullptr}; // 创建host mem对象
caffeToTRTModel("googlenet.prototxt", "googlenet.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, BATCH_SIZE, trtModelStream); // 解析caffemodel和mean file
assert(trtModelStream != nullptr);
// create an engine
IRuntime* infer = createInferRuntime(gLogger); // 创建runtime对象
assert(infer != nullptr);
ICudaEngine* engine = infer->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr); // 反序列化TensorRT engine, 参数nullptr和开发者自己添加的custom layer有关
assert(engine != nullptr);
printf("Bindings after deserializing:\n"); // 回去网络输入输出tensor的信息
for (int bi = 0; bi < engine->getNbBindings(); bi++) {
if (engine->bindingIsInput(bi) == true) {
printf("Binding %d (%s): Input.\n", bi, engine->getBindingName(bi));
} else {
printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
}
}
// run inference with null data to time network performance
timeInference(engine, BATCH_SIZE); // do inference
engine->destroy();
infer->destroy();
trtModelStream->destroy(); // 释放各种申请的内存
gProfiler.printLayerTimes();
std::cout << "Done." << std::endl;
return 0;
}
根据官网TensorRT配置流程完成环境安装,我是在docker里面配置的cuda9.0+cudnn7+trt4的环境
编译各sample运行测试trt