TensorRT 学习(2)

TensorRT3.0之后给开发者提供了C++和python两种接口,在对性能要求和应用安全性要求比较高的场景主要应用C++接口,TensorRT的安装文件sample提供了很多实例code,从sampleGoogleNet对TensorRT做inference时的Object创建进行简要分析注释;

  • run GoogleNet with TensorRT engine
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include "NvInfer.h"
#include "NvCaffeParser.h"
#include "common.h"

static Logger gLogger; // 创建全局变量Logger,作为TensorRT各种调用的方法的输入参数
using namespace nvinfer1;
using namespace nvcaffeparser1; // 两个TensorRT头文件的作用域

// stuff we know about the network and the caffe input/output blobs

static const int BATCH_SIZE = 4;
static const int TIMING_ITERATIONS = 1000;

const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";


std::string locateFile(const std::string& input)
{
    std::vector<std::string> dirs{"data/samples/googlenet/", "data/googlenet/"};
    return locateFile(input, dirs);
}

struct Profiler : public IProfiler
{
    typedef std::pair<std::string, float> Record;
    std::vector<Record> mProfile;

    virtual void reportLayerTime(const char* layerName, float ms)
    {
        auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
        if (record == mProfile.end())
            mProfile.push_back(std::make_pair(layerName, ms));
        else
            record->second += ms;
    }

    void printLayerTimes()
    {
        float totalTime = 0;
        for (size_t i = 0; i < mProfile.size(); i++)
        {
            printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / TIMING_ITERATIONS);
            totalTime += mProfile[i].second;
        }
        printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
    }

} gProfiler;

void caffeToTRTModel(const std::string& deployFile,             // name for caffe prototxt
                     const std::string& modelFile,              // name for model
                     const std::vector<std::string>& outputs,   // network outputs
                     unsigned int maxBatchSize,                 // batch size - NB must be at least as large as the batch we want to run with)
                     IHostMemory *&trtModelStream)
{
    // create API root class - must span the lifetime of the engine usage
    IBuilder* builder = createInferBuilder(gLogger); // 以Logger为参数创建IBuilder对象,用以创建优化网络
    INetworkDefinition* network = builder->createNetwork(); // 创建一个TensorRT网络,后续会对网络进行填充

    // parse the caffe model to populate the network, then set the outputs
    ICaffeParser* parser = createCaffeParser(); // 创建caffe parser,TensorRT目前有三种主要类型的Parser
    /*
    CaffeParser for caffe model
    UffParser for TF model
    OnnxParser for other framework model
    */

    bool useFp16 = builder->platformHasFastFp16(); // 判断当前平台GPU是否支持FP16的运算

    DataType modelDataType = useFp16 ? DataType::kHALF : DataType::kFLOAT; // create a 16-bit model if it's natively supported
    const IBlobNameToTensor *blobNameToTensor =
        parser->parse(locateFile(deployFile).c_str(),               // caffe deploy file
                                 locateFile(modelFile).c_str(),     // caffe model file
                                 *network,                          // network definition that the parser will populate
                                 modelDataType); // 解析caffemodel,初始化填充TensorRT网络

    assert(blobNameToTensor != nullptr);
    // the caffe file has no notion of outputs, so we need to manually say which tensors the engine should generate
    for (auto& s : outputs)
        network->markOutput(*blobNameToTensor->find(s.c_str())); // 指定网络的输出tensor,这步操作是必须的

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize); // 设置最大的batch size
    builder->setMaxWorkspaceSize(16 << 20); // runtime 工作区mem size

    // set up the network for paired-fp16 format if available
    if(useFp16)
        builder->setFp16Mode(true);

    ICudaEngine* engine = builder->buildCudaEngine(*network); // 创建TensorRT engine
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    network->destroy();
    parser->destroy();

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize(); 
    // 序列化并存储engine,用于之后的inference,代码中省略了engine的存储,由于engine的build比较耗时,一般会build一次,之后inference直接反量化导入提升效率
    engine->destroy();
    builder->destroy();
    shutdownProtobufLibrary();
}

void timeInference(ICudaEngine* engine, int batchSize)
{
    // input and output buffer pointers that we pass to the engine - the engine requires exactly ICudaEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine->getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than ICudaEngine::getNbBindings()
    int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME), outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);

    // allocate GPU buffers
    Dims3 inputDims = static_cast<Dims3&&>(engine->getBindingDimensions(inputIndex)), outputDims = static_cast<Dims3&&>(engine->getBindingDimensions(outputIndex));
    size_t inputSize = batchSize * inputDims.d[0] * inputDims.d[1] * inputDims.d[2] * sizeof(float);
    size_t outputSize = batchSize * outputDims.d[0] * outputDims.d[1] * outputDims.d[2] * sizeof(float);

    CHECK(cudaMalloc(&buffers[inputIndex], inputSize));
    CHECK(cudaMalloc(&buffers[outputIndex], outputSize));

    IExecutionContext* context = engine->createExecutionContext(); // 创建GPU执行context,用来存储engine中的网络定义和参数等数据
    context->setProfiler(&gProfiler);

    // zero the input buffer
    CHECK(cudaMemset(buffers[inputIndex], 0, inputSize));

    for (int i = 0; i < TIMING_ITERATIONS;i++)
        context->execute(batchSize, buffers); // 执行inference

    // release the context and buffers
    context->destroy(); // 释放执行上下文
    CHECK(cudaFree(buffers[inputIndex])); // 释放申请的GPU mem buffer
    CHECK(cudaFree(buffers[outputIndex]));
}


int main(int argc, char** argv)
{
    std::cout << "Building and running a GPU inference engine for GoogleNet, N=4..." << std::endl;

    // parse the caffe model and the mean file
    IHostMemory *trtModelStream{nullptr}; // 创建host mem对象
    caffeToTRTModel("googlenet.prototxt", "googlenet.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, BATCH_SIZE, trtModelStream); // 解析caffemodel和mean file
    assert(trtModelStream != nullptr);

    // create an engine
    IRuntime* infer = createInferRuntime(gLogger); // 创建runtime对象
    assert(infer != nullptr);
    ICudaEngine* engine = infer->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr); // 反序列化TensorRT engine, 参数nullptr和开发者自己添加的custom layer有关
    assert(engine != nullptr);

        printf("Bindings after deserializing:\n"); // 回去网络输入输出tensor的信息
        for (int bi = 0; bi < engine->getNbBindings(); bi++) {
               if (engine->bindingIsInput(bi) == true) {
        printf("Binding %d (%s): Input.\n",  bi, engine->getBindingName(bi));
               } else {
        printf("Binding %d (%s): Output.\n", bi, engine->getBindingName(bi));
               }
           }

    // run inference with null data to time network performance
    timeInference(engine, BATCH_SIZE); // do inference

    engine->destroy();
    infer->destroy();
    trtModelStream->destroy(); // 释放各种申请的内存

    gProfiler.printLayerTimes();

    std::cout << "Done." << std::endl;

    return 0;
}
  • 编译和运行TensorRT sample
根据官网TensorRT配置流程完成环境安装,我是在docker里面配置的cuda9.0+cudnn7+trt4的环境
编译各sample运行测试trt

你可能感兴趣的:(Deep,Learning,Framework)