caffe TensorRT

caffe模型TensorRT部署实践(一)

无眠栀 2019-05-10 21:11:46  4666  收藏 10
分类专栏: 深度学习
版权
参考代码
TensorRT安装包下的samples/sampleMNIST/sampleMNIST.cpp

1.设置使用的gpu id,如果不设置,默认使用第0块。
cudaSetDevice(3); //set device id
1
2.定义模型的输入输出和logger
static const int INPUT_H = 299; //输入图像高
static const int INPUT_W = 299;//输入图像宽
static const int CHANNELS = 3;//输入图像通道
static const int OUTPUT_SIZE = 1536;//输出特征维度
static Logger gLogger;

const char* INPUT_BLOB_NAME = "data";//deploy文件中定义的输入层名称
const char* OUTPUT_BLOB_NAME = "pool_8x8_s2";//deploy文件中定义的输出层名称
1
2
3
4
5
6
7
8
3.定义GIE模型,并将训练好的caffe模型转换到GIE模型
// create a GIE model from the caffe model and serialize it to a stream
IHostMemory *gieModelStream{nullptr};
caffeToGIEModel("deploy.prototxt", "inceptionv4.caffemodel", std::vector < std::string > { OUTPUT_BLOB_NAME }, 1, gieModelStream);
1
2
3
4.准备输入图像,可以采用opencv读取,也可用其他方式,根据情况编写与处理部分,最终存入一个float*中
    float data[INPUT_H*INPUT_W*CHANNELS];

    cv::Mat im = imread("gap.jpg") ;
    cv::resize(im, im, cv::Size(INPUT_W, INPUT_H));
    int mean_data[] = {104, 117, 123}; //均值
    float *pdata = data;
    for(int c = 0; c < CHANNELS; ++c)
    {
        for(int h = 0; h < INPUT_H; ++h)
        {
            for(int w = 0; w < INPUT_W; ++w)
            {
                *pdata++ = float(im.at(h,w)[c] - mean_data[c]) ;
            }
        }
    }
   
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
5. 反序列化前向引擎
    // deserialize the engine
    IRuntime* runtime = createInferRuntime(gLogger);
    ICudaEngine* engine = runtime->deserializeCudaEngine(gieModelStream->data(), gieModelStream->size(), nullptr);
    if (gieModelStream) gieModelStream->destroy();
1
2
3
4
6.开始前向推断
    IExecutionContext *context = engine->createExecutionContext();

    std::cout << "begin inference\n";
    // run inference
    CProTimer timet;
    float prob[OUTPUT_SIZE];
    doInference(*context, data, prob, 1);

    std::cout << "end inference " << timet.GetTime(true) << "\n";
1
2
3
4
5
6
7
8
9
7.释放资源并输出结果
    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();

    // print a histogram of the output distribution
    std::cout << "\n\n";
    for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
    {
        std::cout << prob[i] << " ";
    }
    std::cout << std::endl;
1
2
3
4
5
6
7
8
9
10
11
12
caffeToGIEModel和doInference可参考开头给出的示例cpp。
void caffeToGIEModel(const std::string& deployFile,             // name for caffe prototxt
                     const std::string& modelFile,              // name for model
                     const std::vector& outputs,   // network outputs
                     unsigned int maxBatchSize,                 // batch size - NB must be at least as large as the batch we want to run with)
                     IHostMemory *&gieModelStream)    // output buffer for the GIE model
{
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger);

    // parse the caffe model to populate the network, then set the outputs
    INetworkDefinition* network = builder->createNetwork();
    ICaffeParser* parser = createCaffeParser();
    const IBlobNameToTensor* blobNameToTensor = parser->parse(locateFile(deployFile, directories).c_str(),
                                                              locateFile(modelFile, directories).c_str(),
                                                              *network,
                                                              nvinfer1::DataType::kFLOAT);

    // specify which tensors are outputs
    for (auto& s : outputs)
        network->markOutput(*blobNameToTensor->find(s.c_str()));

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxWorkspaceSize(1 << 20);

    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    network->destroy();
    parser->destroy();

    // serialize the engine, then close everything down
    gieModelStream = engine->serialize();
    engine->destroy();
    builder->destroy();
    shutdownProtobufLibrary();
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
    int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
        outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);

    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * CHANNELS * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
8. 编写MakeFile文件并编译
代码需依赖cuda, cudnn 和TensorRT库,gcc版本5.3以上,其他库可根据自身需要设定

OPENCV_INC_DIR="/user/3rdparty/opencv-3.1.0/include/"
OPENCV_LIB_DIR="/user/3rdparty/opencv-3.1.0/lib/"
CUDA_INC_DIR="/user/3rdparty/cuda/include/"
CUDA_LIB_DIR="/user/3rdparty/cuda/lib64/"
CUDNN_INC_DIR="/user/3rdparty/cudnn_7.0.5/include/"
CUDNN_LIB_DIR="/user/3rdparty/cudnn_7.0.5/lib64/"
TENSORRT_INC_DIR="/user/3rdparty/TensorRT-4.0.0.3/include/"
TENSORRT_LIB_DIR="/user/3rdparty/TensorRT-4.0.0.3/lib/"

export PATH=/user/3rdparty/gcc-5.3.0/bin:$PATH

INCLUFLAGS = -I${OPENCV_INC_DIR} \
             -I${CUDA_INC_DIR} -I${CUDNN_INC_DIR}\
             -I../common/ \
             -I${TENSORRT_INC_DIR}

LIBFLAGS = -L${OPENCV_LIB_DIR} -lopencv_imgcodecs -lopencv_imgproc -lopencv_core -lopencv_highgui \
           -L${CUDA_LIB_DIR} -L${CUDNN_LIB_DIR} -lcudnn -lcublas -lcudart_static -lnvToolsExt -lcudart \
           -L${TENSORRT_LIB_DIR} -lnvinfer -lnvparsers -lnvinfer_plugin

LIBFLAGS += -lrt -ldl -lpthread

SOURCES = main.cpp  

CXXFLAGS = -Wall -std=c++11 

EXE = inceptionv4_tensorrt

OBJECTS = $(subst .c,.o,$(SOURCES:%.cpp=%.o))

all:
    g++ -o $(EXE) $(SOURCES) $(CXXFLAGS) $(INCLUFLAGS) $(LIBFLAGS)
clean:
    rm -f $(OBJECTS) $(EXE)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
9.精度和速度对比
TensorRT的float32模型与原始caffe精度基本无差异,但速度快很多,单batch的平均gpu前向速度是原始caffe模型的4~5倍左右,优化还是很给力。
————————————————
版权声明:本文为CSDN博主「无眠栀」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/may0324/article/details/90083988

你可能感兴趣的:(mxnet)