学习TensorRT的资料实在是太少了
除了官方资料外,Github上star最多的就是 tensorrtx
本文主要内容
python lenet5.py && python inference.py
,记录模型推理结果。/path/to/tensorrtx/lenet
中。cd /path/to/tensorrtx/lenet && mkdir build && cd build && cmake .. && make
sudo ./lenet -s
,执行模型推理 sudo ./lenet -d
,并记录推理结果。lenet.cpp
:包括模型构建与模型推理,后文主要介绍。logging.h
:感觉是TensorRT官方samples里的源码,就不细看了,主要就是一些输出日志的功能。lenet.cpp
的主要功能包括
./lenet -s
命令,主要功能就是通过API构建模型,导入权重,并将模型保存到本地。loadWeights
就是导入权重文件的入口,返回一个 std::map
对象。// Creat the engine using only the API and not any parser.
ICudaEngine* createLenetEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt)
{
// 首先要创建空白网络,即 INetworkDefinition 对象
INetworkDefinition* network = builder->createNetworkV2(0U);
// 添加模型输入,指定名称、数据类型、shape
// Create input tensor of shape { 1, 32, 32 } with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{1, INPUT_H, INPUT_W});
assert(data);
// 获取权重文件中的所有权重信息
std::map<std::string, Weights> weightMap = loadWeights("../lenet5.wts");
// 添加卷基层,输入参数包括:输入tensor、输出channel数量、卷积核尺寸、weight权重、bias权重
// 注意,这里没有指定stride、padding等其他参数
// Add convolution layer with 6 outputs and a 5x5 filter.
IConvolutionLayer* conv1 = network->addConvolutionNd(*data, 6, DimsHW{5, 5}, weightMap["conv1.weight"], weightMap["conv1.bias"]);
assert(conv1);
// 指定stride
conv1->setStrideNd(DimsHW{1, 1});
// 添加relu,其实就是指定输入tensor(上一个卷积的输出)以及激活函数类型
// Add activation layer using the ReLU algorithm.
IActivationLayer* relu1 = network->addActivation(*conv1->getOutput(0), ActivationType::kRELU);
assert(relu1);
// 添加池化层,输入参数包括:输入tensor、池化层类型、卷积核尺寸
// 注意,这里没有设置stride、padding等参数
// Add max pooling layer with stride of 2x2 and kernel size of 2x2.
IPoolingLayer* pool1 = network->addPoolingNd(*relu1->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
assert(pool1);
// 设置stride
pool1->setStrideNd(DimsHW{2, 2});
// Add second convolution layer with 16 outputs and a 5x5 filter.
IConvolutionLayer* conv2 = network->addConvolutionNd(*pool1->getOutput(0), 16, DimsHW{5, 5}, weightMap["conv2.weight"], weightMap["conv2.bias"]);
assert(conv2);
conv2->setStrideNd(DimsHW{1, 1});
// Add activation layer using the ReLU algorithm.
IActivationLayer* relu2 = network->addActivation(*conv2->getOutput(0), ActivationType::kRELU);
assert(relu2);
// Add second max pooling layer with stride of 2x2 and kernel size of 2x2>
IPoolingLayer* pool2 = network->addPoolingNd(*relu2->getOutput(0), PoolingType::kAVERAGE, DimsHW{2, 2});
assert(pool2);
pool2->setStrideNd(DimsHW{2, 2});
// 添加全连接层,输入参数包括:输入tensor、输出神经元数量、weight权重、bias权重
// Add fully connected layer
IFullyConnectedLayer* fc1 = network->addFullyConnected(*pool2->getOutput(0), 120, weightMap["fc1.weight"], weightMap["fc1.bias"]);
assert(fc1);
// Add activation layer using the ReLU algorithm.
IActivationLayer* relu3 = network->addActivation(*fc1->getOutput(0), ActivationType::kRELU);
assert(relu3);
// Add second fully connected layer
IFullyConnectedLayer* fc2 = network->addFullyConnected(*relu3->getOutput(0), 84, weightMap["fc2.weight"], weightMap["fc2.bias"]);
assert(fc2);
// Add activation layer using the ReLU algorithm.
IActivationLayer* relu4 = network->addActivation(*fc2->getOutput(0), ActivationType::kRELU);
assert(relu4);
// Add third fully connected layer
IFullyConnectedLayer* fc3 = network->addFullyConnected(*relu4->getOutput(0), OUTPUT_SIZE, weightMap["fc3.weight"], weightMap["fc3.bias"]);
assert(fc3);
// 添加Softmax,输入参数就是前一层的输出
// Add softmax layer to determine the probability.
ISoftMaxLayer* prob = network->addSoftMax(*fc3->getOutput(0));
assert(prob);
// 设置输出层的名称,后面调用Engine的时候要用到
prob->getOutput(0)->setName(OUTPUT_BLOB_NAME);
// 设置 INetworkDefinition 的输出
network->markOutput(*prob->getOutput(0));
// Build engine
// 设置 batch size,前面构建过程都跟batch size无关
builder->setMaxBatchSize(maxBatchSize);
// 参考 https://developer.nvidia.com/blog/speeding-up-deep-learning-inference-using-tensorrt/
config->setMaxWorkspaceSize(1 << 20);
// 创建 engine 对象,输入数据包括INetworkDefinition 和 IBuilderConfig
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
// network的作用就是创建engine,没用了就释放资源
// Don't need the network any more
network->destroy();
// 权重也没用了,释放资源
// Release host memory
for (auto& mem : weightMap)
{
free((void*) (mem.second.values));
}
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream)
{
// Create builder
// 创建相关builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
// 这个就是上面介绍的函数
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createLenetEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);
// 序列化输出,将结果保存在 modelStream 中
// Serialize the engine
(*modelStream) = engine->serialize();
// Close everything down
engine->destroy();
builder->destroy();
}
./lenet -d
。// 准备输入数据
// 虽然代码注释中写了要减去均值,但其实这个是固定输入
// Subtract mean from image
float data[INPUT_H * INPUT_W];
for (int i = 0; i < INPUT_H * INPUT_W; i++)
data[i] = 1.0;
// 模型推理所需的各种类创建
// IRuntime 不知道是啥
// ICudaEngine其实就是一个优化方案(即Engine文件解析),读取的Engine文件保存在trtModelStream中
// IExecutionContext 是推理上下文,管理整个推理流程
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
// 具体执行
// 模型结果就保存在 prob 数组中
// 其他一些代码是inference time计算
// Run inference
float prob[OUTPUT_SIZE];
for (int i = 0; i < 1000; i++) {
auto start = std::chrono::system_clock::now();
doInference(*context, data, prob, 1);
auto end = std::chrono::system_clock::now();
//std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl;
}
// 推理结束,释放资源
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
doInference
函数中
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
// 先要获取优化计划,即Engine
const ICudaEngine& engine = context.getEngine();
// 不是特别明白,应该指的是有没有绑定输入与输出吧
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// 获取输入与输出对应的buffer编号
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
// 开辟输入与输出的内存区域,后面要使用
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// 创建 cudaStream,这个好像是同步流
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// 从内存到显存,input是内存中的输入数据,buffers[inputIndex]是在显存中的存储区域
// cudaMemcpyHostToDevice其实就是从内存到显存的意思
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
// 加入队列,启动cuda核进行计算
context.enqueue(batchSize, buffers, stream, nullptr);
// 将推理结果保存到内存
// cudaMemcpyDeviceToHost就是显存到内存的意思
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
// 如果使用了多个cuda流,需要同步
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}