TensorRT官方文档
作为新手,我觉得TensorRT的文档并不是特别友好
官方文档主要分为以下7个部分
Tips:注意网页右上方,进入文档页面后可以下载PDF
文档,对应的jupyter
快速入门,其实也就是总体走一遍TensorRT的流程,大概介绍能够实现哪些功能。
TensorRT总体工作流程
所有TensorRT部署工作都包括将模型转换为 optimized representation(也就是TensorRT中所谓的Engine)。
TensorRT基本流程包括:导出模型、设置Batch Size、设置模型精度(即Float32/Float16/Int8等)、转换模型(估计就是转换Engine)、部署模型(估计就是使用Runtime API以及转换好的Engine进行模型推理)。
模型转换与部署参数
选择工作流
主要内容:ONNX模型转换以及Python部署
trtexec
命令
--onnx=path/to/model.onnx
,指定输出--saveEngine==path/to/model.trt
,固定batch size--explicitBatch
ONNXClassifierWrapper
模型转换进阶
trtexec
工具以及 TensorRT API(这种方法不会自动保存trt文件)本节对应的代码在 这里
C++
// 1. 读取 engine 文件
std::vector<char> engineData(fsize);
engineFile.read(engineData.data(), fsize);
util::UniquePtr<nvinfer1::IRuntime> runtime{nvinfer1::createInferRuntime(sample::gLogger.getTRTLogger())};
util::UniquePtr<nvinfer1::ICudaEngine> mEngine(runtime->deserializeCudaEngine(engineData.data(), fsize, nullptr));
// 2. engine的输入输出初始化(也可以理解为 engine context 初始化)
// engine的输入是input,数据类型是float,shape是(1, 3, height, width)
auto input_idx = mEngine->getBindingIndex("input");
assert(mEngine->getBindingDataType(input_idx) == nvinfer1::DataType::kFLOAT);
auto input_dims = nvinfer1::Dims4{1, 3 /* channels */, height, width};
context->setBindingDimensions(input_idx, input_dims);
auto input_size = util::getMemorySize(input_dims, sizeof(float));
// engine的输出是output,数据类型是int32,自动获取输出数据shape
auto output_idx = mEngine->getBindingIndex("output");
assert(mEngine->getBindingDataType(output_idx) == nvinfer1::DataType::kINT32);
auto output_dims = context->getBindingDimensions(output_idx);
auto output_size = util::getMemorySize(output_dims, sizeof(int32_t));
// 3. inference 准备工作
// 为输入输出开辟显存空间
void* input_mem{nullptr};
cudaMalloc(&input_mem, input_size);
void* output_mem{nullptr};
cudaMalloc(&output_mem, output_size);
// 定义图像norm操作
const std::vector<float> mean{0.485f, 0.456f, 0.406f};
const std::vector<float> stddev{0.229f, 0.224f, 0.225f};
auto input_image{util::RGBImageReader(input_filename, input_dims, mean, stddev)};
input_image.read();
auto input_buffer = input_image.process();
// 将处理好的数据转移到显存中
cudaMemcpyAsync(input_mem, input_buffer.get(), input_size, cudaMemcpyHostToDevice, stream);
// 4. 执行 inference 操作
// 通过 executeV2 or enqueueV2 激发 inference 的具体执行
void* bindings[] = {input_mem, output_mem};
bool status = context->enqueueV2(bindings, stream, nullptr);
// 获取预测结果
auto output_buffer = std::unique_ptr<int>{new int[output_size]};
cudaMemcpyAsync(output_buffer.get(), output_mem, output_size, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// 释放资源
cudaFree(input_mem);
cudaFree(output_mem);
// 5. 输出预测结果
const int num_classes{21};
const std::vector<int> palette{
(0x1 << 25) - 1, (0x1 << 15) - 1, (0x1 << 21) - 1};
auto output_image{util::ArgmaxImageWriter(output_filename, output_dims, palette, num_classes)};
output_image.process(output_buffer.get());
output_image.write();
# 导入 engine 文件
def load_engine(engine_file_path):
assert os.path.exists(engine_file_path)
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
# inference pipeline,即 inference 基本流程
def infer(engine, input_file, output_file):
# 读取输入图片
print("Reading input image from file {}".format(input_file))
with Image.open(input_file) as img:
input_image = preprocess(img)
image_width = img.width
image_height = img.height
# 创建 execution context 对象,并初始化各种信息
with engine.create_execution_context() as context:
# 设置输入数据shape
# Set input shape based on image dimensions for inference
context.set_binding_shape(engine.get_binding_index("input"), (1, 3, image_height, image_width))
# 为输入输出分配显存,
# Allocate host and device buffers
# Allocate CUDA device memory for input and output.
# Allocate CUDA page-locked host memory to efficiently copy back the output.
bindings = []
for binding in engine:
binding_idx = engine.get_binding_index(binding)
size = trt.volume(context.get_binding_shape(binding_idx))
dtype = trt.nptype(engine.get_binding_dtype(binding))
if engine.binding_is_input(binding):
input_buffer = np.ascontiguousarray(input_image)
input_memory = cuda.mem_alloc(input_image.nbytes)
bindings.append(int(input_memory))
else:
output_buffer = cuda.pagelocked_empty(size, dtype)
output_memory = cuda.mem_alloc(output_buffer.nbytes)
bindings.append(int(output_memory))
stream = cuda.Stream()
# 将输入数据转存到显存中
# Transfer input data to the GPU.
# Transfer the processed image data into input memory using asynchronous host-to-device CUDA copy.
cuda.memcpy_htod_async(input_memory, input_buffer, stream)
# 执行模型推理
# Run inference
# Kickoff the TensorRT inference pipeline using the asynchronous execute API.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# 将预测结果从显存转移到内存
# Transfer prediction output from the GPU.
# Transfer the segmentation output back into pagelocked host memory using device-to-host CUDA copy.
cuda.memcpy_dtoh_async(output_buffer, output_memory, stream)
# Synchronize the stream
# Synchronize the stream used for data transfers and inference execution to ensure all operations are completes.
stream.synchronize()
# Finally, write out the segmentation output to an image file for visualization.
# postprocess 图像结果后处理函数,从预测结果转换为UINT8 RGB图像
with postprocess(np.reshape(output_buffer, (image_height, image_width))) as img:
print("Writing output image to file {}".format(output_file))
img.convert('RGB').save(output_file, "PPM")
# 如何调用前面两个函数
print("Running TensorRT inference for FCN-ResNet101")
with load_engine(engine_file) as engine:
infer(engine, input_file, output_file)
IExecutionContext
(用于inference)和ICudaEngine
(engine)quality-of-service measurement
,可以很好的比较用户体验与系统性能。trtexec
提供了相关工具trtexec
中的--best
选项