python mmdeploy-main/tools/deploy.py mmdeploy-main/configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth mmdetection/demo/demo.jpg --work-dir mmdeploy_model/faster-rcnn --device cuda --dump-info
CUDA Toolkit 11.1
cuDNN 8.2.1.0
TensorRT 8.2.3.0 (python包 + 环境变量)
pip install pycuda -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install -U openmim -i https://pypi.tuna.tsinghua.edu.cn/simple
mim install "mmcv>=2.0.0rc2"
# 1. 安装 MMDeploy 模型转换工具(含trt/ort自定义算子)
pip install mmdeploy==1.2.0
# 2. 安装 MMDeploy SDK推理工具
# 根据是否需要GPU推理可任选其一进行下载安装
# 2.1 支持 onnxruntime 推理
pip install mmdeploy-runtime==1.2.0
# 2.2 支持 onnxruntime-gpu tensorrt 推理
pip install mmdeploy-runtime-gpu==1.2.0
# 3. 安装推理引擎
# 3.1 安装推理引擎 TensorRT
pip install TensorRT-8.2.3.0/python/tensorrt-8.2.3.0-cp38-none-linux_x86_64.whl
pip install onnxruntime-gpu==1.8.1
在准备工作就绪后,我们可以使用 MMDeploy 中的工具 tools/deploy.py,将 OpenMMLab 的 PyTorch 模型转换成推理后端支持的格式。
以 MMDetection 中的 Faster R-CNN 为例,我们可以使用如下命令,将 PyTorch 模型转换为 TenorRT 模型,从而部署到 NVIDIA GPU 上.
# 克隆 mmdeploy 仓库。转换时,需要使用 mmdeploy 仓库中的配置文件,建立转换流水线, `--recursive` 不是必须的
git clone -b main --recursive https://github.com/open-mmlab/mmdeploy.git
# 安装 mmdetection。转换时,需要使用 mmdetection 仓库中的模型配置文件,构建 PyTorch nn module
git clone -b 3.x https://github.com/open-mmlab/mmdetection.git
cd mmdetection
mim install -v -e .
cd ..
# 下载 Faster R-CNN 模型权重
wget -P checkpoints https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
# 执行转换命令,实现端到端的转换
python mmdeploy/tools/deploy.py \
mmdeploy/configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py \
mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py \
checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth \
mmdetection/demo/demo.jpg \
--work-dir mmdeploy_model/faster-rcnn \
--device cuda \
--dump-info
pip install -e . -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install mmdeploy==1.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
# onnx
pip install mmdeploy-runtime==1.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
# tensorRt
pip install mmdeploy-runtime-gpu==1.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install onnxruntime==1.8.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install onnxruntime-gpu==1.8.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install mmdeploy==1.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install mmdeploy-runtime-gpu==1.2.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
我们推荐用户使用预编译包安装和体验 MMDeploy 功能。目前提供模型转换(trt/ort)以及 SDK 推理的 pypi 预编译包,SDK 的 c/cpp 库可从这里 选择最新版本下载并安装。
目前,MMDeploy 的预编译包支持的平台和设备矩阵如下
:
from mmdeploy.apis import torch2onnx
from mmdeploy.backend.sdk.export_info import export2SDK
img = 'mmpretrain/demo/demo.JPEG'
work_dir = 'work_dir/onnx/resnet'
save_file = 'end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_dynamic.py'
model_cfg = 'mmpretrain/configs/resnet/resnet18_8xb32_in1k.py'
model_checkpoint = 'resnet18_8xb32_in1k_20210831-fbbb1da6.pth'
device = 'cpu'
# 1. convert model to onnx
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg,
model_checkpoint, device)
# 2. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
from mmdeploy.apis import torch2onnx
from mmdeploy.apis.tensorrt import onnx2tensorrt
from mmdeploy.backend.sdk.export_info import export2SDK
import os
img = 'mmpretrain/demo/demo.JPEG'
work_dir = 'work_dir/trt/resnet'
save_file = 'end2end.onnx'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_tensorrt_static-224x224.py'
model_cfg = 'mmpretrain/configs/resnet/resnet18_8xb32_in1k.py'
model_checkpoint = 'resnet18_8xb32_in1k_20210831-fbbb1da6.pth'
device = 'cpu'
# 1. convert model to IR(onnx)
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg,
model_checkpoint, device)
# 2. convert IR to tensorrt
onnx_model = os.path.join(work_dir, save_file)
save_file = 'end2end.engine'
model_id = 0
device = 'cuda'
onnx2tensorrt(work_dir, save_file, model_id, deploy_cfg, onnx_model, device)
# 3. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
from mmdeploy.apis import inference_model
model_cfg = 'mmpretrain/configs/resnet/resnet18_8xb32_in1k.py'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_onnxruntime_dynamic.py'
backend_files = ['work_dir/onnx/resnet/end2end.onnx']
img = 'mmpretrain/demo/demo.JPEG'
device = 'cpu'
result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
from mmdeploy.apis import inference_model
model_cfg = 'mmpretrain/configs/resnet/resnet18_8xb32_in1k.py'
deploy_cfg = 'mmdeploy/configs/mmpretrain/classification_tensorrt_static-224x224.py'
backend_files = ['work_dir/trt/resnet/end2end.engine']
img = 'mmpretrain/demo/demo.JPEG'
device = 'cuda'
result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
这里介绍如何使用SDK的Python API进行推理
ONNXRuntime
推理代码
python .\mmdeploy\demo\python\image_classification.py cpu .\work_dir\onnx\resnet\ .\mmpretrain\demo\demo.JPEG
TensorRT
推理代码
python .\mmdeploy\demo\python\image_classification.py cuda .\work_dir\trt\resnet\ .\mmpretrain\demo\demo.JPEG
from mmdeploy.apis import torch2onnx
from mmdeploy.apis.tensorrt import onnx2tensorrt
from mmdeploy.backend.sdk.export_info import export2SDK
import os
img = 'mmpretrain-main/demo/demo.JPEG'
work_dir = 'work_dir/trt/resnet'
save_file = 'end2end.onnx'
deploy_cfg = 'mmdeploy-main/configs/mmpretrain/classification_tensorrt_static-224x224.py'
model_cfg = 'mmpretrain-main/configs/resnet/resnet18_8xb32_in1k.py'
model_checkpoint = 'resnet18_8xb32_in1k_20210831-fbbb1da6.pth'
device = 'cuda:0'
# 1. convert model to IR(onnx)
torch2onnx(img, work_dir, save_file, deploy_cfg, model_cfg,
model_checkpoint, device)
# 2. convert IR to tensorrt
onnx_model = os.path.join(work_dir, save_file)
save_file = 'end2end.engine'
model_id = 0
device = 'cuda'
onnx2tensorrt(work_dir, save_file, model_id, deploy_cfg, onnx_model, device)
# 3. extract pipeline info for sdk use (dump-info)
export2SDK(deploy_cfg, model_cfg, work_dir, pth=model_checkpoint, device=device)
from mmdeploy.apis import inference_model
model_cfg = 'mmpretrain-main/configs/resnet/resnet18_8xb32_in1k.py'
deploy_cfg = 'mmdeploy-main/configs/mmpretrain/classification_tensorrt_static-224x224.py'
backend_files = ['work_dir/trt/resnet/end2end.engine']
img = 'mmpretrain-main/demo/demo.JPEG'
device = 'cuda:0'
result = inference_model(model_cfg, deploy_cfg, backend_files, img, device)
from mmdeploy_runtime import Detector
import cv2
# 读取图片
img = cv2.imread('mmdetection/demo/demo.jpg')
# 创建检测器
detector = Detector(model_path='mmdeploy_models/faster-rcnn', device_name='cuda', device_id=0)
# 执行推理
bboxes, labels, _ = detector(img)
# 使用阈值过滤推理结果,并绘制到原图中
indices = [i for i in range(len(bboxes))]
for index, bbox, label_id in zip(indices, bboxes, labels):
[left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
if score < 0.3:
continue
cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
cv2.imwrite('output_detection.png', img)
# python mmdeploy/tools/deploy.py mmdeploy/configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py mmdetection/configs/faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py checkpoints/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth mmdetection/demo/demo.jpg --work-dir mmdeploy_model/faster-rcnn --device cuda:0 --dump-info
import datetime
from PIL import Image
# from mmdeploy.apis import inference_model
# result = inference_model(
# model_cfg=r'D:\workplace\python\TestMMDeploy\mmdetection\configs\faster_rcnn\faster-rcnn_r50_fpn_1x_coco.py',
# deploy_cfg=r'./mmdeploy/configs/mmdet/detection/detection_tensorrt_dynamic-320x320-1344x1344.py',
# backend_files=['mmdeploy_model/faster-rcnn/end2end.engine'],
# img='mmdetection/demo/demo.jpg',
# device='cuda:0')
# print(result)
from mmdeploy_runtime import Detector
import cv2
# 读取图片
img = cv2.imread('mmdetection/demo/demo.jpg')
detector = Detector(model_path=r'D:\workplace\python\TestMMDeploy\mmdeploy_model\faster-rcnn', device_name='cuda', device_id=0)
for i in range(99):
# 创建检测器
startTime = datetime.datetime.now()
# 执行推理
bboxes, labels, _ = detector(img)
endTime = datetime.datetime.now()
durTime = '推理-----时间:%dms' % (
(endTime - startTime).seconds * 1000 + (endTime - startTime).microseconds / 1000)
print(durTime)
# 使用阈值过滤推理结果,并绘制到原图中
indices = [i for i in range(len(bboxes))]
for index, bbox, label_id in zip(indices, bboxes, labels):
[left, top, right, bottom], score = bbox[0:4].astype(int), bbox[4]
if score < 0.3:
continue
cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0))
img = Image.fromarray(img)
img.show()
mim install -v -e . -i https://pypi.tuna.tsinghua.edu.cn/simple
from mmdeploy_runtime import Segmentor
import cv2
import numpy as np
img = cv2.imread('./demo/resources/cityscapes.png')
# create a classifier
segmentor = Segmentor(model_path='./mmdeploy_models/mmseg/ort', device_name='cpu', device_id=0)
# perform inference
seg = segmentor(img)
# visualize inference result
## random a palette with size 256x3
palette = np.random.randint(0, 256, size=(256, 3))
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
for label, color in enumerate(palette):
color_seg[seg == label, :] = color
# convert to BGR
color_seg = color_seg[..., ::-1]
img = img * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)
cv2.imwrite('output_segmentation.png', img)
onnx
# convert mmpretrain model to onnxruntime model with dynamic shape
python tools/deploy.py configs/mmpretrain/classification_onnxruntime_dynamic.py resnet18_8xb32_in1k.py resnet18_8xb32_in1k_20210831-fbbb1da6.pth tests/data/tiger.jpeg --work-dir mmdeploy_models/mmpretrain/ort --device cuda --show --dump-info
tensorrt
python tools/deploy.py configs/mmpretrain/classification_tensorrt-fp16_dynamic-224x224-224x224.py resnet18_8xb32_in1k.py resnet18_8xb32_in1k_2021
0831-fbbb1da6.pth tests/data/tiger.jpeg --work-dir mmdeploy_models/mmpretrain/ort --device cuda --show --dump-info
from mmdeploy_runtime import Classifier
import cv2
img = cv2.imread('tests/data/tiger.jpeg')
# create a classifier
classifier = Classifier(model_path='./mmdeploy_models/mmpretrain/ort', device_name='cpu', device_id=0)
# perform inference
result = classifier(img)
# show inference result
for label_id, score in result:
print(label_id, score)
from mmdeploy_runtime import Classifier
import cv2
import datetime
img = cv2.imread('tests/data/tiger.jpeg')
# create a classifier
classifier = Classifier(model_path='./mmdeploy_models/mmpretrain/ort', device_name='cuda', device_id=0)
# perform inference
startTime = datetime.datetime.now()
# 执行推理
result = classifier(img)
# show inference result
for label_id, score in result:
print(label_id, score)
endTime = datetime.datetime.now()
durTime = '推理-----时间:%dms' % (
(endTime - startTime).seconds * 1000 + (endTime - startTime).microseconds / 1000)
print(durTime)