【模型部署】Pytorch->ONNX->trt

1.电脑环境

主要注意cuda、cudnn、pytorch、tensorrt、onnx、onnxruntime的版本。

absl-py=1.2.0=pypi_0
addict=2.4.0=pypi_0
appdirs=1.4.4=pypi_0
backcall=0.2.0=pypi_0
blas=1.0=mkl
brotlipy=0.7.0=py37h2bbff1b_1003
ca-certificates=2022.6.15=h5b45459_0
cachetools=5.2.0=pypi_0
certifi=2022.6.15=pyhd8ed1ab_1
cffi=1.15.1=py37h2bbff1b_0
charset-normalizer=2.0.4=pyhd3eb1b0_0
colorama=0.4.5=pypi_0
coloredlogs=15.0.1=pypi_0
cryptography=37.0.1=py37h21b164f_0
cudatoolkit=11.3.1=h59b6b97_2
cudnn=8.2.1=cuda11.3_0
cycler=0.11.0=pypi_0
decorator=5.1.1=pypi_0
ffmpeg=4.3.1=ha925a31_0
flatbuffers=2.0.7=pypi_0
fonttools=4.37.1=pypi_0
freetype=2.10.4=hd328e21_0
google-auth=2.11.0=pypi_0
google-auth-oauthlib=0.4.6=pypi_0
graphsurgeon=0.4.6=pypi_0
grpcio=1.48.1=pypi_0
humanfriendly=10.0=pypi_0
idna=3.3=pyhd3eb1b0_0
importlib-metadata=4.12.0=pypi_0
intel-openmp=2021.4.0=haa95532_3556
ipython=7.34.0=pypi_0
jedi=0.18.1=pypi_0
jpeg=9e=h2bbff1b_0
kiwisolver=1.4.4=pypi_0
lerc=3.0=hd77b12b_0
libdeflate=1.8=h2bbff1b_5
libpng=1.6.37=h2a8f88b_0
libtiff=4.4.0=h8a3f274_0
libuv=1.40.0=he774522_0
libwebp=1.2.2=h2bbff1b_0
lz4-c=1.9.3=h2bbff1b_1
mako=1.2.2=pypi_0
markdown=3.4.1=pypi_0
markupsafe=2.1.1=pypi_0
matplotlib=3.5.3=pypi_0
matplotlib-inline=0.1.6=pypi_0
mkl=2021.4.0=haa95532_640
mkl-service=2.4.0=py37h2bbff1b_0
mkl_fft=1.3.1=py37h277e83a_0
mkl_random=1.2.2=py37hf11a4ad_0
mpmath=1.2.1=pypi_0
numpy=1.21.5=py37h7a0a035_3
numpy-base=1.21.5=py37hca35cd5_3
oauthlib=3.2.0=pypi_0
onnx=1.12.0=pypi_0
onnx-graphsurgeon=0.3.12=pypi_0
onnxruntime=1.12.1=pypi_0
onnxruntime-gpu=1.12.1=pypi_0
opencv-python=4.6.0.66=pypi_0
openssl=1.1.1q=h8ffe710_0
packaging=21.3=pypi_0
pandas=1.3.5=pypi_0
parso=0.8.3=pypi_0
pickleshare=0.7.5=pypi_0
pillow=9.2.0=py37hdc2b20a_1
pip=22.1.2=py37haa95532_0
platformdirs=2.5.2=pypi_0
prompt-toolkit=3.0.31=pypi_0
protobuf=3.19.4=pypi_0
psutil=5.9.2=pypi_0
pyasn1=0.4.8=pypi_0
pyasn1-modules=0.2.8=pypi_0
pycparser=2.21=pyhd3eb1b0_0
pycuda=2022.1=pypi_0
pygments=2.13.0=pypi_0
pyopenssl=22.0.0=pyhd3eb1b0_0
pyparsing=3.0.9=pypi_0
pyreadline=2.1=pypi_0
pysocks=1.7.1=py37_1
python=3.7.13=h6244533_0
python-dateutil=2.8.2=pypi_0
pytools=2022.1.12=pypi_0
pytorch=1.11.0=py3.7_cuda11.3_cudnn8_0
pytorch-mutex=1.0=cuda
pytz=2022.2.1=pypi_0
pyyaml=6.0=pypi_0
requests=2.28.1=py37haa95532_0
requests-oauthlib=1.3.1=pypi_0
rsa=4.9=pypi_0
scipy=1.7.3=pypi_0
seaborn=0.12.0=pypi_0
setuptools=63.4.1=py37haa95532_0
six=1.16.0=pyhd3eb1b0_1
sqlite=3.39.2=h2bbff1b_0
sympy=1.10.1=pypi_0
tensorboard=2.10.0=pypi_0
tensorboard-data-server=0.6.1=pypi_0
tensorboard-plugin-wit=1.8.1=pypi_0
tensorrt=8.4.3.1=pypi_0
thop=0.1.1-2207130030=pypi_0
tk=8.6.12=h2bbff1b_0
torchaudio=0.11.0=py37_cu113
torchvision=0.12.0=py37_cu113
tqdm=4.64.1=pypi_0
traitlets=5.3.0=pypi_0
typing_extensions=4.3.0=py37haa95532_0
uff=0.6.9=pypi_0
urllib3=1.26.11=py37haa95532_0
vc=14.2=h21ff451_1
vs2015_runtime=14.27.29016=h5e58377_2
wcwidth=0.2.5=pypi_0
werkzeug=2.2.2=pypi_0
wheel=0.37.1=pyhd3eb1b0_0
win_inet_pton=1.1.0=py37haa95532_0
wincertstore=0.2=py37haa95532_2
xz=5.2.5=h8cc25b3_1
zipp=3.8.1=pypi_0
zlib=1.2.12=h8cc25b3_2
zstd=1.5.2=h19a0ad4_0

2.Pytorch转onnx

from pyexpat import model
import torch
import numpy as np
from torchvision.models.resnet import resnet50

TORCH_WEIGHT_PATH = r"H:\pytorch-onnx-tensorrt\new_test\resnet50-0676ba61.pth"
ONNX_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"

def get_numpy_data():
    batch_size = 1
    img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
    return img_input

def get_torch_model():
    # Load Network Here
    model = resnet50()
    return model

def torch2onnx(img_input, onnx_model_path, device_id=0):
    torch_model = get_torch_model()  # Network define
    device = 'cpu' if device_id < 0 else f'cuda:{device_id}'
    torch_model.to(device)
    torch_weights = torch.load(TORCH_WEIGHT_PATH)
    torch_model.load_state_dict(torch_weights)
    torch_model.eval()
    dummy_img = torch.Tensor(img_input).to(device)
    torch.onnx.export(
        torch_model,
        (dummy_img),
        onnx_model_path,
        input_names=['input'],
        output_names=['output'],
        export_params=True,
        verbose=False,
        do_constant_folding=False,  # or True
        opset_version=12
    )
    print("Generate ONNX file over!")

if __name__ == "__main__":
    img_input = get_numpy_data()
    torch2onnx(img_input, ONNX_MODEL_PATH)

简化ONNX模型结构(可选)

 生成的ONNX结构可能还有简化的空间,可以使用onnx-simplifier工具进一步优化。

import onnx
from onnxsim import simplify

ONNX_MODEL_PATH = 'net_bs8_v1.onnx'
ONNX_SIM_MODEL_PATH = 'net_bs8_v1_simple.onnx'

if __name__ == "__main__":
    onnx_model = onnx.load(ONNX_MODEL_PATH)
    onnx_sim_model, check = simplify(onnx_model)
    assert check, "Simplified ONNX model could not be validated"
    onnx.save(onnx_sim_model, ONNX_SIM_MODEL_PATH)
    print('ONNX file simplified!')

3.测试pytorch模型和onnx模型预测的误差

import time
import torch
import onnxruntime
import numpy as np
from torchvision.models.resnet import resnet50

TORCH_WEIGHT_PATH = r"H:\pytorch-onnx-tensorrt\new_test\resnet50-0676ba61.pth"
ONNX_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"

def get_numpy_data():
    batch_size = 1
    img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
    return img_input

def get_torch_model():
    # Load Network Here
    model = resnet50()
    return model

def test_torch(img_input, device_id=0, loop=100):
    torch_model = get_torch_model()
    device = 'cpu' if device_id < 0 else f'cuda:{device_id}'
    torch_model.to(device)
    torch_weights = torch.load(TORCH_WEIGHT_PATH)
    torch_model.load_state_dict(torch_weights)
    torch_model.eval()
    dummy_img = torch.Tensor(img_input).to(device)
    batch_size = 1
    # with torch.no_grad():
    #     out_img = torch_model(dummy_img)
    time1 = time.time()
    for i in range(loop):
        time_bs1 = time.time()
        with torch.no_grad():
            out_img = torch_model(dummy_img)
            out_img_numpy = out_img.detach().cpu().numpy()
        time_bs2 = time.time()
        time_use_pt_bs = time_bs2 - time_bs1
        print(f'PyTorch use time {time_use_pt_bs} for bs8')
    time2 = time.time()
    time_use_pt = time2-time1
    print(f'PyTorch use time {time_use_pt} for loop {loop}, FPS={loop*batch_size//time_use_pt}')
    return out_img_numpy

def test_onnx(inputs, loop=100):
    inputs = inputs.astype(np.float32)
    print(onnxruntime.get_device())
    sess = onnxruntime.InferenceSession(ONNX_MODEL_PATH, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
    batch_size = 1
    time1 = time.time()
    for i in range(loop):
        time_bs1 = time.time()
        out_ort_img = sess.run(None, {sess.get_inputs()[0].name: inputs,})
        time_bs2 = time.time()
        time_use_onnx_bs = time_bs2 - time_bs1
        print(f'ONNX use time {time_use_onnx_bs} for bs8')
    time2 = time.time()
    time_use_onnx = time2-time1
    print(f'ONNX use time {time_use_onnx} for loop {loop}, FPS={loop*batch_size//time_use_onnx}')
    return out_ort_img

if __name__ == "__main__":
    img_input = get_numpy_data()
    out_ort_img = test_onnx(img_input, loop=100)[0]
    out_img_numpy = test_torch(img_input, loop=100)
    mse = np.square(np.subtract(out_ort_img, out_img_numpy)).mean()
    # 精度会有损失,一般小数点后5位都是相同的
    print('mse between pytorch and onnx result: ', mse)

4.onnx转tensorrt

import os
import tensorrt as trt

ONNX_SIM_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
TENSORRT_ENGINE_PATH_PY = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1_fp16_py.engine"

def build_engine(onnx_file_path, engine_file_path, flop=16):
    trt_logger = trt.Logger(trt.Logger.VERBOSE)  # trt.Logger.ERROR
    builder = trt.Builder(trt_logger)
    network = builder.create_network(
        1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    
    parser = trt.OnnxParser(network, trt_logger)
    # parse ONNX
    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse the ONNX file.')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None
    print("Completed parsing ONNX file")
    # builder.max_workspace_size = 2 << 30
    # # default = 1 for fixed batch size
    # builder.max_batch_size = 1
    # set mixed flop computation for the best performance
    if builder.platform_has_fast_fp16 and flop == 16:
        builder.fp16_mode = True

    if os.path.isfile(engine_file_path):
        try:
            os.remove(engine_file_path)
        except Exception:
            print("Cannot remove existing file: ",
                engine_file_path)

    print("Creating Tensorrt Engine")

    config = builder.create_builder_config()
    # config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
    config.max_workspace_size = 2 << 30
    config.set_flag(trt.BuilderFlag.FP16)
    
    engine = builder.build_engine(network, config)
    with open(engine_file_path, "wb") as f:
        f.write(engine.serialize())
    print("Serialized Engine Saved at: ", engine_file_path)
    return engine

if __name__ == "__main__":    
    build_engine(ONNX_SIM_MODEL_PATH, TENSORRT_ENGINE_PATH_PY)

5.测试onnx模型和tensorrt模型预测之间的误差

import os
import time
import onnxruntime
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np

ONNX_SIM_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
TENSORRT_ENGINE_PATH_PY = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1_fp16_py.engine"

def get_numpy_data():
    batch_size = 1
    img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
    return img_input

def test_onnx(inputs, loop=100):
    inputs = inputs.astype(np.float32)
    print(onnxruntime.get_device())
    sess = onnxruntime.InferenceSession(ONNX_SIM_MODEL_PATH, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
    batch_size = 1
    time1 = time.time()
    for i in range(loop):
        time_bs1 = time.time()
        out_ort_img = sess.run(None, {sess.get_inputs()[0].name: inputs,})
        time_bs2 = time.time()
        time_use_onnx_bs = time_bs2 - time_bs1
        print(f'ONNX use time {time_use_onnx_bs} for bs8')
    time2 = time.time()
    time_use_onnx = time2-time1
    print(f'ONNX use time {time_use_onnx} for loop {loop}, FPS={loop*batch_size//time_use_onnx}')
    return out_ort_img

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def _load_engine(engine_file_path):
    trt_logger = trt.Logger(trt.Logger.ERROR)
    with open(engine_file_path, 'rb') as f:
        with trt.Runtime(trt_logger) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
            print('_load_engine ok.')
    return engine


def _allocate_buffer(engine):
    binding_names = []
    for idx in range(100):
        bn = engine.get_binding_name(idx)
        if bn:
            binding_names.append(bn)
        else:
            break

    inputs = []
    outputs = []
    bindings = [None] * len(binding_names)
    stream = cuda.Stream()

    for binding in binding_names:
        binding_idx = engine[binding]
        if binding_idx == -1:
            print("Error Binding Names!")
            continue
        
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        bindings[binding_idx] = int(device_mem)

        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

def _test_engine(engine_file_path, data_input, num_times=100):
    # Code from blog.csdn.net/TracelessLe
    engine = _load_engine(engine_file_path)
    # print(engine)
    input_bufs, output_bufs, bindings, stream = _allocate_buffer(engine)
    batch_size = 1
    context = engine.create_execution_context()
    ###heat###
    input_bufs[0].host = data_input
    cuda.memcpy_htod_async(
        input_bufs[0].device,
        input_bufs[0].host,
        stream
    )
    context.execute_async_v2(
        bindings=bindings,
        stream_handle=stream.handle
    )
    cuda.memcpy_dtoh_async(
        output_bufs[0].host,
        output_bufs[0].device,
        stream
    )
    stream.synchronize()
    trt_outputs = [output_bufs[0].host.copy()]
    ##########
    start = time.time()
    for _ in range(num_times):
        time_bs1 = time.time()
        input_bufs[0].host = data_input
        cuda.memcpy_htod_async(
            input_bufs[0].device,
            input_bufs[0].host,
            stream
        )
        context.execute_async_v2(
            bindings=bindings,
            stream_handle=stream.handle
        )
        cuda.memcpy_dtoh_async(
            output_bufs[0].host,
            output_bufs[0].device,
            stream
        )
        stream.synchronize()
        trt_outputs = [output_bufs[0].host.copy()]
        time_bs2 = time.time()
        time_use_bs = time_bs2 - time_bs1
        print(f'TRT use time {time_use_bs} for bs8')
    
    end = time.time()
    time_use_trt = end - start
    print(f"TRT use time {(time_use_trt)}for loop {num_times}, FPS={num_times*batch_size//time_use_trt}")
    return trt_outputs

def test_engine(data_input, loop=100):
    engine_file_path = TENSORRT_ENGINE_PATH_PY
    cuda.init()
    cuda_ctx = cuda.Device(0).make_context()
    trt_outputs = None
    try:
        trt_outputs = _test_engine(engine_file_path, data_input, loop)
    finally:
        cuda_ctx.pop()
    return trt_outputs

if __name__ == "__main__":
    img_input = get_numpy_data()
    
    trt_outputs = test_engine(img_input, 100)
    trt_outputs = trt_outputs[0].reshape((1,1000))
    # trt_image_numpy = (np.transpose(trt_outputs[0], (1, 2, 0)) + 1) / 2.0 * 255.0
    # trt_image_numpy = np.clip(trt_image_numpy, 0, 255)
    
    out_ort_img = test_onnx(img_input, loop=100)[0]
    # onnx_image_numpy = (np.transpose(out_ort_img[0], (1, 2, 0)) + 1) / 2.0 * 255.0
    # onnx_image_numpy = np.clip(onnx_image_numpy, 0, 255)
    
    mse = np.square(np.subtract(out_ort_img, trt_outputs)).mean()
    print('mse between onnx and trt result: ', mse)

参考

PyTorch模型转ONNX格式_TracelessLe的博客-CSDN博客_onnx转pytorch

你可能感兴趣的:(机器学习,pytorch,python,人工智能)