主要注意cuda、cudnn、pytorch、tensorrt、onnx、onnxruntime的版本。
absl-py=1.2.0=pypi_0
addict=2.4.0=pypi_0
appdirs=1.4.4=pypi_0
backcall=0.2.0=pypi_0
blas=1.0=mkl
brotlipy=0.7.0=py37h2bbff1b_1003
ca-certificates=2022.6.15=h5b45459_0
cachetools=5.2.0=pypi_0
certifi=2022.6.15=pyhd8ed1ab_1
cffi=1.15.1=py37h2bbff1b_0
charset-normalizer=2.0.4=pyhd3eb1b0_0
colorama=0.4.5=pypi_0
coloredlogs=15.0.1=pypi_0
cryptography=37.0.1=py37h21b164f_0
cudatoolkit=11.3.1=h59b6b97_2
cudnn=8.2.1=cuda11.3_0
cycler=0.11.0=pypi_0
decorator=5.1.1=pypi_0
ffmpeg=4.3.1=ha925a31_0
flatbuffers=2.0.7=pypi_0
fonttools=4.37.1=pypi_0
freetype=2.10.4=hd328e21_0
google-auth=2.11.0=pypi_0
google-auth-oauthlib=0.4.6=pypi_0
graphsurgeon=0.4.6=pypi_0
grpcio=1.48.1=pypi_0
humanfriendly=10.0=pypi_0
idna=3.3=pyhd3eb1b0_0
importlib-metadata=4.12.0=pypi_0
intel-openmp=2021.4.0=haa95532_3556
ipython=7.34.0=pypi_0
jedi=0.18.1=pypi_0
jpeg=9e=h2bbff1b_0
kiwisolver=1.4.4=pypi_0
lerc=3.0=hd77b12b_0
libdeflate=1.8=h2bbff1b_5
libpng=1.6.37=h2a8f88b_0
libtiff=4.4.0=h8a3f274_0
libuv=1.40.0=he774522_0
libwebp=1.2.2=h2bbff1b_0
lz4-c=1.9.3=h2bbff1b_1
mako=1.2.2=pypi_0
markdown=3.4.1=pypi_0
markupsafe=2.1.1=pypi_0
matplotlib=3.5.3=pypi_0
matplotlib-inline=0.1.6=pypi_0
mkl=2021.4.0=haa95532_640
mkl-service=2.4.0=py37h2bbff1b_0
mkl_fft=1.3.1=py37h277e83a_0
mkl_random=1.2.2=py37hf11a4ad_0
mpmath=1.2.1=pypi_0
numpy=1.21.5=py37h7a0a035_3
numpy-base=1.21.5=py37hca35cd5_3
oauthlib=3.2.0=pypi_0
onnx=1.12.0=pypi_0
onnx-graphsurgeon=0.3.12=pypi_0
onnxruntime=1.12.1=pypi_0
onnxruntime-gpu=1.12.1=pypi_0
opencv-python=4.6.0.66=pypi_0
openssl=1.1.1q=h8ffe710_0
packaging=21.3=pypi_0
pandas=1.3.5=pypi_0
parso=0.8.3=pypi_0
pickleshare=0.7.5=pypi_0
pillow=9.2.0=py37hdc2b20a_1
pip=22.1.2=py37haa95532_0
platformdirs=2.5.2=pypi_0
prompt-toolkit=3.0.31=pypi_0
protobuf=3.19.4=pypi_0
psutil=5.9.2=pypi_0
pyasn1=0.4.8=pypi_0
pyasn1-modules=0.2.8=pypi_0
pycparser=2.21=pyhd3eb1b0_0
pycuda=2022.1=pypi_0
pygments=2.13.0=pypi_0
pyopenssl=22.0.0=pyhd3eb1b0_0
pyparsing=3.0.9=pypi_0
pyreadline=2.1=pypi_0
pysocks=1.7.1=py37_1
python=3.7.13=h6244533_0
python-dateutil=2.8.2=pypi_0
pytools=2022.1.12=pypi_0
pytorch=1.11.0=py3.7_cuda11.3_cudnn8_0
pytorch-mutex=1.0=cuda
pytz=2022.2.1=pypi_0
pyyaml=6.0=pypi_0
requests=2.28.1=py37haa95532_0
requests-oauthlib=1.3.1=pypi_0
rsa=4.9=pypi_0
scipy=1.7.3=pypi_0
seaborn=0.12.0=pypi_0
setuptools=63.4.1=py37haa95532_0
six=1.16.0=pyhd3eb1b0_1
sqlite=3.39.2=h2bbff1b_0
sympy=1.10.1=pypi_0
tensorboard=2.10.0=pypi_0
tensorboard-data-server=0.6.1=pypi_0
tensorboard-plugin-wit=1.8.1=pypi_0
tensorrt=8.4.3.1=pypi_0
thop=0.1.1-2207130030=pypi_0
tk=8.6.12=h2bbff1b_0
torchaudio=0.11.0=py37_cu113
torchvision=0.12.0=py37_cu113
tqdm=4.64.1=pypi_0
traitlets=5.3.0=pypi_0
typing_extensions=4.3.0=py37haa95532_0
uff=0.6.9=pypi_0
urllib3=1.26.11=py37haa95532_0
vc=14.2=h21ff451_1
vs2015_runtime=14.27.29016=h5e58377_2
wcwidth=0.2.5=pypi_0
werkzeug=2.2.2=pypi_0
wheel=0.37.1=pyhd3eb1b0_0
win_inet_pton=1.1.0=py37haa95532_0
wincertstore=0.2=py37haa95532_2
xz=5.2.5=h8cc25b3_1
zipp=3.8.1=pypi_0
zlib=1.2.12=h8cc25b3_2
zstd=1.5.2=h19a0ad4_0
from pyexpat import model
import torch
import numpy as np
from torchvision.models.resnet import resnet50
TORCH_WEIGHT_PATH = r"H:\pytorch-onnx-tensorrt\new_test\resnet50-0676ba61.pth"
ONNX_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
def get_numpy_data():
batch_size = 1
img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
return img_input
def get_torch_model():
# Load Network Here
model = resnet50()
return model
def torch2onnx(img_input, onnx_model_path, device_id=0):
torch_model = get_torch_model() # Network define
device = 'cpu' if device_id < 0 else f'cuda:{device_id}'
torch_model.to(device)
torch_weights = torch.load(TORCH_WEIGHT_PATH)
torch_model.load_state_dict(torch_weights)
torch_model.eval()
dummy_img = torch.Tensor(img_input).to(device)
torch.onnx.export(
torch_model,
(dummy_img),
onnx_model_path,
input_names=['input'],
output_names=['output'],
export_params=True,
verbose=False,
do_constant_folding=False, # or True
opset_version=12
)
print("Generate ONNX file over!")
if __name__ == "__main__":
img_input = get_numpy_data()
torch2onnx(img_input, ONNX_MODEL_PATH)
简化ONNX模型结构(可选)
生成的ONNX结构可能还有简化的空间,可以使用onnx-simplifier工具进一步优化。
import onnx from onnxsim import simplify ONNX_MODEL_PATH = 'net_bs8_v1.onnx' ONNX_SIM_MODEL_PATH = 'net_bs8_v1_simple.onnx' if __name__ == "__main__": onnx_model = onnx.load(ONNX_MODEL_PATH) onnx_sim_model, check = simplify(onnx_model) assert check, "Simplified ONNX model could not be validated" onnx.save(onnx_sim_model, ONNX_SIM_MODEL_PATH) print('ONNX file simplified!')
import time
import torch
import onnxruntime
import numpy as np
from torchvision.models.resnet import resnet50
TORCH_WEIGHT_PATH = r"H:\pytorch-onnx-tensorrt\new_test\resnet50-0676ba61.pth"
ONNX_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
def get_numpy_data():
batch_size = 1
img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
return img_input
def get_torch_model():
# Load Network Here
model = resnet50()
return model
def test_torch(img_input, device_id=0, loop=100):
torch_model = get_torch_model()
device = 'cpu' if device_id < 0 else f'cuda:{device_id}'
torch_model.to(device)
torch_weights = torch.load(TORCH_WEIGHT_PATH)
torch_model.load_state_dict(torch_weights)
torch_model.eval()
dummy_img = torch.Tensor(img_input).to(device)
batch_size = 1
# with torch.no_grad():
# out_img = torch_model(dummy_img)
time1 = time.time()
for i in range(loop):
time_bs1 = time.time()
with torch.no_grad():
out_img = torch_model(dummy_img)
out_img_numpy = out_img.detach().cpu().numpy()
time_bs2 = time.time()
time_use_pt_bs = time_bs2 - time_bs1
print(f'PyTorch use time {time_use_pt_bs} for bs8')
time2 = time.time()
time_use_pt = time2-time1
print(f'PyTorch use time {time_use_pt} for loop {loop}, FPS={loop*batch_size//time_use_pt}')
return out_img_numpy
def test_onnx(inputs, loop=100):
inputs = inputs.astype(np.float32)
print(onnxruntime.get_device())
sess = onnxruntime.InferenceSession(ONNX_MODEL_PATH, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
batch_size = 1
time1 = time.time()
for i in range(loop):
time_bs1 = time.time()
out_ort_img = sess.run(None, {sess.get_inputs()[0].name: inputs,})
time_bs2 = time.time()
time_use_onnx_bs = time_bs2 - time_bs1
print(f'ONNX use time {time_use_onnx_bs} for bs8')
time2 = time.time()
time_use_onnx = time2-time1
print(f'ONNX use time {time_use_onnx} for loop {loop}, FPS={loop*batch_size//time_use_onnx}')
return out_ort_img
if __name__ == "__main__":
img_input = get_numpy_data()
out_ort_img = test_onnx(img_input, loop=100)[0]
out_img_numpy = test_torch(img_input, loop=100)
mse = np.square(np.subtract(out_ort_img, out_img_numpy)).mean()
# 精度会有损失,一般小数点后5位都是相同的
print('mse between pytorch and onnx result: ', mse)
import os
import tensorrt as trt
ONNX_SIM_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
TENSORRT_ENGINE_PATH_PY = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1_fp16_py.engine"
def build_engine(onnx_file_path, engine_file_path, flop=16):
trt_logger = trt.Logger(trt.Logger.VERBOSE) # trt.Logger.ERROR
builder = trt.Builder(trt_logger)
network = builder.create_network(
1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, trt_logger)
# parse ONNX
with open(onnx_file_path, 'rb') as model:
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
print("Completed parsing ONNX file")
# builder.max_workspace_size = 2 << 30
# # default = 1 for fixed batch size
# builder.max_batch_size = 1
# set mixed flop computation for the best performance
if builder.platform_has_fast_fp16 and flop == 16:
builder.fp16_mode = True
if os.path.isfile(engine_file_path):
try:
os.remove(engine_file_path)
except Exception:
print("Cannot remove existing file: ",
engine_file_path)
print("Creating Tensorrt Engine")
config = builder.create_builder_config()
# config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS))
config.max_workspace_size = 2 << 30
config.set_flag(trt.BuilderFlag.FP16)
engine = builder.build_engine(network, config)
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
print("Serialized Engine Saved at: ", engine_file_path)
return engine
if __name__ == "__main__":
build_engine(ONNX_SIM_MODEL_PATH, TENSORRT_ENGINE_PATH_PY)
import os
import time
import onnxruntime
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np
ONNX_SIM_MODEL_PATH = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1.onnx"
TENSORRT_ENGINE_PATH_PY = r"H:\pytorch-onnx-tensorrt\new_test\net_bs8_v1_fp16_py.engine"
def get_numpy_data():
batch_size = 1
img_input = np.ones((batch_size, 3, 224, 224), dtype=np.float32)
return img_input
def test_onnx(inputs, loop=100):
inputs = inputs.astype(np.float32)
print(onnxruntime.get_device())
sess = onnxruntime.InferenceSession(ONNX_SIM_MODEL_PATH, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])
batch_size = 1
time1 = time.time()
for i in range(loop):
time_bs1 = time.time()
out_ort_img = sess.run(None, {sess.get_inputs()[0].name: inputs,})
time_bs2 = time.time()
time_use_onnx_bs = time_bs2 - time_bs1
print(f'ONNX use time {time_use_onnx_bs} for bs8')
time2 = time.time()
time_use_onnx = time2-time1
print(f'ONNX use time {time_use_onnx} for loop {loop}, FPS={loop*batch_size//time_use_onnx}')
return out_ort_img
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def _load_engine(engine_file_path):
trt_logger = trt.Logger(trt.Logger.ERROR)
with open(engine_file_path, 'rb') as f:
with trt.Runtime(trt_logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
print('_load_engine ok.')
return engine
def _allocate_buffer(engine):
binding_names = []
for idx in range(100):
bn = engine.get_binding_name(idx)
if bn:
binding_names.append(bn)
else:
break
inputs = []
outputs = []
bindings = [None] * len(binding_names)
stream = cuda.Stream()
for binding in binding_names:
binding_idx = engine[binding]
if binding_idx == -1:
print("Error Binding Names!")
continue
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings[binding_idx] = int(device_mem)
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def _test_engine(engine_file_path, data_input, num_times=100):
# Code from blog.csdn.net/TracelessLe
engine = _load_engine(engine_file_path)
# print(engine)
input_bufs, output_bufs, bindings, stream = _allocate_buffer(engine)
batch_size = 1
context = engine.create_execution_context()
###heat###
input_bufs[0].host = data_input
cuda.memcpy_htod_async(
input_bufs[0].device,
input_bufs[0].host,
stream
)
context.execute_async_v2(
bindings=bindings,
stream_handle=stream.handle
)
cuda.memcpy_dtoh_async(
output_bufs[0].host,
output_bufs[0].device,
stream
)
stream.synchronize()
trt_outputs = [output_bufs[0].host.copy()]
##########
start = time.time()
for _ in range(num_times):
time_bs1 = time.time()
input_bufs[0].host = data_input
cuda.memcpy_htod_async(
input_bufs[0].device,
input_bufs[0].host,
stream
)
context.execute_async_v2(
bindings=bindings,
stream_handle=stream.handle
)
cuda.memcpy_dtoh_async(
output_bufs[0].host,
output_bufs[0].device,
stream
)
stream.synchronize()
trt_outputs = [output_bufs[0].host.copy()]
time_bs2 = time.time()
time_use_bs = time_bs2 - time_bs1
print(f'TRT use time {time_use_bs} for bs8')
end = time.time()
time_use_trt = end - start
print(f"TRT use time {(time_use_trt)}for loop {num_times}, FPS={num_times*batch_size//time_use_trt}")
return trt_outputs
def test_engine(data_input, loop=100):
engine_file_path = TENSORRT_ENGINE_PATH_PY
cuda.init()
cuda_ctx = cuda.Device(0).make_context()
trt_outputs = None
try:
trt_outputs = _test_engine(engine_file_path, data_input, loop)
finally:
cuda_ctx.pop()
return trt_outputs
if __name__ == "__main__":
img_input = get_numpy_data()
trt_outputs = test_engine(img_input, 100)
trt_outputs = trt_outputs[0].reshape((1,1000))
# trt_image_numpy = (np.transpose(trt_outputs[0], (1, 2, 0)) + 1) / 2.0 * 255.0
# trt_image_numpy = np.clip(trt_image_numpy, 0, 255)
out_ort_img = test_onnx(img_input, loop=100)[0]
# onnx_image_numpy = (np.transpose(out_ort_img[0], (1, 2, 0)) + 1) / 2.0 * 255.0
# onnx_image_numpy = np.clip(onnx_image_numpy, 0, 255)
mse = np.square(np.subtract(out_ort_img, trt_outputs)).mean()
print('mse between onnx and trt result: ', mse)
PyTorch模型转ONNX格式_TracelessLe的博客-CSDN博客_onnx转pytorch