TensorRT是NVIDIA提供在其GPU上进行推理加速的工具。
这里以超分辨率模型 Real-ESRGAN 为例介绍3种方式将PyTorch模型转为TensorRT优化的模型并测试性能。
如果安装的cuda是dev版本:
nvcc --version
或者ls -all /usr/local/
查看cuda软连接的cuda版本。
cat /usr/local/cuda/include/cudnn_version.h
查看cudnn版本。
使用torch查看使用的版本:torch.version.cuda
和torch.backends.cudnn.version()
如果安装的是runtime版本,则不会有/usr/local/cuda
目录,我没有测试这种情况下能否使用TensorRT。
上面是我测试用的环境,在你的环境中确保cuda、cudnn版本和gpu驱动匹配。
同时,确认系统变量正确设置:
vim ~/.bashrc
export PATH=/usr/local/cuda-11.3/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH
source ~/.bashrc
tar -xzvf TensorRT-8.2.1.8.Linux.x86_64-gnu.cuda-11.4.cudnn8.2.tar.gz
。TensorRT
的lib可添加到系统变量里vim ~/.bashrc
然后export LD_LIBRARY_PATH=/root/TensorRT-8.2.1.8/lib:$LD_LIBRARY_PATH
,最后source ~/.bashrc
。pip
安装tensorrt
: 到tensorrt解压目录下,pip install python/tensorrt-8.2.1.8-cp38-none-linux_x86_64.whl
,用的python3.8
,所以选cp38
。graphsurgeon
,pip install graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
。Real-ESRGAN推理时主要使用RRDB-Net(basicsr/archs/rrdbnet_arch.py
),当scale=2时(本文也以scale=2为例),其中有个pixel_unshuffle
方法会导致转onnx有问题,
如这个issue,
我这里的解决方法是直接将pixel_unshuffle
移出forward
,事实上pixel_unshuffle
的作用只是将分辨率压缩到到通道维度上,
如[b, 3, h, w] -> [b, 3*4, h/2, w/2]
。
修改后的RRDB-Net(新建一个文件test/rrdb_net.py
并引用):
import torch
from torch import nn as nn
from torch.nn import functional as F
from basicsr.archs.arch_util import default_init_weights, make_layer
class ResidualDenseBlock(nn.Module):
def __init__(self, num_feat=64, num_grow_ch=32):
super(ResidualDenseBlock, self).__init__()
self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
# initialization
default_init_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
def forward(self, x):
x1 = self.lrelu(self.conv1(x))
x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
# Emperically, we use 0.2 to scale the residual for better performance
return x5 * 0.2 + x
class RRDB(nn.Module):
def __init__(self, num_feat, num_grow_ch=32):
super(RRDB, self).__init__()
self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
def forward(self, x):
out = self.rdb1(x)
out = self.rdb2(out)
out = self.rdb3(out)
# Emperically, we use 0.2 to scale the residual for better performance
return out * 0.2 + x
class RRDBNet(nn.Module):
def __init__(self, num_in_ch, num_out_ch, num_feat=64, num_block=23, num_grow_ch=32):
super(RRDBNet, self).__init__()
num_in_ch = num_in_ch * 4
self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
feat = self.conv_first(x)
body_feat = self.conv_body(self.body(feat))
feat = feat + body_feat
feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
out = self.conv_last(self.lrelu(self.conv_hr(feat)))
return out
因此需要在输入网络前需要先做这个pixel_unshuffle
操作在prepare
中,另外提供一些共用方法,例如benchmark测试和显示图片等(新建一个文件test/common.py
):
import time
import torch
import cv2
import numpy as np
from test.rrdb_net import RRDBNet
import matplotlib.pyplot as plt
def get_sr_model():
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32)
load_net = torch.load('src/pretrained_models/real_esrgan/RealESRGAN_x2plus.pth')
model.load_state_dict(load_net['params_ema'], strict=True)
model.eval()
model = model.to('cuda').half()
return model
def show(x: torch.Tensor):
print(x.shape)
x = x[0].clip(0, 1).cpu().detach().numpy().transpose(1, 2, 0).astype(np.float32)
plt.imshow(x)
plt.show()
@torch.no_grad()
def benchmark(model, x, warm_up=2, runs=10):
print("Warm up ...")
with torch.no_grad():
for _ in range(warm_up):
features = model(x)
torch.cuda.synchronize()
print("Start timing ...")
timings = []
with torch.no_grad():
for i in range(1, runs + 1):
start_time = time.time()
features = model(x)
torch.cuda.synchronize()
end_time = time.time()
timings.append(end_time - start_time)
if i % 10 == 0:
print('Iteration %d/%d, ave batch time %.2f ms' % (i, runs, np.mean(timings[i-10: i]) * 1000))
print("Input shape:", x.shape)
print("Output features size:", features.shape)
print('Average batch time: %.2f ms' % (np.mean(timings) * 1000))
def test(x, model, name):
torch.cuda.empty_cache()
s = time.time()
with torch.no_grad():
y = model(x)
torch.cuda.synchronize()
print(name, time.time() - s)
show(y)
return y
def prepare(side=512):
model = get_sr_model().eval()
# create example data
x = cv2.resize(cv2.imread('../src/inputs/sr/0014.jpeg'),
(side, side))[..., ::-1].transpose(2, 0, 1) / 255.0
x = torch.from_numpy(x).cuda().half().unsqueeze(0)
show(x)
b, c, h, w = x.size()
h = h//2
w = w//2
x = x.view(b, c, h, 2, w, 2).permute(0, 1, 3, 5, 2, 4).reshape(b, 12, h, w)
return model, x
下面说明具体工具使用:
torch2trt 目前只能固定一种尺寸的输入,可以参考这个
torch2trt_dynamic 实现多尺寸。
项目地址: https://github.com/NVIDIA-AI-IOT/torch2trt
文档地址: https://nvidia-ai-iot.github.io/torch2trt/v0.3.0/
由于在最新的torch2trt 0.3
中已经包含torch.nn.functional.interpolate
操作,所以选择无插件的安装方式
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
python setup.py install
此外,这里也记录下插件的安装方式(当有些pytorch
的操作在torch2trt
未实现需要手动以插件方式实现):
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
python setup.py install --plugins
报错: NvInfer.h: No such file or directory
解决方法:
编辑setup.py文件
include_dirs=[
trt_inc_dir(),
'your/path/TensorRT-8.2.1.8/include' # add include directories
],
library_dirs=[
trt_lib_dir(),
'your/path/TensorRT-8.2.1.8/lib' # add link directories
],
报错: error: invalid new-expression of abstract class type ‘torch2trt::GroupNormPlugin’
暂无解决方法,可能是TensorRT
版本与torch2trt不匹配?还未测试。
使用torch2trt文档中的示例 测试:
import torch
from torch2trt import torch2trt
from torchvision.models.alexnet import alexnet
# create some regular pytorch model...
model = alexnet(pretrained=True).eval().cuda()
# create example data
x = torch.ones((1, 3, 224, 224)).cuda()
# convert to TensorRT feeding sample data as input
model_trt = torch2trt(model, [x])
y = model(x)
y_trt = model_trt(x)
# check the output against PyTorch
print(torch.max(torch.abs(y - y_trt)))
如果未报错则说明配置成功,测试输出结果:
tensor(1.0729e-06, device='cuda:0', grad_fn=<MaxBackward1>)
原始模型使用FP16精度,TensorRT也使用FP16
import torch
from torch2trt import torch2trt
from test.common import prepare, test
side = 1024
model, x = prepare(side)
torch.cuda.empty_cache()
with torch.no_grad():
print('converting trt...')
model_trt = torch2trt(model, [x], fp16_mode=True)
torch.save(model_trt.state_dict(), f't2trt_fp16_{side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("error", torch.max(torch.abs(y - y_trt)))
结果:
torch.Size([1, 3, 512, 512])
Converting trt...
Converted! used time 239.82s
Testing model used 0.38s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.20s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1553, device='cuda:0', dtype=torch.float16)
批量测试优化效果:
import torch
from torch2trt import TRTModule
from test.common import prepare, test, benchmark
side = 512
model, x = prepare(side)
torch.cuda.empty_cache()
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(f'src/torch2trt_fp16_{side}.trt'))
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
print('*'*100)
torch.cuda.empty_cache()
benchmark(model, x, warm_up=5, runs=50)
print('*'*100)
torch.cuda.empty_cache()
benchmark(model_trt, x, warm_up=5, runs=50)
结果:
torch.Size([1, 3, 512, 512])
[12/10/2021-15:06:46] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
[12/10/2021-15:06:47] [TRT] [W] TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
[12/10/2021-15:06:47] [TRT] [W] TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
[12/10/2021-15:06:47] [TRT] [W] TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Testing model used 0.40s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1553, device='cuda:0', dtype=torch.float16)
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 314.55 ms
Iteration 20/50, ave batch time 314.76 ms
Iteration 30/50, ave batch time 315.70 ms
Iteration 40/50, ave batch time 316.29 ms
Iteration 50/50, ave batch time 317.29 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 315.72 ms
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 200.73 ms
Iteration 20/50, ave batch time 201.76 ms
Iteration 30/50, ave batch time 202.16 ms
Iteration 40/50, ave batch time 202.45 ms
Iteration 50/50, ave batch time 200.82 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 201.59 ms
可以看到使用TensorRT后时间减少了1/3左右。另外在3090上测试时间可以减少一半。
python直接使用pip安装Torch-TensorRT 。
pip install torch-tensorrt -f https://github.com/NVIDIA/Torch-TensorRT/releases
Torch-TensorRT 的文档比较详细,其中包括c++部署。Torch-TensorRT 支持动态输入尺寸,需要设置min_shape、opt_shape、max_shape。
import time
import torch_tensorrt
import torch
from test.common import prepare, test
side = 512
model, x = prepare(side)
compile_settings = {
"inputs": [
torch_tensorrt.Input(
(1, 12, side // 2, side // 2),
dtype=torch.half)
],
"truncate_long_and_double": True,
"enabled_precisions": {torch.half} # Run with FP16
}
with torch.no_grad():
torch.cuda.empty_cache()
traced_model = torch.jit.trace(model, x)
torch.cuda.empty_cache()
print('Converting trt...')
s = time.time()
model_trt = torch_tensorrt.compile(traced_model, **compile_settings)
print(f'Converted! used time {time.time() - s:.2f}s')
print("Saving ...")
torch.jit.save(model_trt, f'src/torch-tensortrt_fp16_{side}.trt')
time.sleep(20)
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print(f"Max error: {torch.max(torch.abs(y - y_trt))}")
结果:
torch.Size([1, 3, 512, 512])
Converting trt...
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
...
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Converted! used time 198.42s
Saving ...
Testing model used 0.38s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error: 0.1142578125
批量测试优化效果:
import torch
import torch_tensorrt
from test.common import prepare, test, benchmark
side = 512
model, x = prepare(side)
torch.cuda.empty_cache()
model_trt = torch.jit.load(f'src/torch-tensortrt_fp16_{side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
print('*'*100)
torch.cuda.empty_cache()
benchmark(model, x, warm_up=5, runs=50)
print('*'*100)
torch.cuda.empty_cache()
benchmark(model_trt, x, warm_up=5, runs=50)
结果:
torch.Size([1, 3, 512, 512])
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Testing model used 0.46s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.25s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1143, device='cuda:0', dtype=torch.float16)
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 315.06 ms
Iteration 20/50, ave batch time 315.99 ms
Iteration 30/50, ave batch time 316.72 ms
Iteration 40/50, ave batch time 318.04 ms
Iteration 50/50, ave batch time 319.05 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 316.97 ms
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 203.72 ms
Iteration 20/50, ave batch time 204.43 ms
Iteration 30/50, ave batch time 205.57 ms
Iteration 40/50, ave batch time 204.77 ms
Iteration 50/50, ave batch time 206.92 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 205.08 ms
加速效果和torch2trt类似。
import time
import torch_tensorrt
import torch
from test.common import prepare, test
# Dynamic input shape
min_side = 128
opt_side = 256
max_side = 512
side = opt_side
model, x = prepare(side)
compile_settings = {
"inputs": [
torch_tensorrt.Input(
min_shape=[1, 12, min_side//2, min_side//2],
opt_shape=[1, 12, opt_side//2, opt_side//2],
max_shape=[1, 12, max_side//2, max_side//2],
dtype=torch.half)
],
"truncate_long_and_double": True,
"enabled_precisions": {torch.half} # Run with FP16
}
with torch.no_grad():
torch.cuda.empty_cache()
traced_model = torch.jit.trace(model, x)
torch.cuda.empty_cache()
print('Converting trt...')
s = time.time()
model_trt = torch_tensorrt.compile(traced_model, **compile_settings)
print(f'Converted! used time {time.time() - s:.2f}s')
print("Saving ...")
torch.jit.save(model_trt, f'src/torch-tensortrt_fp16_{min_side}-{max_side}.trt')
time.sleep(20)
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print(f"Max error: {torch.max(torch.abs(y - y_trt))}")
结果:
ssh://root@8.142.136.95:22/root/miniconda3/envs/ImageEnhance/bin/python -u /root/projects/imageenhance/test/torchrt.py
torch.Size([1, 3, 256, 256])
Converting trt...
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
...
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT TorchScript Conversion Context] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Converted! used time 160.27s
Saving ...
Testing model used 0.13s
torch.Size([1, 3, 512, 512])
Testing model_trt used 0.08s
torch.Size([1, 3, 512, 512])
Max error: 0.05126953125
动态尺寸包含了固定尺寸的512宽度,但是转换时间更短160.27s<198.42s,模型文件更小136.1MB<153MB。
下面我们测试下时间
import torch
import torch_tensorrt
from test.common import prepare, test, benchmark
# min_side = int(5376 / 16 + 8) # sc
# opt_side = int(6272 / 16 + 8) # z1
# max_side = int(8000 / 16 + 8) # xi_xun
# Dynamic input shape
min_side = 128
opt_side = 256
max_side = 512
side = max_side
model, x = prepare(side)
torch.cuda.empty_cache()
model_trt = torch.jit.load(f'../src/pretrained_models/real_esrgan/torch-tensortrt_fp16_{min_side}-{max_side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
print('*'*100)
torch.cuda.empty_cache()
benchmark(model, x, warm_up=5, runs=50)
print('*'*100)
torch.cuda.empty_cache()
benchmark(model_trt, x, warm_up=5, runs=50)
结果:
torch.Size([1, 3, 512, 512])
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Testing model used 0.43s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.21s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1343, device='cuda:0', dtype=torch.float16)
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 346.04 ms
Iteration 20/50, ave batch time 346.11 ms
Iteration 30/50, ave batch time 347.46 ms
Iteration 40/50, ave batch time 348.45 ms
Iteration 50/50, ave batch time 349.24 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 347.46 ms
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 215.32 ms
Iteration 20/50, ave batch time 213.66 ms
Iteration 30/50, ave batch time 214.31 ms
Iteration 40/50, ave batch time 216.50 ms
Iteration 50/50, ave batch time 217.51 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 215.46 ms
加速效果和固定尺寸差不多: 215.46/347.46=0.62 205.08/316.97=0.65。
安装步骤参考这个教程 ,已经很详细了,不在赘述。
PyTorch -> ONNX
import torch
from test.common import prepare, test, show
side = 512
model, x = prepare(side)
torch.cuda.empty_cache()
with torch.no_grad():
print('getting onnx...')
y_onnx = torch.onnx._export(model, x, f'src/onnx_{side}.onnx', opset_version=11, export_params=True)
show(y_onnx)
y = test(x, model, "model")
print("error", torch.max(torch.abs(y - y_onnx)))
ONNX -> TensorRT
使用命令行工具:
onnx2trt src/onnx_512.onnx -o src/onnx_512.trt_ -b 1 -d 16
d 16
表示使用FP16精度
批量测试优化效果:
import time
import onnx
import torch
import onnx_tensorrt.backend as backend
import numpy as np
import tensorrt as trt
from test.common import prepare, test, show, benchmark
side = 512
model, x = prepare(side)
logger = trt.Logger()
with open(f'src/onnx_{side}.trt', "rb") as f, trt.Runtime(logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
engine = backend.Engine(engine)
input_data = x[None].cpu().numpy()
print(input_data.shape)
s = time.time()
y_trt = engine.run(input_data)[0]
print('trt', time.time() - s)
y_trt = torch.from_numpy(np.array(y_trt)).cuda()
show(y_trt)
y = test(x, model, "model")
print("Max error", torch.max(torch.abs(y - y_trt)))
print('*'*100)
torch.cuda.empty_cache()
benchmark(model, x, warm_up=5, runs=50)
print('*'*100)
torch.cuda.empty_cache()
benchmark(engine.run, input_data, warm_up=5, runs=50)
结果:
torch.Size([1, 3, 512, 512])
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
WARNING: [Torch-TensorRT] - TensorRT was linked against cuBLAS/cuBLAS LT 11.6.3 but loaded cuBLAS/cuBLAS LT 11.5.1
WARNING: [Torch-TensorRT] - TensorRT was linked against cuDNN 8.2.1 but loaded cuDNN 8.2.0
Testing model used 0.40s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1143, device='cuda:0', dtype=torch.float16)
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 308.90 ms
Iteration 20/50, ave batch time 308.78 ms
Iteration 30/50, ave batch time 309.86 ms
Iteration 40/50, ave batch time 310.72 ms
Iteration 50/50, ave batch time 311.41 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 309.94 ms
****************************************************************************************************
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 194.15 ms
Iteration 20/50, ave batch time 195.11 ms
Iteration 30/50, ave batch time 195.09 ms
Iteration 40/50, ave batch time 195.97 ms
Iteration 50/50, ave batch time 196.25 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 195.31 ms
加速效果和torch2trt以及Torch-TensorRT类似。
简单测试使用3种工具,发现对于PyTorch使用Torch-TensorRT 是最方便快捷的,支持多分辨率的输入输出。