这里以超分辨率模型 Real-ESRGAN 为例介绍3种方式将PyTorch模型转为TensorRT优化的模型并测试性能。
nvcc --version
或者ls -all /usr/local/
cat /usr/local/cuda/include/cudnn_version.h
vim ~/.bashrc
export PATH=/usr/local/cuda-11.3/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64:$LD_LIBRARY_PATH
source ~/.bashrc
tar -xzvf TensorRT-
的lib可添加到系统变量里vim ~/.bashrc
,最后source ~/.bashrc
: 到tensorrt解压目录下,pip install python/tensorrt-
,pip install graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
如[b, 3, h, w] -> [b, 3*4, h/2, w/2]
import torch
from torch import nn as nn
from torch.nn import functional as F
from basicsr.archs.arch_util import default_init_weights, make_layer
class ResidualDenseBlock(nn.Module):
def __init__(self, num_feat=64, num_grow_ch=32):
super(ResidualDenseBlock, self).__init__()
self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
# initialization
default_init_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1)
def forward(self, x):
x1 = self.lrelu(self.conv1(x))
x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
# Emperically, we use 0.2 to scale the residual for better performance
return x5 * 0.2 + x
class RRDB(nn.Module):
def __init__(self, num_feat, num_grow_ch=32):
super(RRDB, self).__init__()
self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
def forward(self, x):
out = self.rdb1(x)
out = self.rdb2(out)
out = self.rdb3(out)
# Emperically, we use 0.2 to scale the residual for better performance
return out * 0.2 + x
class RRDBNet(nn.Module):
def __init__(self, num_in_ch, num_out_ch, num_feat=64, num_block=23, num_grow_ch=32):
super(RRDBNet, self).__init__()
num_in_ch = num_in_ch * 4
self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
self.body = make_layer(RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch)
self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, x):
feat = self.conv_first(x)
body_feat = self.conv_body(self.body(feat))
feat = feat + body_feat
feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
out = self.conv_last(self.lrelu(self.conv_hr(feat)))
return out
import time
import torch
import cv2
import numpy as np
from test.rrdb_net import RRDBNet
import matplotlib.pyplot as plt
def get_sr_model():
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32)
load_net = torch.load('src/pretrained_models/real_esrgan/RealESRGAN_x2plus.pth')
model.load_state_dict(load_net['params_ema'], strict=True)
model = model.to('cuda').half()
return model
def show(x: torch.Tensor):
x = x[0].clip(0, 1).cpu().detach().numpy().transpose(1, 2, 0).astype(np.float32)
def benchmark(model, x, warm_up=2, runs=10):
print("Warm up ...")
with torch.no_grad():
for _ in range(warm_up):
features = model(x)
print("Start timing ...")
timings = []
with torch.no_grad():
for i in range(1, runs + 1):
start_time = time.time()
features = model(x)
end_time = time.time()
timings.append(end_time - start_time)
if i % 10 == 0:
print('Iteration %d/%d, ave batch time %.2f ms' % (i, runs, np.mean(timings[i-10: i]) * 1000))
print("Input shape:", x.shape)
print("Output features size:", features.shape)
print('Average batch time: %.2f ms' % (np.mean(timings) * 1000))
def test(x, model, name):
s = time.time()
with torch.no_grad():
y = model(x)
print(name, time.time() - s)
return y
def prepare(side=512):
model = get_sr_model().eval()
# create example data
x = cv2.resize(cv2.imread('../src/inputs/sr/0014.jpeg'),
(side, side))[..., ::-1].transpose(2, 0, 1) / 255.0
x = torch.from_numpy(x).cuda().half().unsqueeze(0)
b, c, h, w = x.size()
h = h//2
w = w//2
x = x.view(b, c, h, 2, w, 2).permute(0, 1, 3, 5, 2, 4).reshape(b, 12, h, w)
return model, x
torch2trt 目前只能固定一种尺寸的输入,可以参考这个
torch2trt_dynamic 实现多尺寸。
项目地址: https://github.com/NVIDIA-AI-IOT/torch2trt
文档地址: https://nvidia-ai-iot.github.io/torch2trt/v0.3.0/
由于在最新的torch2trt 0.3
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
python setup.py install
git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
python setup.py install --plugins
报错: NvInfer.h: No such file or directory
'your/path/TensorRT-' # add include directories
'your/path/TensorRT-' # add link directories
报错: error: invalid new-expression of abstract class type ‘torch2trt::GroupNormPlugin’
使用torch2trt文档中的示例 测试:
import torch
from torch2trt import torch2trt
from torchvision.models.alexnet import alexnet
# create some regular pytorch model...
model = alexnet(pretrained=True).eval().cuda()
# create example data
x = torch.ones((1, 3, 224, 224)).cuda()
# convert to TensorRT feeding sample data as input
model_trt = torch2trt(model, [x])
y = model(x)
y_trt = model_trt(x)
# check the output against PyTorch
print(torch.max(torch.abs(y - y_trt)))
tensor(1.0729e-06, device='cuda:0', grad_fn=<MaxBackward1>)
import torch
from torch2trt import torch2trt
from test.common import prepare, test
side = 1024
model, x = prepare(side)
with torch.no_grad():
print('converting trt...')
model_trt = torch2trt(model, [x], fp16_mode=True)
torch.save(model_trt.state_dict(), f't2trt_fp16_{side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("error", torch.max(torch.abs(y - y_trt)))
torch.Size([1, 3, 512, 512])
Converting trt...
Converted! used time 239.82s
Testing model used 0.38s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.20s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1553, device='cuda:0', dtype=torch.float16)
import torch
from torch2trt import TRTModule
from test.common import prepare, test, benchmark
side = 512
model, x = prepare(side)
model_trt = TRTModule()
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
benchmark(model, x, warm_up=5, runs=50)
benchmark(model_trt, x, warm_up=5, runs=50)
torch.Size([1, 3, 512, 512])
Testing model used 0.40s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1553, device='cuda:0', dtype=torch.float16)
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 314.55 ms
Iteration 20/50, ave batch time 314.76 ms
Iteration 30/50, ave batch time 315.70 ms
Iteration 40/50, ave batch time 316.29 ms
Iteration 50/50, ave batch time 317.29 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 315.72 ms
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 200.73 ms
Iteration 20/50, ave batch time 201.76 ms
Iteration 30/50, ave batch time 202.16 ms
Iteration 40/50, ave batch time 202.45 ms
Iteration 50/50, ave batch time 200.82 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 201.59 ms
python直接使用pip安装Torch-TensorRT 。
pip install torch-tensorrt -f https://github.com/NVIDIA/Torch-TensorRT/releases
Torch-TensorRT 的文档比较详细,其中包括c++部署。Torch-TensorRT 支持动态输入尺寸,需要设置min_shape、opt_shape、max_shape。
import time
import torch_tensorrt
import torch
from test.common import prepare, test
side = 512
model, x = prepare(side)
compile_settings = {
"inputs": [
(1, 12, side // 2, side // 2),
"truncate_long_and_double": True,
"enabled_precisions": {torch.half} # Run with FP16
with torch.no_grad():
traced_model = torch.jit.trace(model, x)
print('Converting trt...')
s = time.time()
model_trt = torch_tensorrt.compile(traced_model, **compile_settings)
print(f'Converted! used time {time.time() - s:.2f}s')
print("Saving ...")
torch.jit.save(model_trt, f'src/torch-tensortrt_fp16_{side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print(f"Max error: {torch.max(torch.abs(y - y_trt))}")
torch.Size([1, 3, 512, 512])
Converting trt...
Converted! used time 198.42s
Saving ...
Testing model used 0.38s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error: 0.1142578125
import torch
import torch_tensorrt
from test.common import prepare, test, benchmark
side = 512
model, x = prepare(side)
model_trt = torch.jit.load(f'src/torch-tensortrt_fp16_{side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
benchmark(model, x, warm_up=5, runs=50)
benchmark(model_trt, x, warm_up=5, runs=50)
torch.Size([1, 3, 512, 512])
Testing model used 0.46s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.25s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1143, device='cuda:0', dtype=torch.float16)
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 315.06 ms
Iteration 20/50, ave batch time 315.99 ms
Iteration 30/50, ave batch time 316.72 ms
Iteration 40/50, ave batch time 318.04 ms
Iteration 50/50, ave batch time 319.05 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 316.97 ms
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 203.72 ms
Iteration 20/50, ave batch time 204.43 ms
Iteration 30/50, ave batch time 205.57 ms
Iteration 40/50, ave batch time 204.77 ms
Iteration 50/50, ave batch time 206.92 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 205.08 ms
import time
import torch_tensorrt
import torch
from test.common import prepare, test
# Dynamic input shape
min_side = 128
opt_side = 256
max_side = 512
side = opt_side
model, x = prepare(side)
compile_settings = {
"inputs": [
min_shape=[1, 12, min_side//2, min_side//2],
opt_shape=[1, 12, opt_side//2, opt_side//2],
max_shape=[1, 12, max_side//2, max_side//2],
"truncate_long_and_double": True,
"enabled_precisions": {torch.half} # Run with FP16
with torch.no_grad():
traced_model = torch.jit.trace(model, x)
print('Converting trt...')
s = time.time()
model_trt = torch_tensorrt.compile(traced_model, **compile_settings)
print(f'Converted! used time {time.time() - s:.2f}s')
print("Saving ...")
torch.jit.save(model_trt, f'src/torch-tensortrt_fp16_{min_side}-{max_side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print(f"Max error: {torch.max(torch.abs(y - y_trt))}")
ssh://root@ -u /root/projects/imageenhance/test/torchrt.py
torch.Size([1, 3, 256, 256])
Converting trt...
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
WARNING: [Torch-TensorRT] - Truncating weight (constant in the graph) from Float64 to Float32
Converted! used time 160.27s
Saving ...
Testing model used 0.13s
torch.Size([1, 3, 512, 512])
Testing model_trt used 0.08s
torch.Size([1, 3, 512, 512])
Max error: 0.05126953125
import torch
import torch_tensorrt
from test.common import prepare, test, benchmark
# min_side = int(5376 / 16 + 8) # sc
# opt_side = int(6272 / 16 + 8) # z1
# max_side = int(8000 / 16 + 8) # xi_xun
# Dynamic input shape
min_side = 128
opt_side = 256
max_side = 512
side = max_side
model, x = prepare(side)
model_trt = torch.jit.load(f'../src/pretrained_models/real_esrgan/torch-tensortrt_fp16_{min_side}-{max_side}.trt')
y = test(x, model, "model")
y_trt = test(x, model_trt, "model_trt")
print("Max error", torch.max(torch.abs(y - y_trt)))
benchmark(model, x, warm_up=5, runs=50)
benchmark(model_trt, x, warm_up=5, runs=50)
torch.Size([1, 3, 512, 512])
Testing model used 0.43s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.21s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1343, device='cuda:0', dtype=torch.float16)
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 346.04 ms
Iteration 20/50, ave batch time 346.11 ms
Iteration 30/50, ave batch time 347.46 ms
Iteration 40/50, ave batch time 348.45 ms
Iteration 50/50, ave batch time 349.24 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 347.46 ms
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 215.32 ms
Iteration 20/50, ave batch time 213.66 ms
Iteration 30/50, ave batch time 214.31 ms
Iteration 40/50, ave batch time 216.50 ms
Iteration 50/50, ave batch time 217.51 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 215.46 ms
加速效果和固定尺寸差不多: 215.46/347.46=0.62 205.08/316.97=0.65。
安装步骤参考这个教程 ,已经很详细了,不在赘述。
PyTorch -> ONNX
import torch
from test.common import prepare, test, show
side = 512
model, x = prepare(side)
with torch.no_grad():
print('getting onnx...')
y_onnx = torch.onnx._export(model, x, f'src/onnx_{side}.onnx', opset_version=11, export_params=True)
y = test(x, model, "model")
print("error", torch.max(torch.abs(y - y_onnx)))
ONNX -> TensorRT
onnx2trt src/onnx_512.onnx -o src/onnx_512.trt_ -b 1 -d 16
d 16
import time
import onnx
import torch
import onnx_tensorrt.backend as backend
import numpy as np
import tensorrt as trt
from test.common import prepare, test, show, benchmark
side = 512
model, x = prepare(side)
logger = trt.Logger()
with open(f'src/onnx_{side}.trt', "rb") as f, trt.Runtime(logger) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
engine = backend.Engine(engine)
input_data = x[None].cpu().numpy()
s = time.time()
y_trt = engine.run(input_data)[0]
print('trt', time.time() - s)
y_trt = torch.from_numpy(np.array(y_trt)).cuda()
y = test(x, model, "model")
print("Max error", torch.max(torch.abs(y - y_trt)))
benchmark(model, x, warm_up=5, runs=50)
benchmark(engine.run, input_data, warm_up=5, runs=50)
torch.Size([1, 3, 512, 512])
Testing model used 0.40s
torch.Size([1, 3, 1024, 1024])
Testing model_trt used 0.19s
torch.Size([1, 3, 1024, 1024])
Max error tensor(0.1143, device='cuda:0', dtype=torch.float16)
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 308.90 ms
Iteration 20/50, ave batch time 308.78 ms
Iteration 30/50, ave batch time 309.86 ms
Iteration 40/50, ave batch time 310.72 ms
Iteration 50/50, ave batch time 311.41 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 309.94 ms
Warm up ...
Start timing ...
Iteration 10/50, ave batch time 194.15 ms
Iteration 20/50, ave batch time 195.11 ms
Iteration 30/50, ave batch time 195.09 ms
Iteration 40/50, ave batch time 195.97 ms
Iteration 50/50, ave batch time 196.25 ms
Input shape: torch.Size([1, 12, 256, 256])
Output features size: torch.Size([1, 3, 1024, 1024])
Average batch time: 195.31 ms
简单测试使用3种工具,发现对于PyTorch使用Torch-TensorRT 是最方便快捷的,支持多分辨率的输入输出。