pytorch模型tensorrt加速之-pth转onnx转trt,在推理trt模型测试模型速度

首先需要安装两个必要的包tensorrt和torch2trt,tensorrt的话要在官网下载tar压缩包,建议下载tar包安装,官网,我下载的是7.2.3版本,torch2trt可以在GitHub上克隆项目。
我的环境(tensorrt目前好像Linux上可以用,我在win10上没有搞成功,tar包里面没有对应的whl文件):
ubuntu18.4
python3.8
pytorch1.8+cu11
tensorrt7.2.3
pycuda2021

解压安装tensorrt

tar -zxvf TensorRT-7.2.3.4.Ubuntu-18.04.x86_64-gnu.cuda-11.1.cudnn8.1.tar.gz
cd TensorRT-7.2.3.4
# 这个目录会有四个文件,python,uff,graphsurgeon,onnx_graphsurgeon,分别进入目录安装对应的whl文件
pip install xxxxx.whl
# 配置环境变量
vim /etc/profile
# 最后面加上如下命令,路径需看自己的安装路径,注意这个lib文件夹可能是个软连接,我当时因为这个报错了,要lib的正真路径
export TRT_RELEASE=`pwd`/TensorRT-7.2.3.4
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$TRT_RELEASE/lib

# 使文件立刻生效
source /etc/profile

安装torch2trt

git clone https://github.com/NVIDIA-AI-IOT/torch2trt
cd torch2trt
python setup.py install

这两个安装好之后就可以转模型了,首先的有一个.pth文件,就是ptorch训练的模型,我用的是mnist数据集训练测试的,训练如下

import torch
import numpy as np
from torch.nn import Module
from torch.autograd import Variable
import torch.nn as nn
from torch import nn,optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

batch_size = 64
learning_rate = 1e-2
num_epochs = 10

# 加载mnist数据集,构建训练集测试集的迭代器
data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# cnn网络
class SimpleCnn(Module):
    def __init__(self):
        super(SimpleCnn, self).__init__() #b, 3, 32, 32
        self.layer1 = nn.Sequential()
        self.layer1.add_module('conv1', nn.Conv2d(1, 32, 3, 1, padding=1))
        self.layer1.add_module('relu', nn.ReLU(True))
        self.layer1.add_module('pool', nn.MaxPool2d(2,2))
        
        self.layer2 = nn.Sequential()
        self.layer2.add_module('conv2', nn.Conv2d(32, 64, 3, 1, padding=1))
        self.layer2.add_module('relu', nn.ReLU(True))
        self.layer2.add_module('pool', nn.MaxPool2d(2,2))
        
        self.layer3 = nn.Sequential()
        self.layer3.add_module('fc1', nn.Linear(3136, 64))
        self.layer3.add_module('relu', nn.ReLU(True))
        self.layer3.add_module('fc_out', nn.Linear(64, 10))
        
    def forward(self, x):
        conv1 = self.layer1(x)
        conv2 = self.layer2(conv1)
        fc_input = conv2.view(conv2.size(0), -1)
        fc_out = self.layer3(fc_input)
        return fc_out
# 损失函数及优化器
model_cnn = SimpleCnn()
model_cnn = model_cnn.cuda()
criterion_cnn = nn.CrossEntropyLoss()
optimizer_cnn = optim.SGD(model_cnn.parameters(), lr=learning_rate)
# train
for epoch in range(num_epochs):
    acc = 0
    loss = 0
    i = 0
    for data in train_loader:
        img, label = data
        x_train = Variable(img).cuda()
        y_train = Variable(label).cuda()
        out = model_cnn(x_train)
        loss_batch = criterion_cnn(out, y_train)
        loss += loss_batch 
        _,pred = torch.max(out, 1)
        num_corrrect = (pred == y_train).sum()
        acc += num_corrrect.data
        optimizer_cnn.zero_grad()
        loss_batch.backward()   #backward loss
        optimizer_cnn.step()   # update parameters
        if i % 10 == 0:
            print("Train Loss: {:.6f}, Acc: {:.6f}".format(loss/len(train_dataset), acc/len(train_dataset)))
        i+=1
# 最后的训练准确率是0.98
Train Loss: 0.000756, Acc: 0.985350
# test
model_cnn.eval()   #转测试模式
eval_loss = 0
eval_acc = 0
i = 0
for data in test_loader:
    img, label = data
    img = Variable(img, volatile=True)
    label = Variable(label, volatile=True)
    out = model_cnn(img)
    loss = criterion_cnn(out, label)
    eval_loss += loss.data * label.size(0)
    _, pred = torch.max(out, 1)
    num_corrrect = (pred == label).sum()
    eval_acc += num_corrrect.data
    if i % 10 == 0:
        print("Test Loss: {:.6f}, Acc: {:.6f}".format(eval_loss / len(test_dataset), eval_acc / len(test_dataset)))
    i+=1
# 测试准确率是0.98
Test Loss: 0.043465, Acc: 0.985400
# 保存模型为.pth文件
torch.save(model_cnn, 'torch_mnist.pth')

现在.pth文件已生成,.pth文件转为.trt文件哟两种方法,一种是tensorrt官网上的方法,github,这种方法我尝试了,失败了,会遇到很多问题,那么就用第二种方法就是 .pth->.onnx->.trt文件转化
现在开始.pth->.onnx步骤

# 首先安装onnx
pip install onnx

开始转换

import torch
​

batch_size = 1    # 这里的batch_size设置成了1,.pth模型我们训练的时候设置成了64
model = './torch_mnist.pth'
​
dummy_input = torch.randn(batch_size, 1, 28, 28, device='cuda')
model = torch.load(model)
torch.onnx.export(model, dummy_input,"torch_mnist.onnx" , verbose=False)

这里有个点要解释下,这里batch_size=1,所以在后面推理的时候也要为1,还有就是转化过程中需要inputs,这个inputs只是模型需要知道输入数据的尺寸,并且模型只取一条数据,所以随机一个就好,但尺寸要与训练时的shape一至。
运行完目录下会有一个.onnx的文件生成,说明这一步成功了,可参考博客。
下一步.onnx->.trt
解压完的tensorrt的bin目录下有个trtexec文件,可以直接用这个文件转

./trtexec --onnx=torch_mnist.onnx --saveEngine=torch_mnist.trt

执行完就可以了,会生成一个.trt文件,如果报错试试onnx-tensorrt模块,可参考博客。
到这一步模型转换就完成了,就差验证这模型到底有没有加速,如何加载这个模型的,我们下面就来研究,看代码!!!

import torch
from torch.autograd import Variable
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
import numpy as np


batch_size = 1    # 这里的batch_size要与转换时的一致


trt_model_name = "./torch_mnist.trt"     


# 操作缓存,进行运算, 这个函数是通用的
def infer(context, input_img, output_size, batch_size):
    # Convert input data to Float32,这个类型要转换,不严会有好多报错
    input_img = input_img.astype(np.float32)
    # Create output array to receive data
    output = np.empty(output_size, dtype=np.float32)

    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.nbytes)
    d_output = cuda.mem_alloc(batch_size * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.execute_async(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    stream.synchronize()

    # Return predictions
    return output


# 执行测试函数
def do_test(context):
    # 读取mnist数据集
    data_tf = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
    train_dataset = datasets.MNIST(root='./data', train=True, transform=data_tf, download=True)
    test_dataset = datasets.MNIST(root='./data', train=False, transform=data_tf)
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    print("mnist data load successful!!!")
    accurary = 0
    start_time = time.time()
    for data in  test_loader:    # 开始测试
        img, label = data
        img = img.numpy()    # 这个数据要从torch.Tensor转换成numpy格式的
        label = Variable(label, volatile=True)
        output = infer(context, img, 10, 1)
        #print(output)
        conf, pred = torch.max(torch.Tensor(output), -1)
        num_count = (pred == label).sum()
        accurary += num_count.data

        print("Test Acc is {:.6f}".format(accurary / len(test_dataset)))

    return accurary/len(test_dataset), time.time() - start_time


def trt_infer():
    # 读取.trt文件
    def loadEngine2TensorRT(filepath):
        G_LOGGER = trt.Logger(trt.Logger.WARNING)
        # 反序列化引擎
        with open(filepath, "rb") as f, trt.Runtime(G_LOGGER) as runtime:
            engine = runtime.deserialize_cuda_engine(f.read())
            return engine
    
    engine = loadEngine2TensorRT(trt_model_name)
    # 创建上下文
    context = engine.create_execution_context()

    print("Start TensorRT Test...")
    acc, times = do_test(context)
    print('INT8 acc: {}, need time: {}'.format(acc, times))


if __name__ == '__main__':

    trt_infer()

然后运行就可以啦,等着出结果~~
我的测试结果是(数据是1万张mnist图片)
trt模型运算结果

INT8 acc: 0.9855999946594238, need time: 4.600905656814575

pth模型运算结果

Test Acc is 0.985600
run succes! time is 20.035008

从结果看出trt模型快出了4-5倍的速度,还是可以的嘻嘻嘻~

你可能感兴趣的:(深度学习,python,pytorch,tensorrt,cuda)