自己写的onnx转trt的代码。
此处记录几点:
inputs[0].host = data[0]
inputs[1].host = data[1]
inputs[2].host = data[2]
inputs[3].host = data[3]
此处给出快速获得engine文件的方法:使用TensorRT自带的trtexec,见:
https://zhuanlan.zhihu.com/p/158199822
# --*-- coding:utf-8 --*--
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2, os
import torchvision
import numpy as np
import model1
from torch.utils.data import Dataset
import torch
import h5py
import random
from torch.utils.data import DataLoader
from torch.autograd import Variable
class NTUData(Dataset):
def __init__(self, cv_path, transform=None):
# TODO
# 1. Initialize file path or list of file names.
self.cv_path = cv_path
self.transform = transform
with h5py.File(self.cv_path, 'r') as file:
self.group_name_list = [name for name in file] # Êý¾Ý¼¯h5µÄdata name
# shape[1]È¡¹Ç¼ÜÐòÁеÄÖ¡Êý
self.num_per_frame = None
self.data_len = file.keys().__len__() # ¹Ç¼ÜÐòÁÐÊý¾ÝµÄÊýÄ¿
def __getitem__(self, index):
# TODO
# 1. Read one data from file (e.g. using numpy.fromfile, PIL.Image.open).
# 2. Preprocess the data (e.g. torchvision.Transform).
# 3. Return a data pair (e.g. image and label).
# ÕâÀïÐèҪעÒâµÄÊÇ£¬µÚÒ»²½£ºread one data£¬ÊÇÒ»¸ödata
index_30 = self.get_frame30_index_list(index)
# print(index_30.shape)
frame30_body_0, frame30_body_1, label_action = self.get_frame30_data(index, index_30) # get segment data ,label
diff_body_0, diff_body_1 = self.get_diff_data(frame30_body_0, frame30_body_1) # get skeleton Motion
frame30_body_0 = Variable(torch.FloatTensor(frame30_body_0))
frame30_body_1 = Variable(torch.FloatTensor(frame30_body_1))
diff_body_0 = Variable(torch.FloatTensor(diff_body_0))
diff_body_1 = Variable(torch.FloatTensor(diff_body_1))
label_action = torch.LongTensor(label_action).squeeze().numpy().tolist().index(1)
return frame30_body_0, frame30_body_1, diff_body_0, diff_body_1, label_action
def __len__(self):
# You should change 0 to the total size of your dataset.
return self.data_len
def get_frame30_index_list(self, index):
with h5py.File(self.cv_path, 'r') as file:
self.num_per_frame = file[self.group_name_list[index]].shape[1] # ÿ¸ö¹Ç¼ÜÐòÁеÄ×ÜÖ¡Êý
# Éú³É31¸öÊýÊÇΪÁ˵õ½30¸öÇø¼ä
random_list = np.linspace(0, self.num_per_frame, 31, dtype=int) # ºóÃæ»á¼õÒ»£¬²»»áÈ¡µ½µÚ109Ö¡
# ¼õÒ»ÊÇȡÿһСÇø¼äµÄ×îºóһλÊý£¬±ÜÃâÈ¡µ½ÏÂÒ»Çø¼äµÄµÚÒ»¸öÊý£¬±ÜÃâ³öÏÖÏàͬ֡µÄÇé¿ö
frame30_list = [random.randint(random_list[i], random_list[i + 1] - 1) for i in range(30)]
# frame30_list = sorted(random.sample(range(0, self.num_per_frame), 30)) # index list(1, 30)
# frame30_index_list = sorted(np.arange(0, self.num_per_frame, 3))
return frame30_list
def get_frame30_data(self, index, frame30_list):
with h5py.File(self.cv_path, 'r') as file:
frame30_body_0 = file[self.group_name_list[index]][0, frame30_list, :, :] # дÈë30Ö¡Êý¾Ý(30, 25, 3)
frame30_body_1 = file[self.group_name_list[index]][1, frame30_list, :, :] # дÈë30Ö¡Êý¾Ý
label_action = file[self.group_name_list[index]].attrs['label'] # дÈëlabel(60, 1)
return frame30_body_0, frame30_body_1, label_action
def get_diff_data(self, body0, body1):
diff_body_0 = np.diff(body0, n=1, axis=0) # (29, 25, 3)
diff_body_1 = np.diff(body1, n=1, axis=0)
diff_zero = np.zeros((1, 25, 3))
diff_body_0 = np.r_[diff_body_0, diff_zero]
diff_body_1 = np.r_[diff_body_1, diff_zero]
return diff_body_0, diff_body_1
cv_tst_path = '../cv_tst.hdf5'
test_data = NTUData(cv_tst_path)
net = model1.ConVNet().cuda()
net.eval()
# filename = '/home/zp1/code/tensorrt/1.png'
max_batch_size = 1
onnx_model_path = "./ConVNet.onnx"
TRT_LOGGER = trt.Logger()
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""
host_mem: cpu memory
device_mem: gpu memory
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for binding in engine:
print('the bindding\'s name:', binding) # 绑定的输入输出
print('bingding shape: ', engine.get_binding_shape(binding)) # get_binding_shape 是变量的大小
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
# volume 计算可迭代变量的空间,指元素个数
# size = trt.volume(engine.get_binding_shape(binding)) # 如果采用固定bs的onnx,则采用该句
dtype = trt.nptype(engine.get_binding_dtype(binding))
# get_binding_dtype 获得binding的数据类型
# nptype等价于numpy中的dtype,即数据类型
# allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype) # 创建锁业内存
device_mem = cuda.mem_alloc(host_mem.nbytes) # cuda分配空间
# print(int(device_mem)) # binding在计算图中的缓冲地址
bindings.append(int(device_mem))
# append to the appropriate list
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", fp16_mode=False, save_engine=False):
"""
params max_batch_size: 预先指定大小好分配显存
params onnx_file_path: onnx文件路径
params engine_file_path: 待保存的序列化的引擎文件路径
params fp16_mode: 是否采用FP16
params save_engine: 是否保存引擎
returns: ICudaEngine
"""
# 如果已经存在序列化之后的引擎,则直接反序列化得到cudaEngine
if os.path.exists(engine_file_path):
print("Reading engine from file: {}".format(engine_file_path))
with open(engine_file_path, 'rb') as f, \
trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read()) # 反序列化
# else: # 由onnx创建cudaEngine
#
# # 使用logger创建一个builder
# # builder创建一个计算图 INetworkDefinition
# # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
# explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#
# with trt.Builder(TRT_LOGGER) as builder, \
# builder.create_network(explicit_batch) as network, \
# trt.OnnxParser(network, TRT_LOGGER) as parser: # 使用onnx的解析器绑定计算图,后续将通过解析填充计算图
# builder.max_workspace_size = 1 << 30 # 预先分配的工作空间大小,即ICudaEngine执行时GPU最大需要的空间
# builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
# builder.fp16_mode = fp16_mode
#
# # 解析onnx文件,填充计算图
# if not os.path.exists(onnx_file_path):
# quit("ONNX file {} not found!".format(onnx_file_path))
# print('loading onnx file from path {} ...'.format(onnx_file_path))
# with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
# print("Begining onnx file parsing")
# parser.parse(model.read()) # 解析onnx文件
# # parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的方法
#
# print("Completed parsing of onnx file")
# # 填充计算图完成后,则使用builder从计算图中创建CudaEngine
# print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
#
# #################
# print(network.get_layer(network.num_layers - 1).get_output(0).shape)
# # network.mark_output(network.get_layer(network.num_layers -1).get_output(0))
# last_layer = network.get_layer(network.num_layers - 1)
# network.mark_output(last_layer.get_output(0))
# engine = builder.build_cuda_engine(network) # 注意,这里的network是INetworkDefinition类型,即填充后的计算图
# print("Completed creating Engine")
# if save_engine: # 保存engine供以后直接反序列化使用
# with open(engine_file_path, 'wb') as f:
# f.write(engine.serialize()) # 序列化
# return engine
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# htod: host to device 将数据由cpu复制到gpu device
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# 当创建network时显式指定了batchsize, 则使用execute_async_v2, 否则使用execute_async
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# gpu to cpu
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(h_outputs, shape_of_output):
h_outputs = h_outputs.reshape(*shape_of_output)
return h_outputs
Lr = 0.0002
Epochs = 300
Batch_Size = 64
test_loader = DataLoader(dataset=test_data, batch_size=Batch_Size, shuffle=False)
# img_np_nchw = get_img_np_nchw(filename).astype(np.float32)
# These two modes are depend on hardwares
fp16_mode = False
trt_engine_path = "./ConVNet.engine"
# Build an cudaEngine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode)
# 创建CudaEngine之后,需要将该引擎应用到不同的卡上配置执行环境
context = engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
shape_of_output = (64, 4096)
#t_row和t_rt用于计算pytorch和trt的推断时间
t_row = 0
t_rt = 0
# 数据的输入需要为numpy格式,为了更好的测试trt的速度,此处先将所有的数据取出
l = []
for data in test_loader:
inputs[0].host = data[0].numpy()
inputs[1].host = data[1].numpy()
inputs[2].host = data[2].numpy()
inputs[3].host = data[3].numpy()
l.append([data[0].numpy(),data[1].numpy(),data[2].numpy(),data[3].numpy()])
# inputs[1].host = ... for multiple input
t1 = time.time()
for data in l:
inputs[0].host = data[0]
inputs[1].host = data[1]
inputs[2].host = data[2]
inputs[3].host = data[3]
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
# feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
t2 = time.time()
t_rt += (t2 - t1)
t3 = time.time()
for data in test_loader:
output = net(data[0].cuda(), data[1].cuda(), data[2].cuda(), data[3].cuda())
t4 = time.time()
t_row += t4 - t3
print('inference in pytorch 总耗时', t_row)
print('inference in trt 总耗时', t_rt)
print('TensorRT ok')
# mse = np.mean((feat - feat_2) ** 2)
# print('MSE Error = {}'.format(mse))
print('All completed!')