def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(context, bindings, inputs, outputs, stream, ctx, batch_size=1):
# Transfer input data to the GPU.
'''
Initialize cuda to avoid error:
[TensorRT] ERROR: ../rtSafe/cuda/reformat.cu (925) - Cuda Error in NCHWToNCHHW2: 400
(invalid resource handle)
Solution ref.: https://blog.csdn.net/yiyayi1/article/details/111314520
```
ctx.push()
{your inference code}
ctx.pop()
```
'''
ctx.push()
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
ctx.pop()
# Return only the host outputs.
return [out.host for out in outputs]
def inf_trt(engine_path, data_loader):
# Initialize cuda
cuda.init()
ctx = cuda.Device(0).make_context()
# Load engine
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
runtime = trt.Runtime(TRT_LOGGER)
engine = None
with open(engine_path, mode='rb') as f:
engine_bytes = f.read()
engine = runtime.deserialize_cuda_engine(engine_bytes)
# Allocate inputs/outputs/stream buffers
inputs, outputs, bindings, stream = allocate_buffers(engine)
# Fetch context
context = engine.create_execution_context()
# get output tensor
dataset = data_loader.dataset
# TensorRT inference
results = []
for data_idx, data in enumerate(data_loader):
# Inference
result_buffer = do_inference(context, bindings, inputs, outputs, stream, ctx)
result = copy.deepcopy(result_buffer)
results.append(result_info)
del engine
ctx.pop()
del context
del stream
del inputs
del outputs
return results
Reference: https://github.com/NVIDIA/TensorRT/issues/1107
# Initialize
cuda.init()
ctx = cuda.Device(0).make_context()
...
# Inference
ctx.push()
{your inference code}
ctx.pop()
PyCUDA ERROR: The context stack was not empty upon module cleanup.
-------------------------------------------------------------------
A context was still active when the context stack was being
cleaned up. At this point in our execution, CUDA may already
have been deinitialized, so there is no way we can finish
cleanly. The program will be aborted now.
Use Context.pop() to avoid this problem.
Reference: https://github.com/NVIDIA/TensorRT/issues/1107
在del engine
后,加上 ctx.pop()
,否则会出现上述报错
del engine
ctx.pop()
del context
del stream
del inputs
del outputs
夸一下TRT 8.+, transformer模块的提速简直起飞
其他待补充啦…