欢迎正在学习或者想学的CV的同学进群一起讨论与学习,v:Rex1586662742,q群:468713665
bool build_model(){
TRTLogger logger;
//构造engine需要下面几个组件
auto builder = make_nvshared(nvinfer1::createInferBuilder(logger));
auto config = make_nvshared(builder->createBuilderConfig());
auto network = make_nvshared(builder->createNetworkV2(1));
// 通过onnxparser解析器解析的结果会填充到network中,类似addConv的方式添加进去
auto parser = make_nvshared(nvonnxparser::createParser(*network, logger));
if(!parser->parseFromFile("classifier.onnx", 1)){
printf("Failed to parse classifier.onnx\n");
return false;
}
int maxBatchSize = 10;
//工作空间大小
config->setMaxWorkspaceSize(1 << 28);
// 如果模型有多个输入,则必须多个profile
auto profile = builder->createOptimizationProfile();
auto input_tensor = network->getInput(0);
auto input_dims = input_tensor->getDimensions();
// 配置最小、最优、最大范围
input_dims.d[0] = 1;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = maxBatchSize;
profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
config->addOptimizationProfile(profile);
auto engine = make_nvshared(builder->buildEngineWithConfig(*network, *config));
if(engine == nullptr){
printf("Build engine failed.\n");
return false;
}
// 将模型序列化,并储存为文件
auto model_data = make_nvshared(engine->serialize());
FILE* f = fopen("engine.engine", "wb");
fwrite(model_data->data(), 1, model_data->size(), f);
fclose(f);
// 卸载顺序按照构建顺序倒序
printf("Done.\n");
return true;
}
import tensorrt as trt
import os
import common
def build_engine(onnx_file_path, engine_file_path, input_shape, TRT_LOGGER, max_batch_size, max_workspace_size):
# builder、network、parser、config在这里体现出来了,与cpp的方式一致
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
common.EXPLICIT_BATCH) as network, builder.create_builder_config() \
as config, trt.OnnxParser(network, TRT_LOGGER) as parser, trt.Runtime(TRT_LOGGER) as runtime:
config.max_workspace_size = 1 << max_workspace_size # 256MiB
builder.max_batch_size = max_batch_size
profile = builder.create_optimization_profile()
config.add_optimization_profile(profile)
if not os.path.exists(onnx_file_path):
print(f"{onnx_file_path} is not exits")
exit(0)
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
if not parser.parse(model.read()):
print('ERROR: Failed to parse the ONNX file')
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
#设置输入的大小
network.get_input(0).shape = input_shape
print('Building an engine ... please wait for a while...')
plan = builder.build_serialized_network(network, config)
#储存为文件
with open(engine_file_path, "wb") as f:
f.write(plan)
print("Completed creating Engine")
return
if __name__ == '__main__':
input_shape = [1, 3, 512, 512]
onnx_path = 'centernet.onnx'
pyengine_path = 'centernet.pyengine'
max_batch_size = 1
max_workspace_size = 30 # 1<<28
TRT_LOGGER = trt.Logger()
build_engine(onnx_path, pyengine_path, input_shape,TRT_LOGGER , max_batch_size , max_workspace_size)
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
auto execution_context = make_nvshared(engine->createExecutionContext());
int input_batch = 1;
int input_channel = 3;
int input_height = 512;
int input_width = 512;
int input_numel = input_batch * input_channel * input_height * input_width;
float* input_data_host = nullptr;
float* input_data_device = nullptr;
//分配input的Host空间
checkRuntime(cudaMallocHost(&input_data_host, input_numel * sizeof(float)));
//分配intput的Device空间
checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));
//分配output的空间
float output_data_host[num_classes];
float* output_data_device = nullptr;
//分配output的Device空间
checkRuntime(cudaMalloc(&output_data_device, sizeof(output_data_host)));
//绑定输入输出
float* bindings[] = {input_data_device, output_data_device};
//推理
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr);
//将结果拷贝到Host上
checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(output_data_host), cudaMemcpyDeviceToHost, stream));
checkRuntime(cudaStreamSynchronize(stream));
#分配空间
def allocate_buffers(engine, max_batch_size=16):
#输入
inputs = []
#输出
outputs = []
bindings = []
#创建流
stream = cuda.Stream()
for binding in engine:
dims = engine.get_binding_shape(binding)
# print(dims)
if dims[0] == -1:
assert (max_batch_size is not None)
dims[0] = max_batch_size # 动态batch_size适应
#计算所需空间
size = trt.volume(dims) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
#分配host空间
host_mem = cuda.pagelocked_empty(size, dtype)
#分配device空间
device_mem = cuda.mem_alloc(host_mem.nbytes)
#绑定输入输出
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
#推理
def do_inference_v2(context, bindings, inputs, outputs, stream):
#复制数据到device
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
#推理
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
#复制结果到host上
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
return [out.host for out in outputs]
from cuda import cudart
import tensorrt as trt
class Infer_bacis():
def __init__(self, engine_file_path, batch_size):
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
#反序列化engine
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
#构造上下文
self.context = engine.create_execution_context()
#创建流
_, self.stream = cudart.cudaStreamCreate()
#定义输入输出
self.host_inputs = []
self.cuda_inputs = []
self.host_outputs = []
self.cuda_outputs = []
self.bindings = []
self.batch_size = batch_size
#self.context.set_binding_shape(0, (2, 3, 512, 512))
# assert self.batch_size <= engine.max_batch_size
for binding in engine:
#计算空间大小
size = abs(trt.volume(engine.get_binding_shape(binding))) * self.batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = np.empty(size, dtype=dtype)
#分配空间
_, cuda_mem = cudart.cudaMallocAsync(host_mem.nbytes, self.stream)
self.bindings.append(int(cuda_mem))
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
self.host_inputs.append(host_mem)
self.cuda_inputs.append(cuda_mem)
else:
self.host_outputs.append(host_mem)
self.cuda_outputs.append(cuda_mem)
def detect(self, image):
batch_input_image = np.ascontiguousarray(image)
np.copyto(self.host_inputs[0], batch_input_image.ravel())
cudart.cudaMemcpyAsync(self.cuda_inputs[0], self.host_inputs[0].ctypes.data, self.host_inputs[0].nbytes,
cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, self.stream)
#推理
self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream)
cudart.cudaMemcpyAsync(self.host_outputs[0].ctypes.data, self.cuda_outputs[0], self.host_outputs[0].nbytes,
cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, self.stream)
cudart.cudaStreamSynchronize(self.stream)
output = self.host_outputs[0]
return output