def TF_TRT_frozen_graph_12():
frozen_graph_path = ‘*.pb’
save_frozen_graph_path = '' #保存路径
if not os.path.isdir(save_frozen_graph_path):
os.mkdir(save_frozen_graph_path)
save_frozen_graph_path+='*.pb'
# 将模型读取
graph_def = tf.GraphDef()
with tf.gfile.GFile(frozen_graph_path, 'rb') as f:
graph_def.ParseFromString(f.read())
calib_graph = trt.create_inference_graph(
# 要转换的模型
input_graph_def=graph_def,
#
outputs=[' '], # 输出的节点的名字,这个是在模型训练的时候有这个参数,看看你们自己写的是什么名字
# 控制精度
precision_mode='FP16'
)
#将optimized graph保存起来
with open(save_frozen_graph_path,'wb') as wh:
wh.write(calib_graph.SerializeToString())
def TF_TRT_frozen_graph_14():
frozen_graph_path = '*.pb'
save_frozen_graph_path = ‘’ #保存路径
if not os.path.isdir(save_frozen_graph_path):
os.mkdir(save_frozen_graph_path)
save_frozen_graph_path += '*.pb'
# 将模型读取
graph_def = tf.compat.v1.GraphDef()
with tf.io.gfile.GFile(frozen_graph_path, 'rb') as f:
graph_def.ParseFromString(f.read())
converter = trt.TrtGraphConverter(
# 要转换的模型
input_graph_def=graph_def,
#
nodes_blacklist=[' '], # 输出的节点的名字
# 控制精度
precision_mode='FP16'
)
frozen_graph = converter.convert()
#savedModel 的保存形式
# converter.save(save_frozen_graph_path)
#将optimized graph保存起来
with open(save_frozen_graph_path,'wb') as wh:
wh.write(frozen_graph.SerializeToString())
使用c++编译的流程:
IBuilder* builder = nvinfer1::createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
//这个根据外部输入决定
builder->setMaxBatchSize(batch_size);
//控制编译时占用的内存的空间的大小,这个大小根据自己的需要调控
int MAX_WORKSPACE = 300000000;
builder->setMaxWorkspaceSize(MAX_WORKSPACE);
nvinfer1::INetworkDefinition* network = builder->createNetwork();
//创建解析对象
auto parser = nvuffparser::createUffParser();
//将模型的输入输出在uff文件中记录,这里输入的数据维度等信息,注意根据自己的需要进行修正
parser->registerInput("input_node", nvinfer1::Dims3(channel_size, 1, width), nvuffparser::UffInputOrder::kNCHW);
parser->registerOutput("myoutputnode"); //myoutputnode 是输出节点的名字
if (!parser->parseBuffer(model,,model_Len, *network, nvinfer1::DataType::kFLOAT)) //kFLOAT :32位 kHALF :16位
{
printf("Failure while parsing UFF file\n");
return 0;
}
ICudaEngine* engine = builder->buildCudaEngine(*network);
if (!engine)
{
printf("Unable to create engine\n");
return 0;
}
//将引擎序列化保存起来
IHostMemory *serializedModel = engine->serialize();
ofstream ofs(save_path, ios::out | ios::binary);
ofs.write((char*)(serializedModel->data()), serializedModel->size());
ofs.close();
serializedModel->destroy();
将序列化的模型进行加载和使用:
IRuntime* runtime = nvinfer1::createInferRuntime(gLogger.getTRTLogger());
ICudaEngine* engine = runtime->deserializeCudaEngine(model,model_len, nullptr)
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
int inputIndex = engine.getBindingIndex(input_layer);
int outputIndex = engine.getBindingIndex(output_layer);
std::unique_ptr output(new float[batch_size * num_class * sizeof(float)]);
create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batch_size * 1 * width * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batch_size * num_class * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input + data_offset , batch_size * 1 * width * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batch_size, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output.get(), buffers[outputIndex], batch_size * num_class * sizeof(float), cudaMemcpyDeviceToHost, stream));
release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]))
CHECK(cudaFree(buffers[outputIndex]));
output label
for (int sample_ind = 0; sample_ind < batch_size; sample_ind++)
{
float score{ 0.0f };
int label_idx{ 0 };
for (unsigned int i = 0; i < num_class; i++)
{
float cur_score = out_buf[sample_ind*num_class + i];
score = std::max(score, cur_score);
if (score == cur_score) label_idx = i;
}
out_label[label_offset + sample_ind] = label_idx;
out_score[label_offset + sample_ind] = score;
}
注意,编译推理使用的引擎,转换模型的引擎,设备使用的引擎要一致,否则会出现性能上的问题(使用rtx2080和nano出现过这种问题。)