pip install onnxruntime
import onnxruntime as ort
ort_session = ort.InferenceSession('xxx.onnx')
img = cv2.imread('test.jpg')
net_input = preprocess(img) # 你的预处理函数
outputs = ort_session.run(None, {ort_session.get_inputs()[0].name: net_input})
print(outputs)
bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with
IHostMemory*& trtModelStream) // output buffer for the TensorRT model
{
// 1. create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
// 创建一个空的nerwork对象,里面的属性还没有值
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(maxBatchSize);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
// 创建一个onnx的解析对象,
auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
// 调用parseFromFile函数对network对象进行填充,将onnx里的tensor解析成tensorrt中的tensor
// locateFile()函数是common.cpp中的辅助函数
if (!parser->parseFromFile(locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity())))
{
gLogError << "Failure while parsing ONNX file" << std::endl;
return false;
}
// 设置工作空间
builder->setMaxWorkspaceSize(1_GiB);
config->setMaxWorkspaceSize(1_GiB);
// 设置推理模式
builder->setFp16Mode(gArgs.runInFp16);
//builder->setInt8Mode(gArgs.runInInt8);
if (gArgs.runInInt8)
{
config->setFlag(BuilderFlag::kINT8);
samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
}
samplesCommon::enableDLA(builder, config, gArgs.useDLACore);
// 2. build engine
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we can destroy the parser
parser->destroy();
// serialize the engine, then close everything down
trtModelStream = engine->serialize();
engine->destroy();
network->destroy();
builder->destroy();
// 序列化到磁盘,方便后续调用
std::ofstream ofs(trtModelName.c_str(), std::ios::out | std::ios::binary);
ofs.write((char*)(trtModelStream->data()), trtModelStream->size());
ofs.close();
trtModelStream->destroy();
DebugP("Trt model save success!");
return true;
}
struct TensorRT {
IExecutionContext* context;
ICudaEngine* engine;
IRuntime* runtime;
};
TensorRT* LoadNet(const char* trtFileName)
{
std::ifstream t(trtFileName, std::ios::in | std::ios::binary);
std::stringstream tempStream;
tempStream << t.rdbuf();
t.close();
DebugP("TRT File Loaded");
tempStream.seekg(0, std::ios::end);
const int modelSize = tempStream.tellg();
tempStream.seekg(0, std::ios::beg);
void* modelMem = malloc(modelSize);
tempStream.read((char*)modelMem, modelSize);
IRuntime* runtime = createInferRuntime(gLogger);
if (runtime == nullptr)
{
DebugP("Build Runtime Failure");
return 0;
}
if (gArgs.useDLACore >= 0)
{
runtime->setDLACore(gArgs.useDLACore);
}
ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
if (engine == nullptr)
{
DebugP("Build Engine Failure");
return 0;
}
IExecutionContext* context = engine->createExecutionContext();
if (context == nullptr)
{
DebugP("Build Context Failure");
return 0;
}
TensorRT* trt = new TensorRT();
trt->context = context;
trt->engine = engine;
trt->runtime = runtime;
DebugP("Build trt Model Success!");
return trt;
}
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_LOC_NAME = "output";
const char* OUTPUT_CONF_NAME = "3612";
void doInference(IExecutionContext& context, float* input, float** output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 3);
// 用于后续GPU开辟显存
void* buffers[3];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputLocIndex = engine.getBindingIndex(OUTPUT_LOC_NAME);
const int outputConfIndex = engine.getBindingIndex(OUTPUT_CONF_NAME);
DebugP(inputIndex); DebugP(outputLocIndex); DebugP(outputConfIndex);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputLocIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputConfIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output[0], buffers[outputLocIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(output[1], buffers[outputConfIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputLocIndex]));
CHECK(cudaFree(buffers[outputConfIndex]));
}
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "logger.h"
#include "common.h"
#include "image.h"
#include "tools.h"
#define DebugP(x) std::cout << "Line" << __LINE__ << " " << #x << "=" << x << std::endl
struct TensorRT {
IExecutionContext* context;
ICudaEngine* engine;
IRuntime* runtime;
};
using namespace nvinfer1;
static const int INPUT_H = 300;
static const int INPUT_W = 300;
static const int INPUT_C = 3;
const int OUTPUT_LEN = 5329;
const int CLASS_NUM = 4;
static const int OUTPUT_SIZE = OUTPUT_LEN * CLASS_NUM;
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_LOC_NAME = "output";
const char* OUTPUT_CONF_NAME = "3612";
const float conf_threshold = 0.8;
const float nms_threshold = 0.4;
const std::string gSampleName = "TensorRT.object_detect";
const std::string trtModelName = "trt_model.pb";
const std::string onnxModeName = "";
samplesCommon::Args gArgs;
bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with
IHostMemory*& trtModelStream) // output buffer for the TensorRT model
{
// create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(maxBatchSize);
nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
if (!parser->parseFromFile(locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity())))
{
gLogError << "Failure while parsing ONNX file" << std::endl;
return false;
}
// Build the engine
//builder->setMaxWorkspaceSize(1 << 20);
builder->setMaxWorkspaceSize(1_GiB);
config->setMaxWorkspaceSize(1_GiB);
builder->setFp16Mode(gArgs.runInFp16);
//builder->setInt8Mode(gArgs.runInInt8);
if (gArgs.runInInt8)
{
config->setFlag(BuilderFlag::kINT8);
samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
}
samplesCommon::enableDLA(builder, config, gArgs.useDLACore);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
// we can destroy the parser
parser->destroy();
// serialize the engine, then close everything down
trtModelStream = engine->serialize();
engine->destroy();
network->destroy();
builder->destroy();
std::ofstream ofs(trtModelName.c_str(), std::ios::out | std::ios::binary);
ofs.write((char*)(trtModelStream->data()), trtModelStream->size());
ofs.close();
trtModelStream->destroy();
DebugP("Trt model save success!");
return true;
}
TensorRT* LoadNet(const char* trtFileName)
{
std::ifstream t(trtFileName, std::ios::in | std::ios::binary);
std::stringstream tempStream;
tempStream << t.rdbuf();
t.close();
DebugP("TRT File Loaded");
tempStream.seekg(0, std::ios::end);
const int modelSize = tempStream.tellg();
tempStream.seekg(0, std::ios::beg);
void* modelMem = malloc(modelSize);
tempStream.read((char*)modelMem, modelSize);
IRuntime* runtime = createInferRuntime(gLogger);
if (runtime == nullptr)
{
DebugP("Build Runtime Failure");
return 0;
}
if (gArgs.useDLACore >= 0)
{
runtime->setDLACore(gArgs.useDLACore);
}
ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, nullptr);
if (engine == nullptr)
{
DebugP("Build Engine Failure");
return 0;
}
IExecutionContext* context = engine->createExecutionContext();
if (context == nullptr)
{
DebugP("Build Context Failure");
return 0;
}
TensorRT* trt = new TensorRT();
trt->context = context;
trt->engine = engine;
trt->runtime = runtime;
DebugP("Build trt Model Success!");
return trt;
}
void doInference(IExecutionContext& context, float* input, float** output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 3);
void* buffers[3];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputLocIndex = engine.getBindingIndex(OUTPUT_LOC_NAME);
const int outputConfIndex = engine.getBindingIndex(OUTPUT_CONF_NAME);
DebugP(inputIndex); DebugP(outputLocIndex); DebugP(outputConfIndex);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputLocIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputConfIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output[0], buffers[outputLocIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(output[1], buffers[outputConfIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputLocIndex]));
CHECK(cudaFree(buffers[outputConfIndex]));
}
void Flaten2Matrix(float* src, int row_num, int clo_num, float** out_mat)
{
for (int i=0;i<row_num;i++)
{
out_mat[i] = new float[clo_num];
for (int j = 0; j < clo_num; j++)
{
out_mat[i][j] = src[i * clo_num + j];
}
}
}
inline bool cmp(ObjInfo a, ObjInfo b)
{
return a.prob > b.prob;
}
void PostProcessing(float** loc, float** conf, ImagePaddingInfo &img_pad_info, std::vector<ObjInfo> &obj)
{
std::vector<box> anchor;
CreateAnchor(anchor, INPUT_W, INPUT_H);
std::vector<ObjInfo> total_box;
// get NMS result
box tmp, tmp1;
for (int i = 0; i < anchor.size(); ++i)
{
const float *boxes = loc[i];
const float *scores = conf[i];
for (int class_id = 1; class_id < CLASS_NUM; class_id++)
{
if (scores[class_id] > conf_threshold)
{
tmp = anchor[i];
ObjInfo result;
tmp1.cx = tmp.cx + boxes[0] * 0.1 * tmp.sx;
tmp1.cy = tmp.cy + boxes[1] * 0.1 * tmp.sy;
tmp1.sx = tmp.sx * exp(boxes[2] * 0.2);
tmp1.sy = tmp.sy * exp(boxes[3] * 0.2);
result.x1 = ((tmp1.cx - tmp1.sx / 2) * INPUT_W - img_pad_info.left) / img_pad_info.scale;
result.y1 = ((tmp1.cy - tmp1.sy / 2) * INPUT_H - img_pad_info.top) / img_pad_info.scale;
result.x2 = ((tmp1.cx + tmp1.sx / 2) * INPUT_W - img_pad_info.left) / img_pad_info.scale;
result.y2 = ((tmp1.cy + tmp1.sy / 2) * INPUT_H - img_pad_info.top) / img_pad_info.scale;
result.prob = *(scores + 1);
result.class_id = class_id;
total_box.push_back(result);
}
}
}
std::sort(total_box.begin(), total_box.end(), cmp);
NMS(total_box, nms_threshold);
for (int j = 0; j < total_box.size(); ++j) {
obj.push_back(total_box[j]);
}
}
int main(int argc, char** argv)
{
// create a TensorRT model from the onnx model and serialize it to a stream
IHostMemory* trtModelStream{ nullptr };
TensorRT* ptensor_rt;
IExecutionContext* context = nullptr;
IRuntime* runtime = nullptr;
ICudaEngine* engine = nullptr;
if (_access(trtModelName.c_str(), 0) != 1)
{
ptensor_rt = LoadNet(trtModelName.c_str());
context = ptensor_rt->context;
runtime = ptensor_rt->runtime;
engine = ptensor_rt->engine;
}
else
{
if (!onnxToTRTModel(onnxModeName, 1, trtModelStream))
return 1;
assert(trtModelStream != nullptr);
std::cout << "Successfully parsed ONNX file!!!!" << std::endl;
// deserialize the engine
runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
if (gArgs.useDLACore >= 0)
{
runtime->setDLACore(gArgs.useDLACore);
}
engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
context = engine->createExecutionContext();
assert(context != nullptr);
}
// 输入预处理
std::cout << "Start reading the input image!!!!" << std::endl;
cv::Mat image = cv::imread(locateFile("test.jpg", gArgs.dataDirs), cv::IMREAD_COLOR);
cv::Mat dst;
ImagePaddingInfo img_pad_info;
ImageScaleAndPadding(image, dst, INPUT_H, img_pad_info);
DebugP(dst.size());
float* net_input = normal(dst);
float* out[2];
out[0] = new float[OUTPUT_LEN * 4];
out[1] = new float[OUTPUT_LEN * CLASS_NUM];
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;
double total = 0.0;
// run inference and cout time
auto t0 = Time::now();
doInference(*context, net_input, out, 1);
auto t1 = Time::now();
fsec fs = t1 - t0;
ms d = std::chrono::duration_cast<ms>(fs);
total += d.count();
//网络输出的后处理
float* loc[OUTPUT_LEN];
float* conf[OUTPUT_LEN];
std::vector<ObjInfo> obj;
Flaten2Matrix(out[0], OUTPUT_LEN, 4, loc);
Flaten2Matrix(out[1], OUTPUT_LEN, CLASS_NUM, conf);
PostProcessing(loc, conf, img_pad_info, obj);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
std::cout << std::endl << "Running time of one image is:" << total << "ms" << std::endl;
return 0;
}
image.h
#pragma once
#include
typedef struct {
int w;
int h;
int c;
float *data;
} image;
typedef struct {
int top;
int left;
float scale;
}ImagePaddingInfo;
float* normal(cv::Mat img);
int ImageScaleAndPadding(cv::Mat src_img, cv::Mat &dst_img, int target_size, ImagePaddingInfo &img_pad_info);
image.cpp
#include "image.h"
static const float kMean[3] = { 0.485f, 0.456f, 0.406f };
static const float kStdDev[3] = { 0.229f, 0.224f, 0.225f };
float* normal(cv::Mat img) {
//cv::Mat image(img.rows, img.cols, CV_32FC3);
float * data;
data = (float*)calloc(img.rows*img.cols * 3, sizeof(float));
for (int c = 0; c < 3; ++c)
{
for (int i = 0; i < img.rows; ++i)
{ //获取第i行首像素指针
cv::Vec3b *p1 = img.ptr<cv::Vec3b>(i);
//cv::Vec3b *p2 = image.ptr(i);
for (int j = 0; j < img.cols; ++j)
{
data[c * img.cols * img.rows + i * img.cols + j] = (p1[j][c] / 255.0f - kMean[c]) / kStdDev[c];
}
}
}
return data;
}
int ImageScaleAndPadding(cv::Mat src_img, cv::Mat &dst_img, int target_size, ImagePaddingInfo &img_pad_info)
{
int w = src_img.cols;
int h = src_img.rows;
int im_max_size = w > h ? w : h;
float ratio = target_size * 1.0 / im_max_size;
cv::Mat resize_img;
cv::resize(src_img, resize_img, cv::Size(), ratio, ratio, cv::INTER_LINEAR);
int resize_w = resize_img.cols;
int resize_h = resize_img.rows;
int dh = target_size - resize_h;
int dw = target_size - resize_w;
int top = dh / 2;
int bottom = dh - top;
int left = dw / 2;
int right = dw - left;
cv::copyMakeBorder(resize_img, dst_img, top, bottom, left, right, cv::BORDER_CONSTANT, 0);
img_pad_info.left = left;
img_pad_info.top = top;
img_pad_info.scale = ratio;
return 0;
}
tools.h
#pragma once
#include
#include
#include
struct bbox {
float x1;
float y1;
float x2;
float y2;
float score;
};
struct box {
float cx;
float cy;
float sx;
float sy;
};
struct ObjInfo {
float x1; //bbox的left
float y1; //bbox的top
float x2; //bbox的right
float y2; //bbox的bottom
float prob; //置信度
int class_id;
};
void NMS(std::vector<ObjInfo> &input_boxes, float NMS_THRESH);
void CreateAnchor(std::vector<box> &anchor, int w, int h);
tools.cpp
#include "tools.h"
void CreateAnchor(std::vector<box> &anchor, int w, int h)
{
anchor.clear();
std::vector<std::vector<int> > feature_map(4), min_sizes(4);
float steps[] = { 8, 16, 32, 64 };
for (int i = 0; i < feature_map.size(); ++i) {
feature_map[i].push_back(ceil(h / steps[i]));
feature_map[i].push_back(ceil(w / steps[i]));
}
std::vector<int> minsize1 = { 10, 16, 24 };
min_sizes[0] = minsize1;
std::vector<int> minsize2 = { 32, 48 };
min_sizes[1] = minsize2;
std::vector<int> minsize3 = { 64, 96 };
min_sizes[2] = minsize3;
std::vector<int> minsize4 = { 128, 192, 256 };
min_sizes[3] = minsize4;
for (int k = 0; k < feature_map.size(); ++k)
{
std::vector<int> min_size = min_sizes[k];
for (int i = 0; i < feature_map[k][0]; ++i)
{
for (int j = 0; j < feature_map[k][1]; ++j)
{
for (int l = 0; l < min_size.size(); ++l)
{
float s_kx = min_size[l] * 1.0 / w;
float s_ky = min_size[l] * 1.0 / h;
float cx = (j + 0.5) * steps[k] / w;
float cy = (i + 0.5) * steps[k] / h;
box axil = { cx, cy, s_kx, s_ky };
anchor.push_back(axil);
}
}
}
}
}
void NMS(std::vector<ObjInfo> &input_boxes, float NMS_THRESH)
{
std::vector<float>vArea(input_boxes.size());
for (int i = 0; i < int(input_boxes.size()); ++i)
{
vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
* (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
}
for (int i = 0; i < int(input_boxes.size()); ++i)
{
for (int j = i + 1; j < int(input_boxes.size());)
{
float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);
float w = (std::max)(float(0), xx2 - xx1 + 1);
float h = (std::max)(float(0), yy2 - yy1 + 1);
float inter = w * h;
float ovr = inter / (vArea[i] + vArea[j] - inter);
if (ovr >= NMS_THRESH)
{
input_boxes.erase(input_boxes.begin() + j);
vArea.erase(vArea.begin() + j);
}
else
{
j++;
}
}
}
}