杜老师推出的 tensorRT从零起步高性能部署 课程,之前有看过一遍,但是没有做笔记,很多东西也忘了。这次重新撸一遍,顺便记记笔记。
本次课程学习 tensorRT 高级-使用 openvino 进行 onnx 的模型推理过程
课程大纲可看下面的思维导图
这节课程我们学习 openvino 案例
1. openvino 是 Intel 开发的基于 intel 计算设备的推理引擎(Intel X86 架构的 CPU)
2. 它可以利用 CPU 发挥出最好的性能,还能使用到新款 CPU 提供的 NN 运算能力
3. 它还可以利用外置的计算棒实现更好的推理性能
我们来看案例,二话不说先去执行下 make run,如下所示:
推理成功了,起码说明代码和 openvino 库没问题
推理效果如下图所示:
我们来看下代码是怎么工作的,完整的代码如下所示:
#include "openvino/openvino.hpp"
// system include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
// coco数据集的labels,关于coco:https://cocodataset.org/#home
static const char* cocolabels[] = {
"person", "bicycle", "car", "motorcycle", "airplane",
"bus", "train", "truck", "boat", "traffic light", "fire hydrant",
"stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
"sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis",
"snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
"skateboard", "surfboard", "tennis racket", "bottle", "wine glass",
"cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
"orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
"chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
"laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
"oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
"scissors", "teddy bear", "hair drier", "toothbrush"
};
// hsv转bgr
static std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v){
const int h_i = static_cast<int>(h * 6);
const float f = h * 6 - h_i;
const float p = v * (1 - s);
const float q = v * (1 - f*s);
const float t = v * (1 - (1 - f) * s);
float r, g, b;
switch (h_i) {
case 0:r = v; g = t; b = p;break;
case 1:r = q; g = v; b = p;break;
case 2:r = p; g = v; b = t;break;
case 3:r = p; g = q; b = v;break;
case 4:r = t; g = p; b = v;break;
case 5:r = v; g = p; b = q;break;
default:r = 1; g = 1; b = 1;break;}
return make_tuple(static_cast<uint8_t>(b * 255), static_cast<uint8_t>(g * 255), static_cast<uint8_t>(r * 255));
}
static std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id){
float h_plane = ((((unsigned int)id << 2) ^ 0x937151) % 100) / 100.0f;;
float s_plane = ((((unsigned int)id << 3) ^ 0x315793) % 100) / 100.0f;
return hsv2bgr(h_plane, s_plane, 1);
}
void inference(){
size_t input_batch = 1;
size_t input_channel = 3;
size_t input_height = 640;
size_t input_width = 640;
ov::Core core;
auto model = core.compile_model("yolov5s.onnx");
auto iq = model.create_infer_request();
auto input = iq.get_input_tensor(0);
auto output = iq.get_output_tensor(0);
input.set_shape({input_batch, input_channel, input_height, input_width});
float* input_data_host = input.data<float>();
///
// letter box
auto image = cv::imread("car.jpg");
// 通过双线性插值对图像进行resize
float scale_x = input_width / (float)image.cols;
float scale_y = input_height / (float)image.rows;
float scale = std::min(scale_x, scale_y);
float i2d[6], d2i[6];
// resize图像,源图像和目标图像几何中心的对齐
i2d[0] = scale; i2d[1] = 0; i2d[2] = (-scale * image.cols + input_width + scale - 1) * 0.5;
i2d[3] = 0; i2d[4] = scale; i2d[5] = (-scale * image.rows + input_height + scale - 1) * 0.5;
cv::Mat m2x3_i2d(2, 3, CV_32F, i2d); // image to dst(network), 2x3 matrix
cv::Mat m2x3_d2i(2, 3, CV_32F, d2i); // dst to image, 2x3 matrix
cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); // 计算一个反仿射变换
cv::Mat input_image(input_height, input_width, CV_8UC3);
cv::warpAffine(image, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114)); // 对图像做平移缩放旋转变换,可逆
cv::imwrite("input-image.jpg", input_image);
int image_area = input_image.cols * input_image.rows;
unsigned char* pimage = input_image.data;
float* phost_b = input_data_host + image_area * 0;
float* phost_g = input_data_host + image_area * 1;
float* phost_r = input_data_host + image_area * 2;
for(int i = 0; i < image_area; ++i, pimage += 3){
// 注意这里的顺序rgb调换了
*phost_r++ = pimage[0] / 255.0f;
*phost_g++ = pimage[1] / 255.0f;
*phost_b++ = pimage[2] / 255.0f;
}
///
iq.infer();
int output_numbox = output.get_shape()[1];
int output_numprob = output.get_shape()[2];
int num_classes = output_numprob - 5;
float* output_data_host = output.data<float>();
// decode box:从不同尺度下的预测狂还原到原输入图上(包括:预测框,类被概率,置信度)
vector<vector<float>> bboxes;
float confidence_threshold = 0.25;
float nms_threshold = 0.5;
for(int i = 0; i < output_numbox; ++i){
float* ptr = output_data_host + i * output_numprob;
float objness = ptr[4];
if(objness < confidence_threshold)
continue;
float* pclass = ptr + 5;
int label = std::max_element(pclass, pclass + num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
// 中心点、宽、高
float cx = ptr[0];
float cy = ptr[1];
float width = ptr[2];
float height = ptr[3];
// 预测框
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
// 对应图上的位置
float image_base_left = d2i[0] * left + d2i[2];
float image_base_right = d2i[0] * right + d2i[2];
float image_base_top = d2i[0] * top + d2i[5];
float image_base_bottom = d2i[0] * bottom + d2i[5];
bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
}
printf("decoded bboxes.size = %d\n", bboxes.size());
// nms非极大抑制
std::sort(bboxes.begin(), bboxes.end(), [](vector<float>& a, vector<float>& b){return a[5] > b[5];});
std::vector<bool> remove_flags(bboxes.size());
std::vector<vector<float>> box_result;
box_result.reserve(bboxes.size());
auto iou = [](const vector<float>& a, const vector<float>& b){
float cross_left = std::max(a[0], b[0]);
float cross_top = std::max(a[1], b[1]);
float cross_right = std::min(a[2], b[2]);
float cross_bottom = std::min(a[3], b[3]);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a[2] - a[0]) * std::max(0.0f, a[3] - a[1])
+ std::max(0.0f, b[2] - b[0]) * std::max(0.0f, b[3] - b[1]) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < bboxes.size(); ++i){
if(remove_flags[i]) continue;
auto& ibox = bboxes[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < bboxes.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = bboxes[j];
if(ibox[4] == jbox[4]){
// class matched
if(iou(ibox, jbox) >= nms_threshold)
remove_flags[j] = true;
}
}
}
printf("box_result.size = %d\n", box_result.size());
for(int i = 0; i < box_result.size(); ++i){
auto& ibox = box_result[i];
float left = ibox[0];
float top = ibox[1];
float right = ibox[2];
float bottom = ibox[3];
int class_label = ibox[4];
float confidence = ibox[5];
cv::Scalar color;
tie(color[0], color[1], color[2]) = random_color(class_label);
cv::rectangle(image, cv::Point(left, top), cv::Point(right, bottom), color, 3);
auto name = cocolabels[class_label];
auto caption = cv::format("%s %.2f", name, confidence);
int text_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width + 10;
cv::rectangle(image, cv::Point(left-3, top-33), cv::Point(left + text_width, top), color, -1);
cv::putText(image, caption, cv::Point(left, top-5), 0, 1, cv::Scalar::all(0), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
}
int main(){
inference();
return 0;
}
在 inference 函数中,一上来就定义了 ov::Core,然后通过 core.compile_model 编译拿到模型,接着创建推理请求,由于是动态 batch,因此需要 set_shape 指定一下推理时的 shape 大小,然后对输入图像进行预处理,和之前依旧一模一样,将预处理后的图像塞到 model 中,通过 iq.infer() 非常简单的语句即可完成推理,推理完成后从 output_data 中进行后处理即可恢复成框
上述就是 openvino 的整个流程,相对来说还是比较简单的
大家会发现做部署这块无非就是一个数据输入、推理、数据输出解码,所以你会发现看了这么多框架,无论是 tensorRT、onnxruntime 还是 openvino,本质上来说其流程都是一样的,只是说我中间推理的东西换一换,所以说大家无论是上到嵌入式上设备也好,其它平台也好,它本质上来讲都是这么回事,所以说你要学的核心是怎样把你的图像高效的做一个预处理,怎么去高效的描述你这个预处理过程,怎么高效的去得到你的结果。比如你在 CPU 上走的是 CPU 代码,你在 GPU 上你走的就是 CUDA 核函数
这是我们针对部署时的一些思考,还是需要我们大家多去动手实践,
关于 openvino 推理步骤如下:
本次课程我们学习了 openvino 推理,回顾我们之前学习的 tensorRT 和 onnxruntime 框架,大家或许会发现这些框架其实大差不差,无非是准备数据塞到框架中,拿到推理后的解决进行后处理,我们能把握的其实就是数据的预处理和后处理部分,至于中间的推理部分我们其实是没办法控制的,因此为了实现高性能,我们就要想办法把数据的预处理和后处理部分尽可能的高效处理,这才是我们需要学习的核心。