9月份我在极市平台参加视觉AI工程项目实训周,完成了两个项目开发,其中有个项目已完成封装部署,并且一次验收通过。
本文主要讲解极市平台算法封装开发步骤(默认模型开发已训练得出分数),也默认使用 yolov5 版本训练出来的模型。
导出onnx 模型,这得在模型开发,实例导出生成 onnx 结构。
pip install onnx==1.9.0
pip install onnx-simplifier==0.4.1
# 切换到导出目录下
cd /project/train/src_repo/v5/export
# 运行导出onnx 命令。
python export.py --data data/data.yaml --weights /project/train/models/exp/weights/best.pt --simplify --include onnx
可视化软件 https://netron.app 。也可下载 https://github.com/lutzroeder/netron/releases
想验证是否对,可在本地电脑验证,生成的onnx,是否只有一个输入,一个输出
先下载模板代码路径: https://gitee.com/cvmart/ev_sdk_demo4.0_pedestrian_intrusion_yolov5.git
然后上传到平台
vscode
编辑器快捷键 (ctrl + ` ) 打开命令 cmd
端口
在 vscode 命令行端口,执行命令
wget -c https://extremevision-js-userfile.oss-cn-hangzhou.aliyuncs.com/user-29090-files/0ba628c3-9051-4659-95fd-e5fce36399c3/ev_sdk_demo4.0.zip
cp -r ev_sdk_demo4.0/* ./ev_sdk/
config/algo_config.json
注意这里的顺序,要跟模型训练pt的类别顺序,要一致。
src/Configuration.hpp
文件修改容器 targetRectTextMap 修改为所参加比赛的类别。
src/SampleAlgorithm.cpp
模型 onnx 路径src/SampleDetector.cpp
模型推理。这里我没用官方的,采用了修改过后的TensorRT 推理代码,比官方快,精度高。
源代码CPP 具体实现。
#include
#include
#include
#include "SampleDetector.h"
#include "opencv2/imgproc.hpp"
#include "opencv2/imgcodecs.hpp"
#include "ji_utils.h"
#include "./logging.h"
#define INPUT_NAME "images"
#define OUTPUT_NAME "output"
using namespace nvinfer1;
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
if(code != cudaSuccess){
const char* err_name = cudaGetErrorName(code);
const char* err_message = cudaGetErrorString(code);
printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
static bool ifFileExists(const char *FileName)
{
struct stat my_stat;
return (stat(FileName, &my_stat) == 0);
}
SampleDetector::SampleDetector()
{
}
// 加载onnx ,转换trt.
void SampleDetector::loadOnnx(const std::string strModelName)
{
Logger gLogger;
//根据tensorrt pipeline 构建网络
IBuilder* builder = createInferBuilder(gLogger);
builder->setMaxBatchSize(1);
const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, gLogger);
parser->parseFromFile(strModelName.c_str(), static_cast<int>(ILogger::Severity::kWARNING));
IBuilderConfig* config = builder->createBuilderConfig();
config->setMaxWorkspaceSize(1ULL << 28);
m_CudaEngine = builder->buildEngineWithConfig(*network, *config);
std::string strTrtName = strModelName;
size_t sep_pos = strTrtName.find_last_of(".");
strTrtName = strTrtName.substr(0, sep_pos) + ".trt";
IHostMemory *gieModelStream = m_CudaEngine->serialize();
std::string serialize_str;
std::ofstream serialize_output_stream;
serialize_str.resize(gieModelStream->size());
memcpy((void*)serialize_str.data(),gieModelStream->data(),gieModelStream->size());
serialize_output_stream.open(strTrtName.c_str());
serialize_output_stream<<serialize_str;
serialize_output_stream.close();
m_CudaContext = m_CudaEngine->createExecutionContext();
parser->destroy();
network->destroy();
config->destroy();
builder->destroy();
}
// 加载TRT
void SampleDetector::loadTrt(const std::string strName)
{
Logger gLogger;
IRuntime* runtime = createInferRuntime(gLogger);
std::ifstream fin(strName);
std::string cached_engine = "";
while (fin.peek() != EOF)
{
std::stringstream buffer;
buffer << fin.rdbuf();
cached_engine.append(buffer.str());
}
fin.close();
m_CudaEngine = runtime->deserializeCudaEngine(cached_engine.data(), cached_engine.size(), nullptr);
m_CudaContext = m_CudaEngine->createExecutionContext();
runtime->destroy();
}
// 初始化
bool SampleDetector::Init(const std::string& strModelName, float thresh)
{
mThresh = thresh;
std::string strTrtName = strModelName;
size_t sep_pos = strTrtName.find_last_of(".");
strTrtName = strTrtName.substr(0, sep_pos) + ".trt";
if(ifFileExists(strTrtName.c_str()))
{
loadTrt(strTrtName);
}
else
{
loadOnnx(strModelName);
}
// 输入
int input_batch = 1;
int input_channel = 3;
m_input_height = 640;
m_input_width = 640;
m_input_numel = input_batch * input_channel * m_input_height * m_input_width;
checkRuntime(cudaMallocHost(&m_input_data_host, m_input_numel * sizeof(float)));
checkRuntime(cudaMalloc(&m_input_data_device, m_input_numel * sizeof(float)));
// 输出
auto output_dims = m_CudaEngine->getBindingDimensions(1);
m_output_numbox = output_dims.d[1];
m_output_numprob = output_dims.d[2];
m_num_classes = m_output_numprob - 5;
m_output_numel = input_batch * m_output_numbox * m_output_numprob;
checkRuntime(cudaMallocHost(&m_output_data_host, sizeof(float) * m_output_numel));
checkRuntime(cudaMalloc(&m_output_data_device, sizeof(float) * m_output_numel));
cudaStreamCreate(&m_CudaStream);
m_bUninit = false;
cv::Mat frame = cv::Mat::ones(cv::Size(m_input_width, m_input_height), CV_8UC3);
// 预加载
std::vector<BoxInfo> DetObjs;
ProcessImage(frame, DetObjs, 0.1);
}
bool SampleDetector::UnInit()
{
if(m_bUninit == true)
{
return false;
}
cudaStreamDestroy(m_CudaStream);
checkRuntime(cudaFreeHost(m_input_data_host));
checkRuntime(cudaFreeHost(m_output_data_host));
checkRuntime(cudaFree(m_input_data_device));
checkRuntime(cudaFree(m_output_data_device));
m_CudaContext->destroy();
m_CudaEngine->destroy();
m_bUninit = true;
}
SampleDetector::~SampleDetector()
{
UnInit();
}
bool SampleDetector::ProcessImage(const cv::Mat& image, std::vector<BoxInfo>& DetObjs, float thresh)
{
mThresh = thresh;
DetObjs.clear();
ProImg(image);
checkRuntime(cudaMemcpyAsync(m_input_data_device, m_input_data_host, m_input_numel * sizeof(float), cudaMemcpyHostToDevice, m_CudaStream));
// 明确当前推理时,使用的数据输入大小
auto input_dims = m_CudaEngine->getBindingDimensions(0);
input_dims.d[0] = 1;
m_CudaContext->setBindingDimensions(0, input_dims);
float* bindings[] = {m_input_data_device, m_output_data_device};
bool success = m_CudaContext->enqueueV2((void**)bindings, m_CudaStream, nullptr);
checkRuntime(cudaMemcpyAsync(m_output_data_host, m_output_data_device, sizeof(float) * m_output_numel, cudaMemcpyDeviceToHost, m_CudaStream));
checkRuntime(cudaStreamSynchronize(m_CudaStream));
decode_outputs(DetObjs, thresh);
// nms
runNms(DetObjs, thresh);
return true;
}
void SampleDetector::ProImg(const cv::Mat &image)
{
float scale_x = m_input_width / (float)image.cols;
float scale_y = m_input_height / (float)image.rows;
float scale = std::min(scale_x, scale_y);
float i2d[6];
i2d[0] = scale; i2d[1] = 0; i2d[2] = (-scale * image.cols + m_input_width + scale - 1) * 0.5;
i2d[3] = 0; i2d[4] = scale; i2d[5] = (-scale * image.rows + m_input_height + scale - 1) * 0.5;
cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);
cv::Mat m2x3_m_d2i(2, 3, CV_32F, m_d2i);
cv::invertAffineTransform(m2x3_i2d, m2x3_m_d2i);
cv::Mat input_image(m_input_height, m_input_width, CV_8UC3);
cv::warpAffine(image, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
int image_area = input_image.cols * input_image.rows;
unsigned char* pimage = input_image.data;
float* phost_b = m_input_data_host + image_area * 0;
float* phost_g = m_input_data_host + image_area * 1;
float* phost_r = m_input_data_host + image_area * 2;
for(int i = 0; i < image_area; ++i, pimage += 3){
// 注意这里的顺序rgb调换了
*phost_r++ = pimage[0] / 255.0f;
*phost_g++ = pimage[1] / 255.0f;
*phost_b++ = pimage[2] / 255.0f;
}
}
void SampleDetector::runNms(std::vector<BoxInfo>& DetObjs, float thresh)
{
std::sort(DetObjs.begin(), DetObjs.end(), [](BoxInfo& a, BoxInfo& b){return a.score > b.score;});
std::vector<bool> remove_flags(DetObjs.size());
std::vector<BoxInfo> box_result;
box_result.reserve(DetObjs.size());
auto iou = [](const BoxInfo& a, const BoxInfo& b){
float cross_left = std::max(a.x1, b.x1);
float cross_top = std::max(a.y1, b.y1);
float cross_right = std::min(a.x2, b.x2);
float cross_bottom = std::min(a.y2, b.y2);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.x2 - a.x1) * std::max(0.0f, a.y2 - a.y1)
+ std::max(0.0f, b.x2 - b.x1) * std::max(0.0f, b.y2 - b.y1) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < DetObjs.size(); ++i){
if(remove_flags[i]) continue;
auto& ibox = DetObjs[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < DetObjs.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = DetObjs[j];
if(ibox.label == jbox.label){
if(iou(ibox, jbox) >= thresh)
remove_flags[j] = true;
}
}
}
DetObjs = box_result;
}
void SampleDetector::decode_outputs(std::vector<BoxInfo>& DetObjs, float thresh)
{
float confidence_threshold = thresh;
float nms_threshold = thresh;
for(int i = 0; i < m_output_numbox; ++i){
float* ptr = m_output_data_host + i * m_output_numprob;
float objness = ptr[4];
if(objness < confidence_threshold)
continue;
float* pclass = ptr + 5;
float* ptheta = pclass + m_num_classes;
int label = std::max_element(pclass, pclass + m_num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
float theta = std::max_element(ptheta, ptheta + 180) - ptheta;
theta = (theta - 90.0f) / 180.0f * 3.14159265f;
float cx = ptr[0];
float cy = ptr[1];
float width = ptr[2];
float height = ptr[3];
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
float image_base_left = m_d2i[0] * left + m_d2i[2];
float image_base_right = m_d2i[0] * right + m_d2i[2];
float image_base_top = m_d2i[0] * top + m_d2i[5];
float image_base_bottom = m_d2i[0] * bottom + m_d2i[5];
DetObjs.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, confidence, label});
}
}
至此,已修改好推理代码。
#编译SDK库
mkdir -p /usr/local/ev_sdk/build
cd /usr/local/ev_sdk/build
cmake ..
make install
#编译测试工具
mkdir -p /usr/local/ev_sdk/test/build
cd /usr/local/ev_sdk/test/build
cmake ..
make install
# 调试
cd /usr/local/ev_sdk/bin/
# 这里路径,对应修改为上面的数据名称。
./test-ji-api -f 1 -i /project/inputs/kouzhao1.mp4 -o result.mp4
然后发起调试,执行命令 bash xxx/debug_gpu.sh
能学到算法,部署模型等系列流程的真实项目,群里又有大佬教导,又不用花钱。而且完成上分要求,还有奖金拿,这极市,爱了爱了。
在算法开发部署模型时,一定要仔细看官方文档,demo源代码教程(有些坑已经描述了),最好先从官方提供的源代码例子开始,熟悉整个训练算法,部署模型流程。熟悉之后,可以按照自己的想法进行修改。