一直陷在物体检测的坑里出不来了。谁让这坑如此之深 ! 继续加点深度 ! 自己训练一个yolov5 object detect 模型 再跑跑 tensorRT 加速吧。
技术主题:
yolov5s 训练人手检测模型并使用 tensortRT 加速。
https://www.robots.ox.ac.uk/~vgg/data/hands/
参考:
https://github.com/coldlarry/YOLOv3-complete-pruning/blob/master/data/converter.py
这个数据集下载下来只有 5000多张图片。 训练显得有点少。 怎么多块好省的步入共产主义? 写个脚本镜像一下把。 1w 数据集有了。
为训练准备 yolov5/data/hand.yaml
train: /workspace/data/hand_dataset/images/train/
val: /workspace/data/hand_dataset/images/validation/
# number of classes
nc: 1
# class names
names: ['hand']
我是先到了一个终点栽了坑,又反向传播回到这里,建议:
jetson nano 下 跑训练的模型 img-size 就用 416 (为了检测速度快)
yolov5 为使用 3.1 版本 (为了能成功导出 tensorRT)
yolov5 下载:
https://github.com/ultralytics/yolov5/tags
同时下载 yolov5s.pt
yolov5/ 下执行:
python3 train.py --img 416 --batch 16 --epochs 300 --data hand.yaml --weights yolov5s.pt --cfg yolov5s.yaml --cache-images --single-cls
最后的训练结果:
P->0.8322 , Recall->0.9663 , map->0.6981
294/299 1.82G 0.02242 0.0224 0 0.04482 74 416 0.8299 0.9553 0.9667 0.6961 0.02342 0.02203 0
295/299 1.82G 0.02225 0.02221 0 0.04446 68 416 0.83 0.9553 0.9668 0.6966 0.02339 0.02202 0
296/299 1.82G 0.0224 0.02195 0 0.04435 50 416 0.8312 0.9553 0.9669 0.6975 0.02336 0.022 0
297/299 1.82G 0.02225 0.02205 0 0.0443 65 416 0.8326 0.9551 0.9668 0.6982 0.02334 0.02199 0
298/299 1.82G 0.02235 0.02194 0 0.04429 69 416 0.8325 0.9551 0.9667 0.6981 0.02331 0.02198 0
299/299 1.82G 0.02256 0.02214 0 0.0447 52 416 0.8322 0.9549 0.9663 0.6981 0.02329 0.02197 0
python3 detect.py --source 0 --weights runs/exp10/weights/best.pt
VID
把训练好的 模型 best.pt 放到 jetson nano下可跟开发环境同样验证。
测试速度 : 640x360@8fps
python3 detect.py --source test_dir/ --weights runs/exp10/weights/best.pt
python3 models/export.py --weights runs/exp10/weights/best.pt --img-size 416
得到 onnx 模型用 onnxruntime 运行. onnxruntime 编译了四个多小时 , 检测速度没多少提升呢。
# 部分代码
def main():
img_raw = cv2.imread('/tmp/in.jpg')
print(img_raw.shape)
t0 = time.time()
img,TestData = process_image_raw(img_raw)
t1 = time.time()
print("process img raw cost = %.1f ms " %( 1000 * (t1 - t0)))
session = onnxruntime.InferenceSession("hand_best_300epoch.onnx")
t2 = time.time()
print("load onnx models cost = %.1f ms " %( 1000 * (t2 - t1)))
inname = [input.name for input in session.get_inputs()][0]
outname = [output.name for output in session.get_outputs()]
print("inputs name:",inname,"outputs name:",outname)
prediction = session.run(outname, {inname:TestData})
t3 = time.time()
print("infer cost = %.1f ms " %( 1000 * (t3 - t2)))
boxes = getBoxes(prediction,0.25,0.6)
drawBox(boxes,img)
t4 = time.time()
print("draw box cost = %.1f ms " %( 1000 * (t4 - t3)))
if __name__ == "__main__":
main()
yolov5s.pt -> yolov5s.wts -> yolov5s.engine
参考这里:
https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5
划重点:
yololayer.h 里:
把 CLASS_NUM, INPUT_W, INPUT_H 都改了
--- a/yolov5/yololayer.h
+++ b/yolov5/yololayer.h
@@ -16,9 +16,9 @@ namespace Yolo
float anchors[CHECK_COUNT * 2];
};
static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
- static constexpr int CLASS_NUM = 80;
- static constexpr int INPUT_H = 608;
- static constexpr int INPUT_W = 608;
+ static constexpr int CLASS_NUM = 1;
+ static constexpr int INPUT_H = 416;
+ static constexpr int INPUT_W = 416;
终于, 编译出来的 可执行程序 yolov5 -d 测试可以到 40ms 一帧的速度
可是。。 我需要的是 库。
可是。。我想在python 下用。
有个 yolov5_trt.py, 一运行 内存吃没了。。 卡死不得其解。
自己包装把
//yolov5_lib.h
#pragma once
#ifdef __cplusplus
extern "C"
{
#endif
void * yolov5_trt_create(const char * engine_name);
const char * yolov5_trt_detect(void *h, cv::Mat &img, float threshold);
void yolov5_trt_destroy(void *h);
#ifdef __cplusplus
}
#endif
~
//yolov5_lib.cpp
#include
#include
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"
#include "yolov5_lib.h"
#define USE_FP16 // comment out this if want to use FP32
#define DEVICE 0 // GPU id
#define NMS_THRESH 0.4
#define CONF_THRESH 0.5
#define BATCH_SIZE 1
// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int CLASS_NUM = Yolo::CLASS_NUM;
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;
static void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
typedef struct
{
float *data;
float *prob;
IRuntime *runtime;
ICudaEngine *engine;
IExecutionContext *exe_context;
void* buffers[2];
cudaStream_t cuda_stream;
int inputIndex;
int outputIndex;
char result_json_str[16384];
}Yolov5TRTContext;
oid * yolov5_trt_create(const char * engine_name)
{
size_t size = 0;
char *trtModelStream = NULL;
Yolov5TRTContext * trt_ctx = NULL;
trt_ctx = new Yolov5TRTContext();
std::ifstream file(engine_name, std::ios::binary);
printf("yolov5_trt_create ... \n");
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}else
return NULL;
trt_ctx->data = new float[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
trt_ctx->prob = new float[BATCH_SIZE * OUTPUT_SIZE];
trt_ctx->runtime = createInferRuntime(gLogger);
assert(trt_ctx->runtime != nullptr);
printf("yolov5_trt_create cuda engine... \n");
trt_ctx->engine = trt_ctx->runtime->deserializeCudaEngine(trtModelStream, size);
assert(trt_ctx->engine != nullptr);
trt_ctx->exe_context = trt_ctx->engine->createExecutionContext();
delete[] trtModelStream;
assert(trt_ctx->engine->getNbBindings() == 2);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
trt_ctx->inputIndex = trt_ctx->engine->getBindingIndex(INPUT_BLOB_NAME);
trt_ctx->outputIndex = trt_ctx->engine->getBindingIndex(OUTPUT_BLOB_NAME);
assert(trt_ctx->inputIndex == 0);
assert(trt_ctx->outputIndex == 1);
// Create GPU buffers on device
printf("yolov5_trt_create buffer ... \n");
CHECK(cudaMalloc(&trt_ctx->buffers[trt_ctx->inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&trt_ctx->buffers[trt_ctx->outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
// Create stream
printf("yolov5_trt_create stream ... \n");
CHECK(cudaStreamCreate(&trt_ctx->cuda_stream));
printf("yolov5_trt_create done ... \n");
return (void *)trt_ctx;
}
const char * yolov5_trt_detect(void *h, cv::Mat &img, float threshold)
{
Yolov5TRTContext *trt_ctx;
int i;
int delay_preprocess;
int delay_infer;
trt_ctx = (Yolov5TRTContext *)h;
trt_ctx->result_json_str[0] = 0;
if (img.empty()) return trt_ctx->result_json_str;
auto start0 = std::chrono::system_clock::now();
//printf("yolov5_trt_detect start preprocess img \n");
cv::Mat pr_img = preprocess_img(img);
//printf("yolov5_trt_detect start convert img to float\n");
// letterbox BGR to RGB
i = 0;
for (int row = 0; row < INPUT_H; ++row) {
uchar* uc_pixel = pr_img.data + row * pr_img.step;
for (int col = 0; col < INPUT_W; ++col) {
trt_ctx->data[i] = (float)uc_pixel[2] / 255.0;
trt_ctx->data[i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
trt_ctx->data[i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
auto end0 = std::chrono::system_clock::now();
delay_preprocess = std::chrono::duration_cast(end0 - start0).count();
// Run inference
//printf("yolov5_trt_detect start do inference\n");
auto start = std::chrono::system_clock::now();
doInference(*trt_ctx->exe_context, trt_ctx->cuda_stream, trt_ctx->buffers, trt_ctx->data, trt_ctx->prob, BATCH_SIZE);
auto end = std::chrono::system_clock::now();
delay_infer = std::chrono::duration_cast(end - start).count();
std::cout <<"delay_proress:" << delay_preprocess << "ms, " << "delay_infer:" << delay_infer << "ms" << std::endl;
//printf("yolov5_trt_detect start do process infer result \n");
int fcount = 1;
int str_len;
std::vector> batch_res(1);
auto& res = batch_res[0];
nms(res, &trt_ctx->prob[0], threshold, NMS_THRESH);
sprintf(trt_ctx->result_json_str,
"{\"delay_preprocess\": %d,"
"\"delay_infer\": %d,"
"\"num_det\":%d, \"objects\":[", delay_preprocess, delay_infer, (int) res.size());
str_len = strlen(trt_ctx->result_json_str);
i = 0;
for(i = 0 ; i < res.size(); i++){
int x1, y1, x2, y2;
int class_id;
cv::Rect r = get_rect(img, res[i].bbox);
x1 = r.x;
y1 = r.y;
x2 = r.x + r.width;
y2 = r.y + r.height;
class_id = (int)res[i].class_id;
if (0 == i){
sprintf(trt_ctx->result_json_str + str_len, "(%d,%d,%d,%d,%d)", class_id, x1, y1, x2, y2);
}else {
sprintf(trt_ctx->result_json_str + str_len, ",(%d,%d,%d,%d,%d)", class_id, x1, y1, x2, y2);
}
str_len = strlen(trt_ctx->result_json_str);
if (str_len >= 16300)
break;
}
sprintf(trt_ctx->result_json_str + str_len, "]}");
return trt_ctx->result_json_str;
}
void yolov5_trt_destroy(void *h)
{
Yolov5TRTContext *trt_ctx;
trt_ctx = (Yolov5TRTContext *)h;
// Release stream and buffers
cudaStreamDestroy(trt_ctx->cuda_stream);
CHECK(cudaFree(trt_ctx->buffers[trt_ctx->inputIndex]));
CHECK(cudaFree(trt_ctx->buffers[trt_ctx->outputIndex]));
// Destroy the engine
trt_ctx->exe_context->destroy();
trt_ctx->engine->destroy();
trt_ctx->runtime->destroy();
delete trt_ctx->data;
delete trt_ctx->prob;
delete trt_ctx;
}
修改 CMakeList.txt
diff --git a/yolov5/CMakeLists.txt b/yolov5/CMakeLists.txt
index f40e006..be0f7b1 100644
--- a/yolov5/CMakeLists.txt
+++ b/yolov5/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_BUILD_TYPE Debug)
find_package(CUDA REQUIRED)
-set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
+set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_53;code=sm_53)
include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
@@ -23,8 +23,8 @@ link_directories(/usr/lib/x86_64-linux-gnu/)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
-cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
-target_link_libraries(myplugins nvinfer cudart)
+cuda_add_library(yolov5_trt SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yolov5_lib.cpp )
+target_link_libraries(yolov5_trt nvinfer cudart)
find_package(OpenCV)
include_directories(OpenCV_INCLUDE_DIRS)
@@ -32,7 +32,7 @@ include_directories(OpenCV_INCLUDE_DIRS)
add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp)
target_link_libraries(yolov5 nvinfer)
target_link_libraries(yolov5 cudart)
-target_link_libraries(yolov5 myplugins)
+target_link_libraries(yolov5 yolov5_trt)
target_link_libraries(yolov5 ${OpenCV_LIBS})
编译得到 libyolov5_trt.so
python modules , 参考:
https://github.com/walletiger/tensorrt_retinaface_with_python/tree/main/python
python wrap
#include
#include
#include
#include
#include
#include
#include "../yolov5_lib.h"
#include "pyboostcvconverter/pyboostcvconverter.hpp"
#include
using namespace cv;
using namespace boost::python;
static PyObject * mpyCreate(PyObject *self, PyObject *args)
{
char *engine_path = NULL;
void *trt_engine = NULL;
if (!PyArg_ParseTuple(args, "s", &engine_path)){
return Py_BuildValue("K", (unsigned long long)trt_engine);
}
trt_engine = yolov5_trt_create(engine_path);
printf("create yolov5-trt , instance = %p\n", trt_engine);
return Py_BuildValue("K", (unsigned long long)trt_engine);
}
static PyObject *mpyDetect(PyObject *self, PyObject *args)
{
void *trt_engine = NULL;
PyObject *ndArray = NULL;
float conf_thresh = 0.45;
const char *ret = NULL;
unsigned long long v;
if (!PyArg_ParseTuple(args, "KOf", &v, &ndArray, &conf_thresh))
return Py_BuildValue("s", "");
Mat mat = pbcvt::fromNDArrayToMat(ndArray);
trt_engine = (void *)v;
ret = yolov5_trt_detect(trt_engine, mat, conf_thresh);
return Py_BuildValue("s", ret);
}
static PyObject * mPyDestroy(PyObject *self, PyObject *args)
{
void *engine = NULL;
unsigned long long v;
if (!PyArg_ParseTuple(args, "K", &v))
return Py_BuildValue("O", NULL);;
printf(" destroy engine , engine = %lu\n", v);
engine = (void *)v;
yolov5_trt_destroy(engine);
return Py_BuildValue("O", NULL);
}
static PyMethodDef TRTYolov5MeThods[] = {
{"create", mpyCreate, METH_VARARGS, "Create the engine."},
{"detect", mpyDetect, METH_VARARGS, "use the engine to detect image"},
{"destroy", mPyDestroy, METH_VARARGS, "destroy the engine"},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef TRTYolov5Module = {
PyModuleDef_HEAD_INIT,
"TRTYolov5", /* name of module */
"", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */
TRTYolov5MeThods
};
PyMODINIT_FUNC PyInit_TRTYolov5(void) {
printf("init module ... \n");
return PyModule_Create(&TRTYolov5Module);
}
终于, python 下可以快速运行 yolov5s tensoRT modules 了
import cv2
import TRTYolov5 as t
engine = t.create('../yolov5s.engine')
img = cv2.imread('/workspace/data/x3.jpg')
b = t.detect(engine, img, 0.45)
#t.destroy(engine)
print(b)
最后看下 jetson nano 下 实时运行效果: 可以 40ms 一帧的速度来执行检测应用。
yolov5 jetson nano tensorRT model for hand detect
python bindins github:
https://github.com/walletiger/yolov5_tensorrtx_python