最近在使用yolov5,网上找了些资料,发现torchlib用来部署yolov5相对更方便,效率更高,
github上也有调用torchlib的代码,但是后处理时间都花费过多,经过我的一番研究后,已经彻底解决了(测试结果为前处理(不包括图片从硬盘加载)8ms,推理15ms(2080ti),后处理7ms),也在github上的torchlib-yolov5--issue后提交了我的回复,
yolov5源码上export.py 导出的是cpu版本的,要改成gpu版本的,github上的链接:https://github.com/yasenh/libtorch-yolov5
下面是我原始的代码:
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace cv;
using namespace dnn;
struct files {//搜索文件的结构体
String filepath;
time_t time;
string filename;
};
vector fileSearch(string path)
{
int result;
long long hFile = 0;
struct stat buf;
vector allfile;
files filed;
struct _finddata_t fileInfo;
vector strpath;
string pathName;
if ((hFile = _findfirst(pathName.assign(path).append("\\*").c_str(), &fileInfo)) == -1)
cout << "错误" << endl;
do {
if (strcmp(fileInfo.name, "..") == 0 || strcmp(fileInfo.name, ".") == 0) {
}
else {
cout << path + "\\" + fileInfo.name << ":time:" << fileInfo.time_create << endl;
if (strstr(fileInfo.name, "png") == NULL && strstr(fileInfo.name, "jpg") == NULL) {
continue;
}
pathName = path + "\\" + fileInfo.name;
result = stat((char*)pathName.data(), &buf);
if (result != 0) {
perror("显示文件状态信息出错");
}
else {
filed.filepath = pathName;
filed.time = buf.st_atime;
filed.filename = fileInfo.name;
allfile.push_back(filed);
}
}
} while (_findnext(hFile, &fileInfo) == 0);
_findclose(hFile);
return allfile;
}
int main()
{
// Loading Module
torch::jit::script::Module module = torch::jit::load(R"(E:\deeplearning\yolov5\runs\temp\best3.torchscript.pt)");//torchscript
torch::DeviceType device_type;
device_type = torch::kCUDA;
torch::Device device0(device_type);
module.to(device0);
module.to(torch::kHalf);
module.eval();
Mat frame, img;
vector allf = fileSearch(R"(D:\图片)");//文件夹路径
for (int n = 0; n < allf.size(); n++) {
clock_t start = clock();
frame = imread(allf[n].filepath);
cout << clock() - start << "ms-read" << endl;
if (frame.empty())
{
std::cout << "Read frame failed!" << std::endl;
break;
}
// Preparing input tensor
resize(frame, img, Size(640, 640));
cvtColor(img, img, COLOR_BGR2RGB);
img.convertTo(img, CV_32FC3, 1.0f / 255.0f); // normalization 1/255
auto tensor_img = torch::from_blob(img.data, { 1, img.rows, img.cols, img.channels() }).to(device0);
tensor_img = tensor_img.permute({ 0, 3, 1, 2 }).contiguous(); // BHWC -> BCHW (Batch, Channel, Height, Width)
tensor_img = tensor_img.to(torch::kHalf);
std::vector inputs;
inputs.emplace_back(tensor_img);
torch::jit::IValue output = module.forward(inputs);
auto detections = output.toTuple()->elements()[0].toTensor();
auto conf_mask = detections.select(2, 4).ge(0.5).unsqueeze(2);//过滤了score为0.5以下的
detections = torch::masked_select(detections[0], conf_mask[0]).view({ -1, 85 });//类别+5,这里类别有80个
detections = detections.to(torch::kFloat);
detections = detections.to(torch::kCPU);//因为结果是在gpu上的,先拷到cpu,减少通信时间
clock_t starrrr = clock();
vector boxes;
vector confidences;
float* ptr = (float*)detections.data_ptr();//主要是通过这里的指针取值,缩短了后处理时间
for (int i = 0; i < (int)detections.size(0); ++i, ptr += 85)//类别+5,这里80个类别
{
float confidence = ptr[4];
int centerX = (int)((ptr[0] / 640) * frame.cols);
int centerY = (int)((ptr[1] / 640) * frame.rows);
int width = (int)((ptr[2] / 640) * frame.cols);
int height = (int)((ptr[3] / 640) * frame.rows);
int left = (centerX - width / 2);
int top = (centerY - height / 2);
confidences.push_back(confidence);
boxes.push_back(Rect(max(left, 0), max(top, 0), min(width, frame.cols - left), min(height, frame.rows - top)));
}
vector indices;
NMSBoxes(boxes, confidences, 0.5, 0.2, indices);
for (size_t i = 0; i < indices.size(); ++i)
{
int idx = indices[i];
Rect box = boxes[idx];
rectangle(frame, box, Scalar(0, 0, 250), 2, 8, 0);
}
return 0;
}