自从opencv3.4.2开始可以直接运行yolov3模型进行图像识别。 至少我在 windows+opencv4.0.1以及Ubuntu18.04+opencv3.4.3上运行是没有问题的。
在GPU上训练数据并在GPU上进行推理已经是很平常的事情。但是现实有很多场景是要求希望GPU训练好的算法模型能够在不同的cpu平台上进行推理,比如ARM, intel cpu等。这里以intel cpu为例介绍两种方式:
1)下载darknet c代码并make成lib, 然后调用其lib接口进行图象识别。
2)安装opencv3.43以上的版本,然后调用其API来进行图像识别
方法2优势很明显:
a)官方darknet只提供makefile在linux上编译,windows平台的编译得自己想办法搞定。
b) 更重要的是, opencv针对intel cpu做了很多指令集优化,所以对同一个yolo3得算法模型得推理,方法1要6s左右,而opencv方式只需要600ms左右。
这里还得再说明一点: 网上宣称用opencv API得推理时间只有220ms,但是我的实验结果却要600ms,大概原因有两个:
(1)网上得神经网络size为 416x416 而我的size为608x608 大约为前者得1.5x1.5=2.25
(2)CPU 不一样。 网上得是i7 6核 我的只有i5 4核 缓存也要差不少。
1。 准备好输入参数,包括网络配置文件,权值,待识别得图片以及识别类型名字
//记得把目录设置正确
String model_config = "xxx/yolov3_2019_01.cfg";
String model_weights = "xxx/yolov3-2019_01_final.weights";
String image_file = "xxx/img20190108_000544.jpg";
vector class_names = get_class_names("xxx/class_2019_01.names");
2。使用opencv API 加载网络模型
// Load the network
Net net = readNetFromDarknet(model_config, model_weights);
if (net.empty())
{
printf("loading network fails \n");
return 1;
}
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);
3。读取图片,并转换blob四维张量,同时resize成网络模型size
// Create a 4D blob from a frame.
Mat frame, blob;
frame = imread(image_file);
if (frame.empty())
{
printf("reading image fails \n");
return 1;
}
int newwork_width = 608;
int newwork_height = 608;
blobFromImage(frame, blob, 1 / 255.0, Size(newwork_width, newwork_height), Scalar(0, 0, 0), true, false);
4。将blob灌进模型,并进行前向预测(最耗时得部分)
//Sets the input to the network
net.setInput(blob);
// Runs the forward pass to get output of the output layers
vector outs;
net.forward(outs, getOutputsNames(net));
识别出来得outs是 3个mat类型数据,为什么是3? 因为darknet有3个yolo检测输出层啊! 这3个mat得shape分别是(4+1+5)x19x19x3, (4+1+5)x38x38x3 (4+1+5)x76x76x3 注:我的模型里面class数量是5
5。从outs里面挑选置信度大于阈值(缺省为0.5)得box,然后再进行NMS做进一步过滤
// Remove the bounding boxes with low confidence
postprocess(class_names, frame, outs);
//其调用得子函数如下
// Initialize the parameters
float confThreshold = 0.5; // Confidence threshold
float nmsThreshold = 0.4; // Non-maximum suppression threshold
// Draw the predicted bounding box 绘出框
void drawPred(vectorclasses, int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
{
//Draw a rectangle displaying the bounding box 绘制矩形
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 0, 255));
//Get the label for the class name and its confidence
string label = format("%.2f", conf);//分类标签及其置信度
//若存在类别标签,读取对应的标签
if (!classes.empty())
{
CV_Assert(classId < (int)classes.size());
label = classes[classId] + ":" + label;
}
//Display the label at the top of the bounding box
int baseLine;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
top = max(top, labelSize.height);
//绘制框上文字
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(255, 255, 255));
}
// Remove the bounding boxes with low confidence using non-maxima suppression
void postprocess(vectorclasses, Mat& frame, const vector& outs)
{
vector classIds;
vector confidences;
vector boxes;
for (size_t i = 0; i < outs.size(); ++i)
{
// Scan through all the bounding boxes output from the network and keep only the
// ones with high confidence scores. Assign the box's class label as the class
// with the highest score for the box.
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
{
int a = outs[i].cols;//中心坐标+框的宽高+置信度+分为各个类别分数=2+2+1+80
int b = outs[i].rows;//框的个数507
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);//取当前框的第六列到最后一列,即该框被分为80个类别,各个类别的评分
Point classIdPoint;
double confidence;
// Get the value and location of the maximum score
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);//找出最大评分的类别
if (confidence > confThreshold)//置信度阈值
{
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
classIds.push_back(classIdPoint.x);
confidences.push_back((float)confidence);
boxes.push_back(Rect(left, top, width, height));
}
}
}
// Perform non maximum suppression to eliminate redundant overlapping boxes with
// lower confidences
vector indices;
NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);//框、置信度、置信度阈值、非极大值抑制阈值、指标(输出)
for (size_t i = 0; i < indices.size(); ++i)
{
int idx = indices[i];//框序号
Rect box = boxes[idx];//框的坐标(矩形区域)
drawPred(classes, classIds[idx], confidences[idx], box.x, box.y,
box.x + box.width, box.y + box.height, frame);
}
}
// Get the names of the output layers
vector getOutputsNames(const Net& net)
{
static vector names;
if (names.empty())
{
//Get the indices of the output layers, i.e. the layers with unconnected outputs
vector outLayers = net.getUnconnectedOutLayers();
//get the names of all the layers in the network
vector layersNames = net.getLayerNames();
// Get the names of the output layers in names
names.resize(outLayers.size());
for (size_t i = 0; i < outLayers.size(); ++i)
names[i] = layersNames[outLayers[i] - 1];
}
return names;
}