学习OpenCV4很好的一个英文博客和代码示例:https://github.com/spmallick/learnopencv
本文使用的模型文件、数据等均可以在上面下载得到。
1、代码配置
使用VS2015和OpenCV4.0.0实现,仅使用CPU(也可以使用GPU,根据电脑配置决定)。在Linux系统下编译,同时使用GPU加速最好。
2、代码实现
该代码是根据上面网址的代码做简单的整理
mask_rcnn.h:
#pragma once
#include
#include
#include
#include
#include
#include
#include
using namespace cv;
using namespace std;
class mask_rcnn
{
public:
mask_rcnn(float confThreshold = 0.5, float maskThreshold = 0.3);
/** @brief Draw the predicted bounding box, colorize and show the mask on the image
* @param[in] image or image frame in video stream
* @param[in] the index value corresponding to the category
* @param[in] bounding box
* @param[in] mask
* @returns NULL.
*/
void drawBox(cv::Mat& frame, int classId, float conf, cv::Rect box, cv::Mat& objectMask);
/** @brief Postprocess the neural network's output for each frame.For each frame, extract the bounding box and mask for each detected object
* @param[in] image or image frame in video stream
* @param[out] Output size of masks is NxCxHxW where
* N - number of detected boxes
* C - number of classes (excluding background)
* HxW - segmentation shape
* @returns NULL.
*/
void postprocess(cv::Mat& frame, const std::vector& outs);
/** @brief detect object and mask in the image
* @param[in] Load names of classes
* @param[in] Load the colors
* @param[in] Give the configuration
* @param[in] Give weight files for the model
* @returns NULL.
*/
void detect_image(std::string classesFile, std::string colorsFile, String textGraph, String modelWeights, std::string imagepath, std::string& outputFile);
/** @brief detect object and mask in the video
* @param[in] Load names of classes
* @param[in] Load the colors
* @param[in] Give the configuration
* @param[in] Give weight files for the model
* @returns NULL.
*/
void detect_video(std::string classesFile, std::string colorsFile, String textGraph, String modelWeights, std::string videopath, std::string& outputFile);
private:
// Initialize the parameters
float mfConfThreshold; // Confidence threshold
float mfMaskThreshold; // Mask threshold
std::vector mvClasses; // Recognizable object categories
std::vector mvColors; // Recognition box color set
};
mask_rcnn.cpp:
#include "mask_rcnn.h"
mask_rcnn::mask_rcnn(float confThreshold, float maskThreshold)
{
mfConfThreshold = confThreshold;
mfMaskThreshold = maskThreshold;
}
void mask_rcnn::detect_image(std::string classesFile, std::string colorsFile, String textGraph, String modelWeights, std::string imagepath, std::string& outputFile)
{
// Load names of classes
std::string line;
ifstream ifs(classesFile.c_str());
while (getline(ifs, line)) mvClasses.push_back(line);
// Load the colors
ifstream colorFptr(colorsFile.c_str());
while (getline(colorFptr, line))
{
char* pEnd;
double r, g, b;
r = strtod(line.c_str(), &pEnd);
g = strtod(pEnd, NULL);
b = strtod(pEnd, NULL);
Scalar color = Scalar(r, g, b, 255.0);
mvColors.push_back(Scalar(r, g, b, 255.0));
}
// Create a window
static const string kWinName = "Deep learning object detection in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);
// Load the network
dnn::Net net = dnn::readNetFromTensorflow(modelWeights, textGraph);
net.setPreferableBackend(dnn::DNN_BACKEND_OPENCV);
net.setPreferableTarget(dnn::DNN_TARGET_CPU);
// Create a 4D blob from a frame.
cv::Mat blob;
cv::Mat frame = cv::imread(imagepath);
// Create a 4D blob from a frame.
dnn::blobFromImage(frame, blob, 1.0, Size(frame.cols, frame.rows), Scalar(), true, false);
//Sets the input to the network
net.setInput(blob);
// Runs the forward pass to get output from the output layers
std::vector outNames(2);
outNames[0] = "detection_out_final";
outNames[1] = "detection_masks";
vector outs;
net.forward(outs, outNames);
// Extract the bounding box and mask for each of the detected objects
postprocess(frame, outs);
// Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
vector layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
string label = format("Mask-RCNN on 2.2 GHz Intel Core E5 CPU, Inference time for a frame : %0.0f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0));
// Write the frame with the detection boxes
imshow(kWinName, frame);
cv::imwrite(outputFile, frame);
}
void mask_rcnn::detect_video(std::string classesFile, std::string colorsFile, String textGraph, String modelWeights, std::string videopath, std::string& outputFile)
{
// Load names of classes
std::string line;
ifstream ifs(classesFile.c_str());
while (getline(ifs, line)) mvClasses.push_back(line);
// Load the colors
ifstream colorFptr(colorsFile.c_str());
while (getline(colorFptr, line))
{
char* pEnd;
double r, g, b;
r = strtod(line.c_str(), &pEnd);
g = strtod(pEnd, NULL);
b = strtod(pEnd, NULL);
Scalar color = Scalar(r, g, b, 255.0);
mvColors.push_back(Scalar(r, g, b, 255.0));
}
// Load the network
dnn::Net net = dnn::readNetFromTensorflow(modelWeights, textGraph);
net.setPreferableBackend(dnn::DNN_BACKEND_OPENCV);
net.setPreferableTarget(dnn::DNN_TARGET_CPU);
// Open a video file or an image file or a camera stream.
VideoCapture cap;
VideoWriter video;
Mat frame, blob;
try {
// Open the video file
ifstream ifile(videopath);
if (!ifile) throw("error");
cap.open(videopath);
}
catch (...) {
cout << "Could not open the input image/video stream" << endl;
return;
}
// Get the video writer initialized to save the output video
video.open(outputFile, VideoWriter::fourcc('M', 'J', 'P', 'G'), 28, Size(cap.get(CAP_PROP_FRAME_WIDTH), cap.get(CAP_PROP_FRAME_HEIGHT)));
// Create a window
static const string kWinName = "Deep learning object detection in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);
// Process frames.
while (waitKey(1) < 0)
{
// get frame from the video
cap >> frame;
// Stop the program if reached end of video
if (frame.empty()) {
cout << "Done processing !!!" << endl;
cout << "Output file is stored as " << outputFile << endl;
waitKey(3000);
break;
}
// Create a 4D blob from a frame.
dnn::blobFromImage(frame, blob, 1.0, Size(frame.cols, frame.rows), Scalar(), true, false);
//Sets the input to the network
net.setInput(blob);
// Runs the forward pass to get output from the output layers
std::vector outNames(2);
outNames[0] = "detection_out_final";
outNames[1] = "detection_masks";
vector outs;
net.forward(outs, outNames);
// Extract the bounding box and mask for each of the detected objects
postprocess(frame, outs);
// Put efficiency information. The function getPerfProfile returns the overall time for inference(t) and the timings for each of the layers(in layersTimes)
vector layersTimes;
double freq = getTickFrequency() / 1000;
double t = net.getPerfProfile(layersTimes) / freq;
string label = format("Mask-RCNN on 2.2 GHz Intel Core E5 CPU, Inference time for a frame : %0.0f ms", t);
putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 0, 0));
// Write the frame with the detection boxes
Mat detectedFrame;
frame.convertTo(detectedFrame, CV_8U);
video.write(detectedFrame);
imshow(kWinName, frame);
}
cap.release();
video.release();
}
void mask_rcnn::postprocess(cv::Mat& frame, const std::vector& outs)
{
Mat outDetections = outs[0];
Mat outMasks = outs[1];
//size[i]:The number of elements in each dimension
const int numDetections = outDetections.size[2];
const int numClasses = outMasks.size[1];
/* Mat reshape(int cn, int rows = 0) const;
* @param cn New number of channels. If the parameter is 0, the number of channels remains the same.
* @param rows New number of rows.If the parameter is 0, the number of rows remains the same.
*/
outDetections = outDetections.reshape(1, outDetections.total() / 7);
for (int i = 0; i < numDetections; ++i)
{
float score = outDetections.at(i, 2);
if (score > mfConfThreshold)
{
// Extract the bounding box
int classId = static_cast(outDetections.at(i, 1));
int left = static_cast(frame.cols * outDetections.at(i, 3));
int top = static_cast(frame.rows * outDetections.at(i, 4));
int right = static_cast(frame.cols * outDetections.at(i, 5));
int bottom = static_cast(frame.rows * outDetections.at(i, 6));
left = max(0, min(left, frame.cols - 1));
top = max(0, min(top, frame.rows - 1));
right = max(0, min(right, frame.cols - 1));
bottom = max(0, min(bottom, frame.rows - 1));
Rect box = Rect(left, top, right - left + 1, bottom - top + 1);
// Extract the mask for the object,
Mat objectMask(outMasks.size[2], outMasks.size[3], CV_32F, outMasks.ptr(i, classId));
// Draw bounding box, colorize and show the mask on the image
drawBox(frame, classId, score, box, objectMask);
}
}
}
void mask_rcnn::drawBox(cv::Mat& frame, int classId, float conf, cv::Rect box, cv::Mat& objectMask)
{
//Draw a rectangle displaying the bounding box
rectangle(frame, Point(box.x, box.y), Point(box.x + box.width, box.y + box.height), Scalar(255, 178, 50), 3);
//Get the label for the class name and its confidence
string label = format("%.2f", conf);
if (!mvClasses.empty())
{
CV_Assert(classId < (int)mvClasses.size());
label = mvClasses[classId] + ":" + label;
}
//Display the label at the top of the bounding box
int baseLine;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
box.y = max(box.y, labelSize.height);
rectangle(frame, Point(box.x, box.y - round(1.5*labelSize.height)), Point(box.x + round(1.5*labelSize.width), box.y + baseLine), Scalar(255, 255, 255), FILLED);
putText(frame, label, Point(box.x, box.y), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 0, 0), 1);
Scalar color = mvColors[classId%mvColors.size()];
// Resize the mask, threshold, color and apply it on the image
resize(objectMask, objectMask, Size(box.width, box.height));
Mat mask = (objectMask > mfMaskThreshold);
Mat coloredRoi = (0.3 * color + 0.7 * frame(box));
coloredRoi.convertTo(coloredRoi, CV_8UC3);
// Draw the contours on the image
vector contours;
Mat hierarchy;
mask.convertTo(mask, CV_8U);
findContours(mask, contours, hierarchy, RETR_CCOMP, CHAIN_APPROX_SIMPLE);
drawContours(coloredRoi, contours, -1, color, 5, LINE_8, hierarchy, 100);
coloredRoi.copyTo(frame(box), mask);
}
main.cpp:
#include
#include "mask_rcnn.h"
int main(int argc, char** argv)
{
// Give the classes and colors files for the model
string classesFile = "../model//mscoco_labels.names";
string colorsFile = "../model//colors.txt";
// Give the configuration and weight files for the model
String modelWeights = "../model//frozen_inference_graph.pb";
String textGraph = "../model//mask_rcnn_inception_v2_coco_2018_01_28.pbtxt";
// Enter an image or video
string imagepath = "../data//cars.jpg";
string videopath = "../data//cars.mp4";
// Output path settings
std::string image_outputFile = "../result//mask_rcnn.jpg";
std::string video_outputFile = "../result//mask_rcnn_out.avi";
// Confidence threshold and Mask threshold
mask_rcnn Mask_RCNN(0.5, 0.3);
Mask_RCNN.detect_image(classesFile, colorsFile, textGraph, modelWeights, imagepath, image_outputFile);
//Mask_RCNN.detect_video(classesFile, colorsFile, textGraph, modelWeights, videopath, video_outputFile);
cv::waitKey(0);
return 0;
}
3、实验结果图
效果还可以,就是仅使用CPU速度太慢了。此外,即是使用GPU速度也不是很快,,,,,,