OpenCV object_detection.cpp




#ifdef CV_CXX11

#include "common.hpp"

std::string keys =
    "{ help  h     | | Print help message. }"
    "{ @alias      | | An alias name of model to extract preprocessing parameters from models.yml file. }"
    "{ zoo         | models.yml | An optional path to file with preprocessing parameters }"
    "{ device      |  0 | camera device number. }"
    "{ input i     | | Path to input image or video file. Skip this argument to capture frames from a camera. }"
    "{ framework f | | Optional name of an origin framework of the model. Detect it automatically if it does not set. }"
    "{ classes     | | Optional path to a text file with names of classes to label detected objects. }"
    "{ thr         | .5 | Confidence threshold. }"
    "{ nms         | .4 | Non-maximum suppression threshold. }"
    "{ backend     |  0 | Choose one of computation backends: "
                         "0: automatically (by default), "
                         "1: Halide language (, "
                         "2: Intel's Deep Learning Inference Engine (, "
                         "3: OpenCV implementation }"
    "{ target      | 0 | Choose one of target computation devices: "
                         "0: CPU target (by default), "
                         "1: OpenCL, "
                         "2: OpenCL fp16 (half-float precision), "
                         "3: VPU }"
    "{ async       | 0 | Number of asynchronous forwards at the same time. "
                        "Choose 0 for synchronous mode }";

using namespace cv;
using namespace dnn;

float confThreshold, nmsThreshold;
std::vector classes;

inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
                       const Scalar& mean, bool swapRB);

void postprocess(Mat& frame, const std::vector& out, Net& net);

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);

void callback(int pos, void* userdata);

#ifdef CV_CXX11
class QueueFPS : public std::queue
    QueueFPS() : counter(0) {}

    void push(const T& entry)
        std::lock_guard lock(mutex);

        counter += 1;
        if (counter == 1)
            // Start counting from a second frame (warmup).

    T get()
        std::lock_guard lock(mutex);
        T entry = this->front();
        return entry;

    float getFPS()
        double fps = counter / tm.getTimeSec();
        return static_cast(fps);

    void clear()
        std::lock_guard lock(mutex);
        while (!this->empty())

    unsigned int counter;

    TickMeter tm;
    std::mutex mutex;
#endif  // CV_CXX11

int main(int argc, char** argv)
    CommandLineParser parser(argc, argv, keys);

    const std::string modelName = parser.get("@alias");
    const std::string zooFile = parser.get("zoo");

    keys += genPreprocArguments(modelName, zooFile);

    parser = CommandLineParser(argc, argv, keys);
    parser.about("Use this script to run object detection deep learning networks using OpenCV.");
    if (argc == 1 || parser.has("help"))
        return 0;

    confThreshold = parser.get("thr");
    nmsThreshold = parser.get("nms");
    float scale = parser.get("scale");
    Scalar mean = parser.get("mean");
    bool swapRB = parser.get("rgb");
    int inpWidth = parser.get("width");
    int inpHeight = parser.get("height");
    size_t asyncNumReq = parser.get("async");
    std::string modelPath = findFile(parser.get("model"));
    std::string configPath = findFile(parser.get("config"));

    // Open file with classes names.
    if (parser.has("classes"))
        std::string file = parser.get("classes");
        std::ifstream ifs(file.c_str());
        if (!ifs.is_open())
            CV_Error(Error::StsError, "File " + file + " not found");
        std::string line;
        while (std::getline(ifs, line))

    // Load a model.
    Net net = readNet(modelPath, configPath, parser.get("framework"));
    std::vector outNames = net.getUnconnectedOutLayersNames();

    // Create a window
    static const std::string kWinName = "Deep learning object detection in OpenCV";
    namedWindow(kWinName, WINDOW_NORMAL);

    int initialConf = (int)(confThreshold * 100);
    createTrackbar("Confidence threshold, %", kWinName, &initialConf, 99, callback);

    // Open a video file or an image file or a camera stream.
    VideoCapture cap;
    if (parser.has("input"))"input"));

#ifdef CV_CXX11
    bool process = true;

    // Frames capturing thread
    QueueFPS framesQueue;
    std::thread framesThread([&](){
        Mat frame;
        while (process)
            cap >> frame;
            if (!frame.empty())

    // Frames processing thread
    QueueFPS processedFramesQueue;
    QueueFPS > predictionsQueue;
    std::thread processingThread([&](){
        std::queue futureOutputs;
        Mat blob;
        while (process)
            // Get a next frame
            Mat frame;
                if (!framesQueue.empty())
                    frame = framesQueue.get();
                    if (asyncNumReq)
                        if (futureOutputs.size() == asyncNumReq)
                            frame = Mat();
                        framesQueue.clear();  // Skip the rest of frames

            // Process the frame
            if (!frame.empty())
                preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);

                if (asyncNumReq)
                    std::vector outs;
                    net.forward(outs, outNames);

            while (!futureOutputs.empty() &&
                AsyncArray async_out = futureOutputs.front();
                Mat out;

    // Postprocessing and rendering loop
    while (waitKey(1) < 0)
        if (predictionsQueue.empty())

        std::vector outs = predictionsQueue.get();
        Mat frame = processedFramesQueue.get();

        postprocess(frame, outs, net);

        if (predictionsQueue.counter > 1)
            std::string label = format("Camera: %.2f FPS", framesQueue.getFPS());
            putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

            label = format("Network: %.2f FPS", predictionsQueue.getFPS());
            putText(frame, label, Point(0, 30), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

            label = format("Skipped frames: %d", framesQueue.counter - predictionsQueue.counter);
            putText(frame, label, Point(0, 45), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));
        imshow(kWinName, frame);

    process = false;

#else  // CV_CXX11
    if (asyncNumReq)
        CV_Error(Error::StsNotImplemented, "Asynchronous forward is supported only with Inference Engine backend.");

    // Process frames.
    Mat frame, blob;
    while (waitKey(1) < 0)
        cap >> frame;
        if (frame.empty())

        preprocess(frame, net, Size(inpWidth, inpHeight), scale, mean, swapRB);

        std::vector outs;
        net.forward(outs, outNames);

        postprocess(frame, outs, net);

        // Put efficiency information.
        std::vector layersTimes;
        double freq = getTickFrequency() / 1000;
        double t = net.getPerfProfile(layersTimes) / freq;
        std::string label = format("Inference time: %.2f ms", t);
        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

        imshow(kWinName, frame);
#endif  // CV_CXX11
    return 0;

inline void preprocess(const Mat& frame, Net& net, Size inpSize, float scale,
                       const Scalar& mean, bool swapRB)
    static Mat blob;
    // Create a 4D blob from a frame.
    if (inpSize.width <= 0) inpSize.width = frame.cols;
    if (inpSize.height <= 0) inpSize.height = frame.rows;
    blobFromImage(frame, blob, 1.0, inpSize, Scalar(), swapRB, false, CV_8U);

    // Run a model.
    net.setInput(blob, "", scale, mean);
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
        resize(frame, frame, inpSize);
        Mat imInfo = (Mat_(1, 3) << inpSize.height, inpSize.width, 1.6f);
        net.setInput(imInfo, "im_info");

void postprocess(Mat& frame, const std::vector& outs, Net& net)
    static std::vector outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;

    std::vector classIds;
    std::vector confidences;
    std::vector boxes;
    if (outLayerType == "DetectionOutput")
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
        CV_Assert(outs.size() > 0);
        for (size_t k = 0; k < outs.size(); k++)
            float* data = (float*)outs[k].data;
            for (size_t i = 0; i < outs[k].total(); i += 7)
                float confidence = data[i + 2];
                if (confidence > confThreshold)
                    int left   = (int)data[i + 3];
                    int top    = (int)data[i + 4];
                    int right  = (int)data[i + 5];
                    int bottom = (int)data[i + 6];
                    int width  = right - left + 1;
                    int height = bottom - top + 1;
                    if (width <= 2 || height <= 2)
                        left   = (int)(data[i + 3] * frame.cols);
                        top    = (int)(data[i + 4] * frame.rows);
                        right  = (int)(data[i + 5] * frame.cols);
                        bottom = (int)(data[i + 6] * frame.rows);
                        width  = right - left + 1;
                        height = bottom - top + 1;
                    classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
                    boxes.push_back(Rect(left, top, width, height));
    else if (outLayerType == "Region")
        for (size_t i = 0; i < outs.size(); ++i)
            // Network produces output blob with a shape NxC where N is a number of
            // detected objects and C is a number of classes + 4 where the first 4
            // numbers are [center_x, center_y, width, height]
            float* data = (float*)outs[i].data;
            for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
                Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
                Point classIdPoint;
                double confidence;
                minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
                if (confidence > confThreshold)
                    int centerX = (int)(data[0] * frame.cols);
                    int centerY = (int)(data[1] * frame.rows);
                    int width = (int)(data[2] * frame.cols);
                    int height = (int)(data[3] * frame.rows);
                    int left = centerX - width / 2;
                    int top = centerY - height / 2;

                    boxes.push_back(Rect(left, top, width, height));
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);

    std::vector indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i)
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
                 box.x + box.width, box.y + box.height, frame);

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));

    std::string label = format("%.2f", conf);
    if (!classes.empty())
        CV_Assert(classId < (int)classes.size());
        label = classes[classId] + ": " + label;

    int baseLine;
    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

    top = max(top, labelSize.height);
    rectangle(frame, Point(left, top - labelSize.height),
              Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());

void callback(int pos, void*)
    confThreshold = pos * 0.01f;
Use this script to run object detection deep learning networks using OpenCV.
Usage: test1.exe [params] alias

        --async (value:0)
                Number of asynchronous forwards at the same time. Choose 0 for synchronous mode
        --backend (value:0)
                Choose one of computation backends: 0: automatically (by default), 1: Halide language (, 2: Intel's Deep Learning Inference Engine (, 3: OpenCV implementation
        -c, --config
                Path to a text file of model contains network configuration. It could be a file with extensions .prototxt (Caffe), .pbtxt (TensorFlow), .cfg (Darknet), .xml (OpenVINO).
                Optional path to a text file with names of classes to label detected objects.
        --device (value:0)
                camera device number.
        -f, --framework
                Optional name of an origin framework of the model. Detect it automatically if it does not set.
        -h, --help
                Print help message.
        --height (value:-1)
                Preprocess input image by resizing to a specific height.
        -i, --input
                Path to input image or video file. Skip this argument to capture frames from a camera.
        -m, --model
                Path to a binary file of model contains trained weights. It could be a file with extensions .caffemodel (Caffe), .pb (TensorFlow), .t7 or .net (Torch), .weights (Darknet), .bin (OpenVINO).
                Preprocess input image by subtracting mean values. Mean values should be in BGR order and delimited by spaces.
        --nms (value:.4)
                Non-maximum suppression threshold.
                Indicate that model works with RGB input images instead BGR ones.
        --scale (value:1.0)
                Preprocess input image by multiplying on a scale factor.
        --target (value:0)
                Choose one of target computation devices: 0: CPU target (by default), 1: OpenCL, 2: OpenCL fp16 (half-float precision), 3: VPU
        --thr (value:.5)
                Confidence threshold.
        --width (value:-1)
                Preprocess input image by resizing to a specific width.
        --zoo (value:models.yml)
                An optional path to file with preprocessing parameters

                An alias name of model to extract preprocessing parameters from models.yml file.

