树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理

一. 引言

最近在玩树莓派,想在树莓派上不是一个目标检测算法,大致看了一下,目前开源的大家都在使用yolov5-Lite,使用ncnn去推理加速,于是自己也尝试部署,在此记录一下,个人踩的坑。

二. 版本选择

这个非常重要,非常重要,非常重要

1. ncnn版本

一定要下载这个版本的ncnn
本人ncnn版本是20210525版本的,其他版本试过几个都不行,强烈推荐这个版本。ncnn版本连接https://github.com/Tencent/ncnn/tree/20210525

下载

git clone https://github.com/Tencent/ncnn.git

安装依赖

sudo apt-get install -y gfortran
sudo apt-get install -y libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
sudo apt-get install --no-install-recommends libboost-all-dev
sudo apt-get install -y libgflags-dev libgoogle-glog-dev liblmdb-dev libatlas-base-dev

编译ncnn

cd ~/ncnn
mkdir build
cmake ..
make -j4
make install

2. yolov5-Lite版本

由于作者版本更新,我目前使用最新的版本时候,有一些bug在ncnn上,没来及修改,于是我使用了之前的版本。推荐这个版本 yolov5-Lite版本 https://github.com/ppogg/YOLOv5-Lite/tree/v1.4

下载

git clone https://github.com/ppogg/YOLOv5-Lite.git

安装依赖

建议按照作者提供的requirements.txt,尤其是pytorch版本一定要低于1.10版本,不然在模型导出,训练时候会报错,pytorch错误。

# pip install -r requirements.txt

# base ----------------------------------------
matplotlib>=3.2.2
numpy>=1.18.5
opencv-python>=4.1.2
Pillow
PyYAML>=5.3.1
scipy>=1.4.1
torch>=1.8.0
torchvision>=0.9.0
tqdm>=4.41.0

# logging -------------------------------------
tensorboard>=2.4.1
# wandb

# plotting ------------------------------------
seaborn>=0.11.0
pandas

# export --------------------------------------
# coremltools>=4.1
# onnx>=1.9.1
# scikit-learn==0.19.2  # for coreml quantization

# extras --------------------------------------
thop  # FLOPS computation
pycocotools>=2.0  # COCO mAP

pytorch版本过高于1.10,会出现如下错误
树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第1张图片

三.模型转换

1. pt 转 onnx

cd ~/yolov5-Lite-1.4/
python3 export --weights "./best.pt" --batch-size 1 --img-size 320

使用onnx-simplifier对onnx进行简化

pip3 install onnx-simplifier
python -m onnxsim best.onnx best-sim.onnx

2. ncnn 模型转换

cd ~/ncnn/build/tools
./onnx2ncnn ./best-sim.onnx ./best-sim.param ./best-sim.bin
转化成fp16
./ncnnoptimize ./best-sim.param ./best-sim.bin ./best-sim-fp16.param ./best-sim-fp16.bin 65536
其中65536 是设置模型转为f16开关

65536来自源码 vim ~/ncnn/tools/ncnnoptimize.cpp
树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第2张图片

3. 修改best-sim.param文件

树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第3张图片
三个输出层的Reshape维度需要修改成-1

修改前:
Reshape  Reshape_468 1 1 632 650 0=6400 1=85 2=3
Reshape  Reshape_484 1 1 652 670 0=1600 1=85 2=3
Reshape  Reshape_500 1 1 672 690 0=400 1=85 2=3
修改后:
Reshape  Reshape_468 1 1 632 650 0=-1 1=85 2=3
Reshape  Reshape_484 1 1 652 670 0=-1 1=85 2=3
Reshape  Reshape_500 1 1 672 690 0=-1 1=85 2=3
632 表示输入层名称 650 表示输出层名称 0表示为第1维度个数 1表示第2维度个数 2表示第3个数

4.修改v5lite-s.cpp源码

#include "layer.h"
#include "net.h"
 
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include 
#include 
#include 
#endif
#include 
#include 
#include 
#include 
 
#include   
#include   
#include   
 
using namespace std;  
using namespace cv;  
using namespace std::chrono;  
 
// 0 : FP16
// 1 : INT8
#define USE_INT8 0
 
// 0 : Image
// 1 : Camera
#define USE_CAMERA 0
 
struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};
 
static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}
 
static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;
 
    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;
 
        while (faceobjects[j].prob < p)
            j--;
 
        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);
 
            i++;
            j--;
        }
    }
 
    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}
 
static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;
 
    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}
 
static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
    picked.clear();
 
    const int n = faceobjects.size();
 
    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }
 
    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];
 
        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];
 
            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }
 
        if (keep)
            picked.push_back(i);
    }
}
 
static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}
 
// unsigmoid
static inline float unsigmoid(float y) {
    return static_cast<float>(-1.0 * (log((1.0 / y) - 1.0)));
}
 
static void generate_proposals(const ncnn::Mat &anchors, int stride, const ncnn::Mat &in_pad,
                               const ncnn::Mat &feat_blob, float prob_threshold,
                               std::vector <Object> &objects) {
    const int num_grid = feat_blob.h;
    float unsig_pro = 0;
    if (prob_threshold > 0.6)
        unsig_pro = unsigmoid(prob_threshold);
 
    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h) {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    } else {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }
 
    const int num_class = feat_blob.w - 5;
 
    const int num_anchors = anchors.w / 2;
 
    for (int q = 0; q < num_anchors; q++) {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];
 
        const ncnn::Mat feat = feat_blob.channel(q);
 
        for (int i = 0; i < num_grid_y; i++) {
            for (int j = 0; j < num_grid_x; j++) {
                const float *featptr = feat.row(i * num_grid_x + j);
 
                // find class index with max class score
                int class_index = 0;
                float class_score = -FLT_MAX;
                float box_score = featptr[4];
                if (prob_threshold > 0.6) {
                    // while prob_threshold > 0.6, unsigmoid better than sigmoid
                    if (box_score > unsig_pro) {
                        for (int k = 0; k < num_class; k++) {
                            float score = featptr[5 + k];
                            if (score > class_score) {
                                class_index = k;
                                class_score = score;
                            }
                        }
 
                        float confidence = sigmoid(box_score) * sigmoid(class_score);
 
                        if (confidence >= prob_threshold) {
 
                            float dx = sigmoid(featptr[0]);
                            float dy = sigmoid(featptr[1]);
                            float dw = sigmoid(featptr[2]);
                            float dh = sigmoid(featptr[3]);
 
                            float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                            float pb_cy = (dy * 2.f - 0.5f + i) * stride;
 
                            float pb_w = pow(dw * 2.f, 2) * anchor_w;
                            float pb_h = pow(dh * 2.f, 2) * anchor_h;
 
                            float x0 = pb_cx - pb_w * 0.5f;
                            float y0 = pb_cy - pb_h * 0.5f;
                            float x1 = pb_cx + pb_w * 0.5f;
                            float y1 = pb_cy + pb_h * 0.5f;
 
                            Object obj;
                            obj.rect.x = x0;
                            obj.rect.y = y0;
                            obj.rect.width = x1 - x0;
                            obj.rect.height = y1 - y0;
                            obj.label = class_index;
                            obj.prob = confidence;
 
                            objects.push_back(obj);
                        }
                    } else {
                        for (int k = 0; k < num_class; k++) {
                            float score = featptr[5 + k];
                            if (score > class_score) {
                                class_index = k;
                                class_score = score;
                            }
                        }
                        float confidence = sigmoid(box_score) * sigmoid(class_score);
 
                        if (confidence >= prob_threshold) {
                            float dx = sigmoid(featptr[0]);
                            float dy = sigmoid(featptr[1]);
                            float dw = sigmoid(featptr[2]);
                            float dh = sigmoid(featptr[3]);
 
                            float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                            float pb_cy = (dy * 2.f - 0.5f + i) * stride;
 
                            float pb_w = pow(dw * 2.f, 2) * anchor_w;
                            float pb_h = pow(dh * 2.f, 2) * anchor_h;
 
                            float x0 = pb_cx - pb_w * 0.5f;
                            float y0 = pb_cy - pb_h * 0.5f;
                            float x1 = pb_cx + pb_w * 0.5f;
                            float y1 = pb_cy + pb_h * 0.5f;
 
                            Object obj;
                            obj.rect.x = x0;
                            obj.rect.y = y0;
                            obj.rect.width = x1 - x0;
                            obj.rect.height = y1 - y0;
                            obj.label = class_index;
                            obj.prob = confidence;
 
                            objects.push_back(obj);
                        }
                    }
                }
            }
        }
    }
}
 
static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov5;
 
#if USE_INT8
    yolov5.opt.use_int8_inference=true;
#else
    yolov5.opt.use_vulkan_compute = true;
    yolov5.opt.use_bf16_storage = true;
#endif
 
    // original pretrained model from https://github.com/ultralytics/yolov5
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
 
#if USE_INT8
    yolov5.load_param("~/ncnn/build/best-sim-int8.param");
    yolov5.load_model("~/ncnn/build/best-sim-int8.bin");
#else
    yolov5.load_param("~/ncnn/build/best-sim-fp16.param");
    yolov5.load_model("~/ncnn/build/best-sim-fp16.bin");
#endif
 
    const int target_size = 320;
    const float prob_threshold = 0.60f;
    const float nms_threshold = 0.60f;
 
    int img_w = bgr.cols;
    int img_h = bgr.rows;
 
    // letterbox pad to multiple of 32
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }
 
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
 
    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
 
    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);
 
    ncnn::Extractor ex = yolov5.create_extractor();
 
    ex.input("images", in_pad);
 
    std::vector<Object> proposals;
 
    // stride 8
    {
        ncnn::Mat out;
        ex.extract("output", out);
 
        ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;
 
        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
 
        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }
    // stride 16
    {
        ncnn::Mat out;
        ex.extract("671", out);
 
 
        ncnn::Mat anchors(6);
        anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;
 
        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
 
        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }
    // stride 32
    {
        ncnn::Mat out;
        ex.extract("691", out);
 
 
        ncnn::Mat anchors(6);
        anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;
 
        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
 
        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }
 
    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);
 
    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);
 
    int count = picked.size();
 
    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];
 
        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
 
        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
 
        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }
 
    return 0;
}
 
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };
 
    cv::Mat image = bgr.clone();
 
    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];
 
        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
 
        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
 
        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
 
        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
 
        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;
 
        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);
 
        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
                    
       // cv::putText(image, to_string(fps), cv::Point(100, 100),             //FPS
                    //cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
        
    }
#if USE_CAMERA
    imshow("camera", image);
    cv::waitKey(1);
#else
    cv::imwrite("result.jpg", image);
#endif
}
 
#if USE_CAMERA
int main(int argc, char** argv)
{
    cv::VideoCapture capture;
    capture.open(0);  //修改这个参数可以选择打开想要用的摄像头
 
    cv::Mat frame;
    
    //111
    int FPS = 0;  
    int total_frames = 0;  
    high_resolution_clock::time_point t1, t2; 
    
    
    while (true)
    {
        capture >> frame;
        cv::Mat m = frame;
        cv::Mat f = frame;
 
        std::vector<Object> objects;
        
        auto start_time = std::chrono::high_resolution_clock::now();  // 记录开始时间 
        
        detect_yolov5(frame, objects);
 
        auto end_time = std::chrono::high_resolution_clock::now();  // 记录结束时间  
        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);  // 计算执行时间 
        
        float fps = (float)(1000)/duration.count();
 
        draw_objects(m, objects);
        cout << "FPS: " << fps << endl;  
        
        //int fps = 1000/duration.count();
        //int x = m.cols-50;
        //int y = m.rows-50;
        //cv::putText(f, to_string(fps), cv::Point(100, 100), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); 
        
        //if (cv::waitKey(30) >= 0)
            //break;
            
    }
}
#else
int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }
 
    const char* imagepath = argv[1];
 
    struct timespec begin, end;
    long time;
    clock_gettime(CLOCK_MONOTONIC, &begin);
 
    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }
 
    std::vector<Object> objects;
    detect_yolov5(m, objects);
 
    clock_gettime(CLOCK_MONOTONIC, &end);
    time = (end.tv_sec - begin.tv_sec) + (end.tv_nsec - begin.tv_nsec);
    printf(">> Time : %lf ms\n", (double)time/1000000);
 
    draw_objects(m, objects);
 
    return 0;
}
#endif

修改代码中一些参数
树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第4张图片

ex.extract("output", out)
其中“output”对应best-sim-fl16.param中如下
Permute Transpose_469 1 1 650 output 0=1

树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第5张图片

ex.extract("671", out)
其中“671”对应best-sim-fl16.param中如下
Permute Transpose_485 1 1 670 671 0=1

树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第6张图片

ex.extract("691", out)
其中“691”对应best-sim-fl16.param中如下
Permute Transpose_501 1 1 690 691 0=1

由于anchors是官方训练的,可以不用修改,但是自己训练自己数据集一定要修改anchors,修改anchors,修改anchors

ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;

anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;

anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;

四.编译v5lite-s.cpp

设置ncnn_DIR路径
export ncnn_DIR=~/ncnn/build/install/lib/cmake/ncnn

创建test目录

cd ~/ncnn
mdkir test

编写CMakeLists.txt

project(YOLOv5s)

cmake_minimum_required(VERSION 3.10.2)

# set(ncnn_DIR "/xxx/path/to/ncnn/xxx/lib/cmake/ncnn")

find_package(ncnn REQUIRED)
find_package(OpenCV REQUIRED)

add_executable(ncnnv5lite v5lite-s.cpp)

target_link_libraries(ncnnv5lite ncnn ${OpenCV_LIBS})

把刚刚编写好的v5lite-s.cpp放在test目录中

cp ~/ncnn/v5lite-s.cpp ~/ncnn/test/

编译

mkdir build
cmake ..
make -j4

五. 推理结果


推理耗时 247ms 包含前后处理
树莓派4B使用ncnn部署yolov5-Lite,推理耗时 247ms 包含前后处理_第7张图片

你可能感兴趣的:(ncnn,yolov5-lite,YOLO)