实时语义分割网络 BiSeNet , RK1126 Npu 推理

实时语义分割网络 BiSeNet , RK1126 Npu 推理_第1张图片

记录下在rk1126上,实现 BiSeNet 网络推理.

https://github.com/CoinCheung/BiSeNet


ONNX

生成 onnx 模型

python tools/export_onnx.py --config configs/bisenetv2_city.py --weight-path ./checkpoints/model_final_v2_city.pth --outpath ./checkpoints/model_final_v2_city.onnx --no-onnxsim

转换 RKNN 模型

  1. 如果是自定义数据集,meanstd 参考自己数据集中的.
  2. datasets.txt 用于量化 ,datasets_ans 用于精度分析
  3. 量化类型使用: asymmetric_affine-u8
    from rknn.api import RKNN
     
    ONNX_MODEL = './model/model_final_v2_city.onnx'
    RKNN_MODEL = './model/model_final_v2_city_u8.rknn'
    
    QUANTIZE_ON = True
    _force_builtin_perm = False
    _acc_analysis_output_dir = './output_dir'
    _acc_analysis_dataset = './images/city/datasets_ans.txt'
    _qua_dataset = './images/city/datasets.txt'
    
    if __name__ == '__main__':
     
        # Create RKNN object
        rknn = RKNN(verbose=True)
     
        # pre-process config
        # asymmetric_affine-u8, dynamic_fixed_point-i8, dynamic_fixed_point-i16
        print('--> config model')
        rknn.config(
                    reorder_channel='0 1 2',
                    mean_values=[[83.0535, 94.095, 82.1865]],
                    std_values=[[53.856, 54.774, 75.786]],  
                    optimization_level=3,
                    target_platform = 'rv1126',
                    quantize_input_node= QUANTIZE_ON,
                    quantized_dtype='asymmetric_affine-u8',
                    batch_size=32,
                    output_optimize=1,
                    force_builtin_perm=_force_builtin_perm)               
        print('done')
     
        print('--> Loading model')
        # ret = rknn.load_onnx(model=ONNX_MODEL, outputs=['output0', 'output1'])
        ret = rknn.load_onnx(model=ONNX_MODEL, outputs=['preds'])
        if ret != 0:
            print('Load model failed!')
            exit(ret)
        print('done')
     
        # Build model
        print('--> Building model')
        ret = rknn.build(do_quantization=QUANTIZE_ON, dataset=_qua_dataset,pre_compile=True) 
        if ret != 0:
            print('Build pp_liteseg_stdc1_camvid_960x720_10k_model failed!')
            exit(ret)
        print('done')
     
        # Export rknn model
        print('--> Export RKNN model')
        ret = rknn.export_rknn(RKNN_MODEL)
        if ret != 0:
            print('Export  failed!')
            exit(ret)
        print('done')
        
        print('--> Accuracy analysis')
        ret = rknn.accuracy_analysis(inputs=_acc_analysis_dataset,output_dir=_acc_analysis_output_dir)
        if ret != 0:
            print('accuracy_analysis failed!')
            exit(ret)
        print('done')
     
        rknn.release()
    

NPU 推理

  1. 主要参考作者给的 ncnntensorrt demo 实现.
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include "rknn_api.h"
    #include "opencv2/opencv.hpp"
    #include "opencv2/core/core.hpp"
    #include "opencv2/imgproc/imgproc.hpp"
    #include "opencv2/highgui/highgui.hpp"
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    using namespace cv;
    
    void printRKNNTensor(rknn_tensor_attr *attr)
    {
        printf("index=%d name=%s n_dims=%d dims=[%d %d %d %d] n_elems=%d size=%d "
               "fmt=%d type=%d qnt_type=%d fl=%d zp=%d scale=%f\n",
               attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2],
               attr->dims[1], attr->dims[0], attr->n_elems, attr->size, 0, attr->type,
               attr->qnt_type, attr->fl, attr->zp, attr->scale);
    }
    
    vector<vector<uint8_t>> get_color_map()
    {
        vector<vector<uint8_t>> color_map(256, vector<uint8_t>(3));
        std::minstd_rand rand_eng(123);
        std::uniform_int_distribution<uint8_t> u(0, 255);
        for (int i{0}; i < 256; ++i)
        {
            for (int j{0}; j < 3; ++j)
            {
                color_map[i][j] = u(rand_eng);
            }
        }
        return color_map;
    }
    
    cv::Mat static_resize(cv::Mat &img, int INPUT_W, int INPUT_H)
    {
        float r = std::min(INPUT_W / (img.cols * 1.0), INPUT_H / (img.rows * 1.0));
        // r = std::min(r, 1.0f);
        int unpad_w = r * img.cols;
        int unpad_h = r * img.rows;
        cv::Mat re(unpad_h, unpad_w, CV_8UC3);
        cv::resize(img, re, re.size());
        cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(114, 114, 114));
        re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
        return out;
    }
    
    int main(int argc, char *argv[])
    {
    
        std::string model_path = std::string(argv[1]);
        // std::string imagepath = std::string(argv[2]);
        std::string folder_path = std::string(argv[2]);
        int input_width_ = std::atoi(argv[3]);
        int input_height_ = std::atoi(argv[4]);
    
        std::vector<cv::String> file_names;
        cv::glob(folder_path, file_names);
    
        int oH{input_height_}, oW{input_width_}, n_classes{2};
    
        // Load model
        FILE *fp = fopen(model_path.c_str(), "rb");
        if (fp == NULL)
        {
            printf("fopen %s fail!\n", model_path);
            return -1;
        }
        fseek(fp, 0, SEEK_END);
        int model_len = ftell(fp);
        void *model = malloc(model_len);
        fseek(fp, 0, SEEK_SET);
        if (model_len != fread(model, 1, model_len, fp))
        {
            printf("fread %s fail!\n", model_path);
            free(model);
            return -1;
        }
    
        rknn_context ctx = 0;
    
        int ret = rknn_init(&ctx, model, model_len, 0);
        if (ret < 0)
        {
            printf("rknn_init fail! ret=%d\n", ret);
            return -1;
        }
    
        /* Query sdk version */
        rknn_sdk_version version;
        ret = rknn_query(ctx, RKNN_QUERY_SDK_VERSION, &version,
                         sizeof(rknn_sdk_version));
        if (ret < 0)
        {
            printf("rknn_init error ret=%d\n", ret);
            return -1;
        }
        printf("sdk version: %s driver version: %s\n", version.api_version,
               version.drv_version);
    
        /* Get input,output attr */
        rknn_input_output_num io_num;
        ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
        if (ret < 0)
        {
            printf("rknn_init error ret=%d\n", ret);
            return -1;
        }
        printf("model input num: %d, output num: %d\n", io_num.n_input,
               io_num.n_output);
    
        rknn_tensor_attr input_attrs[io_num.n_input];
        memset(input_attrs, 0, sizeof(input_attrs));
        for (int i = 0; i < io_num.n_input; i++)
        {
            input_attrs[i].index = i;
            ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]),
                             sizeof(rknn_tensor_attr));
            if (ret < 0)
            {
                printf("rknn_init error ret=%d\n", ret);
                return -1;
            }
            printRKNNTensor(&(input_attrs[i]));
        }
    
        rknn_tensor_attr output_attrs[io_num.n_output];
        memset(output_attrs, 0, sizeof(output_attrs));
        for (int i = 0; i < io_num.n_output; i++)
        {
            output_attrs[i].index = i;
            ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]),
                             sizeof(rknn_tensor_attr));
            printRKNNTensor(&(output_attrs[i]));
        }
    
        int input_channel = 3;
        int input_width = 0;
        int input_height = 0;
        if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
        {
            printf("model is NCHW input fmt\n");
            input_width = input_attrs[0].dims[0];
            input_height = input_attrs[0].dims[1];
            printf("input_width=%d input_height=%d\n", input_width, input_height);
        }
        else
        {
            printf("model is NHWC input fmt\n");
            input_width = input_attrs[0].dims[1];
            input_height = input_attrs[0].dims[2];
            printf("input_width=%d input_height=%d\n", input_width, input_height);
        }
    
        printf("model input height=%d, width=%d, channel=%d\n", input_height, input_width,
               input_channel);
    
        for (size_t i = 0; i < file_names.size(); i++)
        {
            cv::Mat im = cv::imread(file_names[i]);
            auto t1 = std::chrono::steady_clock::now();
            Mat pr_img;
            cv::resize(im, pr_img, cv::Size(oW, oH));
            cv::cvtColor(pr_img, pr_img, cv::COLOR_BGR2RGB);
            /* Init input tensor */
            rknn_input inputs[1];
            memset(inputs, 0, sizeof(inputs));
            inputs[0].index = 0;
            inputs[0].buf = pr_img.data;
            inputs[0].type = RKNN_TENSOR_UINT8;
            inputs[0].size = input_width * input_height * input_channel;
            inputs[0].fmt = RKNN_TENSOR_NHWC;
            inputs[0].pass_through = 0;
    
            /* Init output tensor */
            rknn_output outputs[io_num.n_output];
            memset(outputs, 0, sizeof(outputs));
            for (int i = 0; i < io_num.n_output; i++)
            {
                outputs[i].want_float = 1;
            }
            rknn_inputs_set(ctx, io_num.n_input, inputs);
            ret = rknn_run(ctx, NULL);
            if (ret < 0)
            {
                printf("ctx error ret=%d\n", ret);
                return -1;
            }
            ret = rknn_outputs_get(ctx, io_num.n_output, outputs, NULL);
            if (ret < 0)
            {
                printf("outputs error ret=%d\n", ret);
                return -1;
            }
            vector<vector<uint8_t>> color_map = get_color_map();
            cv::Mat pred(cv::Size(oW, oH), CV_8UC3);
            int o_size = input_width * input_height * 4;
            float *prob = new float[o_size];
            memcpy(prob, (float *)outputs[0].buf, o_size);
            int idx{0};
            for (int i{0}; i < oH; ++i)
            {
                uint8_t *ptr = pred.ptr<uint8_t>(i);
                for (int j{0}; j < oW; ++j)
                {
                    ptr[0] = color_map[prob[idx]][0];
                    ptr[1] = color_map[prob[idx]][1];
                    ptr[2] = color_map[prob[idx]][2];
                    ptr += 3;
                    ++idx;
                }
            }
            // resize back and save
            cv::resize(pred, pred, im.size(), cv::INTER_CUBIC);
            cv::imwrite(cv::format("./out/%d.jpg", i), pred);
            ret = rknn_outputs_release(ctx, io_num.n_output, outputs);
            if (ret < 0)
            {
                printf("rknn_query fail! ret=%d\n", ret);
                goto Error;
            }
        }
    
    Error:
        if (ctx > 0)
            rknn_destroy(ctx);
        if (model)
            free(model);
        if (fp)
            fclose(fp);
        return 0;
    }
    
  2. 使用 adb 将程序拷贝到板载上调用
    ./bisenet_seg_npu_sample ./model/model_final_v2_city_u8.rknn ./images 1024 512

实时语义分割网络 BiSeNet , RK1126 Npu 推理_第2张图片
实时语义分割网络 BiSeNet , RK1126 Npu 推理_第3张图片
实时语义分割网络 BiSeNet , RK1126 Npu 推理_第4张图片

混合量化

  1. 如果感觉模型识别效果不是那么好,可以尝试这使用混合量化,找到一个速度兼精度的一个平衡点.

  2. hybrid_quantization_step1.py

    from rknn.api import RKNN
    
    ONNX_MODEL = './model/model_final_v2_city.onnx'
    RKNN_MODEL = './model/model_final_v2_city_u8.rknn'
    
    QUANTIZE_ON = True
    
    _qua_dataset = './images/city/datasets.txt'
    
    _force_builtin_perm = False
    
    if __name__ == '__main__':
    
        # Create RKNN object
        rknn = RKNN()
        
        # model config
        print('--> Config model')
        rknn.config(reorder_channel='0 1 2',
                    mean_values=[[83.0535, 94.095, 82.1865]],
                    std_values=[[53.856, 54.774, 75.786]], 
                    optimization_level=3,
                    target_platform='rk1126',
                    output_optimize=1,
                    quantized_dtype='asymmetric_affine-u8',
                    quantize_input_node= QUANTIZE_ON,  
                    batch_size=32,
                    force_builtin_perm=False
                )   
        print('done')
    
        # Load onnx model
        print('--> Loading model')
        ret = rknn.load_onnx(model=ONNX_MODEL)
        if ret != 0:
            print('Load model failed!')
            exit(ret)
        print('done')
    
        # Hybrid quantization step1
        print('--> hybrid_quantization_step1')
        ret = rknn.hybrid_quantization_step1(dataset=_qua_dataset)
        if ret != 0:
            print('hybrid_quantization_step1 failed!')
            exit(ret)
        print('done')
        print('==================================================================================================')
    
        rknn.release()
    
  3. hybrid_quantization_step2.py, 根据精度分析结果,在 torchjitexport.quantization.cfg 中 ,将误差较大的层,换成 floatdynamic_fixed_point-i16 等精度高的量化类型.

    from rknn.api import RKNN
    
    ONNX_MODEL = './model/model_final_v2_city.onnx'
    RKNN_MODEL = './model/model_final_v2_city_u8_hyqua.rknn'
    
    QUANTIZE_ON = True
    _force_builtin_perm = False
    
    _qua_dataset = './images/city/datasets.txt'
    
    if __name__ == '__main__':
    
        # Create RKNN object
        rknn = RKNN()
        
        # Set model config
        print('--> config model')
        rknn.config(reorder_channel='0 1 2',
                    mean_values=[[83.0535, 94.095, 82.1865]],
                    std_values=[[53.856, 54.774, 75.786]], 
                    optimization_level=3,
                    target_platform='rk1126',
                    output_optimize=1,
                    quantized_dtype='asymmetric_affine-u8',
                    quantize_input_node= QUANTIZE_ON,  
                    batch_size=32,
                    force_builtin_perm=False
                )   
        print('done')
    
        # Hybrid quantization step2
        print('--> hybrid_quantization_step2')
        ret = rknn.hybrid_quantization_step2(model_input='./torchjitexport.json',
                                             data_input='./torchjitexport.data',
                                             model_quantization_cfg='./torchjitexport.quantization.cfg',
                                             dataset=_qua_dataset, pre_compile=True)
        if ret != 0:
            print('hybrid_quantization_step2 failed!')
            exit(ret)
        print('done')
    
        # Export RKNN model
        print('--> Export RKNN model')
        ret = rknn.export_rknn(RKNN_MODEL)
        if ret != 0:
            print('Export RKNN model failed!')
            exit(ret)
        print('done')
    
        rknn.release()
    
    

END

  1. 以上差不多就是实现推理全部过程,有不对地方欢迎大佬们指正.

你可能感兴趣的:(Rockchip,BiSeNet,Npu,rk1126)