[RKNN] 1. 入门介绍
[RKNN] 2. 模型转换和推理–API介绍&以yolox为例
[RKNN] 3. 零拷贝接口推理
[RKNN] 4. 基于零拷贝接口封装
Rknn提供了两套C的API,分别是通用接口和零拷贝接口,上一篇文章使用通用接口进行推理,本文主要采用零拷贝接口进行推理。同时兄弟们也可以注意到百度的FastDeploy在rknpu上也是采用的零拷贝接口,应该是相对性能会好一点。
整个推理过程和通用接口基本一致,主要是在输入输出内存的设置部分不同,我先把这部分的对比放出来,然后把整个代码贴上去。当然也可以直接去Github查看所有的代码。不过因为不是通用数据集,我没有上传我的权重文件,建议只参考代码流程。
在初始化时,通用API需要rknn_input
,rknn_output
结构体实现输出输出数据初始化。零拷贝则是创建rknn_tensor_mem
类型的结构构体变量申请内存,并绑定到ctx的输入输出中。
通用API
// 初始化输入
rknn_input inputs[1];
memset(inputs, 0, sizeof(inputs));
inputs[0].index = 0;
inputs[0].type = RKNN_TENSOR_UINT8;
inputs[0].size = width * height * channel;
inputs[0].fmt = RKNN_TENSOR_NHWC;
inputs[0].pass_through = 0;
// 初始化输出
rknn_output outputs[io_num.n_output];
memset(outputs, 0, sizeof(outputs));
for (int i = 0; i < io_num.n_output; i++) {
outputs[i].want_float = false; // 输出是u8类型。 true则在内部转成fp后再输出
}
零拷贝API
// 输入输出内存
rknn_tensor_mem* input_mems[1];
rknn_tensor_mem* output_mems[1];
input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
output_mems[0] = rknn_create_mem(ctx, output_attrs[0].n_elems * sizeof(int8_t));
// 设置输入输出类型
input_attrs[0].type = RKNN_TENSOR_UINT8;
output_attrs[0].type = RKNN_TENSOR_INT8;
CHECK_RKNN(rknn_set_io_mem(ctx, input_mems[0], &input_attrs[0]));
CHECK_RKNN(rknn_set_io_mem(ctx, output_mems[0], &output_attrs[0]));
通用API需要利用rknn_inputs_set
设置输入,零拷贝API则直接将数据拷贝到之前申请好的内存即可。
通用API
inputs[0].buf = (void*)img_out.data;
CHECK_RKNN(rknn_inputs_set(ctx, io_num.n_input, inputs));
零拷贝API
memcpy(input_mems[0]->virt_addr, img_out.data, input_attrs[0].size_with_stride);
通用API需要利用rknn_outputs_get
获取输出,零拷贝API则直接读取之前申请好的输出部分的内存即可。
通用API
CHECK_RKNN(rknn_outputs_get(ctx, io_num.n_output, outputs, NULL));
auto result = (int8_t *)outputs->buf;
零拷贝API
auto result = (int8_t *)output_mems[0]->virt_addr;
#include
#include
#include
#include "rga.h"
#include "im2d.h"
#include "rknn_api.h"
#include "opencv2/opencv.hpp"
#include "tools.hpp"
#include "postprocess.hpp"
// 打印信息
static void dump_tensor_attr(rknn_tensor_attr* attr){
std::string shape_str = attr->n_dims < 1 ? "" : std::to_string(attr->dims[0]);
for (int i = 1; i < attr->n_dims; ++i) {
shape_str += ", " + std::to_string(attr->dims[i]);
}
printf(" index=%d, name=%s, n_dims=%d, dims=[%s], n_elems=%d, size=%d, w_stride = %d, size_with_stride=%d, fmt=%s, "
"type=%s, qnt_type=%s, "
"zp=%d, scale=%f\n",
attr->index, attr->name, attr->n_dims, shape_str.c_str(), attr->n_elems, attr->size, attr->w_stride,
attr->size_with_stride, get_format_string(attr->fmt), get_type_string(attr->type),
get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
}
// resize
cv::Mat static_resize(cv::Mat& img, int input_w, int input_h) {
float r = std::min(input_w / (img.cols*1.0), input_h / (img.rows*1.0));
// r = std::min(r, 1.0f);
int unpad_w = r * img.cols;
int unpad_h = r * img.rows;
cv::Mat re(unpad_h, unpad_w, CV_8UC3);
cv::resize(img, re, re.size());
cv::Mat out(input_h, input_w, CV_8UC3, cv::Scalar(114, 114, 114));
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
int main(){
std::string model_name = "../../1convert/yolox_relu_nodecode.rknn";
std::string image_name = "../img/1.jpg";
const float nms_threshold = 0.65;
const float box_conf_threshold = 0.45;
rknn_context ctx;
std::vector<rknn_tensor_attr> input_attrs;
std::vector<rknn_tensor_attr> output_attrs;
// 反量化参数
std::vector<float> out_scales;
std::vector<int32_t> out_zps;
// 加载文件
int model_data_size = 0;
unsigned char* model_data = load_model(model_name.c_str(), &model_data_size);
// 初始化
CHECK_RKNN(rknn_init(&ctx, model_data, model_data_size, 0, NULL));
// 指定npu核
// rknn_core_mask core_mask = RKNN_NPU_CORE_0_1_2;
// CHECK_RKNN(rknn_set_core_mask(ctx, core_mask));
// 获取&打印版本信息
rknn_sdk_version version;
CHECK_RKNN(rknn_query(ctx, RKNN_QUERY_SDK_VERSION, &version, sizeof(rknn_sdk_version)));
printf("sdk version: %s driver version: %s\n", version.api_version, version.drv_version);
// 获取&打印输入输出数量
rknn_input_output_num io_num;
CHECK_RKNN(rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)));
printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
// 获取&打印input信息
input_attrs.resize(io_num.n_input);
// memset(input_attrs, 0, sizeof(input_attrs));
for (int i = 0; i < io_num.n_input; i++) {
input_attrs[i].index = i;
CHECK_RKNN(rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)));
printf("input information: ");
dump_tensor_attr(&(input_attrs[i]));
}
// 获取&打印output信息
output_attrs.resize(io_num.n_output);
// memset(output_attrs, 0, sizeof(output_attrs));
for (int i = 0; i < io_num.n_output; i++) {
output_attrs[i].index = i;
CHECK_RKNN(rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)));
printf("output information:");
dump_tensor_attr(&(output_attrs[i]));
}
int channel = 3;
int width = 0;
int height = 0;
if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) {
printf("model is NCHW input fmt\n");
channel = input_attrs[0].dims[1];
height = input_attrs[0].dims[2];
width = input_attrs[0].dims[3];
} else {
printf("model is NHWC input fmt\n");
height = input_attrs[0].dims[1];
width = input_attrs[0].dims[2];
channel = input_attrs[0].dims[3];
}
// 初始化后处理类
for (int i = 0; i < io_num.n_output; ++i) {
out_scales.push_back(output_attrs[i].scale);
out_zps.push_back(output_attrs[i].zp);
}
std::shared_ptr<YoloxPostProcess> post_process = std::make_shared<YoloxPostProcess>(height, box_conf_threshold, nms_threshold, output_attrs);
// 读取图片
cv::Mat img = cv::imread(image_name, 1);
// cv::cvtColor(orig_img, img, cv::COLOR_BGR2RGB);
// 预处理
float scale = std::min(width / (img.cols*1.0), height / (img.rows*1.0));
auto img_out = static_resize(img, width, height);
// 输入输出内存
rknn_tensor_mem* input_mems[1];
rknn_tensor_mem* output_mems[1];
input_mems[0] = rknn_create_mem(ctx, input_attrs[0].size_with_stride);
output_mems[0] = rknn_create_mem(ctx, output_attrs[0].n_elems * sizeof(int8_t));
// 输入输出类型
input_attrs[0].type = RKNN_TENSOR_UINT8;
output_attrs[0].type = RKNN_TENSOR_INT8;
// 绑定
CHECK_RKNN(rknn_set_io_mem(ctx, input_mems[0], &input_attrs[0]));
CHECK_RKNN(rknn_set_io_mem(ctx, output_mems[0], &output_attrs[0]));
memcpy(input_mems[0]->virt_addr, img_out.data, input_attrs[0].size_with_stride);
// 推理
CHECK_RKNN(rknn_run(ctx, NULL));
// 后处理
auto res = post_process->process((int8_t *)output_mems[0]->virt_addr, out_zps, out_scales);
// 打印结果
printf("res size: %ld\n", res.size());
for (auto a : res) {
a.x1 /= scale;
a.y1 /= scale;
a.x2 /= scale;
a.y2 /= scale;
std::cout<<a.x1<<" "<<a.y1<<" "<<a.x2<<" "<<a.y2 <<" "<<a.score<<" "<<a.category<< std::endl;
cv::rectangle(img, cv::Point(a.x1, a.y1), cv::Point(a.x2, a.y2), cv::Scalar(255, 0, 0, 255), 3);
cv::putText(img, std::to_string(a.category), cv::Point(a.x1, a.y1 + 12), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
}
cv::imwrite("./out.jpg", img);
// 测速
int test_count = 200;
// warmup
for (int i = 0; i < 50; ++i) {
auto img_out = static_resize(img, width, height);
memcpy(input_mems[0]->virt_addr, img_out.data, input_attrs[0].size_with_stride);
CHECK_RKNN(rknn_run(ctx, NULL));
auto res = post_process->process((int8_t *)output_mems[0]->virt_addr, out_zps, out_scales);
}
auto start = std::chrono::system_clock::now();
for (int i = 0; i < test_count; ++i) {
auto img_out = static_resize(img, width, height);
memcpy(input_mems[0]->virt_addr, img_out.data, input_attrs[0].size_with_stride);
CHECK_RKNN(rknn_run(ctx, NULL));
auto res = post_process->process((int8_t *)output_mems[0]->virt_addr, out_zps, out_scales);
}
auto end = std::chrono::system_clock::now();
float infer_time = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() ;
printf("运行 %d 次,平均耗时 %f ms\n", test_count, infer_time / (float)test_count);
// release
CHECK_RKNN(rknn_destroy_mem(ctx, input_mems[0]));
CHECK_RKNN(rknn_destroy_mem(ctx, output_mems[0]));
CHECK_RKNN(rknn_destroy(ctx));
if (model_data) {
free(model_data);
}
return 0;
}
warmup轮数50,循环推理1000次,计算平均耗时,包括预处理和后处理的总时间。npu只用了一个核心。
型号 | 速度 |
---|---|
通用API | 30-32ms |
零拷贝API | 30-31ms |
我发现重复运行每次的时间都不太一致,我记得之前看文档,说固定一些频率会会更稳定,去试试。
下面操作均在root
用户下。不过我固定了cpu和npu频率后还是会有波动,不算大。
找到一个脚本
# 请切换到root用户
# CPU定频
echo "CPU0-3 可用频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy0/scaling_available_frequencies
sudo echo userspace > /sys/devices/system/cpu/cpufreq/policy0/scaling_governor
sudo echo 1800000 > /sys/devices/system/cpu/cpufreq/policy0/scaling_setspeed
echo "CPU0-3 当前频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy0/cpuinfo_cur_freq
echo "CPU4-5 可用频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy4/scaling_available_frequencies
sudo echo userspace > /sys/devices/system/cpu/cpufreq/policy4/scaling_governor
sudo echo 2400000 > /sys/devices/system/cpu/cpufreq/policy4/scaling_setspeed
echo "CPU4-5 当前频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy4/cpuinfo_cur_freq
echo "CPU6-7 可用频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy6/scaling_available_frequencies
sudo echo userspace > /sys/devices/system/cpu/cpufreq/policy6/scaling_governor
sudo echo 2400000 > /sys/devices/system/cpu/cpufreq/policy6/scaling_setspeed
echo "CPU6-7 当前频率:"
sudo cat /sys/devices/system/cpu/cpufreq/policy6/cpuinfo_cur_freq
# NPU定频
echo "NPU 可用频率:"
sudo cat /sys/class/devfreq/fdab0000.npu/available_frequencies
sudo echo userspace > /sys/class/devfreq/fdab0000.npu/governor
sudo echo 1000000000 > /sys/class/devfreq/fdab0000.npu/userspace/set_freq
echo "NPU 当前频率:"
sudo cat /sys/class/devfreq/fdab0000.npu/cur_freq
# 查看当前频率
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq
# 1800000
# 列出所有频率
cat /sys/devices/system/cpu/cpufreq/policy0/scaling_available_frequencies
# 408000 600000 816000 1008000 1200000 1416000 1608000 1800000
# 设置频率
echo userspace > /sys/devices/system/cpu/cpufreq/policy0/scaling_governor
echo 1800000 > /sys/devices/system/cpu/cpufreq/policy0/scaling_setspeed
# 查看频率
cat /sys/kernel/debug/clk/clk_summary | grep clk_npu_dsu0
# clk_npu_dsu0 3 6 0 250000000 0 0 50000
# 查看可用频率
cat /sys/class/devfreq/fdab0000.npu/available_frequencies
# 300000000 400000000 500000000 600000000 700000000 800000000 900000000 1000000000
# 设置NPU 频率,例如,设置1GHz
echo 1000000000 > /sys/kernel/debug/clk/clk_npu_dsu0/clk_rate
我看文档是可以调整的,但是不知道为什么我的板子里没有。
# 查看DDR 可用频率
cat /sys/class/devfreq/dmc/available_frequencies
# 设置DDR 频率,例如,设置1560MHz
echo userspace > /sys/class/devfreq/dmc/governor
echo 1560000000 > /sys/class/devfreq/dmc/userspace/set_freq