最近在研究ocr模型(包括文本检测和文本识别)在安卓端的部署,由于工作中用到的算法是基于百度研发的PPOCR算法,最终需要在安卓端落地应用,部署框架使用的是ncnn框架,中间涉及模型转换和部署的问题,所以特意在此做一个记录,本文主要讲一下模型部署的问题,关于模型转换的讲解详见我的另一篇文章:安卓端部署PPOCR的ncnn模型——模型转换
说到模型部署,顾名思义,就是通过部署框架编写相关代码使模型能够在终端应用里推理和调用,本文主要讲的是安卓端的OCR模型部署。
以下是ncnn的官方介绍:
在部署以前需要下载和编译相关库并将其放入安卓工程中,关于ncnn部署库的编译与使用可以详见我的另一篇文章:手写数字识别从训练到部署全流程详解——模型在Android端的部署
下面开始正题
public static Bitmap resizeWithStep(Bitmap bitmap, int maxLength, int step) {
int width = bitmap.getWidth();
int height = bitmap.getHeight();
int maxWH = Math.max(width, height);
float ratio = 1;
int newWidth = width;
int newHeight = height;
if (maxWH > maxLength) {
ratio = maxLength * 1.0f / maxWH;
newWidth = (int) Math.floor(ratio * width);
newHeight = (int) Math.floor(ratio * height);
}
newWidth = newWidth - newWidth % step;
if (newWidth == 0) {
newWidth = step;
}
newHeight = newHeight - newHeight % step;
if (newHeight == 0) {
newHeight = step;
}
return Bitmap.createScaledBitmap(bitmap, newWidth, newHeight, true);
}
以上变换方式是目前最通用的方式,虽然比较简单,但有个缺点就是可能会稍微改变原图比例,使原图内容产生一定形变,尤其在原图本身尺寸比较小的情况,从而影响文本区域检测精度,目前我还研究了一种通过边缘扩展的方式以保持原图的比例。
关键代码如下:
public static ResizeInfoEntity resizeWithBorder(Bitmap currentBitmap, int maxLength, int step){
ResizeInfoEntity resizeInfoEntity = new ResizeInfoEntity();
resizeInfoEntity.setOriginBitmap(currentBitmap);
int width = currentBitmap.getWidth();
int height = currentBitmap.getHeight();
int newWidth = width;
int newHeight = height;
float newWidthRatio = 1.0f;
float newHeightRatio = 1.0f;
float ratio = 1.0f;
//构造32倍数尺寸背景图
if (newWidth % step != 0) {
newWidth = (newWidth / step + 1) * step;
newWidth = Math.max(newWidth, step);
}
if(newWidth>maxLength){
newWidthRatio = maxLength * 1.0f / newWidth;
}
if (newHeight % step != 0) {
newHeight = (newHeight / step + 1) * step;
newHeight = Math.max(newHeight, step);
}
if(newHeight>maxLength){
newHeightRatio = maxLength * 1.0f / newHeight;
}
Bitmap targetBitmap = null;
Canvas tgCanvas = null;
int newWidthTemp = newWidth;
int newHeightTemp = newHeight;
ratio = Math.min(newWidthRatio, newHeightRatio);
if(ratio<1.0f){
//构造32倍数尺寸缩放背景图
newWidth = (int) Math.floor(ratio * newWidth);
newHeight = (int) Math.floor(ratio * newHeight);
if (newWidth % step != 0) {
newWidth = (newWidth / step) * step;
newWidth = Math.max(newWidth, step);
}
newWidthRatio = newWidth * 1.0f / newWidthTemp;
if (newHeight % step != 0) {
newHeight = (newHeight / step) * step;
newHeight = Math.max(newHeight, step);
}
newHeightRatio = newHeight * 1.0f / newHeightTemp;
ratio = Math.min(newWidthRatio, newHeightRatio);
targetBitmap = Bitmap.createBitmap(newWidth, newHeight, Bitmap.Config.ARGB_8888);
tgCanvas = new Canvas(targetBitmap);
tgCanvas.drawARGB(255, 255, 255, 255);
tgCanvas.drawBitmap(Bitmap.createScaledBitmap(currentBitmap, (int) Math.floor(currentBitmap.getWidth()*ratio), (int) Math.floor(currentBitmap.getHeight()*ratio), true), 0, 0, null);
resizeInfoEntity.setTargetBitmap(targetBitmap);
}else{
targetBitmap = Bitmap.createBitmap(newWidth, newHeight, Bitmap.Config.ARGB_8888);
tgCanvas = new Canvas(targetBitmap);
tgCanvas.drawARGB(255, 255, 255, 255);
tgCanvas.drawBitmap(currentBitmap, 0, 0, null);
resizeInfoEntity.setTargetBitmap(targetBitmap);
}
resizeInfoEntity.setRatio(ratio);
return resizeInfoEntity;
}
输入节点:x
运行推理
输出节点:save_infer_model/scale_0.tmp_1
关键代码
ncnn::Mat input = ncnn::Mat::from_android_bitmap(env, inputBitmap, ncnn::Mat::PIXEL_BGR2RGB);
ncnn::Extractor extractor = net->create_extractor();
ncnn::Mat out;
const float meanValues[3] = {127.5f, 127.5f, 127.5f};
const float normValues[3] = {1.0f / 127.5f, 1.0f / 127.5f, 1.0f / 127.5f};
input.substract_mean_normalize(meanValues, normValues);
extractor.input("x", input);
extractor.extract("save_infer_model/scale_0.tmp_1", out);
cv::Mat pred_map = cv::Mat::zeros(pred_height, pred_width, CV_32FC1);
memcpy(pred_map.data, pred, pred_size * sizeof(float));
cv::Mat norfMapMat;
norfMapMat = pred_map > boxThresh;
cv::Mat cbuf_map;
norfMapMat.convertTo(cbuf_map, CV_8UC1);
cv::Mat mask_map;
cv::Mat dilation_kernel = (cv::Mat_<uint8_t>(2,2) << 1, 1, 1, 1);
cv::dilate(cbuf_map, mask_map, dilation_kernel);
std::vector<TextBox> boxes = boxes_from_bitmap(pred_map, mask_map, boxScoreThresh, unClipRatio);
//倾斜校正
std::vector<cv::Mat> partImages;
for (int i = 0; i < textBoxes.size(); ++i) {
cv::Mat partImg = getRotateCropImage(src, textBoxes[i].boxPoint);
partImages.emplace_back(partImg);
}
return partImages;
//尺寸变换
float scale = (float) 32 / (float) src.rows;
float dstWidth = int((float) src.cols * scale / (float) 16 + 0.5f) * 16;
cv::Mat srcResize;
cv::resize(src, srcResize, cv::Size(dstWidth, 32));
注:在精度测试中,宽为16的倍数的词条图像尺寸调整后会造成识别精度会下降,虽然影响性能发挥,但还是应优先保证精度,性能问题主要出在最后Softmax_0网络层耗时不稳定所致,正等ncnn作者来解决。故词条宽度的调整应根据原宽高比例来,关键代码如下
float scale = (float)src.cols / (float)src.rows;
int dstWidth = (int)((float)32 * scale);
cv::Mat srcResize;
cv::resize(src, srcResize, cv::Size(dstWidth, 32));
ncnn::Mat input = ncnn::Mat::from_pixels(srcResize.data, ncnn::Mat::PIXEL_BGR2RGB, srcResize.cols, srcResize.rows);
const float meanValues[3] = {127.5f, 127.5f, 127.5f};
const float normValues[3] = {1.0f / 127.5f, 1.0f / 127.5f, 1.0f / 127.5f};
input.substract_mean_normalize(meanValues, normValues);
ncnn::Mat out;
ncnn::Extractor extractor = net->create_extractor();
extractor.input("x", input);
extractor.extract("save_infer_model/scale_0.tmp_1", out);
//字典解析
char *buffer = readKeysFromAssets(mgr);
if (buffer != nullptr) {
std::istringstream inStr(buffer);
std::string line;
int size = 0;
while (getline(inStr, line)) {
keys.emplace_back(line);
size++;
}
free(buffer);
LOGI("keys size(%d)", size);
}
//字符预测概率排序/对应/拼接
int keySize = keys.size();
std::string strRes;
std::vector<float> scores;
int lastIndex = 0;
int maxIndex;
float maxValue;
for (int i = 0; i < h; i++) {
maxIndex = 0;
maxValue = -1000.f;
for (int j = 0; j < w; j++) {
if (outputData[i * w + j] > maxValue) {
maxValue = outputData[i * w + j];
maxIndex = j;
}
}
if (maxIndex > 0 && maxIndex < keySize && (!(i > 0 && maxIndex == lastIndex))) {
scores.emplace_back(maxValue);
strRes.append(keys[maxIndex - 1]);
}
lastIndex = maxIndex;
}
return {strRes, scores};
...
// input params
fprintf(stderr, "loop_count = %d\n", g_loop_count);//100
fprintf(stderr, "num_threads = %d\n", num_threads);//4
fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());//0
fprintf(stderr, "gpu_device = %d\n", gpu_device);//-1
fprintf(stderr, "cooling_down = %d\n", (int)g_enable_cooling_down);//1
// run det benchmark
benchmark("det_model", ncnn::Mat(320, 320, 3), opt);
benchmark("det_model", ncnn::Mat(480, 480, 3), opt);
benchmark("det_model", ncnn::Mat(640, 640, 3), opt);
// run rec benchmark
benchmark("rec_model", ncnn::Mat(272, 32, 3), opt);
benchmark("rec_model", ncnn::Mat(592, 32, 3), opt);
benchmark("rec_model", ncnn::Mat(816, 32, 3), opt);