我们使用mmdetection训练了一个以mobilenet-v2为backbone的SSD模型,将其从Pytorch转化为onnx,并使用TVM运行。当前TVM的文档中仅有一个基于mxnet的SSD模型的tutorial,与我们期望的不符。因此我们自己实现了从onnx到TVM的程序代码,并使用了TVM中的INT-8量化,将其部署到树莓派4B上。
完整代码
import time
import cv2
import numpy as np
import onnx
import tvm
import tvm.contrib.graph_runtime as runtime
import tvm.relay as relay
# preprocess
image_path = '9331584514251_.pic_hd.jpg'
image = cv2.imread(image_path)
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
resize_shape = (300, 300)
img = cv2.resize(img, resize_shape, interpolation=cv2.INTER_LINEAR)
mean = np.array([123.675, 116.28, 103.53]).reshape(1, -1)
std = np.array([1., 1., 1.]).reshape(1, -1)
img = img.astype(np.float32)
img = cv2.subtract(img, mean)
img = cv2.multiply(img, std)
img = img.transpose(2, 0, 1)
# load onnx model and build tvm runtime
target = 'llvm'
ctx = tvm.context(target)
dtype = 'float32'
mssd = onnx.load('mssd.onnx')
input_blob = mssd.graph.input[0]
input_shape = tuple(map(lambda x: getattr(x, 'dim_value'), input_blob.type.tensor_type.shape.dim))
shape_dict = {
input_blob.name: input_shape}
mod, params = relay.frontend.from_onnx(mssd, shape_dict)
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, target, params=params)
module = runtime.create(graph, lib, ctx)
# run
module.set_input(**params)
module.set_input(input_blob.name, tvm.nd.array(img))
module.run()
# postprocess
# generate anchor
from anchor import gen_anchors
mlvl_anchors = gen_anchors()
img_shape = image.shape
scale_factor = [img_shape[1] / resize_shape[1], img_shape[0] / resize_shape[0]] # x_scale, y_scale
from bbox_utils import get_bboxes_single
from easydict import EasyDict
cfg = dict(
nms=dict(iou_thr=0.45),
min_bbox_size=0,
score_thr=0.6,
max_per_img=200
)
cfg = EasyDict(cfg)
# get output
cls_score_list = [module.get_output(i).asnumpy()[0] for i in range(6)]
bbox_pred_list = [module.get_output(i + 6).asnumpy()[0] for i in range(6)]
# recover bbox
proposals = get_bboxes_single(cls_score_list, bbox_pred_list, mlvl_anchors, resize_shape, scale_factor, cfg, rescale=True)
from vis_bbox import imshow_det_bboxes
bboxes = proposals[0]
labels = proposals[1]
imshow_det_bboxes(image, bboxes, labels, score_thr=0.9, out_file='out.png')
mmdetection自带有目标检测的后处理代码,但是由于mmdetection是用Pytorch实现的,而Pytorch在树莓派上的安装十分麻烦,因此我们使用Numpy,基于mmdetection的后处理代码,重新实现了SSD后处理。
# generate anchor
from anchor import gen_anchors
mlvl_anchors = gen_anchors()
img_shape = image.shape
scale_factor = [img_shape[1] / resize_shape[1], img_shape[0] / resize_shape[0]] # x_scale, y_scale
from bbox_utils import get_bboxes_single
from easydict import EasyDict
cfg = dict(
nms=dict(type='nms', iou_thr=0.45),
min_bbox_size=0,
score_thr=0.6,
max_per_img=200
)
cfg = EasyDict(cfg)
# get output
post_start = time.time()
cls_score_list = [module.get_output(i).asnumpy()[0] for i in range(6)]
bbox_pred_list = [module.get_output(i + 6).asnumpy()[0] for i in range(6)]
# recover bbox
proposals = get_bboxes_single(cls_score_list, bbox_pred_list, mlvl_anchors, resize_shape, scale_factor, cfg,
rescale=True)
post_end = time.time()
from vis_bbox import imshow_det_bboxes
bboxes = proposals[0]
labels = proposals[1]
imshow_det_bboxes(image, bboxes, labels, score_thr=0.9, out_file='out.png')
后处理需要的文件如下:
# ./anchor.py
import numpy as np
from anchor_generator import AnchorGenerator
# generate anchor
def gen_anchors():
basesize_ratio_range = (0.2, 0.9)
in_channels = (32, 96, 320, 512, 256, 256)
input_size = 300
anchor_strides = (8, 16, 32, 64, 100, 300)
anchor_ratios = ([2], [2, 3], [2, 3], [2, 3], [2], [2])
min_ratio, max_ratio = basesize_ratio_range
min_ratio = int(min_ratio * 100)
max_ratio = int(max_ratio * 100)
step = int(np.floor(max_ratio - min_ratio) / (len(in_channels) - 2))
min_sizes = []
max_sizes = []
for r in range(int(min_ratio), int(max_ratio) + 1, step):
min_sizes.append(int(input_size * r / 100))
max_sizes.append(int(input_size * (r + step) / 100))
min_sizes.insert(0, int(input_size * 10 / 100))
max_sizes.insert(0, int(input_size * 20 / 100))
anchor_generators = []
for k in range(len(anchor_strides)):
base_size = min_sizes[k]
stride = anchor_strides[k]
ctr = ((stride - 1) / 2., (stride - 1) / 2.)
scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
ratios = [1.]
for r in anchor_ratios[k]:
ratios += [1 / r, r] # 4 or 6 ratio
anchor_generator = AnchorGenerator(
base_size, scales, ratios, scale_major=False, ctr=ctr)
indices = list(range(len(ratios)))
indices.insert(1, len(indices))
anchor_generator.base_anchors = anchor_generator.base_anchors[indices]
anchor_generators.append(anchor_generator)
feature_size = ((38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1))
mlvl_anchors = []
for feat_size, stride, anchor_generator in zip(feature_size, anchor_strides, anchor_generators):
anchor = anchor_generator.grid_anchors(feat_size, stride)
mlvl_anchors.append(anchor)
return mlvl_anchors
# ./anchor_generator.py
import numpy as np
class AnchorGenerator(object):
def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
self.base_size = base_size
self.scales = np.array(scales)
self.ratios = np.array(ratios)
self.scale_major = scale_major
self.ctr = ctr
self.base_anchors = self.gen_base_anchors()
@property
def num_base_anchors(self):
return self.base_anchors.shape[0]
def gen_base_anchors(self):
w = self.base_size
h = self.base_size
if self.ctr is None:
x_ctr = 0.5 * (w - 1)
y_ctr = 0.5 * (h - 1)
else:
x_ctr, y_ctr = self.ctr
h_ratios = np.sqrt(self.ratios)
w_ratios = 1 / h_ratios
if self.scale_major:
ws = (w * w_ratios[:, None] * self.scales[None, :]).flatten()
hs = (h * h_ratios[:, None] * self.scales[None, :]).flatten()
else:
ws = (w * self.scales[:, None] * w_ratios[None, :]).flatten()
hs = (h * self.scales[:, None] * h_ratios[None, :]).flatten()
# yapf: disable
base_anchors = np.stack(
[
x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
],
axis=-1).round()
# yapf: enable
return base_anchors
@staticmethod
def _meshgrid(x, y, row_major=True):
xx = np.tile(x, len(y))
yy = np.tile(y.reshape(-1, 1), (1, len(x))).flatten()
if row_major:
return xx, yy
else:
return yy, xx
def grid_anchors(self, featmap_size, stride=16):
base_anchors = self.base_anchors
feat_h, feat_w = featmap_size
shift_x = np.arange(0, feat_w) * stride
shift_y = np.arange(0, feat_h) * stride
shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
shifts = np.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1)
shifts = shifts.astype(base_anchors.dtype)
# first feat_w elements correspond to the first row of shifts
# add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
# shifted anchors (K, A, 4), reshape to (K*A, 4)
all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
all_anchors = all_anchors.reshape(-1, 4)
# first A rows correspond to A anchors of (0, 0) in feature map,
# then (0, 1), (0, 2), ...
return all_anchors
# ./bbox_utils.py
import numpy as np
from utils import sigmoid, softmax, addcmul, topk
from bbox_nms import multiclass_nms
def delta2bbox(rois,
deltas,
means=None,
stds=None,
max_shape=None,
wh_ratio_clip=16 / 1000):
if stds is None:
stds = [1, 1, 1, 1]
if means is None:
means = [0, 0, 0, 0]
means = np.tile(np.array(means, dtype=deltas.dtype), (1, deltas.shape[1] // 4))
stds = np.tile(np.array(stds, dtype=deltas.dtype), (1, deltas.shape[1] // 4))
denorm_deltas = deltas * stds + means
dx = denorm_deltas[:, 0::4]
dy = denorm_deltas[:, 1::4]
dw = denorm_deltas[:, 2::4]
dh = denorm_deltas[:, 3::4]
max_ratio = np.abs(np.log(wh_ratio_clip))
dw = np.clip(dw, -max_ratio, max_ratio)
dh = np.clip(dh, -max_ratio, max_ratio)
# Compute center of each roi
px = ((rois[:, 0] + rois[:, 2]) * 0.5)[:, np.newaxis]
py = ((rois[:, 1] + rois[:, 3]) * 0.5)[:, np.newaxis]
# Compute width/height of each roi
pw = (rois[:, 2] - rois[:, 0] + 1.0)[:, np.newaxis]
ph = (rois[:, 3] - rois[:, 1] + 1.0)[:, np.newaxis]
# Use exp(network energy) to enlarge/shrink each roi
gw = pw * np.exp(dw)
gh = ph * np.exp(dh)
# Use network energy to shift the center of each roi
gx = addcmul(px, pw, dx) # gx = px + pw * dx
gy = addcmul(py, ph, dy) # gy = py + ph * dy
# Convert center-xy/width/height to top-left, bottom-right
x1 = gx - gw * 0.5 + 0.5
y1 = gy - gh * 0.5 + 0.5
x2 = gx + gw * 0.5 - 0.5
y2 = gy + gh * 0.5 - 0.5
if max_shape is not None:
x1 = np.clip(x1, 0, max_shape[1] - 1)
y1 = np.clip(y1, 0, max_shape[0] - 1)
x2 = np.clip(x2, 0, max_shape[1] - 1)
y2 = np.clip(y2, 0, max_shape[0] - 1)
bboxes = np.stack([x1, y1, x2, y2], axis=-1).reshape(deltas.shape)
return bboxes
def get_bboxes_single(cls_score_list,
bbox_pred_list,
mlvl_anchors,
img_shape,
scale_factor,
cfg,
rescale=False):
assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
# ############# add #############
use_sigmoid_cls = False
cls_out_channels = 2
target_means = (.0, .0, .0, .0)
target_stds = (0.1, 0.1, 0.2, 0.2)
# ############# add #############
for cls_score, bbox_pred, anchors in zip(cls_score_list,
bbox_pred_list, mlvl_anchors):
assert cls_score.shape[-2:] == bbox_pred.shape[-2:]
cls_score = np.transpose(cls_score, (1, 2, 0)).reshape(-1, cls_out_channels)
if use_sigmoid_cls:
scores = sigmoid(cls_score)
else:
scores = softmax(cls_score)
bbox_pred = np.transpose(bbox_pred, (1, 2, 0)).reshape(-1, 4)
nms_pre = cfg.get('nms_pre', -1)
if 0 < nms_pre < scores.shape[0]:
# Get maximum scores for foreground classes.
if use_sigmoid_cls:
max_scores = scores.max(axis=1)
else:
max_scores, _ = scores[:, 1:].max(axis=1)
topk_inds = topk(max_scores, nms_pre, axis=1)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
bboxes = delta2bbox(anchors, bbox_pred, target_means,
target_stds, img_shape)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_bboxes = np.concatenate(mlvl_bboxes)
mlvl_scores = np.concatenate(mlvl_scores)
if use_sigmoid_cls:
# Add a dummy background class to the front when using sigmoid
padding = np.zeros((mlvl_scores.shape[0], 1), dtype=mlvl_scores.dtype)
mlvl_scores = np.concatenate([padding, mlvl_scores], axis=1)
det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
cfg.score_thr, cfg.nms,
cfg.max_per_img)
if rescale:
det_bboxes[:, 0] *= scale_factor[0]
det_bboxes[:, 1] *= scale_factor[1]
det_bboxes[:, 2] *= scale_factor[0]
det_bboxes[:, 3] *= scale_factor[1]
return det_bboxes, det_labels
# ./bbox_nms.py
import numpy as np
from utils import nms
def multiclass_nms(multi_bboxes,
multi_scores,
score_thr,
nms_cfg,
max_num=-1,
score_factors=None):
"""NMS for multi-class bboxes.
Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_scores (Tensor): shape (n, #class), where the 0th column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
will not be considered.
nms_thr (float): NMS IoU threshold
max_num (int): if there are more than max_num bboxes after NMS,
only top max_num will be kept.
score_factors (Tensor): The factors multiplied to scores before
applying NMS
Returns:
tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
are 0-based.
"""
num_classes = multi_scores.shape[1] - 1
# exclude background category
if multi_bboxes.shape[1] > 4:
bboxes = multi_bboxes.reshape(multi_scores.shape[0], -1, 4)[:, 1:]
else:
bboxes = multi_bboxes[:, None]
scores = multi_scores[:, 1:]
# filter out boxes with low scores
valid_mask = scores > score_thr
bboxes = bboxes[valid_mask]
if score_factors is not None:
scores = scores * score_factors[:, None]
scores = scores[valid_mask]
labels = valid_mask.nonzero()[1]
if bboxes.size == 0:
bboxes = np.zeros((0, 5), dtype=multi_bboxes.dtype)
labels = np.zeros((0, ), dtype=np.long)
return bboxes, labels
# Modified from https://github.com/pytorch/vision/blob
# /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
# strategy: in order to perform NMS independently per class.
# we add an offset to all the boxes. The offset is dependent
# only on the class idx, and is large enough so that boxes
# from different classes do not overlap
max_coordinate = bboxes.max()
offsets = labels.astype(bboxes.dtype) * (max_coordinate + 1)
bboxes_for_nms = bboxes + offsets[:, None]
scores = scores.astype(np.float64)
bboxes_for_nms = bboxes_for_nms.astype(np.float64)
nms_cfg_ = nms_cfg.copy()
keep = nms(bboxes_for_nms, scores, nms_cfg_.get('iou_thr', None))
bboxes = bboxes[keep]
scores = scores[keep]
labels = labels[keep]
if len(keep) > max_num:
inds = scores.argsort()[::-1]
inds = inds[:max_num]
bboxes = bboxes[inds]
scores = scores[inds]
labels = labels[inds]
return np.concatenate([bboxes, scores[:, None]], 1), labels
# ./utils.py
import numpy as np
def sigmoid(x: np.ndarray):
s = 1 / (1 + np.exp(-x))
return s
def softmax(x: np.ndarray):
x -= np.max(x, axis=1, keepdims=True)
x = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
return x
def addcmul(px: np.ndarray, pw: np.ndarray, dx: np.ndarray):
return px + pw * dx
def topk(x: np.ndarray, k: int, axis=1):
part = np.argpartition(x, k, axis=axis)
if axis == 0:
row_index = np.arange(x.shape[1])
sort_K = np.argsort(x[part[k + 1:, :], row_index], axis=axis)
return np.fliplr(part[k + 1:, :][sort_K, row_index])
else:
column_index = np.arange(x.shape[1 - axis])[:, None]
sort_K = np.argsort(x[column_index, part[:, k + 1:]], axis=axis)
return np.fliplr(part[:, k + 1:][column_index, sort_K])
def nms(dets, scores, prob_threshold):
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
score_index = scores.argsort()[::-1]
keep = []
while score_index.size > 0:
i = score_index[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[score_index[1:]])
yy1 = np.maximum(y1[i], y1[score_index[1:]])
xx2 = np.minimum(x2[i], x2[score_index[1:]])
yy2 = np.minimum(y2[i], y2[score_index[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
union = w * h
iou = union / (areas[i] + areas[score_index[1:]] - union)
ids = np.where(iou <= prob_threshold)[0]
score_index = score_index[ids + 1]
return keep
# ./vis_bbox.py
from enum import Enum
import numpy as np
import cv2
class Color(Enum):
"""An enum that defines common colors.
Contains red, green, blue, cyan, yellow, magenta, white and black.
"""
red = (0, 0, 255)
green = (0, 255, 0)
blue = (255, 0, 0)
cyan = (255, 255, 0)
yellow = (0, 255, 255)
magenta = (255, 0, 255)
white = (255, 255, 255)
black = (0, 0, 0)
def imshow_det_bboxes(img,
bboxes,
labels,
class_names=None,
score_thr=0,
bbox_color='green',
text_color='green',
thickness=1,
font_scale=0.5,
show=False,
win_name='',
wait_time=0,
out_file=None):
"""Draw bboxes and class labels (with scores) on an image.
Args:
img (str or ndarray): The image to be displayed.
bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
(n, 5).
labels (ndarray): Labels of bboxes.
class_names (list[str]): Names of each classes.
score_thr (float): Minimum score of bboxes to be shown.
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
text_color (str or tuple or :obj:`Color`): Color of texts.
thickness (int): Thickness of lines.
font_scale (float): Font scales of texts.
show (bool): Whether to show the image.
win_name (str): The window name.
wait_time (int): Value of waitKey param.
out_file (str or None): The filename to write the image.
"""
assert bboxes.ndim == 2
assert labels.ndim == 1
assert bboxes.shape[0] == labels.shape[0]
assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
if score_thr > 0:
assert bboxes.shape[1] == 5
scores = bboxes[:, -1]
inds = scores > score_thr
bboxes = bboxes[inds, :]
labels = labels[inds]
bbox_color = Color[bbox_color].value
text_color = Color[text_color].value
for bbox, label in zip(bboxes, labels):
bbox_int = bbox.astype(np.int32)
left_top = (bbox_int[0], bbox_int[1])
right_bottom = (bbox_int[2], bbox_int[3])
cv2.rectangle(
img, left_top, right_bottom, bbox_color, thickness)
label_text = class_names[
label] if class_names is not None else 'cls {}'.format(label)
if len(bbox) > 4:
label_text += '|{:.02f}'.format(bbox[-1])
cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
if show:
cv2.imshow(img, win_name)
if out_file is not None:
cv2.imwrite(out_file, img)
预处理使用numpy和opencv-python。
# preprocess
image_path = '9331584514251_.pic_hd.jpg'
image = cv2.imread(image_path)
img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
resize_shape = (300, 300)
img = cv2.resize(img, resize_shape, interpolation=cv2.INTER_LINEAR)
mean = np.array([123.675, 116.28, 103.53]).reshape(1, -1)
std = np.array([1., 1., 1.]).reshape(1, -1)
img = img.astype(np.float32)
img = cv2.subtract(img, mean)
img = cv2.multiply(img, std)
img = img.transpose(2, 0, 1)
# load onnx model and build tvm runtime
target = 'llvm'
ctx = tvm.context(target)
mssd = onnx.load('mssd.onnx')
# get input blob name and shape
input_blob = mssd.graph.input[0]
input_shape = tuple(map(lambda x: getattr(x, 'dim_value'), input_blob.type.tensor_type.shape.dim))
shape_dict = {
input_blob.name: input_shape}
mod, params = relay.frontend.from_onnx(mssd, shape_dict)
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, target, params=params)
module = runtime.create(graph, lib, ctx)
# run
module.set_input(**params)
module.set_input(input_blob.name, tvm.nd.array(img))
module.run()
使用TVM自带的quantize模块,可简单实现模型的INT-8离线量化。在量化过程中,需要校准数据集。
受运算能力的限制,我们在树莓派上仅编译了runtime,因此在树莓派上运行代码有两种方式:
我们在这里使用第二种方式。此时需要对模型载入编译部分做改动,并删除电脑上的模型运行部分。
# load onnx model and build tvm runtime
target = 'llvm -target=aarch64-linux-gnu -mattr=+neon'
ctx = tvm.context(target)
mssd = onnx.load('mssd.onnx')
# get input blob name and shape
input_blob = mssd.graph.input[0]
input_shape = tuple(map(lambda x: getattr(x, 'dim_value'), input_blob.type.tensor_type.shape.dim))
shape_dict = {
input_blob.name: input_shape}
mod, params = relay.frontend.from_onnx(mssd, shape_dict)
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, target, params=params)
######## export lib ########
path = 'model/'
path_lib = path + "deploy_lib.tar"
path_graph = path + "deploy_graph.json"
path_params = path + "deploy_param.params"
lib.export_library(path_lib)
with open(path_graph, "w") as fo:
fo.write(graph)
with open(path_params, "wb") as fo:
fo.write(relay.save_param_dict(params))
讲编译后的三个文件发送到树莓派上,并在树莓派上运行该模型。
树莓派上的模型无上述载入编译部分。
######## load lib ########
# load the module back.
path = 'model/'
path_lib = path + "deploy_lib.tar"
path_graph = path + "deploy_graph.json"
path_params = path + "deploy_param.params"
graph = open(path_graph).read()
lib = tvm.runtime.load_module(path_lib)
params = bytearray(open(path_params, "rb").read())
module = runtime.create(graph, lib, ctx)
# run
module.load_params(params) # pay attention to the difference
module.set_input('input.1', tvm.nd.array(img)) # hardcode the input blob name
其他部分与原代码相同。
可以看到,量化后的模型精度有所下降,而运算速度有一定提升。
下表是不同平台下模型运行的时间测试。