背景
tensorrt是nvidia开发的模型推理框架, 对于各个框架的模型推理都有比较高的加速比.不过tensorrt只支持加载caffe, onnx模型(官方建议tensorflow模型转UFF, 不过一般还是转为onnx). 所以首先需要将各个框架的模型保存为合适的格式, 这里我们将mxnet模型转为onnx.
mxnet虽然有API支持直接转onnx, 但是经常会出现op不存在或者推理结果不一致的情况. 这里主要是梳理模型转换过程中遇到的一些问题及解决方案(mxnet已经没几个人在维护了....)
转换
转换很简单, 通过内置API导出即可, 只是需要首先修复一些转化过程中可能存在的问题, 最后需要把batch维度设置成动态的(如果输入的尺寸也是动态的, 设置方法一致)
转化脚本, 见代码:
import onnx
import numpy as np
import mxnet as mx
from mxnet.contrib import onnx as onnx_mx
def mxnet_model_fix(input_symbol_path, input_params_path, rewrite = True):
# if some bug need fixed.
pass
def export_onnx(input_symbol_path, input_params_path, input_shape, precision, export_onnx_path):
# mxnet_model_fix(input_symbol_path, input_params_path, rewrite=True)
onnx_mx.export_model(input_symbol_path, input_params_path, [input_shape], precision, export_onnx_path, verbose=True)
onnx.checker.check_model(export_onnx_path)
# set batch dimention to be dynamic
model = onnx.load(export_onnx_path)
model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = "?"
onnx.save(model, export_onnx_path)
if __name__ == "__main__":
sym_path = "./model-symbol.json"
params_path = "./model-0000.params"
precision = np.float32
input_shape = (1, 3, 224, 224)
export_onnx_path = "./model.onnx"
export_onnx(sym_path, params_path, input_shape, precision, export_onnx_path)
推理
见代码:
import cv2
import numpy as np
import mxnet as mx
import onnxruntime as ort
mxnet_model_path = "./model"
onnx_model_path = "./model.onnx"
image_path = "./image.jpg"
# format input
img = cv2.imread(image_path)
# img = cv2.resize(img, (224, 224))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.transpose(img, (2, 0, 1))
input_blob = np.expand_dims(img, axis=0).astype(np.float32) # NCHW
# mxnet runtime
sym, params, aux_params = mx.model.load_checkpoint(mxnet_model_path, 0)
model = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names=None)
model.bind(data_shapes=[('data', input_blob.shape)])
model.set_params(params, aux_params)
# mxnet input
mx_data = mx.nd.array(input_blob)
mx_db = mx.io.DataBatch(data=(mx_data,))
# mxnet predict
model.forward(mx_db, is_train=False)
mxnet_result = model.get_outputs()[0].asnumpy()
# onnx runtime
ort_session = ort.InferenceSession(onnx_model_path)
onnx_input_name = ort_session.get_inputs()[0].name
onnx_outputs = ort_session.get_outputs()[0].name
# set input and predict
onnx_result = ort_session.run([onnx_outputs], input_feed={onnx_input_name: input_blob})
onnx_result = onnx_result[0]
print("######mxnet result#########")
print(mxnet_result)
print("######onnx result##########")
print(onnx_result)
可能遇到的问题
op名称不同
- SoftmaxActivation
mxnet官方文档显示这个op已经废除, 但实际上softmax的实现和这个op一致, 我们可以直接替换
解决方案: 将symbol.json
中的SoftmaxActivation
修改为softmax
, 并修改attrs
, 见下图:
op不存在
- Upsampling
mxnet没有实现这个op, 如果使用deconv的话,可能存在一定的diff, 这里使用resize实现.
解决方案: 在mxnet包的mxnet/contrib/onnx/mx2onnx/_op_translations.py
中添加实现, 见代码:
def create_helper_tensor_node(input_vals, output_name, kwargs):
"""create extra tensor node from numpy values"""
data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[input_vals.dtype]
tensor_node = onnx.helper.make_tensor_value_info(
name=output_name,
elem_type=data_type,
shape=input_vals.shape
)
kwargs["initializer"].append(
onnx.helper.make_tensor(
name=output_name,
data_type=data_type,
dims=input_vals.shape,
vals=input_vals.flatten().tolist(),
raw=False,
)
)
return tensor_node
@mx_op.register("UpSampling")
def convert_upsample(node, **kwargs):
"""Map MXNet's UpSampling operator attributes to onnx's Upsample operator
and return the created node.
"""
name, input_nodes, attrs = get_inputs(node, kwargs)
sample_type = attrs.get('sample_type', 'nearest')
sample_type = 'linear' if sample_type == 'bilinear' else sample_type
scale = convert_string_to_list(attrs.get('scale'))
scaleh = scalew = float(scale[0])
if len(scale) > 1:
scaleh = float(scale[0])
scalew = float(scale[1])
scale = np.array([1.0, 1.0, scaleh, scalew], dtype=np.float32)
roi = np.array([], dtype=np.float32)
node_roi=create_helper_tensor_node(roi, name+'roi', kwargs)
node_sca=create_helper_tensor_node(scale, name+'scale', kwargs)
node = onnx.helper.make_node(
'Resize',
inputs=[input_nodes[0], name+'roi', name+'scale'],
outputs=[name],
coordinate_transformation_mode='asymmetric',
mode=sample_type,
nearest_mode='floor',
name=name
)
return [node_roi, node_sca, node]
- Crop
mxnet官方文档中提示使用slice代替(那为啥不直接在模型转化中注册这个op呢?)
解决方案: 在mxnet包的mxnet/contrib/onnx/mx2onnx/_op_translations.py
中添加实现, 见代码:
def create_helper_shape_node(input_node, node_name):
"""create extra transpose node for dot operator"""
trans_node = onnx.helper.make_node(
'Shape',
inputs=[input_node],
outputs=[node_name],
name=node_name
)
return trans_node
@mx_op.register("Crop")
def convert_crop(node, **kwargs):
"""Map MXNet's crop operator attributes to onnx's Crop operator
and return the created node.
"""
name, inputs, attrs = get_inputs(node, kwargs)
start=np.array([0, 0, 0, 0], dtype=np.int) #index是int类型
start_node=create_helper_tensor_node(start, name+'__starts', kwargs)
shape_node = create_helper_shape_node(inputs[1], inputs[1]+'__shape')
crop_node = onnx.helper.make_node(
"Slice",
inputs=[inputs[0], name+'__starts', inputs[1]+'__shape'], #data、start、end
outputs=[name],
name=name
)
logging.warning(
"Using an experimental ONNX operator: Crop. " \
"Its definition can change.")
return [start_node, shape_node, crop_node]
op结果不一致
- softmax
onnx实现的softmax在处理多维输入(NCHW)时存在问题
解决方案: 在mxnet包的mxnet/contrib/onnx/mx2onnx/_op_translations.py
中添加实现, 并注释原op, 见代码:
@mx_op.register("softmax")
def convert_softmax(node, **kwargs):
"""Map MXNet's softmax operator attributes to onnx's Softmax operator
and return the created node.
"""
name, input_nodes, attrs = get_inputs(node, kwargs)
axis = int(attrs.get("axis", -1))
c_softmax_node = []
axis=-1
transpose_node1 = onnx.helper.make_node(
"Transpose",
inputs=input_nodes,
perm=(0,2,3,1), #NCHW--NHWC--(NHW,C)
name=name+'_tr1',
outputs=[name+'_tr1']
)
softmax_node = onnx.helper.make_node(
"Softmax",
inputs=[name+'_tr1'],
axis=axis,
name=name+'',
outputs=[name+'']
)
transpose_node2 = onnx.helper.make_node(
"Transpose",
inputs=[name+''],
perm=(0,3,1,2), #NHWC--NCHW
name=name+'_tr2',
outputs=[name+'_tr2']
)
c_softmax_node.append(transpose_node1)
c_softmax_node.append(softmax_node)
c_softmax_node.append(transpose_node2)
return c_softmax_node
- BatchNorm
mxnet的BatchNorm中, gamma和beta都是可学习参数, 当fix_gamma为True时,gamma被设置为1, 梯度设置为0.
如果一个BatchNorm op的fix_gamma设置为True, 但是保存的gamma值不为1, mxnet推理的时候会将gamma置为1, 没有问题, 但是onnx在推理的时候使用的却是实际保存的gamma值, 因此会导致结果不一致.
解决方案: 将mxnet的.params
中, fix_gamma=True的batchnorm的gamma值修改为1.
参考
- Insightface中Retinaface MxNet2ONNX踩坑