Supported Operator
Onnx支持的算子https://pytorch.org/docs/stable/onnx.html?highlight=onnx%20runtime
Supported Model
Onnx支持的模型:Bug Free Demo
Bert_squad_onnxruntime加速例子导入数据
import os
cache_dir = "./squad"
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
predict_file_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
predict_file = os.path.join(cache_dir, "dev-v1.1.json")
if not os.path.exists(predict_file):
import wget
print("Start downloading predict file.")
wget.download(predict_file_url, predict_file)
print("Predict file downloaded.")
设置数据路径和处理数据
# Whether allow overwriting existing ONNX model and download the latest script from GitHub
enable_overwrite = True
# Total samples to inference, so that we can get average latency
total_samples = 1000
# ONNX opset version
opset_version=11
model_name_or_path = "bert-base-uncased"
max_seq_length = 128
doc_stride = 128
max_query_length = 64
cache_dir = r'D:\pretrain_model\bert_base_uncased'
config_dir = r'D:\pretrain_model\bert_base_uncased\config.json'
vocab_dir = r'D:\pretrain_model\bert_base_uncased\vocab.txt'
model_dir = r'D:\pretrain_model\bert_base_uncased\pytorch_model.bin'
# The following code is adapted from HuggingFace transformers
# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py
from transformers import (BertConfig, BertForQuestionAnswering, BertTokenizer,XLNetConfig,XLNetForQuestionAnswering,XLNetTokenizer)
# Load pretrained model and tokenizer
config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer)
config = config_class.from_json_file(config_dir)
tokenizer = tokenizer_class.from_pretrained(vocab_dir, do_lower_case=True)
model = model_class.from_pretrained(model_dir,
from_tf=False,
config=config)
# load some examples
from transformers.data.processors.squad import SquadV1Processor
processor = SquadV1Processor()
examples = processor.get_dev_examples(None, filename=predict_file)
from transformers import squad_convert_examples_to_features
features, dataset = squad_convert_examples_to_features(
examples=examples[:total_samples], # convert enough examples for this notebook
tokenizer=tokenizer,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
max_query_length=max_query_length,
is_training=False,
return_dataset='pt'
)
保存为onnx格式模型
output_dir = "./onnx"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
export_model_path = os.path.join(output_dir, 'bert-base-uncased-squad_opset{}.onnx'.format(opset_version))
# export_model_path = os.path.join(output_dir, 'xlnet-base-cased-squad_opset{}.onnx'.format(opset_version))
import torch
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
# Get the first example data to run the model and export it to ONNX
data = dataset[0]
inputs = {
'input_ids': data[0].to(device).reshape(1, max_seq_length),
'attention_mask': data[1].to(device).reshape(1, max_seq_length),
'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
}
# Set model to inference mode, which is required before exporting the model because some operators behave differently in
# inference and training mode.
model.eval()
model.to(device)
if enable_overwrite or not os.path.exists(export_model_path):
with torch.no_grad():
symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
torch.onnx.export(model, # model being run
args=tuple(inputs.values()), # model input (or a tuple for multiple inputs)
f=export_model_path, # where to save the model (can be a file or file-like object)
opset_version=opset_version, # the ONNX version to export the model to
verbose=True,
do_constant_folding=True, # whether to execute constant folding for optimization
input_names=['input_ids', # the model's input names
'input_mask',
'segment_ids'],
output_names=['start', 'end'], # the model's output names
dynamic_axes={'input_ids': symbolic_names, # variable length axes
'input_mask' : symbolic_names,
'segment_ids' : symbolic_names,
'start' : symbolic_names,
'end' : symbolic_names})
print("Model exported at ", export_model_path)
使用pytorch框架推理模型
import time
import torch
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
# Measure the latency. It is not accurate using Jupyter Notebook, it is recommended to use standalone python script.
latency = []
with torch.no_grad():
for i in range(total_samples):
data = dataset[i]
inputs = {
'input_ids': data[0].to(device).reshape(1, max_seq_length),
'attention_mask': data[1].to(device).reshape(1, max_seq_length),
'token_type_ids': data[2].to(device).reshape(1, max_seq_length)
}
start = time.time()
outputs = model(**inputs)
latency.append(time.time() - start)
print("PyTorch {} Inference time = {} ms".format(device.type, format(sum(latency) * 1000 / len(latency), '.2f')))
使用onnxruntime框架推理模型
import psutil
import onnxruntime
import numpy
# assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()
device_name = 'cpu'
sess_options = onnxruntime.SessionOptions()
# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.
# Note that this will increase session creation time so enable it for debugging only.
sess_options.optimized_model_filepath = os.path.join(output_dir, "optimized_model_{}.onnx".format(device_name))
# Please change the value according to best setting in Performance Test Tool result.
sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)
session = onnxruntime.InferenceSession(export_model_path, sess_options)
latency = []
for i in range(total_samples):
data = dataset[i]
# TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.
ort_inputs = {
'input_ids': data[0].cpu().reshape(1, max_seq_length).numpy(),
'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
}
start = time.time()
ort_outputs = session.run(None, ort_inputs)
latency.append(time.time() - start)
print("OnnxRuntime {} Inference time = {} ms".format(device_name, format(sum(latency) * 1000 / len(latency), '.2f')))
对比两种框架的推理速度
print("***** Verifying correctness *****")
for i in range(2):
print('PyTorch and ONNX Runtime output {} are close:'.format(i), numpy.allclose(ort_outputs[i], outputs[i].cpu(), rtol=1e-02, atol=1e-02))
diff = ort_outputs[i] - outputs[i].cpu().numpy()
max_diff = numpy.max(numpy.abs(diff))
avg_diff = numpy.average(numpy.abs(diff))
print(f'maximum_diff={max_diff} average_diff={avg_diff}')
Supported Operator
Onnxruntime支持的算子参考链接:https://github.com/microsoft/onnxruntime/blob/master/docs/OperatorKernels.md
上图Op Name支持算子的名字,Parameters为算子的参数,Opset Version为算子支持的版本
Current Problem
目前存在的问题自己项目的第一个模型,使用的是xlnet,其中很多算子是onnx不支持的,
在opset version 12 版本中支持了,onnxruntime1.4.0版本中也支持了
在opset version 12 版本中支持了,onnxruntime1.4.0版本中也支持了
以下算子是onnxruntime不支持的:
在opset version 12 版本中支持了,onnxruntime1.4.0版本中也支持了
以及onnx导出的pytorch的模型,在onnxruntime中无法使用,出错的类型大都是算子不支持。
解决办法:
1、修改模型中onnx或者onnxruntime不支持的算子
2、更新对应onnx/onnxruntime框架使其支持对应的算子。
最后的解决办法:
更新onnxruntime到1.4.0,其中onnx.opset_version=12。即可解决对应算子不支持的情况。
Closed Issue
遇到的问题:1、在linux环境下,pytorch是自带onnx的工具的,调用的接口是torch.onnx。当时torch版本是1.3.0在保存模型的过程中出现的是出现很多不支持的操作。onnx的版本是1.7.0,但是我自己在PC上却是可以将模型保存下来。发现自己PC的torch版本是1.5.0.于是更新了,服务器上torch的版本到1.5.0+cuda10。
2、onnx中的opset version应该设置为12,且onnxruntime也需要更新到1.4.0及以后的版本。不然会出现有些算子不支持的操作。
3、在保存onnx格式的模型过程中,需要将数据的格式设置为一个batch的情况,不然在进行推理的过程中会出现数据混乱的情况。
4、Bert在onnxruntime框架下是有进一步的优化的,体现在数据的保存,数据的输入输出方面,别的模型比如目前我用到的xlnet是没有在输入的输入和输出方面得到优化的。
5、Bert在调试的过程中发现一个问题,就是原生态的onnxruntime目前而言,只支持不超过256的长度的最大加速内存的申请。超过了这个长度加速的效果会慢慢减少直到512的长度位置。在最后的效果上的体现就是用空间换时间的代价。
unclosed Issue
尚未解决的问题:onnxruntime server 功能还没有用起来,原因是这个功能还在开发阶段。