业界主流的模型结构包括tensorflow和pytorch,很多时候两者的模型需要转换成中间格式,比如onnx,另外在tokenized的时候需要保留[unusedx]不被分词,但默认的是会分词的,这里记录一下处理方式。
torch转onnx方法很多,这里介绍两种方式
# model_path为torch保存的文件,onnx_path为保存的文件路径
def lower_level(model_path, onnx_path="bert_std.onnx"):
# load model and tokenizer
added_token = ["[unused%s]" % i for i in range(100)]
print("added_token:", added_token[:10])
tokenizer = AutoTokenizer.from_pretrained(model_path, additional_special_tokens=added_token)
dummy_model_input = tokenizer("hello bert", return_tensors="pt")
unused_input = tokenizer("hello bert[unused17]", return_tensors="pt")
print("dummy_model_input", dummy_model_input)
print("unused_input:", unused_input)
model = AutoModelForMaskedLM.from_pretrained(model_path)
# export
torch.onnx.export(
model,
tuple(dummy_model_input.values()),
f=onnx_path,
input_names=['input_ids', 'attention_mask'],
output_names=['logits'],
dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
'attention_mask': {0: 'batch_size', 1: 'sequence'},
'logits': {0: 'batch_size', 1: 'sequence'}},
do_constant_folding=True,
opset_version=13,
)
print("over")
def middle_level(model_path, onnx_path="bert_std.onnx"):
from pathlib import Path
import transformers
from transformers.onnx import FeaturesManager
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
# load model and tokenizer
feature = "sequence-classification"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# load config
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=feature)
onnx_config = model_onnx_config(model.config)
# export
onnx_inputs, onnx_outputs = transformers.onnx.export(
preprocessor=tokenizer,
model=model,
config=onnx_config,
opset=13,
output=Path(onnx_path)
)
print("onnx_inputs:", onnx_inputs)
print("onnx_outputs:", onnx_outputs)
print("over")
在AutoTokenizer.from_pretrained增加additional_special_tokens参数,如:
added_token = ["[unused%s]" % i for i in range(100)]
print("added_token:", added_token[:10])
tokenizer = AutoTokenizer.from_pretrained(model_path, additional_special_tokens=added_token)
完整代码如下:
def lower_level(model_path, onnx_path="bert_std.onnx"):
# load model and tokenizer
added_token = ["[unused%s]" % i for i in range(100)]
print("added_token:", added_token[:10])
tokenizer = AutoTokenizer.from_pretrained(model_path, additional_special_tokens=added_token)
dummy_model_input = tokenizer("hello bert", return_tensors="pt")
unused_input = tokenizer("hello bert[unused17]", return_tensors="pt")
print("dummy_model_input", dummy_model_input)
print("unused_input:", unused_input)
model = AutoModelForMaskedLM.from_pretrained(model_path)
print("over")
preprocessor = hub.load(bert_preprocess_path)
okenize = tfm.nlp.layers.BertTokenizer(vocab_file=vocab_path, lower_case=True,
tokenizer_kwargs=dict(preserve_unused_token=True, token_out_type=tf.int32))
bert_pack_inputs = hub.KerasLayer(
preprocessor.bert_pack_inputs,
arguments=dict(seq_length=seq_length)) # Optional argument.
encoder = TFAutoModel.from_pretrained(checkpoint_dir, from_pt=True)