我们以文章tensorflow(8)将h5文件转化为pb文件并利用tensorflow/serving实现模型部署中的模型文件为例,介绍如何在tensorflow/serving实现模型的高效使用。
平时我们在使用tensorflow/serving的Docker镜像时,常用的部署命令为:
docker run -t --rm -p 8561:8501 -v "$path/example_ner:/models/example_ner" -e MODEL_NAME=example_ner tensorflow/serving:1.14.0
但tensorflow/serving提供了一套高效的模型使用机制,在实际部署时,可以调整其参数来实现模型并发、批量调用等。关于tensorflow/serving部署的参数说明,可以参考网址:https://github.com/tensorflow/serving/blob/master/tensorflow_serving/g3doc/serving_config.md 和 https://github.com/tensorflow/serving/blob/master/tensorflow_serving/model_servers/main.cc#L59。
本文以参数rest_api_num_threads
和enable_batching
为例,来介绍如何利用tensorflow/serving实现模型的高效使用。参数rest_api_num_threads
为HTTP/REST API所使用的线程数量,如果不设置,则它会自动设置为CPU的数量。参数enable_batching
为是否支持批量调用,默认值为false。我们使用上述参数,重新进行部署部署,设置rest_api_num_threads
为300,enable_batching
为true,命令如下:
docker run -t --rm -p 8561:8501 -v "$path/example_ner:/models/example_ner" -e MODEL_NAME=example_ner tensorflow/serving:1.14.0 --rest_api_num_threads=300 --enable_batching=true
接下来,我们将会比较不同调用方式之间的耗时区别。测试的样本在文件tf_test_sample.txt,一共1080个句子。
单线程,单个样本调用的预测调用脚本代码如下:
# -*- coding: utf-8 -*-
import time
import json
import requests
import numpy as np
from keras_bert import Tokenizer
# 读取label2id字典
with open("../example_label2id.json", "r", encoding="utf-8") as h:
label_id_dict = json.loads(h.read())
id_label_dict = {
v: k for k, v in label_id_dict.items()}
# 载入数据
dict_path = '../chinese_L-12_H-768_A-12/vocab.txt'
token_dict = {
}
with open(dict_path, 'r', encoding='utf-8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
else:
R.append('[UNK]')
return R
# 将BIO序列转化为JSON格式
def bio_to_json(string, tags):
item = {
"string": string, "entities": []}
entity_name = ""
entity_start = 0
iCount = 0
entity_tag = ""
for c_idx in range(min(len(string), len(tags))):
c, tag = string[c_idx], tags[c_idx]
if c_idx < len(tags)-1:
tag_next = tags[c_idx+1]
else:
tag_next = ''
if tag[0] == 'B':
entity_tag = tag[2:]
entity_name = c
entity_start = iCount
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": c, "start": iCount, "end": iCount + 1, "type": tag[2:]})
elif tag[0] == "I":
if tag[2:] != tags[c_idx-1][2:] or tags[c_idx-1][2:] == 'O':
tags[c_idx] = 'O'
pass
else:
entity_name = entity_name + c
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": entity_name, "start": entity_start, "end": iCount + 1, "type": entity_tag})
entity_name = ''
iCount += 1
return item
tokenizer = OurTokenizer(token_dict)
# 读取测试样本
with open("tf_test_sample.txt", "r", encoding="utf-8") as f:
content = [_.strip() for _ in f.readlines()]
# 模型预测
s_time = time.time()
for i, line in enumerate(content):
token_ids, segment_is = tokenizer.encode(line, max_len=128)
tensor = {
"instances": [{
"input_1": token_ids, "input_2": segment_is}]}
url = "http://192.168.1.193:8561/v1/models/example_ner:predict"
req = requests.post(url, json=tensor)
if req.status_code == 200:
t = np.asarray(req.json()['predictions'][0]).argmax(axis=1)
tags = [id_label_dict[_] for _ in t]
print(i)
e_time = time.time()
print("avg cost time: {}".format((e_time-s_time)/len(content)))
输出结果为:
avg cost time: 0.3065736322491257
多线程,单个样本调用的预测调用脚本代码如下:
# -*- coding: utf-8 -*-
import time
import json
import requests
import numpy as np
from keras_bert import Tokenizer
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
# 读取label2id字典
with open("../example_label2id.json", "r", encoding="utf-8") as h:
label_id_dict = json.loads(h.read())
id_label_dict = {
v: k for k, v in label_id_dict.items()}
# 载入数据
dict_path = '../chinese_L-12_H-768_A-12/vocab.txt'
token_dict = {
}
with open(dict_path, 'r', encoding='utf-8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
else:
R.append('[UNK]')
return R
# 将BIO序列转化为JSON格式
def bio_to_json(string, tags):
item = {
"string": string, "entities": []}
entity_name = ""
entity_start = 0
iCount = 0
entity_tag = ""
for c_idx in range(min(len(string), len(tags))):
c, tag = string[c_idx], tags[c_idx]
if c_idx < len(tags)-1:
tag_next = tags[c_idx+1]
else:
tag_next = ''
if tag[0] == 'B':
entity_tag = tag[2:]
entity_name = c
entity_start = iCount
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": c, "start": iCount, "end": iCount + 1, "type": tag[2:]})
elif tag[0] == "I":
if tag[2:] != tags[c_idx-1][2:] or tags[c_idx-1][2:] == 'O':
tags[c_idx] = 'O'
pass
else:
entity_name = entity_name + c
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": entity_name, "start": entity_start, "end": iCount + 1, "type": entity_tag})
entity_name = ''
iCount += 1
return item
tokenizer = OurTokenizer(token_dict)
# 读取测试样本
with open("tf_test_sample.txt", "r", encoding="utf-8") as f:
content = [_.strip() for _ in f.readlines()]
start_time = time.time()
# 测试HTTP响应时间
def get_predict(i, sentence):
token_ids, segment_is = tokenizer.encode(sentence, max_len=128)
tensor = {
"instances": [{
"input_1": token_ids, "input_2": segment_is}]}
url = "http://192.168.1.193:8561/v1/models/example_ner:predict"
req = requests.post(url, json=tensor)
if req.status_code == 200:
t = np.asarray(req.json()['predictions'][0]).argmax(axis=1)
tags = [id_label_dict[_] for _ in t]
print("predict {} sample".format(i))
# 利用多线程调用接口
executor = ThreadPoolExecutor(max_workers=10) # 可以自己调整max_workers,即线程的个数
# submit()的参数: 第一个为函数, 之后为该函数的传入参数,允许有多个
future_tasks = [executor.submit(get_predict, i, sent) for i, sent in enumerate(content)]
# 等待所有的线程完成,才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
end_time = time.time()
print("avg cost time: {}".format((end_time-start_time)/len(content)))
输出结果如下:
avg cost time: 0.09153805485478154
单线程,批量调用的预测调用脚本代码如下:
# -*- coding: utf-8 -*-
# @Time : 2021/1/9 11:18
# @Author : Jclian91
# @File : tf_serving_normal_predict_test.py
# @Place : Yangpu, Shanghai
import json
import requests
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from keras_bert import Tokenizer
# 读取label2id字典
with open("../example_label2id.json", "r", encoding="utf-8") as h:
label_id_dict = json.loads(h.read())
id_label_dict = {
v: k for k, v in label_id_dict.items()}
# 载入数据
dict_path = '../chinese_L-12_H-768_A-12/vocab.txt'
token_dict = {
}
with open(dict_path, 'r', encoding='utf-8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
else:
R.append('[UNK]')
return R
# 将BIO序列转化为JSON格式
def bio_to_json(string, tags):
item = {
"string": string, "entities": []}
entity_name = ""
entity_start = 0
iCount = 0
entity_tag = ""
for c_idx in range(min(len(string), len(tags))):
c, tag = string[c_idx], tags[c_idx]
if c_idx < len(tags)-1:
tag_next = tags[c_idx+1]
else:
tag_next = ''
if tag[0] == 'B':
entity_tag = tag[2:]
entity_name = c
entity_start = iCount
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": c, "start": iCount, "end": iCount + 1, "type": tag[2:]})
elif tag[0] == "I":
if tag[2:] != tags[c_idx-1][2:] or tags[c_idx-1][2:] == 'O':
tags[c_idx] = 'O'
pass
else:
entity_name = entity_name + c
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": entity_name, "start": entity_start, "end": iCount + 1, "type": entity_tag})
entity_name = ''
iCount += 1
return item
tokenizer = OurTokenizer(token_dict)
# 读取测试样本
with open("tf_test_sample.txt", "r", encoding="utf-8") as f:
content = [_.strip() for _ in f.readlines()]
batch_size = 10
start_time = time.time()
global predict_result
predict_result = []
# 测试HTTP响应时间
def get_predict(i, sentence_list):
tensor = {
"instances": []}
for sentence in sentence_list:
token_ids, segment_is = tokenizer.encode(sentence, max_len=128)
tensor["instances"].append({
"input_1": token_ids, "input_2": segment_is})
url = "http://192.168.1.193:8561/v1/models/example_ner:predict"
req = requests.post(url, json=tensor)
if req.status_code == 200:
for j in range(len(req.json()['predictions'])):
t = np.asarray(req.json()['predictions'][j]).argmax(axis=1)
tags = [id_label_dict[_] for _ in t]
print("predict {} batch, batch no {}".format(i, j))
predict_result.append(bio_to_json(sentence_list[j], tags[1:-1]))
# 批量调用测试
for i in range(len(content)//batch_size):
get_predict(i, content[batch_size*i:batch_size*(i+1)])
end_time = time.time()
print("avg cost time: {}".format((end_time-start_time)/len(content)))
with open("batch_predict.json", "w", encoding="utf-8") as g:
g.write(json.dumps(predict_result, ensure_ascii=False, indent=2))
输出结果如下:
avg cost time: 0.12037382854355706
多线程,批量调用的预测调用脚本代码如下:
# -*- coding: utf-8 -*-
import json
import requests
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from keras_bert import Tokenizer
# 读取label2id字典
with open("../example_label2id.json", "r", encoding="utf-8") as h:
label_id_dict = json.loads(h.read())
id_label_dict = {
v: k for k, v in label_id_dict.items()}
# 载入数据
dict_path = '../chinese_L-12_H-768_A-12/vocab.txt'
token_dict = {
}
with open(dict_path, 'r', encoding='utf-8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(self, text):
R = []
for c in text:
if c in self._token_dict:
R.append(c)
else:
R.append('[UNK]')
return R
# 将BIO序列转化为JSON格式
def bio_to_json(string, tags):
item = {
"string": string, "entities": []}
entity_name = ""
entity_start = 0
iCount = 0
entity_tag = ""
for c_idx in range(min(len(string), len(tags))):
c, tag = string[c_idx], tags[c_idx]
if c_idx < len(tags)-1:
tag_next = tags[c_idx+1]
else:
tag_next = ''
if tag[0] == 'B':
entity_tag = tag[2:]
entity_name = c
entity_start = iCount
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": c, "start": iCount, "end": iCount + 1, "type": tag[2:]})
elif tag[0] == "I":
if tag[2:] != tags[c_idx-1][2:] or tags[c_idx-1][2:] == 'O':
tags[c_idx] = 'O'
pass
else:
entity_name = entity_name + c
if tag_next[2:] != entity_tag:
item["entities"].append({
"word": entity_name, "start": entity_start, "end": iCount + 1, "type": entity_tag})
entity_name = ''
iCount += 1
return item
tokenizer = OurTokenizer(token_dict)
# 读取测试样本
with open("tf_test_sample.txt", "r", encoding="utf-8") as f:
content = [_.strip() for _ in f.readlines()]
batch_size = 10
start_time = time.time()
global predict_result
predict_result = []
# 测试HTTP响应时间
def get_predict(i, sentence_list):
tensor = {
"instances": []}
for sentence in sentence_list:
token_ids, segment_is = tokenizer.encode(sentence, max_len=128)
tensor["instances"].append({
"input_1": token_ids, "input_2": segment_is})
url = "http://192.168.1.193:8561/v1/models/example_ner:predict"
req = requests.post(url, json=tensor)
if req.status_code == 200:
for j in range(len(req.json()['predictions'])):
t = np.asarray(req.json()['predictions'][j]).argmax(axis=1)
tags = [id_label_dict[_] for _ in t]
print("predict {} batch, batch no {}".format(i, j))
predict_result.append(bio_to_json(sentence_list[j], tags[1:-1]))
# 利用多线程调用接口
executor = ThreadPoolExecutor(max_workers=10) # 可以自己调整max_workers,即线程的个数
# submit()的参数: 第一个为函数, 之后为该函数的传入参数,允许有多个
future_tasks = [executor.submit(get_predict, i, content[batch_size*i:batch_size*(i+1)]) for i in range(len(content)//batch_size)]
# 等待所有的线程完成,才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
end_time = time.time()
print("avg cost time: {}".format((end_time-start_time)/len(content)))
with open("batch_multi_thread_predict.json", "w", encoding="utf-8") as g:
g.write(json.dumps(predict_result, ensure_ascii=False, indent=2))
输出结果如下:
avg cost time: 0.06627912256452773
以上为调用测试。笔者在业余时间,对模型的调用进行了专门的测试,具体结果如下:
测试结果总结
:tensorflow/serving天然地支持并发,在服务端设置rest_api_num_threads和enable_batching,在客户端多线程调用或者批量调用都有助于提高预测效率,多线程+批量调用效率最高。
注意
: 以上测试结果与服务端机器性能、客户端机器性能、预测文本数量、预测文本长度等因素有关,因此请求耗时时间仅作为参考。
本文所使用的脚本已放至Gihub,地址为:https://github.com/percent4/keras_bert_sequence_labeling 。
感谢大家的阅读~
2021.1.16于上海浦东