文本分类是自然语言处理的基本任务之一。大部分文本分类是多分类,即数据有多个标签。实际工作或者项目中会遇到多标签的文本。笔者利用hugging face的Transformers实现多标签文本分类。笔者的tensorflow版本为2.4.0,transformers的版本为4.2.0
利用transformers中的BertTokenizer对数据进行Tokenizer。代码如下:
def get_model_data(data, labels, max_seq_len=128):
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", do_lower_case=True)
dataset_dict = {
"input_ids": [],
"attention_mask": [],
"label": []
}
assert len(data) == len(labels)
for i in range(len(data)):
sentence = data[i]
input_ids = tokenizer.encode(
sentence, # Sentence to encode.
add_special_tokens=True, # Add '[CLS]' and '[SEP]'
max_length=max_seq_len, # Truncate all sentences.
)
sentence_length = len(input_ids)
input_ids = pad_sequences([input_ids],
maxlen=max_seq_len,
dtype="long",
value=0,
truncating="post",
padding="post")
input_ids = input_ids.tolist()[0]
attention_mask = [1] * sentence_length + [0] * (max_seq_len - sentence_length)
dataset_dict["input_ids"].append(input_ids)
# dataset_dict["token_type_ids"].append(token_type_ids)
dataset_dict["attention_mask"].append(attention_mask)
dataset_dict["label"].append(labels[i])
for key in dataset_dict:
dataset_dict[key] = np.array(dataset_dict[key])
x = [
dataset_dict["input_ids"],
dataset_dict["attention_mask"],
]
y = dataset_dict["label"]
return x, y
利用Transformers搭建多标签分类模型。多标签分类在模型的最后一层全连接层的激活函数是sigmoid,而多分类的激活函数为softmax。多标签分类的损失函数为BinaryCrossentropy。代码如下:
class BertMultiClassifier(object):
def __init__(self, bert_model_name, label_num):
self.label_num = label_num
self.bert_model_name = bert_model_name
def get_model(self):
bert = TFBertModel.from_pretrained(self.bert_model_name)
input_ids = Input(shape=(None,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(None,), dtype=tf.int32, name="attention_mask")
outputs = bert(input_ids, attention_mask=attention_mask)[1]
cla_outputs = Dense(self.label_num, activation='sigmoid')(outputs)
model = Model(
inputs=[input_ids, attention_mask],
outputs=[cla_outputs])
return model
def create_model(bert_model_name, label_nums):
model = BertMultiClassifier(bert_model_name, label_nums).get_model()
optimizer = tf.keras.optimizers.Adam(lr=1e-5)
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=False)
model.compile(optimizer=optimizer, loss=loss_object,
metrics=['accuracy', tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tf.keras.metrics.AUC()]) # metrics=['accuracy']
return model
利用tensorflow中的高阶API keras训练模型,模型保存为h5,同时将模型保存为pb模型。训练代码如下:
model.fit(train_x, train_y, epochs=args["epoch"], verbose=1,
batch_size=args["batch_size"],
callbacks=callbacks,
validation_data=(val_x, val_y),
validation_batch_size=args["batch_size"])
model_path = os.path.join("./output/model/", "mulclassifition.h5")
model.save_weights(model_path)
tf.keras.models.save_model(model, args["pbmodel_path"], save_format="tf", overwrite=True)
一般训练模型可以直接加载做预测也可以用Tensorflow serving部署提供http服务,笔者分别介绍这两种方式。直接加载模型做预测的代码如下:
def predict(test_data, args, label_num):
# test_steps_per_epoch = len(test_data) // args["batch_size"]
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese", do_lower_case=True)
testdata = get_model_data(test_data, tokenizer, args["max_length"])
print("testdata: ", testdata)
model = create_model(args['bert_model_name'], label_num)
model.load_weights("./output/model/mulclassifition.h5")
pred_logits = model.predict(testdata, batch_size=args["batch_size"])
pred = np.where(pred_logits >= 0.5, 1, 0).tolist()
return pred
利用Tensorflow serving和Flask提供HTTP服务。代码如下:
@app.route("/multiclassfier", methods=['POST'])
def multiclassifier_pred():
data_para = json.loads(request.get_data(), encoding="utf-8")
sentence = data_para["sent"]
print("sentence: ", sentence)
# get model input
test_x = get_model_data(sentence, tokenizer, 256)
input_ids = test_x[0]
attention_mask = test_x[1]
data = json.dumps({"signature_name": "serving_default",
"inputs": {"input_ids": input_ids,
"attention_mask": attention_mask}})
headers = {"content-type": "application/json"}
result = requests.post("http://ip:port/v1/models/multiclass:predict", data=data, headers=headers)
if result.status_code == 200:
result = json.loads(result.text)
pred_logits = np.array(result["outputs"])
pred = np.where(pred_logits >= 0.5, 1, 0).tolist()
pred_encoder = label_encoder(pred, label)
return_result = {"code": 200, "sent": sentence, "label": pred_encoder[0]}
return jsonify(return_result)
else:
return jsonify({"code": result.status_code,
"message": traceback.format_exc()})
代码中的http://ip:port/v1/models/multiclass:predict是tensorflow serving的加载模型做预测的服务。采用docker 部署tensorflow serving部署服务。