预训练模型下载地址:Albert_Large_zh
数据集下载地址:事故灾害多分类数据集(数据集由爬虫获取,如有错误请多多指教)
将预训练模型放入pretraining_model文件夹下。
将数据集分成训练集,测试集与验证集,分别为train.txt,test.txt,dev.txt三个,比例一般为7:2:1,放入datasets文件夹下。
项目结构
Keras_Bert_Class/
|-- datasets/
| |-- train.txt
| |-- test.txt
| |-- dev.txt
|
|-- model/
|
|-- pretraining_model/
| |-- albert_larger/
|
|-- main.py
|-- requirements.txt
代码如下:
import numpy as np
from sklearn import metrics
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.snippets import DataGenerator, sequence_padding
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from keras.layers import Lambda, Dense
from contextlib import redirect_stdout
代码如下:
# 配置
class Config:
def __init__(self):
# 预训练模型名称
self.model_name = "bert"
# 数据集
self.train_path = "datasets/train.txt"
# 类别列表
self.class_list = self.read_class()
# 类别数
self.num_classes = len(self.class_list)
# epoch数
self.epochs = 10
# mini-batch
self.batch_size = 8
# 每句话处理长度(短填切长)
self.pad_size = 128
# 学习率
self.learning_rate = 1e-5
# 预训练模型路径
self.config_path = "./pretraining_model/albert_large/albert_config.json"
self.checkpoint_path = "./pretraining_model/albert_large/albert_model.ckpt"
self.dict_path = "./pretraining_model/albert_large/vocab.txt"
# 文本处理
self.tokenizer = Tokenizer(self.dict_path)
# 标签字典
self.label2id, self.id2label = self.label_dict()
def read_class(self):
class_list = []
for line in open("datasets/train.txt", 'r', encoding='utf-8').readlines():
line = line.split('\t')
if line[0] not in class_list:
class_list.append(line)
return class_list
def label_dict(self):
label2id, id2label = {}, {}
with open(self.train_path, 'r', encoding="utf-8") as data:
for line in data:
line = line.split('\t')
label, text = line[0], line[1].replace("\n", "")
if label not in label2id:
label2id[label] = len(label2id)
id2label[len(label2id)] = label
return label2id, id2label
config = Config()
代码如下:
def split_data(ratio=0.2, transmit_data=config.train_path):
data = open(transmit_data, 'r', encoding='utf-8').readlines()
train_x, test_val_x, train_y, test_val_y = train_test_split(x, y, test_size=ratio, stratify=y, random_state=42)
test_x, val_x, test_y, val_y = train_test_split(test_val_x, test_val_y, test_size=0.5, stratify=test_val_y, random_state=42)
train_data = [(x, y) for x, y in zip(train_x, train_y)]
test_data = [(x, y) for x, y in zip(test_x, test_y)]
val_data = [(x, y) for x, y in zip(val_x, val_y)]
return train_data, test_data, val_data
train_data, test_data, dev_data = split_data(ratio=0.2, transmit_data=config.train_path)
class data_gernerator(DataGenerator):
"""数据生成器"""
def __iter__(self, random=False):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for is_end, (text, label) in self.sample(random):
token_ids, segment_ids = config.tokenizer.encode(text, maxlen=config.pad_size)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == config.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids, batch_segment_ids], batch_labels
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
train_generator = data_gernerator(train_data, config.batch_size)
test_generator = data_gernerator(test_data, config.batch_size)
dev_generator = data_gernerator(dev_data, config.batch_size)
使用苏神的bert4keras加载预训练模型
代码如下:
bert = build_transformer_model(
config_path=config.config_path,
checkpoint_path=config.checkpoint_path,
model="albert",
return_keras_model=False
)
output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
units=config.num_classes,
activation='softmax',
kernel_initializer=bert.initializer
)(output)
model = keras.models.Model(bert.model.input, output)
# 将模型结构写入txt文件
with open('model/modelsummary.txt', 'w+') as f:
with redirect_stdout(f):
model.summary()
AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=AdamLR(lr=config.learning_rate),
metrics=['accuracy'])
代码如下:
# 生成准确率
def evaluate(data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_true = y_true[:, 0]
total += len(y_true)
right += (y_true == y_pred).sum()
return right / total
class Evaluator(keras.callbacks.Callback):
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, epoch, logs=None):
val_acc = evaluate(dev_generator)
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('best_model.weights')
test_acc = evaluate(test_generator)
print(
u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
(val_acc, self.best_val_acc, test_acc)
)
evaluator = Evaluator()
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=config.epochs,
callbacks=[evaluator]
)
代码如下:
# 生成分类报告
def evaluate1(data):
y1, y2 = [], []
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
y_pred = y_pred.tolist()
y_true = y_true[:, 0]
y_true = y_true.tolist()
y1 = y1 + y_pred
y2 = y2 + y_true
y1, y2 = np.array(y1), np.array(y2)
categories = list(config.label2id.keys())
print(categories)
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y2, y1, target_names=categories)
evaluate1(test_generator)
模型整体效果较佳,由于数据集中个别类别较少导致权重不均衡,识别效果较差。大家可以各显神通获取数据,取得样本均衡,模型效果浮动在97%左右。
感谢评论区‘喀拉布喀’的指点
https://github.com/bojone/bert4keras/blob/master/examples/task_sentiment_albert.py
科学空间站