Bert文本分类(基于keras-bert实现训练,保存,加载,预测单个文本)

Bert 预训练模型准备

中文预训练模型下载      当Bert遇上Keras:这可能是Bert最简单的打开姿势      keras-bert

下载解压结果

Bert文本分类(基于keras-bert实现训练,保存,加载,预测单个文本)_第1张图片

Bert 模型文本分类

1、数据准备

在GitHub中有酒店评论的数据:https://github.com/Hejp5665/bert_keras_nlp

训练集:5888个,正样本:2940,负样本:2948

测试集:101个,正样本:50,负样本:51

2、代码实现


'''
提示:受GPU性能的影响,只能运行基础版的bert预训练模型,若出现OOM 适当调整batch_size,maxlen 
我用的是numpy==1.16.4。其他版本可能会有提示
'''

import pandas as pd
import codecs, gc
import numpy as np
from sklearn.model_selection import KFold
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
from keras.metrics import top_k_categorical_accuracy
from keras.layers import *
from keras.callbacks import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn import metrics
# 读取训练集和测试集
from sklearn.model_selection import train_test_split

# 参数配置
maxlen      = 100   # 设置序列长度为100,要保证序列长度不超过512
Batch_size  = 16    #批量运行的个数
Epoch       = 1     #迭代次数

def get_train_test_data():
    train_df = pd.read_excel(r'data\data_train.xlsx' ).astype(str)
    test_df = pd.read_excel(r'data\data_test.xlsx').astype(str)

    # 训练数据、测试数据和标签转化为模型输入格式
    DATA_LIST = []
    for data_row in train_df.iloc[:].itertuples():
        DATA_LIST.append((data_row.contents, to_categorical(data_row.labels, 2)))
    DATA_LIST = np.array(DATA_LIST)

    DATA_LIST_TEST = []
    for data_row in test_df.iloc[:].itertuples():
        DATA_LIST_TEST.append((data_row.contents, to_categorical(data_row.labels, 2)))
    DATA_LIST_TEST = np.array(DATA_LIST_TEST)

    data = DATA_LIST
    data_test = DATA_LIST_TEST

    X_train,X_valid = train_test_split(data,test_size=0.2,random_state = 0)
    return X_train,X_valid,data_test

# 预训练好的模型 roberta_wwm_ext_large
# config_path     = r'roberta_wwm_ext_large\bert_config.json' # 加载配置文件
# checkpoint_path = r'roberta_wwm_ext_large\bert_model.ckpt'
# dict_path       = r'roberta_wwm_ext_large\vocab.txt'

# 预训练好的模型 bert base
config_path     = r'bert\bert_config.json' # 加载配置文件
checkpoint_path = r'bert\bert_model.ckpt'
dict_path       = r'bert\vocab.txt'


def get_token_dict():
    """
    # 将词表中的字编号转换为字典
    :return: 返回自编码字典
    """
    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    return token_dict

# 重写tokenizer
class OurTokenizer(Tokenizer):
    def _tokenize(self, text):
        R = []
        for c in text:
            if c in self._token_dict:
                R.append(c)
            elif self._is_space(c):
                R.append('[unused1]')  # 用[unused1]来表示空格类字符
            else:
                R.append('[UNK]')  # 不在列表的字符用[UNK]表示   UNK是unknown的意思
        return R

# 获取新的tokenizer
tokenizer = OurTokenizer(get_token_dict())


def seq_padding(X, padding=0):
    """
    :param X: 文本列表
    :param padding: 填充为0
    :return: 让每条文本的长度相同,用0填充
    """
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])


class data_generator:
    """
    data_generator只是一种为了节约内存的数据方式
    """
    def __init__(self, data, batch_size=Batch_size, shuffle=True):
        """
        :param data: 训练的文本列表
        :param batch_size:  每次训练的个数
        :param shuffle: 文本是否打乱
        """
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1

    def __len__(self):
        return self.steps

    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))

            if self.shuffle:
                np.random.shuffle(idxs)

            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y[:, 0, :]
                    [X1, X2, Y] = [], [], []



def acc_top2(y_true, y_pred):
    """
    :param y_true: 真实值
    :param y_pred: 训练值
    :return: # 计算top-k正确率,当预测值的前k个值中存在目标类别即认为预测正确
    """
    return top_k_categorical_accuracy(y_true, y_pred, k=2)


# bert模型设置
def build_bert(nclass):
    """
    :param nclass: 文本分类种类
    :return: 构建的bert模型
    """
    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)  # 加载预训练模型

    for l in bert_model.layers:
        l.trainable = True
    #构建模型
    x1_in = Input(shape=(None,))
    x2_in = Input(shape=(None,))

    x = bert_model([x1_in, x2_in])
    x = Lambda(lambda x: x[:, 0])(x)  # 取出[CLS]对应的向量用来做分类
    p = Dense(nclass, activation='softmax')(x)

    model = Model([x1_in, x2_in], p)
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(1e-5),  # 用足够小的学习率
                  metrics=['accuracy', acc_top2])
    print(model.summary())
    return model


def run_kb():
    """
    训练模型
    :return: 验证预测集,测试预测集,训练好的模型
    """
    # 搭建模型参数
    print('正在加载模型,请耐心等待....')
    model = build_bert(2)  # 二分类模型
    print('模型加载成功,开始训练....')
    early_stopping = EarlyStopping(monitor='val_acc', patience=3)  # 早停法,防止过拟合
    plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2)  # 当评价指标不在提升时,减少学习率
    checkpoint = ModelCheckpoint(r'C:\Users\ChuangLan\PycharmProjects\bert_porject\use_bert\bert_dump1.hdf5', monitor='val_acc', verbose=2,
                                 save_best_only=True, mode='max', save_weights_only=True)  # 保存最好的模型
    # 获取数据并文本序列化
    X_train, X_valid, data_test = get_train_test_data()
    train_D = data_generator(X_train, shuffle=True)
    valid_D = data_generator(X_valid, shuffle=True)
    test_D = data_generator(data_test, shuffle=False)

    # 模型训练
    model.fit_generator(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=Epoch,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D),
        callbacks=[early_stopping, plateau, checkpoint],
        )
    # 对验证集和测试集进行预测
    valid_D = data_generator(X_valid, shuffle=False)
    train_model_pred = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=1)
    test_model_pred  = model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)
    # 将预测概率值转化为类别值
    train_pred = [np.argmax(x) for x in train_model_pred]
    test_pred = [np.argmax(x) for x in test_model_pred]
    y_true = [np.argmax(x) for x in X_valid[:, 1]]

    return train_pred,test_pred,y_true,model,data_test


def bk_metrics(y_true,y_pred,type ='metrics'):
    """
    :param y_true: 真实值
    :param y_pred: 预测值
    :param type: 预测种类
    :return: 评估指标
    """
    print(type,'...')
    print(metrics.confusion_matrix(y_true,y_pred))
    print('准确率:',metrics.accuracy_score(y_true,y_pred))
    print('类别精度:',metrics.precision_score(y_true,y_pred,average = None)) #不求平均
    print('宏平均精度:',metrics.precision_score(y_true,y_pred,average = 'macro'))
    print('微平均召回率:',metrics.recall_score(y_true,y_pred,average = 'micro'))
    print('加权平均F1得分:',metrics.f1_score(y_true,y_pred,average = 'weighted'))

#
if __name__ == '__main__':

    # 训练和预测
    train_pred, test_pred, y_true,model,data_test = run_kb()

    # 评估验证集
    bk_metrics(train_pred,y_true,type =' train metrics')
    # 评估测试集

    bk_metrics(test_pred,[np.argmax(x) for x in data_test[:, 1]],type =' test metrics')
    # 将模型保存
    model_path =r'use_bert\bertkeras_model.h5'
    model.save(model_path)


    # 模型加载
    from keras_bert import get_custom_objects
    from keras.models import load_model
    custom_objects = get_custom_objects()
    my_objects = {'acc_top2': acc_top2}
    custom_objects.update(my_objects)
    model = load_model(model_path, custom_objects=custom_objects)


    # 单独评估一个本来分类
    text = '这家餐厅的菜味道可以'
    DATA_text = []
    DATA_text.append((text, to_categorical(0, 2)))
    DATA_text = np.array(DATA_text)
    text= data_generator(DATA_text, shuffle=False)
    test_model_pred  = model.predict_generator(text.__iter__(), steps=len(text), verbose=1)
    print('预测结果',test_model_pred)
    print(np.argmax(test_model_pred))


    del model # 删除模型减少缓存
    gc.collect()  # 清理内存
    K.clear_session()  # clear_session就是清除一个session

训练迭代10次 

[[48  3]
 [ 6 44]]
准确率: 0.9108910891089109
类别精度: [0.88888889 0.93617021]
宏平均精度: 0.9125295508274232
微平均召回率: 0.9108910891089109
加权平均F1得分: 0.9107861007013809

训练迭代20次

[[48  3]
 [ 5 45]]
准确率: 0.9207920792079208
类别精度: [0.90566038 0.9375    ]
宏平均精度: 0.9215801886792453
微平均召回率: 0.9207920792079208
加权平均F1得分: 0.9207454497412065

 

你可能感兴趣的:(Bert文本分类(基于keras-bert实现训练,保存,加载,预测单个文本))