NLP-bert4keras文本分类

bert4keras + textCNN实现文本分类

  • 前言
  • 一、data process
  • 二、model
  • 三、train
  • 四、predict


前言

【写这个是为了做笔记,如果有写错的地方,请一定告诉我,感谢】
利用bert4keras库和textCNN实现文本分类
bert4keras:官方GitHub链接,里面有要求的keras和tensorflow版本
有时候不按照官方限制的版本也可以运行,反正出错就百度,安装库都是小问题
BERT预训练模型下载:GitHub链接
B站:KBQA项目实战第9期-使用bert+textcnn做意图识别和文本分类
以及我根据自己实际使用修改的代码,增加了利用训练好的模型预测标签的内容

一、data process

pred_data 函数为新增内容

import json
import pandas as pd

def gen_training_data(raw_data_path):
    label_list = [line.strip() for line in open('./github_bert_inten_recognition/label.txt','r',encoding='utf-8')]
    print(label_list)
    label2id = {label:idx for idx,label in enumerate(label_list)}

    data = []
    with open('./github_bert_inten_recognition/CMID_datasets.json','r',encoding='utf-8') as f:
        origin_data = f.read()
        origin_data = eval(origin_data)  # [{},{},...]

    label_set = set()
    for item in origin_data: # item是字典
        text = item['originalText']

        label_class = item['label_4class'][0].strip("'")
        if label_class=='其他':
            data.append([text,label_class,label2id[label_class]])
            continue

        label_class = item['label_36class'][0].strip("'")
        label_set.add(label_class)
        if label_class not in label_list: # 如果label_class是我们需要的标签,那就添加到data里,不是的话就跳过
            # lable_class = '其他'
            continue
        data.append([text,label_class,label2id[label_class]])

    print('数据集中一共有:',label_set,'个标签')

    data = pd.DataFrame(data,columns=['text','label_class','label'])

    print('各个类别出现的数量')
    print(data['label_class'].value_counts()) # 统计各个标签的数量

    data['text_len'] = data['text'].map(lambda x:len(x)) # 统计原始文本的序列长度,因为在训练模型的时候要确定maxlen
    print(data['text_len'].describe())
    import matplotlib.pyplot as plt
    plt.hist(data['text_len'],bins=30,rwidth=0.9,density=True)
    plt.show()

    del data['text_len']

    # DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)
    data = data.sample(frac=1.0) # sample()用于从DataFrame中随机选择行和列,它返回与调用者相同类型的新对象, 其中包含从调用者对象中随机采样的n个项目
    train_num = int(0.9*len(data))
    train,test = data[:train_num],data[train_num:]
    train.to_csv('train.csv',index=False)
    test.to_csv('test.csv',index=False)

def load_data(filename):
    '''
    加载数据
    单条格式:(文本,标签id)
    '''
    df = pd.read_csv(filename,header=0)
    return df[['text','label']].values

# 新增内容,用于预测标签
def pred_data(file):
    '''
    用训练好的模型预测新数据的标签
    '''
    df = pd.read_csv(file)

    # 创建一列标签全为0的数据,保存到文件里,后面用预测后的标签替换这一列
    labels = []
    for i in range(len(df)):
        labels.append(0)
    df['label'] = labels
    df.to_csv(file,index=False,encoding='utf-8_sig')


if __name__ == '__main__':
    data_path = './github_bert_inten_recognition/CMID_datasets.json'
    gen_training_data(data_path)
    pred_data('待预测标签的文本')

二、model

可以根据实际任务修改神经网络结构,比如把CNN换成RNN、加减层数之类,要注意各层之间的维度问题,不行就print看看,至于超参数就慢慢调吧

#声明编码方式
#! -*- coding: utf-8 -*-
import  os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from bert4keras.backend import keras,set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam

set_gelu('tanh')

def textcnn(inputs,kernel_initializer):
    # 3,4,5
    cnn1 = keras.layers.Conv1D(
        256,
        3,
        strides=1,
        padding='same',
        activation='relu',
        kernel_initializer=kernel_initializer
    )(inputs) # shape=[batch_size,maxlen-2,256]
    cnn1 = keras.layers.GlobalMaxPooling1D()(cnn1) # shape=[batch_size,256]

    cnn2 = keras.layers.Conv1D(
        256,
        4,
        strides=1,
        padding='same',
        activation='relu',
        kernel_initializer=kernel_initializer
    )(inputs)
    cnn2 = keras.layers.GlobalMaxPooling1D()(cnn2)

    cnn3 = keras.layers.Conv1D(
        256,
        5,
        strides=1,
        padding='same',
        kernel_initializer=kernel_initializer
    )(inputs)
    cnn3 = keras.layers.GlobalMaxPooling1D()(cnn3)

    cnn = keras.layers.concatenate([cnn1,cnn2,cnn3],axis=-1)
    output = keras.layers.Dropout(0.2)(cnn)

    return output


# config_path 配置文件路径,checkpoint_path 预训练文件路径,class_nums 要分类的数目
def build_bert_model(config_path,checkpoint_path,class_nums):
    # bert模型的预加载
    bert = build_transformer_model(config_path=config_path,checkpoint_path=checkpoint_path,
                            model='bert',return_keras_model=False)
    # bert的输入是[CLS] token1 token2 token3 ... [sep]
    # 要从输出里提取到CLS,bert的输出是768维的语义向量
    # 用Lambda函数抽取所有行的第一列,因为CLS在第一个位置,如果后面不再接textCNN的话,就可以直接拿CLS这个向量,后面接全连接层去做分类了
    cls_features = keras.layers.Lambda(
        lambda x:x[:,0],
        name='cls_token'
    )(bert.model.output) # shape=[batch_size,768]

    # 去掉CLS和SEP的所有token(第一列到倒数第二列),抽取所有token的embedding,可以看作是input经过embedding之后的结果
    # 其实就是一个embedding矩阵,将这个矩阵传给textCNN
    all_token_embedding = keras.layers.Lambda(
        lambda x:x[:,1:-1],
        name='all_token'
    )(bert.model.output) # shape=[batch_size,maxlen-2,768]

    cnn_features = textcnn(all_token_embedding,bert.initializer) # shape=[batch_size,cnn_output_dim]
    # 经过CNN提取特征后,将其和CLS特征进行拼接,然后输入全连接层进行分类
    concat_features = keras.layers.concatenate([cls_features,cnn_features],axis=-1) # 在768那个维度拼接

    dense = keras.layers.Dense(
        units=512,
        activation='relu',
        kernel_initializer=bert.initializer
    )(concat_features)

    output = keras.layers.Dense(
        units=class_nums,
        activation='softmax',
        kernel_initializer=bert.initializer
    )(dense)

    model = keras.models.Model(bert.model.input,output)
    print(model.summary())

    return model

if __name__ == '__main__':
    config_path = './bert_weights/rbt3/bert_config_rbt3.json'
    checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
    class_nums = 13  # 根据实际任务修改类别数

    build_bert_model(config_path, checkpoint_path, class_nums)

三、train

B站教程中的训练代码

#! -*- coding: utf-8 -*-
import  os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import json
import pandas as pd
import numpy as np

from bert4keras.backend import keras
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import sequence_padding,DataGenerator
from sklearn.metrics import classification_report
from bert4keras.optimizers import Adam

from BERT_textCNN import build_bert_model
from data_process_CMID import load_data

# 定义超参数和配置文件
class_nums = 13 # 根据实际任务修改类别数
maxlen = 128 # 根据数据中文本的长度分布观察得到
batch_size = 32

config_path = './bert_weights/rbt3/bert_config_rbt3.json'
checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
dict_path = './bert_weights/rbt3/vocab.txt'

tokenizer = Tokenizer(dict_path)
# 继承DataGenerator类
class data_generator(DataGenerator):
    '''
    数据生成器
    '''

    def __iter__(self,random=False):
        batch_token_ids,batch_segment_ids,batch_labels = [],[],[]
        for is_end,(text,label) in self.sample(random):
            token_ids,segment_ids = tokenizer.encode(text,maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids,batch_segment_ids],batch_labels # [模型的输入],标签
                batch_token_ids,batch_segment_ids,batch_labels = [],[],[] # 再次初始化

if __name__ == '__main__':
    # 加载数据集
    train_data = load_data('train.csv')
    test_data = load_data('test.csv')


    # 转换数据集
    train_generator = data_generator(train_data,batch_size)
    test_generator = data_generator(test_data,batch_size)

    model = build_bert_model(config_path,checkpoint_path,class_nums)
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(5e-6),
        metrics=['accuracy']
    )

    earlystop = keras.callbacks.EarlyStopping(
        monitor='val_acc',
        patience=2,
        verbose=2,
        mode='max'
    )
    best_model_filepath = 'best_model.weights'

    if os.path.exists(best_model_filepath):
        print('---------------load the model---------------')
        model.load_weights(best_model_filepath)

    checkpoint = keras.callbacks.ModelCheckpoint(
        best_model_filepath,
        monitor='val_acc',
        verbose=1,
        save_best_only=True,
        mode='max'
    )
    # 传入迭代器进行训练
    model.fit_generator(
        train_generator.forfit(),
        steps_per_epoch=len(train_generator),
        epochs=10,
        validation_data=test_generator.forfit(),
        validation_steps=len(test_generator),
        shuffle=True,
        callbacks=[checkpoint]
    )

    model.save_weights(best_model_filepath)

    model.load_weights('best_model.weights')

    test_pred = []
    test_true = []
    for x,y in test_generator:
        p = model.predict(x).argmax(axis=1)
        test_pred.extend(p)

    test_true = test_data[:,1].tolist()
    print(set(test_true))
    print(set(test_pred))

    target_names = [line.strip() for line in open('./github_bert_inten_recognition/label.txt','r',encoding='utf-8')]
    print(classification_report(test_true,test_pred,target_names=target_names))

四、predict

利用训练好的模型预测标签,做的修改都是图方便,可以运行就ok,可能不是最佳做法

#! -*- coding: utf-8 -*-
import  os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

import json
import pandas as pd
import numpy as np

from bert4keras.backend import keras
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import sequence_padding,DataGenerator
from sklearn.metrics import classification_report
from bert4keras.optimizers import Adam

from BERT_textCNN import build_bert_model
from data_process_CMID import load_data

# 定义超参数和配置文件
class_nums = 13 # 根据实际任务修改类别数
maxlen = 128 # 根据数据中文本的长度分布观察得到
batch_size = 32

config_path = './bert_weights/rbt3/bert_config_rbt3.json'
checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
dict_path = './bert_weights/rbt3/vocab.txt'

tokenizer = Tokenizer(dict_path)
# 继承DataGenerator类
class data_generator(DataGenerator):
    '''
    数据生成器
    '''

    def __iter__(self,random=False):
        batch_token_ids,batch_segment_ids,batch_labels = [],[],[]
        for is_end,(text,label) in self.sample(random):
            token_ids,segment_ids = tokenizer.encode(text,maxlen=maxlen)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids,batch_segment_ids],batch_labels # [模型的输入],标签
                batch_token_ids,batch_segment_ids,batch_labels = [],[],[] # 再次初始化

if __name__ == '__main__':
    # 加载数据集
    train_data = load_data('train.csv')
    test_data = load_data('test.csv')
    # pred
    pred_data = load_data('待预测标签的文本')


    # 转换数据集
    train_generator = data_generator(train_data,batch_size)
    test_generator = data_generator(test_data,batch_size)
    # pred
    pred_generator = data_generator(pred_data, batch_size)

    model = build_bert_model(config_path,checkpoint_path,class_nums)
    best_model_filepath = 'best_model.weights'

    if os.path.exists(best_model_filepath):
        print('---------------load the model---------------')
        model.load_weights(best_model_filepath)
    
    test_pred = []

    for x,_ in pred_generator:
        p = model.predict(x).argmax(axis=1)
        test_pred.extend(p)
    print(test_pred)

    df = pd.read_csv('待预测标签的文本')
    df['label'] = test_pred # 用预测的标签替换原来全为0的标签
    df.to_csv('预测结果.csv',index=False,encoding='utf-8_sig') 




你可能感兴趣的:(python)