【写这个是为了做笔记,如果有写错的地方,请一定告诉我,感谢】
利用bert4keras库和textCNN实现文本分类
bert4keras:官方GitHub链接,里面有要求的keras和tensorflow版本
有时候不按照官方限制的版本也可以运行,反正出错就百度,安装库都是小问题
BERT预训练模型下载:GitHub链接
B站:KBQA项目实战第9期-使用bert+textcnn做意图识别和文本分类
以及我根据自己实际使用修改的代码,增加了利用训练好的模型预测标签的内容
pred_data 函数为新增内容
import json
import pandas as pd
def gen_training_data(raw_data_path):
label_list = [line.strip() for line in open('./github_bert_inten_recognition/label.txt','r',encoding='utf-8')]
print(label_list)
label2id = {label:idx for idx,label in enumerate(label_list)}
data = []
with open('./github_bert_inten_recognition/CMID_datasets.json','r',encoding='utf-8') as f:
origin_data = f.read()
origin_data = eval(origin_data) # [{},{},...]
label_set = set()
for item in origin_data: # item是字典
text = item['originalText']
label_class = item['label_4class'][0].strip("'")
if label_class=='其他':
data.append([text,label_class,label2id[label_class]])
continue
label_class = item['label_36class'][0].strip("'")
label_set.add(label_class)
if label_class not in label_list: # 如果label_class是我们需要的标签,那就添加到data里,不是的话就跳过
# lable_class = '其他'
continue
data.append([text,label_class,label2id[label_class]])
print('数据集中一共有:',label_set,'个标签')
data = pd.DataFrame(data,columns=['text','label_class','label'])
print('各个类别出现的数量')
print(data['label_class'].value_counts()) # 统计各个标签的数量
data['text_len'] = data['text'].map(lambda x:len(x)) # 统计原始文本的序列长度,因为在训练模型的时候要确定maxlen
print(data['text_len'].describe())
import matplotlib.pyplot as plt
plt.hist(data['text_len'],bins=30,rwidth=0.9,density=True)
plt.show()
del data['text_len']
# DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)
data = data.sample(frac=1.0) # sample()用于从DataFrame中随机选择行和列,它返回与调用者相同类型的新对象, 其中包含从调用者对象中随机采样的n个项目
train_num = int(0.9*len(data))
train,test = data[:train_num],data[train_num:]
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
def load_data(filename):
'''
加载数据
单条格式:(文本,标签id)
'''
df = pd.read_csv(filename,header=0)
return df[['text','label']].values
# 新增内容,用于预测标签
def pred_data(file):
'''
用训练好的模型预测新数据的标签
'''
df = pd.read_csv(file)
# 创建一列标签全为0的数据,保存到文件里,后面用预测后的标签替换这一列
labels = []
for i in range(len(df)):
labels.append(0)
df['label'] = labels
df.to_csv(file,index=False,encoding='utf-8_sig')
if __name__ == '__main__':
data_path = './github_bert_inten_recognition/CMID_datasets.json'
gen_training_data(data_path)
pred_data('待预测标签的文本')
可以根据实际任务修改神经网络结构,比如把CNN换成RNN、加减层数之类,要注意各层之间的维度问题,不行就print看看,至于超参数就慢慢调吧
#声明编码方式
#! -*- coding: utf-8 -*-
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
from bert4keras.backend import keras,set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
set_gelu('tanh')
def textcnn(inputs,kernel_initializer):
# 3,4,5
cnn1 = keras.layers.Conv1D(
256,
3,
strides=1,
padding='same',
activation='relu',
kernel_initializer=kernel_initializer
)(inputs) # shape=[batch_size,maxlen-2,256]
cnn1 = keras.layers.GlobalMaxPooling1D()(cnn1) # shape=[batch_size,256]
cnn2 = keras.layers.Conv1D(
256,
4,
strides=1,
padding='same',
activation='relu',
kernel_initializer=kernel_initializer
)(inputs)
cnn2 = keras.layers.GlobalMaxPooling1D()(cnn2)
cnn3 = keras.layers.Conv1D(
256,
5,
strides=1,
padding='same',
kernel_initializer=kernel_initializer
)(inputs)
cnn3 = keras.layers.GlobalMaxPooling1D()(cnn3)
cnn = keras.layers.concatenate([cnn1,cnn2,cnn3],axis=-1)
output = keras.layers.Dropout(0.2)(cnn)
return output
# config_path 配置文件路径,checkpoint_path 预训练文件路径,class_nums 要分类的数目
def build_bert_model(config_path,checkpoint_path,class_nums):
# bert模型的预加载
bert = build_transformer_model(config_path=config_path,checkpoint_path=checkpoint_path,
model='bert',return_keras_model=False)
# bert的输入是[CLS] token1 token2 token3 ... [sep]
# 要从输出里提取到CLS,bert的输出是768维的语义向量
# 用Lambda函数抽取所有行的第一列,因为CLS在第一个位置,如果后面不再接textCNN的话,就可以直接拿CLS这个向量,后面接全连接层去做分类了
cls_features = keras.layers.Lambda(
lambda x:x[:,0],
name='cls_token'
)(bert.model.output) # shape=[batch_size,768]
# 去掉CLS和SEP的所有token(第一列到倒数第二列),抽取所有token的embedding,可以看作是input经过embedding之后的结果
# 其实就是一个embedding矩阵,将这个矩阵传给textCNN
all_token_embedding = keras.layers.Lambda(
lambda x:x[:,1:-1],
name='all_token'
)(bert.model.output) # shape=[batch_size,maxlen-2,768]
cnn_features = textcnn(all_token_embedding,bert.initializer) # shape=[batch_size,cnn_output_dim]
# 经过CNN提取特征后,将其和CLS特征进行拼接,然后输入全连接层进行分类
concat_features = keras.layers.concatenate([cls_features,cnn_features],axis=-1) # 在768那个维度拼接
dense = keras.layers.Dense(
units=512,
activation='relu',
kernel_initializer=bert.initializer
)(concat_features)
output = keras.layers.Dense(
units=class_nums,
activation='softmax',
kernel_initializer=bert.initializer
)(dense)
model = keras.models.Model(bert.model.input,output)
print(model.summary())
return model
if __name__ == '__main__':
config_path = './bert_weights/rbt3/bert_config_rbt3.json'
checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
class_nums = 13 # 根据实际任务修改类别数
build_bert_model(config_path, checkpoint_path, class_nums)
B站教程中的训练代码
#! -*- coding: utf-8 -*-
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import json
import pandas as pd
import numpy as np
from bert4keras.backend import keras
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import sequence_padding,DataGenerator
from sklearn.metrics import classification_report
from bert4keras.optimizers import Adam
from BERT_textCNN import build_bert_model
from data_process_CMID import load_data
# 定义超参数和配置文件
class_nums = 13 # 根据实际任务修改类别数
maxlen = 128 # 根据数据中文本的长度分布观察得到
batch_size = 32
config_path = './bert_weights/rbt3/bert_config_rbt3.json'
checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
dict_path = './bert_weights/rbt3/vocab.txt'
tokenizer = Tokenizer(dict_path)
# 继承DataGenerator类
class data_generator(DataGenerator):
'''
数据生成器
'''
def __iter__(self,random=False):
batch_token_ids,batch_segment_ids,batch_labels = [],[],[]
for is_end,(text,label) in self.sample(random):
token_ids,segment_ids = tokenizer.encode(text,maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids,batch_segment_ids],batch_labels # [模型的输入],标签
batch_token_ids,batch_segment_ids,batch_labels = [],[],[] # 再次初始化
if __name__ == '__main__':
# 加载数据集
train_data = load_data('train.csv')
test_data = load_data('test.csv')
# 转换数据集
train_generator = data_generator(train_data,batch_size)
test_generator = data_generator(test_data,batch_size)
model = build_bert_model(config_path,checkpoint_path,class_nums)
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=Adam(5e-6),
metrics=['accuracy']
)
earlystop = keras.callbacks.EarlyStopping(
monitor='val_acc',
patience=2,
verbose=2,
mode='max'
)
best_model_filepath = 'best_model.weights'
if os.path.exists(best_model_filepath):
print('---------------load the model---------------')
model.load_weights(best_model_filepath)
checkpoint = keras.callbacks.ModelCheckpoint(
best_model_filepath,
monitor='val_acc',
verbose=1,
save_best_only=True,
mode='max'
)
# 传入迭代器进行训练
model.fit_generator(
train_generator.forfit(),
steps_per_epoch=len(train_generator),
epochs=10,
validation_data=test_generator.forfit(),
validation_steps=len(test_generator),
shuffle=True,
callbacks=[checkpoint]
)
model.save_weights(best_model_filepath)
model.load_weights('best_model.weights')
test_pred = []
test_true = []
for x,y in test_generator:
p = model.predict(x).argmax(axis=1)
test_pred.extend(p)
test_true = test_data[:,1].tolist()
print(set(test_true))
print(set(test_pred))
target_names = [line.strip() for line in open('./github_bert_inten_recognition/label.txt','r',encoding='utf-8')]
print(classification_report(test_true,test_pred,target_names=target_names))
利用训练好的模型预测标签,做的修改都是图方便,可以运行就ok,可能不是最佳做法
#! -*- coding: utf-8 -*-
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import json
import pandas as pd
import numpy as np
from bert4keras.backend import keras
from bert4keras.tokenizers import Tokenizer
from bert4keras.snippets import sequence_padding,DataGenerator
from sklearn.metrics import classification_report
from bert4keras.optimizers import Adam
from BERT_textCNN import build_bert_model
from data_process_CMID import load_data
# 定义超参数和配置文件
class_nums = 13 # 根据实际任务修改类别数
maxlen = 128 # 根据数据中文本的长度分布观察得到
batch_size = 32
config_path = './bert_weights/rbt3/bert_config_rbt3.json'
checkpoint_path = './bert_weights/rbt3/bert_model.ckpt'
dict_path = './bert_weights/rbt3/vocab.txt'
tokenizer = Tokenizer(dict_path)
# 继承DataGenerator类
class data_generator(DataGenerator):
'''
数据生成器
'''
def __iter__(self,random=False):
batch_token_ids,batch_segment_ids,batch_labels = [],[],[]
for is_end,(text,label) in self.sample(random):
token_ids,segment_ids = tokenizer.encode(text,maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
if len(batch_token_ids) == self.batch_size or is_end:
batch_token_ids = sequence_padding(batch_token_ids)
batch_segment_ids = sequence_padding(batch_segment_ids)
batch_labels = sequence_padding(batch_labels)
yield [batch_token_ids,batch_segment_ids],batch_labels # [模型的输入],标签
batch_token_ids,batch_segment_ids,batch_labels = [],[],[] # 再次初始化
if __name__ == '__main__':
# 加载数据集
train_data = load_data('train.csv')
test_data = load_data('test.csv')
# pred
pred_data = load_data('待预测标签的文本')
# 转换数据集
train_generator = data_generator(train_data,batch_size)
test_generator = data_generator(test_data,batch_size)
# pred
pred_generator = data_generator(pred_data, batch_size)
model = build_bert_model(config_path,checkpoint_path,class_nums)
best_model_filepath = 'best_model.weights'
if os.path.exists(best_model_filepath):
print('---------------load the model---------------')
model.load_weights(best_model_filepath)
test_pred = []
for x,_ in pred_generator:
p = model.predict(x).argmax(axis=1)
test_pred.extend(p)
print(test_pred)
df = pd.read_csv('待预测标签的文本')
df['label'] = test_pred # 用预测的标签替换原来全为0的标签
df.to_csv('预测结果.csv',index=False,encoding='utf-8_sig')