bert训练自用
#%% 导入包
from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.utils import shuffle
from plt_score import plt_roc_pr
#%% 导入数据
import os,sys
# print('current path:',os.getcwd())
os.chdir(sys.path[0])
# print('current path:',os.getcwd())
# df = pd.read_excel('../../data/label_data_language_en.xlsx')
df = pd.read_excel('../../data/label_data_language_en_aug.xlsx')#读取数据增强文件
print(df['gf_review_fix'].value_counts())
df['content'] = df['content'].str.lower().fillna('test') #全部小写
print(df.isnull().sum())
df = shuffle(df,random_state=1115)#shuffle两次
df = shuffle(df,random_state=930)
#%% 文本预处理
start_token=time.time()
x = list(df['content'])
y = list(df['gf_review_fix'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=1634,stratify = y)
x_vali,x_test,y_vali,y_test = train_test_split(x_test,y_test,test_size=0.5,random_state=1111,stratify = y_test)
tokenizer = BertTokenizer.from_pretrained('../../model-pytorch/bert-base-uncased')
# print(tokenizer)
train_encoding = tokenizer(x_train, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
vali_encoding = tokenizer(x_vali, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt" )
test_encoding = tokenizer(x_test, add_special_tokens=True,
padding=True, truncation=True, max_length = 300,return_tensors="pt")
# print(type(train_encoding))
#%%定义类
# print(x)
class NewsDataset(Dataset):
def __init__(self,encodings,labels):
self.encodings = encodings
self.labels = labels
#这里的idx是为了让后面的DataLoader成批处理成迭代器,按idx映射到对应数据
def __getitem__(self, idx):
item = {key:(val[idx]) for key, val in self.encodings.items()}
# item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(int(self.labels[idx]))
return item
def __len__(self):
return len(self.labels)
train_dataset = NewsDataset(train_encoding, y_train)
vali_dataset = NewsDataset(vali_encoding, y_vali)
test_dataset = NewsDataset(test_encoding, y_test)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True)
vali_loader = DataLoader(vali_dataset,batch_size=batch_size ,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=True)
end_token = time.time()
print('data token batch Running time:{} Seconds'.format(end_token-start_token))
#可以看看长啥样
# batch = next(iter(train_loader))
# print(batch)
# print(batch['input_ids'].shape)
# %% 构建模型
torch.cuda.empty_cache()
device = torch.device("cuda:0"if torch.cuda.is_available() else"cpu")
class BertClassificationModel(nn.Module):
def __init__(self):
super(BertClassificationModel, self).__init__()
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer,
"../../model-pytorch/bert-base-uncased/")
# self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
#bert,input,dense都要放入GPU中
#从上次的第**次开始再训练
# pretrained_weights = '../model_save/uncase_bert_stratify_19'
self.bert = torch.nn.DataParallel(model_class.from_pretrained(pretrained_weights).cuda())
# 最后的预测层
self.predictor = nn.Sequential(
nn.Linear(768, 2).cuda(), #bert默认的隐藏单元数是768, 输出单元是2,表示二分类
nn.Softmax(dim=1)
)
def forward(self, input_ids,attention_mask):
torch.cuda.empty_cache()
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
bert_output = self.bert( input_ids= input_ids, attention_mask=attention_mask)
bert_cls_hidden_state = bert_output[0][:,0,:] #提取[CLS]对应的隐藏状态
linear_sigmoid_output = self.predictor(bert_cls_hidden_state)
return linear_sigmoid_output
#%% 实例化模型、定义损失函数和优化器、epochs
#初始化
torch.cuda.empty_cache()
bert_classifier_model = BertClassificationModel()
#超参数-----------------
#轮次,学习率
epochs = 10
lr = 1e-4
#优化器、调整器、损失函数
optimizer = AdamW(bert_classifier_model.parameters(), lr=lr) #改为Adam ->AdamW,删除筛选过滤冻结层(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)
total_steps = epochs * len(train_loader)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
criterion =nn.CrossEntropyLoss().cuda()
#记录数据参数-----------------
#训练集记录数据
batch_loss = pd.DataFrame(columns=['epoch','batch','loss'])
#验证集记录数据
epoch_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#测试集评分
epoch_test_scores = pd.DataFrame(columns = ['epoch','prescore','accscore','recascore','f1_score'])
#保存模型比较值
max_f1 = 0
max_epoch = 0
#%% 训练函数
# 训练函数
def train():
bert_classifier_model.train()
total_train_loss = 0
iter_num = 0
total_iter = len(train_loader)
global batch_loss
for batch in train_loader:
# 正向传播
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
total_train_loss += loss.item()
loss.backward()
nn.utils.clip_grad_norm_(bert_classifier_model.parameters(), 1.0) #梯度裁剪,防止梯度爆炸
# 参数更新
optimizer.step()
scheduler.step()
# df1 = pd.DataFrame([[epoch,iter_num,loss.item]],columns=['epoch','batch','loss'])
# batch_loss = batch_loss.append(df1, ignore_index=True)
loss_each = total_train_loss/(iter_num+1)/batch_size
df1 = pd.DataFrame([[epoch,iter_num,loss_each]],columns=['epoch','batch','loss'])
batch_loss = batch_loss.append(df1, ignore_index=True)
iter_num += 1
if(iter_num % 100 == 0):
print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss_each, iter_num/total_iter*100))
batch_loss.to_csv('./result/batch_loss.csv')
print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
#%% 评价函数及测试
def validation(test_dataloader):
bert_classifier_model.eval()
pre_list = []
prob_list = []
with torch.no_grad():
for batch in test_dataloader:
# 正常传播
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = bert_classifier_model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
# logits = outputs
prob = outputs[:, 1].cpu().numpy()
prob_list.extend(prob.tolist())
_, predicted = torch.max(outputs, 1)
predicted = predicted.cpu().numpy()
torch.cuda.empty_cache()
pre_list.extend(predicted.tolist())
return pre_list,prob_list
def metric_show(pre_list,targets,prob_list,epoch,figlist):
prescore= precision_score(targets,pre_list)
accscore = accuracy_score(targets,pre_list)
recascore = recall_score(targets,pre_list)
f1score = f1_score(targets,pre_list)
print("precision:{}".format(prescore))
print("accuracy:{}".format(accscore))
print("recascore:{}".format(recascore))
print("f1:{}".format(f1score))
#
save_path = './result/bert'
plt_roc_pr(targets,prob_list,name = str(epoch),save_path=save_path,figlist=figlist)
return prescore,accscore,recascore,f1score
#%% 训练开始
start = time.time()
for epoch in range(epochs):
print("------------Epoch: %d ----------------" % epoch)
start_epoch = time.time()
train()
#使用验证集比较epoch效果最好的模型
pre_list,prob_list= validation(vali_loader)
prescore,accscore,recascore,f1score = metric_show(pre_list,y_vali,prob_list,epoch,figlist=[0,1])
epoch_scores.loc[epoch]=[epoch,prescore,accscore,recascore,f1score]
epoch_scores.to_csv('./result/epoch_scores.csv')
#得到各种指标,同时记录效果最好的F1和当时EPOCH,保存模型
if epoch>5 and max_f1<f1score:
max_f1 = f1score
max_epoch = epoch
print("最大F1:{}-对应EPOCH:{}".format(max_f1,max_epoch))
save_directory = '../model_save/uncase_bert_stratify_best'
bert_classifier_model.bert.module.save_pretrained(save_directory)
#记录测试集数据
pre_list,prob_list= validation(test_loader)
t_prescore,t_accscore,t_recascore,t_f1score = metric_show(pre_list,y_test,prob_list,epoch = 't'+str(epoch),figlist=[2,3])
epoch_test_scores.loc[epoch]=[epoch,t_prescore,t_accscore,t_recascore,t_f1score]
epoch_test_scores.to_csv('./result/epoch_test_scores.csv')
end_epoch = time.time()
print('total model Running time:{} Seconds'.format(end_epoch-start_epoch))
end = time.time()
print('total model Running time:{} Seconds'.format(end-start))