复旦大学邱锡鹏老师课题组的研究论文《How to Fine-Tune BERT for Text Classification?》。
论文: https://arxiv.org/pdf/1905.05583.pdf
https://mp.weixin.qq.com/s/9MrgIz2bchiCjUGpz6MbGQ
旨在文本分类任务上探索不同的BERT微调方法并提供一种通用的BERT微调解决方法。这篇论文从三种路线进行了探索:
(1) BERT自身的微调策略,包括长文本处理、学习率、不同层的选择等方法;
(2) 目标任务内、领域内及跨领域的进一步预训练BERT;
(3) 多任务学习。微调后的BERT在七个英文数据集及搜狗中文数据集上取得了当前最优的结果。
作者的实现代码: https://github.com/xuyige/BERT4doc-Classification
数据集来源:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset?select=train.csv
项目地址:https://www.kaggle.com/shivanandmn/multilabel-classification-dataset
该数据集包含 6 个不同的标签(计算机科学、物理、数学、统计学、生物学、金融),根据摘要和标题对研究论文进行分类。标签列中的值 1 表示标签属于该标签。每个论文有多个标签为 1。
#2.1 Import
#关于torch的安装可以参考https://blog.csdn.net/Checkmate9949/article/details/119494673?spm=1001.2014.3001.5501
import torch
from transformers import BertTokenizerFast as BertTokenizer
from utils.plot_results import plot_results
from resources.train_val_model import train_model
from resources.get_data import get_data
from resources.build_model import BertClassifier
from resources.test_model import test_model
from resources.build_dataloader import build_dataloader
2.2 Get data
##################################
# get data
##################################
#该函数见2.2.1
train_df, val_df, test_df = get_data()
# fixed parameters
#Columns: 第三行到倒数第二行
label_columns = train_df.columns.tolist()[3:-1]
num_labels = len(label_columns)
max_token_len = 30
# BERT_MODEL_NAME = "bert-base-uncased"
# bert-base-uncased: for English. bert-base-Chinese
BERT_MODEL_NAME = "model/bert-base-uncased"
#分词
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
def get_data():
df = pd.read_csv("data/train.csv")
#把标题和摘要合并作为TEXT
df["TEXT"] = df["TITLE"] + df["ABSTRACT"]
label_columns = df.columns.tolist()[3:-1]
print(df[label_columns].sum().sort_values())
#Split data in to train and test: 训练集占比80%
test_df, train_df = train_test_split(df, test_size=0.8, random_state=42)
#Split data in to valid and test: 分别占比50%
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)
#输出数据集
return train_df, val_df, test_df
##################################
# build data loaders
##################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#训练集使随机抽样random sample
#val and test: squential sample
train_dataloader = build_dataloader(
train_df, label_columns, tokenizer, max_token_len, trainset=True
)
val_dataloader = build_dataloader(val_df, label_columns, tokenizer, max_token_len)
test_dataloader = build_dataloader(test_df, label_columns, tokenizer, max_token_len)
build_dataloader
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
#df=train_df
os.environ["TOKENIZERS_PARALLELISM"] = "false"
class text_dataset(Dataset):
def __init__( df, label_columns, tokenizer, max_token_len):
data = df
label_columns = label_columns
tokenizer = tokenizer
max_token_len = max_token_len
#返回数据长度
def __len__(self):
return len(data)
#根据index获取item
#index=3
def __getitem__( index):
data_row = data.iloc[index]
text = data_row["TEXT"]
labels = data_row[label_columns]
encoding = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=max_token_len,
return_token_type_ids=False,
padding="max_length",
truncation=True,
return_attention_mask=True,
return_tensors="pt",
)
return dict(
text=text,
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.FloatTensor(labels),
)
#test=__getitem__( index)
def build_dataloader(df, label_columns, tokenizer, max_token_len, trainset=False):
dataset = text_dataset(df, label_columns, tokenizer, max_token_len)
#随机抽取样本
if trainset:
sampler = RandomSampler(df)
#有次序地抽取样本
else:
sampler = SequentialSampler(df)
return DataLoader(dataset, batch_size=10, sampler=sampler)
##################################
# build model
##################################
bert_classifier = BertClassifier(
num_labels=num_labels, BERT_MODEL_NAME=BERT_MODEL_NAME, freeze_bert=False
)
import torch
import torch.nn as nn
from transformers import BertModel
class BertClassifier(nn.Module):
def __init__(self, num_labels: int, BERT_MODEL_NAME, freeze_bert=False):
super().__init__()
self.num_labels = num_labels
self.bert = BertModel.from_pretrained(BERT_MODEL_NAME)
# hidden size of BERT, hidden size of our classifier, and number of labels to classify
D_in, H, D_out = self.bert.config.hidden_size, 50, num_labels
# Instantiate an one-layer feed-forward classifier
self.classifier = nn.Sequential(
nn.Dropout(p=0.3),
nn.Linear(D_in, H),
nn.ReLU(),
nn.Dropout(p=0.3),
nn.Linear(H, D_out),
)
# loss
self.loss_func = nn.BCEWithLogitsLoss()
if freeze_bert:
print("freezing bert parameters")
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask, labels=None):
outputs = self.bert(input_ids, attention_mask=attention_mask)
# Extract the last hidden state of the token `[CLS]` for classification task
last_hidden_state_cls = outputs[0][:, 0, :]
logits = self.classifier(last_hidden_state_cls)
if labels is not None:
predictions = torch.sigmoid(logits)
loss = self.loss_func(
predictions.view(-1, self.num_labels), labels.view(-1, self.num_labels)
)
return loss
else:
return logits
##################################
# train and validate model
##################################
trained_model, training_stats, train_loss_set = train_model(
bert_classifier,
train_dataloader,
val_dataloader=val_dataloader,
epochs=5,
evaluation=True,
)
plot_results(training_stats, train_loss_set)
import time
import random
import numpy as np
import torch
from utils.helper_functions import format_time
from transformers import AdamW, get_linear_schedule_with_warmup
def train_model(
model, train_dataloader, val_dataloader=None, epochs=5, evaluation=False
):
"""Train and validate the BertClassifier model."""
training_stats = []
train_loss_set = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer, scheduler = build_optimizer_scheduler(
model=model, epochs=epochs, train_dataloader=train_dataloader
)
print("Start training...\n")
for epoch_i in range(epochs):
# =======================================
# Training
# =======================================
print(
f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}"
)
print("-" * 70)
t0 = time.time()
t0_epoch, t0_batch = time.time(), time.time()
total_loss, batch_loss, batch_counts = 0, 0, 0
model.train()
for step, batch in enumerate(train_dataloader):
batch_counts += 1
b_input_ids = batch["input_ids"].to(device)
b_attention_mask = batch["attention_mask"].to(device)
b_labels = batch["labels"].to(device)
# b_attention_mask, b_labels = tuple(t.to(device) for t in batch)
model.zero_grad()
loss = model(
input_ids=b_input_ids,
attention_mask=b_attention_mask,
labels=b_labels,
)
batch_loss += loss.item()
total_loss += loss.item()
train_loss_set.append(loss.item())
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
# Print the loss values and time elapsed for every 20 batches
if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
time_elapsed = time.time() - t0_batch
print(
f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}"
)
# Reset batch tracking variables
batch_loss, batch_counts = 0, 0
t0_batch = time.time()
# Calculate the average loss over the entire training data
avg_train_loss = total_loss / len(train_dataloader)
training_time = format_time(time.time() - t0)
print("-" * 70)
# =======================================
# Evaluation
# =======================================
if evaluation == True:
avg_val_loss, avg_val_accuracy, validation_time = evaluate(
model, val_dataloader
)
time_elapsed = time.time() - t0_epoch
print(
f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_val_loss:^10.6f} | {avg_val_accuracy:^9.2f} | {time_elapsed:^9.2f}"
)
print("-" * 70)
# save model
if (
len(training_stats) == 0
or training_stats[-1]["Valid. Loss"] > avg_train_loss
):
model_dir = "model/model.pt"
torch.save(model.state_dict(), model_dir)
training_stats.append(
{
"epoch": epoch_i + 1,
"Training Loss": avg_train_loss,
"Valid. Loss": avg_val_loss,
"Valid. Accur.": avg_val_accuracy,
"Training Time": training_time,
"Validation Time": validation_time,
}
)
print("\n")
print("Training complete!")
return model, training_stats, train_loss_set
def evaluate(model, val_dataloader):
"""After the completion of each training epoch, measure the model's performance
on our validation set.
"""
# Put the model into the evaluation mode. The dropout layers are disabled during
# the test time.
t0 = time.time()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# Tracking variables
avg_val_accuracy = []
avg_val_loss = []
# For each batch in our validation set...
for batch in val_dataloader:
b_input_ids = batch["input_ids"].to(device)
b_attention_mask = batch["attention_mask"].to(device)
b_labels = batch["labels"].to(device)
# Compute logits
with torch.no_grad():
logits = model(b_input_ids, b_attention_mask)
# Compute loss
loss_func = model.loss_func
predictions = torch.sigmoid(logits)
loss = loss_func(
predictions.view(-1, model.num_labels), b_labels.view(-1, model.num_labels)
)
avg_val_loss.append(loss.item())
# Get the predictions
preds = torch.round(predictions)
# Calculate the accuracy rate
accuracy = (preds == b_labels).cpu().numpy().mean() * 100
avg_val_accuracy.append(accuracy)
# Compute the average accuracy and loss over the validation set.
avg_val_loss = np.mean(avg_val_loss)
avg_val_accuracy = np.mean(avg_val_accuracy)
validation_time = format_time(time.time() - t0)
return avg_val_loss, avg_val_accuracy, validation_time
def build_optimizer_scheduler(model, epochs, train_dataloader):
# setting custom optimization parameters for huggingface model and implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
],
"weight_decay_rate": 0.01,
},
{
"params": [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
],
"weight_decay_rate": 0.0,
},
]
optimizer = AdamW(
optimizer_grouped_parameters,
lr=5e-5, # Default learning rate
eps=1e-8, # Default epsilon value
)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0, # Default value
num_training_steps=total_steps,
)
return optimizer, scheduler
##################################
# test model
##################################
test_model(
test_dataloader=test_dataloader,
BERT_MODEL_NAME=BERT_MODEL_NAME,
num_labels=num_labels,
label_columns=label_columns,
)