我们可以利用下面的代码来训练一个 sequence 分类器
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
# This is new
batch["labels"] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()
显然,不可能用两个句子就能得到好的结果,因此我们需要更大的数据集
huggingface 也保存了很多数据集,可以通过 load_data
来下载
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc") # GLUE benchmark 中的 MRPC数据集
raw_datasets
> DatasetDict({
train: Dataset({
features: ['sentence1', 'sentence2', 'label', 'idx'],
num_rows: 3668
})
validation: Dataset({
features: ['sentence1', 'sentence2', 'label', 'idx'],
num_rows: 408
})
test: Dataset({
features: ['sentence1', 'sentence2', 'label', 'idx'],
num_rows: 1725
})
})
我们可以通过索引来访问每一对句子
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]
>{'idx': 0,
'label': 1,
'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'}
如果想知道数据集的每个部分的含义,可以通过 features
属性来查看
raw_train_dataset.features
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
'idx': Value(dtype='int32', id=None)}
tokenizer 可以直接处理成对的数据,就是BERT
希望的那样
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs
>{
'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}
token_type_ids
是用来区分第一个句子和第二个句子的,这未必在每个 tokenizer 中都有。只有当模型知道如何处理它们时,它们才会返回,因为它在预训练期间已经看到了它们。
BERT
在预训练时用到了 token_type_dis我们可以让tokenizer处理 a list of pairs of sentence,通过给它第一个句子的列表和第二个句子的列表
tokenized_dataset = tokenizer(
raw_datasets["train"]["sentence1"],
raw_datasets["train"]["sentence2"],
padding=True,
truncation=True,
)
这样返回的是一个字典,我们可以将这个返回结果添加到原来的数据集当中去
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
负责将samples放到一个batch的函数是 collate function
(是DataLoader的一个参数,默认将samples转成pytorch的tensor,并把他们连接起来)
我们故意推迟填充,只在每批处理中应用它,并避免有大量填充的过长输入。
为了在实践中做到这一点,我们必须定义一个collate函数,该函数将对我们想要批处理的数据集中的项应用正确的填充量
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets["train"][:8]
batch = data_collator(samples)
transformers提供了Trainer class来帮助在自己的数据上fine-tune预训练模型,当做完了数据处理,只剩一些定义 Trainer 的步骤
我们总结之前的操作
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
在定义trainer之前,需要定义TrainingArguments类,这包括所有的参数。我们只用提供模型要保存的位置,其他参数保持默认也能训练的不错
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
定义一个模型
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
定义一个Trainer
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
)
训练
trainer.train()
compute_metrics()
来在evaluate期间计算一个 metric,只是返回loss,这并不直观我们可以使用 Trainer.predict() 来使我们的模型进行预测
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)
>(408, 2) (408,)
要将我们的预测的可以与真正的标签进行比较,我们需要在第二个轴上取最大值的索引:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)
建立compute_metraic(),我们可以使用Evaluate库
import evaluate
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)
> {'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}
最后将所有东西整合起来,我们得到 compute_metrics() 函数
def compute_metrics(eval_preds):
metric = evaluate.load("glue", "mrpc")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
此时我们可以定义新的Trainer
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
数据处理的简短总结
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
我们需要对 tokenized_datasets做一些处理,具体来说:
sentence1
和sentence2
列)。label
重命名为labels
(因为模型期望参数是labels
)。tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names
加载 DataLoader
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
加载模型
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
模型加载到 gpu 上
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
加载 optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)
加载 scheduler
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
训练循环
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps)) # 进度条
model.train()
for epoch in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
evaluate循环
import evaluate
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()
add_batch()
方法进行预测循环时,实际上该指标可以为我们累积所有 batch
的结果。一旦我们累积了所有 batch
,我们就可以使用 metric.compute()
得到最终结果使用Accelerate加速循环训练:一个完整的训练 - Hugging Face Course