本文在 distilgpt2 模型的基础上微调,使用 glue/sst2 数据集,训练预测下一个词的模型。
本文根据视频修改:
蓝斯诺特 《七个实战任务,玩转自然语言处理,基于HuggingFace和PyTorch》
https://www.bilibili.com/video/BV1dU4y1C7so?p=2
源码地址:https://github.com/lansinuote/Huggingface_Task/blob/main/1.预测最后一个词.ipynb
checkpoint = 'distilgpt2'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
tokenizer
'''
PreTrainedTokenizerFast(name_or_path='distilgpt2',
vocab_size=50257, model_max_len=1024, is_fast=True,
padding_side='right', truncation_side='right',
special_tokens={'bos_token': '<|endoftext|>',
'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'})
'''
测试 tokenizer
# 测试
text_arr = [' Creating new Models, Datasets or Spaces.',
'This model is a fine-tune checkpoint of DistilBERT, fine-tuned on SST-2. ']
text_arr = [
'hide new secretions from the parental units',
'contains no wit , only labored gags'
]
tokenizer.batch_encode_plus(text_arr)
'''
{'input_ids': [[24717, 649, 3200, 507, 422, 262, 21694, 4991], [3642, 1299, 645, 20868, 837, 691, 2248, 1850, 308, 3775]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
'''
# 加载数据集
from datasets import load_dataset, load_from_disk
dataset = load_dataset(path='glue', name='sst2')
查看数据
dataset
'''
DatasetDict({
train: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 67349
})
validation: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 872
})
test: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 1821
})
})
'''
dataset['train']
'''
Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 67349
})
'''
dataset['train'][0]
'''
{'sentence': 'hide new secretions from the parental units ',
'label': 0,
'idx': 0}
'''
处理数据
# 1、分词,同时删除多余的字段
def f(data):
return tokenizer.batch_encode_plus(data['sentence'])
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=4, remove_columns=['sentence', 'idx', 'label'])
# 2、过滤掉长度小于8的句子; 准备拿第8个词,作为 label。
def f(data):
return [len(i) > 8 for i in data['input_ids']]
dataset = dataset.filter(f, batched=True, batch_size=1000, num_proc=4)
# 3、截断句子,同时整理成模型需要的格式
def f(data):
data['input_ids'] = [ i[:8] for i in data['input_ids'] ]
data['attention_mask'] = [[1] * 8 ] * len(data['attention_mask'])
# 在模型中处理了偏移量的问题,这里保持输入输出一致即可
data['labels'] = data['input_ids'].copy()
return data
dataset = dataset.map(f, batched=True, batch_size=1000, num_proc=4)
查看处理后的内容
dataset
'''
(DatasetDict({
train: Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 36125
})
validation: Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 834
})
test: Dataset({
features: ['input_ids', 'attention_mask', 'labels'],
num_rows: 1683
})
}),
'''
dataset['train'][0]
'''
{'input_ids': [24717, 649, 3200, 507, 422, 262, 21694, 4991],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
'labels': [24717, 649, 3200, 507, 422, 262, 21694, 4991]})
'''
# 定义数据加载器
import torch
from transformers.data.data_collator import default_data_collator
loader = torch.utils.data.DataLoader(
dataset=dataset['train'],
batch_size = 8,
collate_fn = default_data_collator,
shuffle=True,
drop_last=True,
)
for i, data in enumerate(loader):
break
len(loader), data
'''
(4515,
{'input_ids': tensor([[ 9099, 22590, 2644, 2809, 45070, 465, 1388, 11171],
[ 1169, 38043, 710, 5369, 815, 299, 470, 307],
[ 3798, 7916, 262, 14186, 1291, 37840, 286, 749],
[ 64, 13526, 837, 1257, 837, 43211, 28680, 3807],
[ 1169, 7110, 318, 1111, 542, 36207, 290, 537],
[19188, 307, 2562, 329, 9188, 284, 21163, 340],
[ 505, 286, 262, 614, 705, 82, 749, 19827],
[ 270, 705, 82, 655, 7650, 1108, 329, 262]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1]]),
'labels': tensor([[ 9099, 22590, 2644, 2809, 45070, 465, 1388, 11171],
[ 1169, 38043, 710, 5369, 815, 299, 470, 307],
[ 3798, 7916, 262, 14186, 1291, 37840, 286, 749],
[ 64, 13526, 837, 1257, 837, 43211, 28680, 3807],
[ 1169, 7110, 318, 1111, 542, 36207, 290, 537],
[19188, 307, 2562, 329, 9188, 284, 21163, 340],
[ 505, 286, 262, 614, 705, 82, 749, 19827],
[ 270, 705, 82, 655, 7650, 1108, 329, 262]])})
'''
# 定义模型
from transformers import AutoModelForCausalLM, GPT2Model
# 自动模型,基本等同于下面手动实现的; 自动模型,官方会有很多健壮性的考虑。这里手动实现,追求简洁性。
# model = AutoModelForCausalLM.from_pretrained(checkpoint)
import torch
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.pretrained = GPT2Model.from_pretrained(checkpoint)
self.fc = torch.nn.Linear(768, tokenizer.vocab_size, bias=False)
parameters = AutoModelForCausalLM.from_pretrained(checkpoint)
self.fc.load_state_dict(parameters.lm_head.state_dict() )
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, input_ids, attention_mask, labels=None):
logits = self.pretrained(input_ids=input_ids, attention_mask=attention_mask, )
logits = logits.last_hidden_state
logits = self.fc(logits)
loss = None
if labels is not None:
shift_logits = logits[:, :-1].reshape(-1, tokenizer.vocab_size)
shift_labels = logits[:, 1:].reshape(-1)
# 将输入的 labels 和 输出的 logits 相互的做一个前后的偏移量,取相互交叉的部分来计算 loss
loss = self.criterion(shift_logits, shift_labels)
print('-- loss : ', loss, ' logits : ', logits)
查看模型
model = Model()
# 统计模型参数量
print('-- ', sum(i.numel() for i in model.parameters() ) / 10000 )
# 12050.9952
# 定义测试函数
# 在测试集合上计算预测的正确率
def test():
model.eval()
#数据加载器
loader_test = torch.utils.data.DataLoader(
dataset=dataset['test'],
batch_size = 8,
collate_fn = default_data_collator,
shuffle=True,
drop_last = True,
)
correct = 0
total = 0
for i, data in enumerate(loader_test):
# 只计算最后一个词的正确率,这里先把最后一个词取出来
label = data['input_ids'][:, -1].clone()
# 从数据中抹除掉最后一个词,防止模型作弊
data['input_ids'][:, -1] = 0
# label就不需要了
data['labels'][:, :] = 0
with torch.no_grad():
out = model(**data)
# 只计算最后一个词的正确率,因为有偏移量的关系,这里取的是倒数第二个词
out = out['logits'].argmax(dim=2)[:, -2]
correct += (label==out).sum().item()
total += 8
if i % 10 == 0:
print('-- ', i, label, out)
if i == 50:break
print('-- ', correct/total)
for i in range(8):
print(tokenizer.decode(data['input_ids'][i, :-1]) )
print(tokenizer.decode(label[i]), tokenizer.decode(out[i]) )
print()
from transformers import AdamW
from transformers.optimization import get_scheduler
'''
使用优化器 AdamW 而不是传统的 Adam 。AdamW 是 huggingface 提供的。
相比 pytorch 提供的 Adam,AdamW 被优化了。
scheduler 可以随着训练的进行,不断调低 learning rate。帮助 loss 下降到更低的点。
clip_grad_norm_ 提供了梯度的范数裁剪。让模型梯度范数保持在 1。这样可以防止梯度的过大或过小,帮助模型参数稳定的更新。
'''
def train():
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler(name='linear', num_warmup_steps=0,
num_training_steps=len(loader),
optimizer=optimizer,
)
model.train()
for i, data in enumerate(loader):
out = model(**data)
loss = out['loss']
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
model.zero_grad()
if i%50 == 0:
labels = data['labels'][:, 1:]
out = out['logits'].argmax(dim=2)[:, :-1]
correct = (labels == out).sum().item()
accuracy = correct / (8 * 7)
lr = optimizer.state_dict()['param_groups'][0]['lr']
print('-- ', i, loss.item(), accuracy, lr)
torch.save(model, 'last_word.model')
伊织 2022-12-03(六)