选择平台,除了Linux等常规的服务器平台,这里安利一下google的平台 colab,在使用服务器不方便的情况下,可以在colab实现深度学习,以及神经网络代码。
安装pytorch,Linux环境的pytorch安装可以参考链接: MAC系统Linux服务器下用Anaconda安装Pytorch;在colab上,则使用!pip install pytorch
不需要其他的配置环境步骤。
安装transformers
下载数据集,本文为大家提供了用于运行代码的两份数据,分别是train.json和eval.json。数据来自于Paraphrase Data,经过了处理保留了每对数据的sentence和headline部分,处理过程见另一篇文章。其中sentences是inputs,其对应的headlines是labels,每份训练文件存储20000对训练数据,而测试集存储10000对数据。
import numpy as np
import os
import re
import json
import torch
import torch.optim as optim
from torch import tensor
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BartForConditionalGeneration, BartTokenizer, BartModel, AutoModel
from transformers.utils.import_utils import SENTENCEPIECE_IMPORT_ERROR
import dataset # from dataset.py
from dataset import Sent_Comp_Dataset
from dataset import collate_batch
import def_train_set # from def_train_set.py
from def_train_set import train,test
全代码地址: dataset.py
下面是dataset.py代码的第一部分:代码讲解见代码后的注解。
class Sent_Comp_Dataset(Dataset): # 定义class
def __init__(self, path="", prefix="train"):
self.data_path = path # dataset的路径
self.sentence = []
self.headline = []
with open(self.data_path, encoding="utf-8", mode = 'r') as source:
context = json.load(source) # 读取.json文件的数据
for i in context:
element_sent = context[i]['sentence']
self.sentence.append(element_sent) # 取出每对数据中的sentence部分
element_head = context[i]['headline']
self.headline.append(element_head) # 取出每对数据中的headline部分
print('Files already downloaded and verified')
def __len__(self):
return len(self.sentence)
def __getitem__(self,idx):
#data_name = self.data_path.split()
return {'sentence':self.sentence[idx], 'headline':self.headline[idx]}
接下来dataset.py的第二部分:代码讲解见代码后的注解。
def collate_batch(batch): # 用于dataloader里batch设置
sentence_list = []
headline_list = []
for unit in batch:
sentence_list.append(unit['sentence'])
headline_list.append(unit['headline'])
return sentence_list,headline_list
关于这一部分,一定要注意各个函数返回值的type,仅供参考:这部分的bug解释。
全代码地址: def_train_test.py,代码讲解见代码后的注解。
def train(train_data_path = str, epoch = int, train_batch_size = int, model_save_path = str, train_max_length = int):
# model statu
model.train(mode=True) # By operation of dropout and batch normalization, to avoid overfitting 让model变成训练模式,此时 dropout和batch normalization的操作在训练q起到防止网络过拟合的问题
datalist = []
for n in os.listdir(train_data_path):
if os.path.splitext(n)[1] == '.json':
datalist.append(n) # 设置data.list批量读取.json训练数据文件
os.chdir(train_data_path)
for i in range(len(datalist)):
scd_train = Sent_Comp_Dataset(datalist[i]) # 每个训练数据调用一次
train_dataloader = DataLoader(scd_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate_batch, drop_last=False) # 使用DataLoader(),其中collate_fn调用dataset.py里我们写好的collate_batch
for epoch in range(epoch):# epoch数量自定义
running_loss = []
for batch_idx,(sentences,headlines) in enumerate(train_dataloader): # 获得token ids
sent_ids = tokenizer.batch_encode_plus(sentences, max_length=train_max_length, return_tensors="pt", pad_to_max_length=True)
head_ids = tokenizer.batch_encode_plus(headlines, max_length=train_max_length, return_tensors="pt", pad_to_max_length=True)
# further process
pad_token_id = tokenizer.pad_token_id
y = head_ids['input_ids']
y_ids = y[:, :-1].contiguous()
lm_labels = y[:, 1:].clone()
lm_labels[y[:, 1:] == pad_token_id] = -100
#此段借鉴了[BART 实战 —— Summarization 任务](https://imagasaikou.cn/blog/4#_3)
# forward + backward + optimize
# optimizer
optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
optimizer.zero_grad() # 用于每个batch训练前的梯度清零
output = model(
input_ids = sent_ids['input_ids'].to(device),
attention_mask = sent_ids['attention_mask'].to(device),
decoder_input_ids = y_ids.to(device),
# lm_labels = lm_labels.to(device),
)
# loss function
# loss = model(**sent_ids, labels = y_ids).loss # bart模型内置loss,可以用但效果不好
logits = model_device_set.model(**sent_ids).logits
criterion = nn.CrossEntropyLoss()
vocab_size_logits = model_device_set.tokenizer.vocab_size
logits_view = logits.view(train_batch_size, vocab_size_logits, -1)
loss = criterion(logits_view, y)
loss.backward()
optimizer.step()
running_loss = 0.0
running_loss += loss.item()
# print every 20 mini-batches
if batch_idx % 20 ==19:
print('[%d,%5d] loss: %.3f' %(epoch +1, batch_idx + 1, running_loss /20 ))
running_loss = 0.0
print('Finish epoch', '%d' % (epoch + 1))
# model saving
torch.save(model.state_dict(), model_save_path) # saves only the model parameters
print('Finished Training')
代码包含在def_train_test.py,代码讲解见代码后的注解。
def test(test_data_path=str, test_batch_size = int, model_save_path = str, test_max_length = int
):
model.load_state_dict(torch.load(model_save_path)) # loads only the model parameters
model.eval() #eval模式,pytorch会自动把BN和DropOut固定住,不会取平均,而是用训练好的值。不然的话,一旦test的batch_size过小,很容易就会被BN层导致生成图片颜色失真极大.
scd_test = Sent_Comp_Dataset(test_data_path)
test_dataloader = DataLoader(scd_test, batch_size=test_batch_size, shuffle=True, collate_fn=collate_batch, drop_last=False)
for batch_idx,(sentences,headlines) in enumerate(test_dataloader):
sent_ids = tokenizer.batch_encode_plus(sentences, max_length=test_max_length
, return_tensors="pt", pad_to_max_length=True)
summaries = model.generate( # 调用了Bart的[model.generate函数](https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/text_generation#generation)
input_ids=sent_ids["input_ids"].to(device),
attention_mask=sent_ids["attention_mask"].to(device),
num_beams=4,
length_penalty=2.0,
max_length=142, # +2 from original because we start at step=1 and stop before max_length
min_length=56, # +1 from original because we start at step=1
no_repeat_ngram_size=3,
early_stopping=True,
do_sample=False,
) # change these arguments if you want
dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
print(dec)
全代码地址: train.py,代码讲解见代码后的注解。
# model # model使用了[facebook/bart-base](https://huggingface.co/facebook/bart-base)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
# device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
model.parameters()
# input # 此处五个参数均可以更改
# 训练数据的路径
train_data_path ='/content/gdrive/My Drive/sentence_compression/preprocessed_data'
# epoch次数
epoch = 1
# 训练batch size
train_batch_size = 8
# 模型参数保存路径
model_save_path = '/content/gdrive/My Drive/Model/short_train.pth'
# 每个sentence或headline的最大长度
train_max_length = 80
train(train_data_path,
epoch, train_batch_size,
model_save_path,
train_max_length)
全代码地址: test.py,代码讲解见代码后的注解。
# model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
# inputs
test_data_path ='/content/gdrive/My Drive/sentence_compression/short_data/eval_test.json'
test_batch_size = 8
model_save_path = '/content/gdrive/My Drive/Model/short_test.pth'
test_max_length = 80
test(test_data_path,
test_batch_size,
model_save_path,
test_max_length
)
此部分待补充
最后给大家同时推荐另一篇Bart实战讲解,大神详解版,更原理性的理解。