教程:使用Bart模型完成Paraphrase Generation同义转换任务

Bart model的train与test

  • 准备工作
  • 载入dataloader
  • 定义train函数
  • 定义test函数
  • train.py
  • test.py
  • 结果与评估

本篇文章用一个paraphrase generation的任务,带领大家走完Bart实战实用的全过程。
每一部分的代码见github链接。下面按照代码的顺序进行一一讲解。

准备工作

  1. 选择平台,除了Linux等常规的服务器平台,这里安利一下google的平台 colab,在使用服务器不方便的情况下,可以在colab实现深度学习,以及神经网络代码。

  2. 安装pytorch,Linux环境的pytorch安装可以参考链接: MAC系统Linux服务器下用Anaconda安装Pytorch;在colab上,则使用!pip install pytorch不需要其他的配置环境步骤。

  3. 安装transformers

  4. 下载数据集,本文为大家提供了用于运行代码的两份数据,分别是train.json和eval.json。数据来自于Paraphrase Data,经过了处理保留了每对数据的sentence和headline部分,处理过程见另一篇文章。其中sentences是inputs,其对应的headlines是labels,每份训练文件存储20000对训练数据,而测试集存储10000对数据。

import numpy as np 
import os
import re
import json
import torch
import torch.optim as optim
from torch import tensor
from torch.utils.data import Dataset, DataLoader

import transformers
from transformers import BartForConditionalGeneration, BartTokenizer, BartModel, AutoModel
from transformers.utils.import_utils import SENTENCEPIECE_IMPORT_ERROR

import dataset # from dataset.py
from dataset import Sent_Comp_Dataset
from dataset import collate_batch

import def_train_set # from def_train_set.py
from def_train_set import train,test

载入dataloader

全代码地址: dataset.py

下面是dataset.py代码的第一部分:代码讲解见代码后的注解。

class Sent_Comp_Dataset(Dataset): # 定义class
    def __init__(self, path="", prefix="train"): 
        self.data_path = path # dataset的路径
        self.sentence = [] 
        self.headline = []
        with open(self.data_path, encoding="utf-8", mode = 'r') as source: 
          context = json.load(source) # 读取.json文件的数据
          for i in context:
            element_sent = context[i]['sentence']
            self.sentence.append(element_sent) # 取出每对数据中的sentence部分

            element_head = context[i]['headline']
            self.headline.append(element_head) # 取出每对数据中的headline部分
        print('Files already downloaded and verified')

    def __len__(self):
      return len(self.sentence)
       
    def __getitem__(self,idx):
        #data_name = self.data_path.split()                                   
        return {'sentence':self.sentence[idx], 'headline':self.headline[idx]}

接下来dataset.py的第二部分:代码讲解见代码后的注解。

def collate_batch(batch): # 用于dataloader里batch设置 
  sentence_list = []
  headline_list = []
  for unit in batch:
    sentence_list.append(unit['sentence'])
    headline_list.append(unit['headline'])
  return sentence_list,headline_list

关于这一部分,一定要注意各个函数返回值的type,仅供参考:这部分的bug解释。

定义train函数

全代码地址: def_train_test.py,代码讲解见代码后的注解。

def train(train_data_path = str, epoch = int, train_batch_size = int, model_save_path = str, train_max_length = int):

  # model statu
  model.train(mode=True) # By operation of dropout and batch normalization, to avoid overfitting 让model变成训练模式,此时 dropout和batch normalization的操作在训练q起到防止网络过拟合的问题
 
  datalist = []
  for n in os.listdir(train_data_path):
    if os.path.splitext(n)[1] == '.json':   
        datalist.append(n) # 设置data.list批量读取.json训练数据文件

  os.chdir(train_data_path) 
  for i in range(len(datalist)):

    scd_train = Sent_Comp_Dataset(datalist[i]) # 每个训练数据调用一次
    train_dataloader = DataLoader(scd_train, batch_size=train_batch_size, shuffle=True, collate_fn=collate_batch, drop_last=False) # 使用DataLoader(),其中collate_fn调用dataset.py里我们写好的collate_batch
  
    for epoch in range(epoch):# epoch数量自定义
      running_loss = []

      for batch_idx,(sentences,headlines) in enumerate(train_dataloader): # 获得token ids
        sent_ids = tokenizer.batch_encode_plus(sentences, max_length=train_max_length, return_tensors="pt", pad_to_max_length=True)
        head_ids = tokenizer.batch_encode_plus(headlines, max_length=train_max_length, return_tensors="pt", pad_to_max_length=True)


        # further process
        pad_token_id = tokenizer.pad_token_id
        y = head_ids['input_ids']
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone()   
        lm_labels[y[:, 1:] == pad_token_id] = -100
        #此段借鉴了[BART 实战 —— Summarization 任务](https://imagasaikou.cn/blog/4#_3)

        # forward + backward + optimize
        # optimizer
        optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9) 
        optimizer.zero_grad() # 用于每个batch训练前的梯度清零
        
        output = model( 
          input_ids = sent_ids['input_ids'].to(device),
          attention_mask = sent_ids['attention_mask'].to(device),
          decoder_input_ids = y_ids.to(device),
         # lm_labels = lm_labels.to(device),
            )
        
        # loss function
        # loss = model(**sent_ids, labels = y_ids).loss # bart模型内置loss,可以用但效果不好
        logits = model_device_set.model(**sent_ids).logits
        criterion = nn.CrossEntropyLoss()
        vocab_size_logits = model_device_set.tokenizer.vocab_size
        logits_view = logits.view(train_batch_size, vocab_size_logits, -1)
        loss = criterion(logits_view, y)
        loss.backward()

         
        optimizer.step()  

        running_loss = 0.0
        running_loss += loss.item()
        # print every 20 mini-batches
        if batch_idx % 20 ==19:
          print('[%d,%5d] loss: %.3f' %(epoch +1, batch_idx + 1, running_loss /20 ))
          running_loss = 0.0

      print('Finish epoch', '%d' % (epoch + 1))

  # model saving
  torch.save(model.state_dict(), model_save_path) # saves only the model parameters
  print('Finished Training')

定义test函数

代码包含在def_train_test.py,代码讲解见代码后的注解。

def test(test_data_path=str, test_batch_size = int, model_save_path = str, test_max_length = int
         ):
   
  model.load_state_dict(torch.load(model_save_path)) # loads only the model parameters
  model.eval() #eval模式,pytorch会自动把BN和DropOut固定住,不会取平均,而是用训练好的值。不然的话,一旦test的batch_size过小,很容易就会被BN层导致生成图片颜色失真极大.

  scd_test = Sent_Comp_Dataset(test_data_path)
  test_dataloader = DataLoader(scd_test, batch_size=test_batch_size, shuffle=True, collate_fn=collate_batch, drop_last=False)

  for batch_idx,(sentences,headlines) in enumerate(test_dataloader):  
    
    sent_ids = tokenizer.batch_encode_plus(sentences, max_length=test_max_length
                                           , return_tensors="pt", pad_to_max_length=True) 

    summaries = model.generate( # 调用了Bart的[model.generate函数](https://huggingface.co/docs/transformers/v4.21.3/en/main_classes/text_generation#generation)
      input_ids=sent_ids["input_ids"].to(device),
      attention_mask=sent_ids["attention_mask"].to(device),
      num_beams=4,
      length_penalty=2.0,
      max_length=142,  # +2 from original because we start at step=1 and stop before max_length
      min_length=56,  # +1 from original because we start at step=1
      no_repeat_ngram_size=3,
      early_stopping=True,
      do_sample=False,
    )  # change these arguments if you want

  dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
  print(dec)

train.py

全代码地址: train.py,代码讲解见代码后的注解。

# model # model使用了[facebook/bart-base](https://huggingface.co/facebook/bart-base)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

# device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)
model.parameters()

# input # 此处五个参数均可以更改
# 训练数据的路径
train_data_path ='/content/gdrive/My Drive/sentence_compression/preprocessed_data'
# epoch次数
epoch = 1
# 训练batch size
train_batch_size = 8
# 模型参数保存路径
model_save_path = '/content/gdrive/My Drive/Model/short_train.pth'
# 每个sentence或headline的最大长度
train_max_length = 80

train(train_data_path, 
      epoch, train_batch_size, 
      model_save_path, 
      train_max_length)

test.py

全代码地址: test.py,代码讲解见代码后的注解。

# model 
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# inputs
test_data_path ='/content/gdrive/My Drive/sentence_compression/short_data/eval_test.json'
test_batch_size = 8
model_save_path = '/content/gdrive/My Drive/Model/short_test.pth'
test_max_length = 80

test(test_data_path, 
     test_batch_size, 
     model_save_path, 
     test_max_length
     )

结果与评估

此部分待补充

最后给大家同时推荐另一篇Bart实战讲解,大神详解版,更原理性的理解。

你可能感兴趣的:(教程,pytorch,python,深度学习)