解读大佬github比赛的预训练的代码

大佬的预训练代码的地址如下
大佬的预训练代码的地址
先放出所有的pretrain.py的代码,然后进一步地进行分析解读

# coding:utf-8
import os
import pickle

import torch
import random
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Tuple, Dict
from collections import defaultdict
from torch.utils.data import Dataset

from transformers import (
    BertTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForWholeWordMask,
    PreTrainedTokenizer, BertConfig
)
from transformers.utils import logging

from modeling.modeling_nezha.modeling import NeZhaForMaskedLM
from modeling.modeling_nezha.configuration import NeZhaConfig
from simple_trainer import Trainer
from pretrain_args import TrainingArguments

warnings.filterwarnings('ignore')
logger = logging.get_logger(__name__)


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed


def read_data(config, train_file_path, test_file_path, tokenizer: BertTokenizer) -> dict:
    train_df = pd.read_csv(train_file_path, header=None, sep='\t')
    test_df = pd.read_csv(test_file_path, header=None, sep='\t')
    pretrain_df = pd.concat([train_df, test_df], axis=0)
    inputs = defaultdict(list)
    for i, row in tqdm(pretrain_df.iterrows(), desc=f'preprocessing pretrain data ... ...', total=len(pretrain_df)):
        sentence_a, sentence_b = row[0], row[1]
        inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens=True,
                                            return_token_type_ids=True, return_attention_mask=True)
        inputs['input_ids'].append(inputs_dict['input_ids'])
        inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
        inputs['attention_mask'].append(inputs_dict['attention_mask'])

    data_cache_path = config['data_cache_path']

    if not os.path.exists(os.path.dirname(data_cache_path)):
        os.makedirs(os.path.dirname(data_cache_path))
    with open(data_cache_path, 'wb') as f:
        pickle.dump(inputs, f)

    return inputs


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, train_file_path: str, test_file_path: str, block_size: int):
        assert os.path.isfile(train_file_path), f"Input file path {train_file_path} not found"
        logger.info(f"Creating features from dataset file at {train_file_path}")

        assert os.path.isfile(test_file_path), f"Input file path {test_file_path} not found"
        logger.info(f"Creating features from dataset file at {test_file_path}")

        with open(train_file_path, encoding="utf-8") as f:
            train_lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        with open(test_file_path, encoding="utf-8") as f:
            test_lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        lines = train_lines + test_lines

        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]


def main():
    """
    download pretrain model from https://github.com/lonePatient/NeZha_Chinese_PyTorch,
    we only use pretrain model name : nezha-cn-base, nezha-base-wwm
    """
    config = {
        'pretrain_type': 'dynamic_mask',  # dynamic_mask, whole_word_mask
        'data_cache_path': '',
        'train_data_path': '/home/xiaoguzai/数据/data/train.txt',
        'test_data_path': '/home/xiaoguzai/数据/data/test.txt',
    }

    mlm_probability = 0.15
    num_train_epochs = 1
    seq_length = 90
    batch_size = 32
    learning_rate = 6e-5
    save_steps = 5000
    seed = 2021

    # put dowm your file path
    if config['pretrain_type'] == 'whole_word_mask':
        model_name = 'nezha-base-wwm'
    else:
        model_name = 'nezha-cn-base'

    config['data_cache_path'] = '../user_data/pretrain/'+config['pretrain_type']+'/data.pkl'

    model_path = '/home/xiaoguzai/数据/nezha-chinese-base/pytorch_model.bin'
    config_path = '/home/xiaoguzai/数据/nezha-chinese-base/config.json'

    vocab_file = '/home/xiaoguzai/数据/nezha-chinese-base/vocab.txt'
    tokenizer = BertTokenizer.from_pretrained(vocab_file)

    model_config = NeZhaConfig.from_pretrained(config_path)

    assert os.path.isfile(model_path), f"Input file path {model_path} not found, " \
                                       f"please download relative pretrain model in huggingface or" \
                                       f"https://github.com/lonePatient/NeZha_Chinese_PyTorch " \
                                       f"model name:nezha-cn-base or nezha-base-wwm"

    if config['pretrain_type'] == 'dynamic_mask':
        #使用动态的dynamic_mask进行遮盖
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=mlm_probability)
        print('after data_collator')
        model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path,
                                                 config=model_config)
        model_save_path = 'mlm_model'

    if config['pretrain_type'] == 'whole_word_mask':
        data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer,
                                                     mlm=True,
                                                     mlm_probability=mlm_probability)
        model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path,
                                                 config=model_config)
        model_save_path = 'whole_word_mask_model'

    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    train_file_path=config['train_data_path'],
                                    test_file_path=config['test_data_path'],
                                    block_size=seq_length)

    training_args = TrainingArguments(
        output_dir='record',
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        save_steps=save_steps,
        logging_steps=500,
        save_total_limit=5,
        prediction_loss_only=True,
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    trainer.train()
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(model_save_path)


if __name__ == '__main__':
    main()

进入main函数之中,首先配置相应的参数,然后进入相应的分词阶段

tokenizer = BertTokenizer.from_pretrained(vocab_file)

配置分词器,进入from_pretrained函数之中去查看一下,先在__init__.py之中确定相应的BertTokenizer的位置

from .models.bert import(
	BertTokenizer
)

这个BertTokenizer就是一个常规的分词类,这里先跳过。
接着运行对应的config_path的内容

model_config = NeZhaConfig.from_pretrained(config_path)

之后得到对应的配置内容

model_config = 
NeZhaConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "directionality": "bidi",
  "embedding_size": 128,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "max_relative_position": 64,
  "model_type": "nezha",
  "num_attention_heads": 12,
  "num_hidden_groups": 1,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.6.1",
  "type_vocab_size": 2,
  "use_relative_position": true,
  "vocab_size": 21128
}

接下来的动态掩码和整个单词掩码是进行掩码两种不同的方式,
whole_word_mask为整个单词掩码,也就是说如果有一个汉字需要被掩盖起来的话,其他的汉字也都需要被掩盖起来,
动态掩码应该是复制多份进行相应的掩码,具体操作代码如下:

if config['pretrain_type'] == 'dynamic_mask':
	...
if config['pretrain_type'] == 'whole_word_mask':
	...

这里选择进入的是’dynamic_mask’内容的相关部分,

    if config['pretrain_type'] == 'dynamic_mask':
        #使用动态的dynamic_mask进行遮盖
        print('dynamic_mask')
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=mlm_probability)
        print('after data_collator')
        model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path,
                                                 config=model_config)
        model_save_path = 'mlm_model'

首先进入DataCollatorForLanguageModeling函数之中进行查看

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True,
                                                        mlm_probability=mlm_probability)

导入的内容

from transformers import (
	DataCollatorForLanguageModeling
)

找到出处:

from .data.data_collator import (
    DataCollator,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    DataCollatorForSeq2Seq,
    DataCollatorForSOP,
    DataCollatorForTokenClassification,
    DataCollatorForWholeWordMask,
    DataCollatorWithPadding,
    default_data_collator,
)

data_collator之后,调用对应的掩码过程

model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path,
                                                 config=model_config)

from_pretrained在models_util.py之中,查看from_pretrained的函数内容

def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
	......
    # make sure token embedding weights are still tied if needed
       model.tie_weights()

       # Set model in evaluation mode to deactivate DropOut modules by default
       model.eval()

       if output_loading_info:
           loading_info = {
               "missing_keys": missing_keys,
               "unexpected_keys": unexpected_keys,
               "error_msgs": error_msgs,
           }
           return model, loading_info

       return model

作用是取出相应的模型并将模型对应的参数赋值上去
这里先打印出模型整体的结构

NeZhaForMaskedLM(
  (bert): NeZhaModel(
    (embeddings): NeZhaEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NeZhaEncoder(
      (layer): ModuleList(
        (0): NeZhaLayer(
          (attention): NeZhaAttention(
            (self): NeZhaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (1): NeZhaLayer(
          (attention): NeZhaAttention(
            (self): NeZhaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        ......
        (11): NeZhaLayer(
          (attention): NeZhaAttention(
            (self): NeZhaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (cls): BertOnlyMLMHead(
    (predictions): BertLMPredictionHead(
      (transform): BertPredictionHeadTransform(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      )
      (decoder): Linear(in_features=768, out_features=21128, bias=True)
    )
  )
)

这里找到NeZhaForMaskedLM的对应类,查看它的定义

self.bert = NeZhaModel(config)
self.cls = BertOnlyMLMHead(config)
self.init_weights()

对应的操作部分为

sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
outputs = (prediction_scores,) + outputs[2:]

计算相应的损失函数

masked_lm_labels = None
if labels is not None:
    loss_fct = CrossEntropyLoss()  # -100 index = padding token
    masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
    outputs = (masked_lm_loss,) + outputs

首先我们查看

self.bert = NeZhaModel(config)

的对应内容,这里放入的参数为

input_ids = 
tensor([[101,169,...107,102],
        [101,169,...100,102],
        ......
        [101,169,...100,102],
        [101,169,...169,102]],device='cuda:0')
attention_mask = 
tensor([[1,1,...1,1],
		[1,1,...1,1],
		......
		[1,1,...1,1],
		[1,1,...1,1]],device='cuda:0')
token_type_ids = None
head_mask = None
inputs_embeds = None

关键调用的对应点是两个相应的部分:

self.bert = NeZhaModel(config)
self.cls = BertOnlyMLMHead(config)

接下来进入到NeZhaModel代码之中进行阅读,阅读之前我们先回顾一下NeZhaModel的结构
原先bert对应的公式:
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q K T d k ) V Attention(Q,K,V) = softmax(\frac{QK^{T}}{\sqrt{d_{k}}})V Attention(Q,K,V)=softmax(dk QKT)V
现在对应的NeZhaModel的对应公式
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q ( K + α i j K ) T d z ) ( V + α i j V ) Attention(Q,K,V) = softmax(\frac{Q(K+\alpha_{ij}^{K})^{T}}{\sqrt{d_{z}}})(V+\alpha_{ij}^{V}) Attention(Q,K,V)=softmax(dz Q(K+αijK)T)(V+αijV)
注意Nezha的改变:
1.position_embedding改为绝对位置编码
2.attention之中加入相对位置编码
3.预训练的时候使用整个单词进行mask的方法
(whole word mask)

传入的初始化参数内容:

outputs = self.bert(
    input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids,
    head_mask=head_mask,
    inputs_embeds=inputs_embeds,
    encoder_hidden_states=encoder_hidden_states,
    encoder_attention_mask=encoder_attention_mask,
)

对应的输入内容为:

NeZhaForMaskedLM forward
input_ids = 
tensor([[ 101,  169,  107,  ...,  100,  100,  102],
        [ 101,  169,  107,  ...,  103,  107,  102],
        [ 101,  169,  107,  ...,  107,  131,  102],
        ...,
        [ 101,  169,  107,  ...,  103,  100,  102],
        [ 101,  169,  103,  ..., 8472,  107,  102],
        [ 101,  169,  107,  ...,  100,  100,  102]], device='cuda:0')
attention_mask = 
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')
token_type_ids = 
None
head_mask = 
None
inputs_embeds = 
None

进入NeZhaModel之中的forward函数

input_ids = 
tensor([[101,169,...107,102],
        [101,169,...100,102],
        ....................,
        [101,169,...100,102],
        [101,169,...169,102]])
inputs_embeds = None
input_shape = torch.Size([32,90])

查看对应的extended_attention_mask的调用过程

extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask,input_shape,self.device)

这里输入的self.device = ‘cuda:0’
看一下get_extended_attention_mask的函数在哪里可以找到

class NeZhaPreTrainedModel(PreTrainedModel):

以及

class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMixin):

最终在PreTrainedModel之中找到相应的get_extended_attention_mask函数

def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device) -> Tensor:
    """
    Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

    Arguments:
        attention_mask (:obj:`torch.Tensor`):
            Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
        input_shape (:obj:`Tuple[int]`):
            The shape of the input to the model.
        device: (:obj:`torch.device`):
            The device of the input to the model.

    Returns:
        :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
    """
    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        # Provided a padding mask of dimensions [batch_size, seq_length]
        # - if the model is a decoder, apply a causal mask in addition to the padding mask
        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder:
            batch_size, seq_length = input_shape
            seq_ids = torch.arange(seq_length, device=device)
            causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
            # in case past_key_values are used we need to add a prefix ones mask to the causal mask
            # causal and attention masks must have same type with pytorch version < 1.3
            causal_mask = causal_mask.to(attention_mask.dtype)
            if causal_mask.shape[1] < attention_mask.shape[1]:
                prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
                causal_mask = torch.cat(
                    [
                        torch.ones(
                            (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype
                        ),
                        causal_mask,
                    ],
                    axis=-1,
                )

            extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
        else:
            extended_attention_mask = attention_mask[:, None, None, :]
    else:
        raise ValueError(
            f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
        )
    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and -10000.0 for masked positions.
    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask

这里调用的对应语句为

extended_attention_mask = attention_mask[:,None,None,:]

接着进行后续的操作

extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)

extended_attention_mask本身为一个全1的对应的tensor,然后进行接下来的操作

extended_attention_mask = (1.0-extended_attention_mask)*-10000.0

之后,extended_attention_mask变成了一个全0的Tensor
这里的extended_attention_mask =

tensor([[[[-0,-0,...-0,-0]]],
		[[[-0,-0,...-0,-0]]],
		........................
		[[[-0,-0,...-0,-0]]],
		[[[-0,-0,...-0,-0]]]])
shape = (32,1,1,90)

接下来查看self.get_head_mask函数的操作

head_mask = self.get_head_mask(head_mask,self.config.num_hidden_layers)

得到对应的

head_mask = [None,None,...None,None]

数组之中共有12个None
然后调用对应的embeddings网络层

embedding_output = self.embeddings(
	input_ids=input_ids,token_type_ids=token_type_ids,inputs_embeds=inputs_embeds
)

进入self.embeddings之中进行查看,这里针对self.embeddings的定义为

self.embeddings = NeZhaEmbeddings(config)

NeZha与Bert结构的不同之Embedding层

NeZha与Bert结构不同的第一个区别在embedding之中,NeZha没有position_embedding的内容,只有word_embedding+token_embedding,而Bert中则是word_embedding+token_embedding+position_embedding

embeddings = inputs_embeds + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)




接下来进入到self.encoder的对应网络层之中 先回顾一下基础的参数
embedding_output = 
tensor([[[2.8207e-01 ... 0.0000e+00],
		 [8.8981e-01 ... 1.5297e+00],
		 ......
		 [-1.4189e-01 ... -1.8974e-01]],
		 ......
		[[2.8207e-01 ... 1.2606e-01],
		 ......
		 [-1.4189e-01 ... -1.8974e-01]]],
		 device='cuda:0',)
embedding_output.shape = (32,90,768)
extended_attention_mask = 
tensor([[[[-0., -0., -0.,  ..., -0., -0., -0.]]],
		    ..............................................
	    [[[-0., -0., -0.,  ..., -0., -0., -0.]]]]
		  device='cuda:0',)
extended_attention_mask = (32,1,1,90)
head_mask = [None,None,...None,None]
encoder_hidden_states = None
encoder_attention_mask = None

对应的初始化内容

self.encoder = NeZhaEncoder(config)
encoder_outputs = self.encoder(
    embedding_output(32,90,768),
    attention_mask=extended_attention_mask(32,1,1,90),
    head_mask=head_mask(None),
    encoder_hidden_states=encoder_hidden_states(None),
    encoder_attention_mask=encoder_extended_attention_mask(None),
)

进入相应的NeZha的forward函数

def forward(
       self,
       hidden_states,
       attention_mask=None,
       head_mask=None,
       encoder_hidden_states=None,
       encoder_attention_mask=None,
):
	  all_hidden_states = ()
	  all_attentions = ()
	  for i, layer_module in enumerate(self.layer):
	      if self.output_hidden_states:
	          all_hidden_states = all_hidden_states + (hidden_states,)
	      layer_outputs = layer_module(
	          hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
	      )
	      #这个layer_module对应的为NeZhaLayer(config)的内容
	      hidden_states = layer_outputs[0]
	      if self.output_attentions:
	          all_attentions = all_attentions + (layer_outputs[1],)
	  # Add last layer
	  if self.output_hidden_states:
	      all_hidden_states = all_hidden_states + (hidden_states,)
	  outputs = (hidden_states,)
	  if self.output_hidden_states:
	      outputs = outputs + (all_hidden_states,)
	  if self.output_attentions:
	      outputs = outputs + (all_attentions,)
	  return outputs  # last-layer hidden state, (all hidden states), (all attentions)

首先运行对应的语句

if self.output_hidden_states:
    all_hidden_states = all_hidden_states + (hidden_states,)

发现上面的这段程序没有运行,接着运行下面的对应语句

layer_outputs = layer_module(
	hidden_states,attention_mask,head_mask[i],encoder_hidden_states,encoder_attention_mask
)

这里的layer_module对应的是NeZhaLayer的网络层,接下来有个对应if的判断语句没有能够运行

if self.output_attentions:
	all_attentions = all_attentions+(layer_outputs[1],)

这里的if下面的语句没有能够运行,所以这里我们进入到NeZhaLayer之中,去查看对应的NeZhaLayer的运行过程。








运行完成之后回到NeZhaEncoder类的forward函数之中,去查看退出循环

for i,layer_module in enumerate(self.layer):

之后的内容

if self.output_hidden_states:
	all_hidden_states = all_hidden_states+(hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
	outputs = outputs+(all_hidden_states,)
if self.output_attentions:
	outputs = outptus+(all_attentions,)

这里的三个if语句全都没有被运行
最后直接进行返回操作

return outputs

也就是说,这里本质上12个Transformer网络层之后直接输出对应的outputs的内容,这里未运行的几个if的对应内容以后如果运行到了再进行探究,接下来进入到关键的地方,对于NeZhaLayer网络层内部结构的解读。
首先进入NeZhaLayer的forward函数之中,

self_attention_outputs = self.attention(hidden_states,attention_mask,head_mask)

这里的self.attention有相应的定义

self.attention = NeZhaAttention(config)

这个NeZhaAttention之中又搞了两波操作:

self_outputs = self.self(
    hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
return outputs

我估摸着(attention_output,)+self_outputs[1:]是实现的残差连接的操作,接下来查看每一层进行操作的内容
先查看对应的初始化过程:

self.self = NeZhaSelfAttention(config)
self.output = BertSelfOutput(config)

这里我们需要进入到NeZhaSelfAttention之中,查看对应的NeZhaSelfAttention类的实现过程。
先回顾一下NeZhaSelfAttention的对应公式:
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q ( K + α i j K ) T d z ) ( V + α i j V ) Attention(Q,K,V) = softmax(\frac{Q(K+\alpha_{ij}^{K})^{T}}{\sqrt{d_{z}}})(V+\alpha_{ij}^{V}) Attention(Q,K,V)=softmax(dz Q(K+αijK)T)(V+αijV)
进入NeZhaSelfAttention之中查看相应的代码实现内容
1.实现 Q ( K + α i j K ) T {Q(K+\alpha_{ij}^{K})^{T}} Q(K+αijK)T的内容,这里是通过 Q K + Q α i j K QK+Q\alpha_{ij}^{K} QK+QαijK

attention_scores = torch.matmul(query_layer,key_layer.transpose(-1,-2))
key_position_scores = torch.matmul(query_layer_r,relations_keys.permute(0,2,1,3))

最后进行两者的相加

key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1))
#key_position_scores = Q*alpha_{ij}
key_position_scores_r = key_position_scores.view(from_seq_length, batch_size,
                                                 num_attention_heads, from_seq_length)
key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3)
attention_scores = attention_scores + key_position_scores_r_t
attention_scores = attention_scores / math.sqrt(self.attention_head_size)

2.令 s o f t m a x ( Q ( K + α i j K ) T s q r t ( d z ) ) = a t t e n t i o n _ p r o b s softmax(\frac{Q(K+\alpha_{ij}^{K})^{T}}{sqrt({d_{z}})}) = attention\_probs softmax(sqrt(dz)Q(K+αijK)T)=attention_probs,则能够产生
a t t e n t i o n _ p r o b s ∗ V + a t t e n t i o n _ p r o b s ∗ α i j V attention\_probs*V+attention\_probs*\alpha_{ij}^{V} attention_probsV+attention_probsαijV的对应代码如下:

context_layer = torch.matmul(attention_probs,value_layer)
value_position_scores = torch.matmul(attentions_probs_r,relations_values)

最后得到的两者结果相加在一起

context_layer = context_layer+value_position_scores_r_t

需要注意的是,如果放入attention_mask的时候,中间计算得分的时候需要加入attention_mask

if attention_mask is not None:
	attention_scores = attention_scores+attentioin_mask



接下来考虑运行完成之后进行的操作
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output)
outputs = (sequence_output,pooled_output,)+encoder_outputs[1:]
return outputs

这里的encoder_outputs[1:] = (,),所以没有对结果产生变化。
接下来回到NeZhaForPreTraining(NeZhaPreTrainedModel):的对应类之中,来查看对应的运行过程:
首先经过下面操作

outputs = self.bert(...)

得到对应的outputs[0] = (32,90,768)的相应矩阵
接下来调用

prediction_scores = self.cls(sequence_output)

得到对应的prediction_scores = (32,90,21128)
这里具体看一下self.cls(…)是如何进行操作的

self.cls = BertOnlyMLMHead(config)

下面的BertOnlyMLMHead的网络结构之中定义了一个dense+LayerNorm,之后又定义了一个对应的decoder网络结构。

BertOnlyMLMHead(
   (predictions): BertLMPredictionHead(
       (transform): BertPredictionHeadTransform(
          (dense): Linear(in_features=768, out_features=768, bias=True)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=21128, bias=True)
    )
)

这样子就由原先输出的(32,90,768)的维度变成现在的(32,90,21128)的维度。
接下来将前面的维度展平,并使用交叉熵损失函数计算相应的损失内容:

CrossEntropyLoss(prediction([2880,21128]),label([2880])))

前面的内容(32,90)展平后为2880,
这样计算完成之后,得到的最终的损失内容去更新之前的网络结构之中的信息。
(不是预测的那种误差,而是每个单词对应的误差。)
接下来进入到DataCollator的内容之中去

data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer,
                                             mlm=True,
                                             mlm_probability=mlm_probability)

查看DataCollatorForWholeWordMask的定义

class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):

查看DataCollatorForLanguageModeling类的定义

class DataCollatorForLanguageModeling:
	tokenizer:PreTrainedTokenizerBase
	mlm: bool = True
	mlm_probability: float = 0.15
	pad_to_multiple_of: Optional[int] = None

另外原先的初始化之后还跟着相应的后续的初始化操作

def __post_init__(self):
	if self.mlm and self.tokenizer.mask_token is None:
		raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                "You should pass `mlm=False` to train on causal language modeling instead."
        )

接下来进入调用模型的过程之中:

model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=model_path,
                                         config=model_config)

这里调用的过程:

outputs = self.bert(
    input_ids,
    attention_mask=attention_mask,
    token_type_ids=token_type_ids,
    head_mask=head_mask,
    inputs_embeds=inputs_embeds,
    encoder_hidden_states=encoder_hidden_states,
    encoder_attention_mask=encoder_attention_mask,
)

sequence_output = outputs[0]
#sequence_output = torch.Size([32,90,768])
r"""
BertOnlyMLMHead(
   (predictions): BertLMPredictionHead(
       (transform): BertPredictionHeadTransform(
          (dense): Linear(in_features=768, out_features=768, bias=True)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=21128, bias=True)
    )
)
"""
prediction_scores = self.cls(sequence_output)
#prediction_scores = torch.Size([32,90,21128])
outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
#outputs[2:] = ()
#outputs[0].shape = torch.Size([32,90,21128])
# Although this may seem awkward, BertForMaskedLM supports two scenarios:
# 1. If a tensor that contains the indices of mask即ed labels is provided,
#    the cross-entropy is the MLM cross-entropy that measures the likelihood
#    of predictions for masked words.
# 2. If `lm_labels` is provided we are in a causal scenario where we
#    try to predict the next token for each input in the decoder.
masked_lm_labels = None
if labels is not None:
    loss_fct = CrossEntropyLoss()  # -100 index = padding token
    #prediction_scores = torch.Size([2880,21128])
    #label = torch.Size([2880])
    #CrossEntropyLoss(predictio_scores([2880,21128],label([2880])))
    masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
    outputs = (masked_lm_loss,) + outputs
return outputs  # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)

重点看调用bert完成之后的内容

sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
outputs = (prediction_scores,)+outputs[2:]
masked_lm_labels = None
if labels is not None:
	loss_fct = CrossEntropyLoss()
	masked_lm_loss = loss_fct(prediction_scores.view(-1,self.config.vocab_size),labels.view(-1))
	outputs = (masked_lm_loss,)+outputs
return outputs

可以看出这里结束的位置为两个对应的损失函数相加。

这里计算的是每一个单词对应概率的损失函数,所以使用交叉熵损失函数(更新权重上下句之间单词的联系)
输入的内容为

prediction_scores = ([32,90,21128])
prediction_scores.view(-1,self.config.vocab_size) = ([2880,21128])
labels.view(-1) = ([2880])

有32个句子,每个句子的最长长度为90,所以计算每个单词的概率,总共有32*90 = 2880个单词,计算每个单词的概率得到相应的交叉熵损失函数,并且进行更新。
这里的self.cls的对应结构总结如下:
解读大佬github比赛的预训练的代码_第1张图片接下来进入LineByLineTextDataset调用Dataset的类的数据部分

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, train_file_path: str, test_file_path: str, block_size: int):
        assert os.path.isfile(train_file_path), f"Input file path {train_file_path} not found"
        logger.info(f"Creating features from dataset file at {train_file_path}")
        assert os.path.isfile(test_file_path), f"Input file path {test_file_path} not found"
        logger.info(f"Creating features from dataset file at {test_file_path}")
        with open(train_file_path, encoding="utf-8") as f:
            train_lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
        with open(test_file_path, encoding="utf-8") as f:
            test_lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
        "..."
        lines = train_lines + test_lines
        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        return self.examples[i]

这里只要返回对应的"input_ids"的内容即可,所以读取对应内容放入到"input_ids"之中形成字典。
这里加入test_lines确实会提升效果,但是不符合实际运行的情况。
接下来查看后面对应的调用代码的内容:

training_args = TrainingArguments(
	output_dir = 'record',
	num_train_epochs = num_train_epochs,
	learning_rate = learning_rate,
	per_device_train_batch_size = batch_size,
	save_steps = save_steps,
	logging_steps = 500,
	save_total_limit = 5,
	prediction_loss_only = True,
	seed = seed
)

这里面它是通过一个类中的值封装好所有对应的参数内容

training_args = 
output_dir:record
overwrite_output_dir:False
do_train:False
do_eval:False
do_predict:False
evaluation_strategy:IntervalStrategy.NO
prediction_loss_only:True
per_device_train_batch_size:32
per_device_eval_batch_size:8
per_gpu_train_batch_size:None
per_gpu_eval_batch_size:None
gradient_accumulation_steps:1
eval_accumulation_steps:None
learning_rate:6e-05
swa_learning_rate:2e-05
swa_steps:500
weight_decay:0.0
adam_beta1:0.9
adam_beta2:0.999
adam_epsilon:1e-08
max_grad_norm:1.0
num_train_epochs:1
max_steps:-1
lr_scheduler_type:SchedulerType.LINEAR
warmup_ratio:0.0
warmup_steps:0
logging_dir:runs/Jul06_14-21-19_xiaoguzai-MS-7B93
logging_strategy:IntervalStrategy.STEPS
logging_first_step:False
logging_steps:500
save_strategy:IntervalStrategy.STEPS
save_steps:5000
save_total_limit:5
no_cuda:False
seed:2021
fp16:False
fp16_opt_level:O1
fp16_backend:auto
fp16_full_eval:False
local_rank:-1
tpu_num_cores:None
tpu_metrics_debug:False
debug:[]
dataloader_drop_last:False
eval_steps:500
dataloader_num_workers:0
past_index:-1
run_name:record
disable_tqdm:False
remove_unused_columns:True
label_names:None
load_best_model_at_end:False
metric_for_best_model:None
greater_is_better:None
ignore_data_skip:False
sharded_ddp:[]
deepspeed:None
label_smoothing_factor:0.0
adafactor:False
group_by_length:False
length_column_name:length
report_to:['tensorboard']
ddp_find_unused_parameters:None
dataloader_pin_memory:True
skip_memory_metrics:True
use_legacy_prediction_loop:False
push_to_hub:False
resume_from_checkpoint:None
log_on_each_node:True
mp_parameters:
_n_gpu:1
__cached__setup_devices:cuda:0

查看类之中的定义,发现参数是对应定义实现的

training_args = 
output_dir:record
overwrite_output_dir:False
do_train:False
do_eval:False
do_predict:False
evaluation_strategy:IntervalStrategy.NO
prediction_loss_only:True
per_device_train_batch_size:32
per_device_eval_batch_size:8
per_gpu_train_batch_size:None
per_gpu_eval_batch_size:None
gradient_accumulation_steps:1
eval_accumulation_steps:None
learning_rate:6e-05
swa_learning_rate:2e-05
swa_steps:500
weight_decay:0.0
adam_beta1:0.9
adam_beta2:0.999
adam_epsilon:1e-08
max_grad_norm:1.0
num_train_epochs:1
max_steps:-1
lr_scheduler_type:SchedulerType.LINEAR
warmup_ratio:0.0
warmup_steps:0
logging_dir:runs/Jul06_14-21-19_xiaoguzai-MS-7B93
logging_strategy:IntervalStrategy.STEPS
logging_first_step:False
logging_steps:500
save_strategy:IntervalStrategy.STEPS
save_steps:5000
save_total_limit:5
no_cuda:False
seed:2021
fp16:False
fp16_opt_level:O1
fp16_backend:auto
fp16_full_eval:False
local_rank:-1
tpu_num_cores:None
tpu_metrics_debug:False
debug:[]
dataloader_drop_last:False
eval_steps:500
dataloader_num_workers:0
past_index:-1
run_name:record
disable_tqdm:False
remove_unused_columns:True
label_names:None
load_best_model_at_end:False
metric_for_best_model:None
greater_is_better:None
ignore_data_skip:False
sharded_ddp:[]
deepspeed:None
label_smoothing_factor:0.0
adafactor:False
group_by_length:False
length_column_name:length
report_to:['tensorboard']
ddp_find_unused_parameters:None
dataloader_pin_memory:True
skip_memory_metrics:True
use_legacy_prediction_loop:False
push_to_hub:False
resume_from_checkpoint:None
log_on_each_node:True
mp_parameters:
_n_gpu:1
__cached__setup_devices:cuda:0

另外这里面还有一些调用的函数,在Trainer初始化的时候查看调用的过程

Trainer __init__
TrainingArguments place_model_on_device
TrainingArguments device
Trainer add_callback
TrainingArguments process_index

在trainer.train()训练的过程中进一步查看调用的过程

begin Trainer2
TrainingArguments world_size
TrainingArguments train_batch_size
TrainingArguments n_gpu
TrainingArguments n_gpu
TrainingArguments train_batch_size
TrainingArguments n_gpu
TrainingArguments process_index
TrainingArguments device
TrainingArguments to_json_string
TrainingArguments to_dict
TrainingArguments to_sanitized_dict
TrainingArguments to_dict
TrainingArguments train_batch_size
TrainingArguments n_gpu
TrainingArguments eval_batch_size
TrainingArguments n_gpu

分析完后感觉这里多是对于gpu的操作,不想陷入太深,所以这里面的内容先跳过,只需要知道它配置的一些参数即可。

output_dir='record',
num_train_epochs=num_train_epochs,
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
save_steps=save_steps,
logging_steps=500,
save_total_limit=5,
prediction_loss_only=True,
seed=seed

你可能感兴趣的:(bert源码解读)