Pytorch Bert源码解读

Bert 源码解读

从 CV 转 NLP 有小半年了,一直在用 Bert 系列做一些自然语言理解任务,包括命名实体识别、关系抽取,文本结构化等信息抽取任务。之前开发任务紧,Bert 拿来就用了,很多代码只知其然而不知其所以然,今日好好读了一下 BERT Pytorch 版本的源码,并逐行做了注释记录,遂开辟 NLP 菜鸟系列,随缘更新一些文章,供基础玩家阅读与学习。

耐心读下来,整个流程可以轻松读懂,源码中一些不常用的函数已经去掉~

BertModel

import torch
import math
import torch.nn as nn
from transformers import BertPreTrainedModel

class BertModel(BertPreTrainedModel):
	#继承BertPreTrainedModel类,该类主要包含一些预训练模型的下载地址,权重初始化函数, config配置参数类等
    def __init__(self, config):
        super().__init__(config) #继承父类__init__
        self.config = config #Bert的配置文件

        self.embeddings = BertEmbeddings(config) #对输入句子做初步embedding
        self.encoder = BertEncoder(config) #Bert的核心构件,由12层BertLayer组成
        self.pooler = BertPooler(config) #Bert pool 得到的输出可以用来下游 NSP 任务

        self.init_weights()#权重初始化


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
        '''
        :param input_ids: 句子中的每个字查vocab.txt所得到的下标
        :param token_type_ids: 有地方称作 segment_ids, 用于Next Sentence Predict任务,Robert后已经取消NSP任务
        :param attention_mask: 1代表有用信息,0代表padding这种无用信息
        :param position_ids: 代表位置下标
        :param head_mask: 处理head_mask,可以用多头剪枝,一般不用/
        :return: 大小为[batch_size * seq_length * hidden_size (默认为768)]的tensor
        '''
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)#如果未指定attention_mask, 全部设置为1
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)#如果未指定token_type_ids,全部设置为0,即所有句子中“每个字的下标”都为0,不设定NSP任务
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)#attention_mask 由[batch_size*seq_length] 变成 [batch_size*1*1*seq_length], 后续的 self-attention会用到它
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 #0代表有用信息, -10000代表padding等无用信息,此操作防止softmax计算溢出
        if head_mask is not None: #head 剪枝操作, 略
            pass
        else: head_mask = [None] * self.config.num_hidden_layers # encoder中的隐藏层数

        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)#大小为[batch_size * seq_length * hidden_size(768)]
        ##############################跳转到BertEmbeddings()######################################
        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) #Encoder层组成整个transformer
        ##############################跳转到BertEncoder()#########################################
        sequence_output = encoder_outputs[0] #默认为 batch_size * seq_len * 768 的tensor
        pooled_output = self.pooler(sequence_output) #可用于下游NSP等任务
        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] #encoder_outputs[1:]代表attention系数
        return outputs

BertEmbeddings

class BertEmbeddings(nn.Module):
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)#word embeding
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)#位置embedding, 其中max_position_embeddings等于max_sequence_length
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)#type_vocab_size=2, 0代表当前句子, 1代表下一句, 用作NSP任务
        self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)#层归一化
        self.dropout = nn.Dropout(config.hidden_dropout_prob)#dropout

    def forward(self, input_ids, token_type_ids, position_ids):
        '''
        :param input_ids: 句子中的每个字查vocab.txt所得到的下标,大小为[batch_size * seq_length]
        :param token_type_ids: 大小为[batch_size * seq_length]
        :param position_ids: 大小为[batch_size * seq_length]
        :return:
        '''
        seq_length = input_ids.size(1) #代表句子长度
        if position_ids is None: #如果没给出位置下标
            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)#按照句子长度从0到seq_length给出位置下标
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)#大小为[batch_size * seq_length]
        if token_type_ids is None:#如果没给出 token_type_ids, 默认不使用NSP任务
            token_type_ids = torch.zeros_like(input_ids)#大小为[batch_size * seq_length]
        words_embeddings = self.word_embeddings(input_ids)#大小为[batch_size * seq_length * hidden_size(768)]
        position_embeddings = self.position_embeddings(position_ids)#大小为[batch_size * seq_length * hidden_size(768)]
        token_type_embeddings = self.token_type_embeddings(token_type_ids)#大小为[batch_size * seq_length * hidden_size(768)]

        embeddings = words_embeddings + position_embeddings + token_type_embeddings #embedding之和
        embeddings = self.LayerNorm(embeddings)# 层归一化
        embeddings = self.dropout(embeddings)# dropout embedding
        return embeddings #大小为[batch_size * seq_length * hidden_size(768)]
        #回到embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)

BertEncoder—Bert的核心构件

class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()
        self.output_attentions = config.output_attentions #是否输出self-attention的权重系数, 默认是不输出的
        self.output_hidden_states = config.output_hidden_states #是否将每一层bert_layer的输出结果添加到all_hidden_states中
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) #num_hidden_layers=12,表示12层的encoder
        ##############################跳转到BertLayer(),构建BERT########################################

    def forward(self, hidden_states, attention_mask, head_mask=None):
        '''
        一般笔者使用Bert作为embedding的方式, 配置文件中output_hidden_states和output_attentions都不会设置为输出每层的结果
        '''
        all_hidden_states = () #用于保留每层BertLayer()的输出
        all_attentions = ()    #用于保留每层self-attention的系数
        for i, layer_module in enumerate(self.layer):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
            hidden_states = layer_outputs[0]#每一层的输出大小均为[batch_size, seq_len, 768]

            if self.output_attentions:#是否输出attention权重
                all_attentions = all_attentions + (layer_outputs[1],)

        if self.output_hidden_states: #是否将每一层的输出结果都保留下来
            all_hidden_states = all_hidden_states + (hidden_states,)

        outputs = (hidden_states,)#单独的最后一层输出
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states,) #(最后一层的输出, 所有层输出的集合)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)    #(最后一层的输出, 所有层权重系数的集合)
        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
        #回到 BertModel 中

Bert Layer—bert encoder中的每一层,核心的self-attention包含于此

class BertLayer(nn.Module):
    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config) #self-attention, 大小为[batch_size * seq_len * 768]
        #################################跳转到BertAttention()##########################################
        self.intermediate = BertIntermediate(config)#self-attention 中间层 [batch_size * seq_len * 更大的dim]
        #################################跳转到BertIntermediate()#######################################
        self.output = BertOutput(config) #输出层
        #################################跳转到BertOutput()#############################################

    def forward(self, hidden_states, attention_mask, head_mask=None):
        attention_outputs = self.attention(hidden_states, attention_mask, head_mask) #self-attention, 第一层的hidden_states为“word_embedding + position_embedding + token_type_embedding”
        attention_output = attention_outputs[0]#(attention_outputs[1]代表q, k相乘得到的权重系数, 默认是不返回的)
        intermediate_output = self.intermediate(attention_output)#往高维映射
        layer_output = self.output(intermediate_output, attention_output)
        outputs = (layer_output,) + attention_outputs[1:]  #输出每一层的[batch*seq_len*768]及权重系数(默认不返回)
        return outputs
        # 回到BertEncoder中

BertAttention—self-attention 机制的实现

class BertAttention(nn.Module):
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        #################################跳转到BertSelfAttention()#############################################
        self.output = BertSelfOutput(config)
        #################################跳转到BertSelfOutput()#############################################
    def forward(self, input_tensor, attention_mask, head_mask=None):
        self_outputs = self.self(input_tensor, attention_mask, head_mask) #得到self-attention后的input_tensor, 大小为 ([batch_size * seq_len * 768], 0), 0代表不返回attention的系数
        attention_output = self.output(self_outputs[0], input_tensor)     #再做一次FC映射及layerNorm及Dropout常规操作
        outputs = (attention_output,) + self_outputs[1:]  #大小为 ([batch_size * seq_len * 768], 0), 0代表不返回attention的系数
        return outputs
        # 回到BertLayer中

class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        self.output_attentions = config.output_attentions #是否输出 self_attentions 系数

        self.num_attention_heads = config.num_attention_heads #multi-head 头数: 12
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads) #单个head的维度: 768 / 12 = 64
        self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size 与 hidden_size 大小相等, all_head_size = 12 * 64 = 768

        self.query = nn.Linear(config.hidden_size, self.all_head_size) # query矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
        self.key = nn.Linear(config.hidden_size, self.all_head_size)   # key矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
        self.value = nn.Linear(config.hidden_size, self.all_head_size) # value矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
    def transpose_for_scores(self, x):
        #输入x: [batch_size * seq_len * 768]
        new_x_shape = x.size()[:-1] #大小为[batch_size * seq_len]
        new_x_shape = new_x_shape + (self.num_attention_heads, self.attention_head_size) #大小为[batch_size, seq_len, 12, 64]
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3) #输出大小为[batch_size, 12, seq_len, 64], 12代表头数,64代表每个头的维度

    def forward(self, hidden_states, attention_mask, head_mask=None):
        '''
        768 代表 hidden_size
        :param hidden_states:  输入大小为[batch_size * seq_len * 768]
        :param attention_mask: 输入大小为[batch_size * 1 * 1 * 768]
        :param head_mask: 默认不进行剪枝操作
        :return:
        '''
        mixed_query_layer = self.query(hidden_states) #输出为 batch_size * seq_len * 768
        mixed_key_layer = self.key(hidden_states)     #输出为 batch_size * seq_len * 768
        mixed_value_layer = self.value(hidden_states) #输出为 batch_size * seq_len * 768
        query_layer = self.transpose_for_scores(mixed_query_layer) #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64
        key_layer = self.transpose_for_scores(mixed_key_layer)     #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64
        value_layer = self.transpose_for_scores(mixed_value_layer) #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) #输出为 batch_size * 12 * seq_len * seq_len
        #self-attention 机制是 q, k 矩阵乘归一化得出系数后, 乘到 value 上
        attention_scores = attention_scores / math.sqrt(self.attention_head_size) #归一化
        attention_scores = attention_scores + attention_mask #[batch_size * 12 * seq_len * seq_len] 与 [batch_size*1*1*seq_len] 之和
        attention_probs = nn.Softmax(dim=-1)(attention_scores) #这样可以mask掉无用信息
        attention_probs = self.dropout(attention_probs) #大小为 [batch_size * 12 * seq_len * seq_len]
        if head_mask is not None:
            attention_probs = attention_probs * head_mask
        context_layer = torch.matmul(attention_probs, value_layer)#系数与原值乘积就代表着该注意句子中的哪一部分,大小为 [batch_size * 12 * seq_len * 64]

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()#大小为 [batch_size * seq_len * 12 * 64], 后续view操作需要tensor连续, 需调用contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) #大小为 [batch_size * seq_len] + (768, )
        context_layer = context_layer.view(*new_context_layer_shape) #大小为 [batch_size * seq_len * 768]
        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)#选择是否输出attention系数
        return outputs
        # 回到BertAttention中
class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size) #768 * 768 全连接
        # self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        # hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
        # 回到BertAttention中

BertIntermediate—对self-attention的输出结果做更高维度的映射

class BertIntermediate(nn.Module):
    '''
    映射self-attention后的输入[batch_size, seq_len, 768]到更大的一个维度, 维度大小为config.intermediate_size=3072
    '''
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = nn.GELU()

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states
        # 回到BertLayer中

BertOutput 层—bert encoder 中每层layer的输出

class BertOutput(nn.Module):
    '''
    BertOutput的输出代表每一层 encoder 的输出
    '''
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        # hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
        # 回到BertLayer中

BertPooler—pooler层的输入是encoder最后一层的输出

class BertPooler(nn.Module):
	#这一层只是简单地取出了句子的第一个token
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
    	#hidden_states 大小为 [batch_size, seq_length, hidden_size]]
        #取出每一句的第一个单词,即[CLS]对应的向量,做全连接和tanh激活。得到的输出可以用来下游NSP任务
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

这样整个bert流程便理过来了,主要是bertLayer构建encoder部分比较绕,因为bertlayer中又嵌套了self-attention机制。

你可能感兴趣的:(个人笔记,深度学习,NLP,深度学习,自然语言处理)