pytorch_bert_pretrained代码学习

配置文件

bert base

{
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

输入的处理

三个词向量:word_embeddingposition_embeddingtoken_type_embedding

class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """

    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        #  单词embedding (V, H)
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
        # 位置embedding (P, H)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        #  段落embedding (V, H)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long,
                                    device=input_ids.device)  # 位置排列(0, 1, 2, 3, ..., seq_length-1)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # 将position_ids维度扩充到input_ids的维度, (b, seq_length)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        words_embeddings = self.word_embeddings(input_ids)  # (B, seq_length, H)
        position_embeddings = self.position_embeddings(position_ids)  # (B, seq_length, H)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)  # (B, seq_length, H)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

自注意力模型,bert里面隐藏层大小等于head数目乘上head大小。自注意力层输入和输出大小维度是相同的,都是(b, seq_len, h)

class BertSelfAttention(nn.Module):
    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:  # 隐藏层大小需要是注意力头数目的倍数
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.output_attentions = output_attentions
        self.keep_multihead_output = keep_multihead_output
        self.multihead_output = None

        self.num_attention_heads = config.num_attention_heads  # 注意力头的数目
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)  # 注意力头的大小
        self.all_head_size = self.num_attention_heads * self.attention_head_size  # 总的注意力头的大小

		# query, key和value参数大小相同,总参数为3xhxh
        self.query = nn.Linear(config.hidden_size, self.all_head_size)  # 在bert里面隐藏层和所有head的大小是相同的
        self.key = nn.Linear(config.hidden_size, self.all_head_size) 
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads, self.attention_head_size)  # (b, h, head_num, head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)  # (b, head_num, h, head_size)

    def forward(self, hidden_states, attention_mask, head_mask=None):
        mixed_query_layer = self.query(hidden_states)  # (b, seq_len, h)
        mixed_key_layer = self.key(hidden_states)  # (b, seq_len, h)
        mixed_value_layer = self.value(hidden_states)  # (b, seq_len, h)

        query_layer = self.transpose_for_scores(mixed_query_layer)  # (b, head_num, seq_len, head_size)
        key_layer = self.transpose_for_scores(mixed_key_layer)  # (b, head_num, seq_len, head_size)
        value_layer = self.transpose_for_scores(mixed_value_layer)  # (b, head_num, seq_len, head_size)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        # (b, head_num, seq_len, head_size)x(b, head_num, head_size, seq_len)=(b, head_num, seq_len, seq_len)
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1,
                                                                         -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
		# 掩码这里使用了加号,对于不掩码的位置来说,掩码值为0;对于掩码的位置来说,掩码值为-10000。使用softmax层之后,可以让-10000处的值为0。
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        # (b, head_num, seq_len, seq_len)
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
		# 维度大小为 (b, num_head, seq_len, seq_len)
        attention_probs = self.dropout(attention_probs)

        # Mask heads if we want to
        # head_mask维度大小为 (b, num_head, seq_len, seq_len),要么为0,要么为1
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer)  # (b, head_num, seq_len, head_size)
        if self.keep_multihead_output:
            self.multihead_output = context_layer
            self.multihead_output.retain_grad()

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  # (b, seq_len, head_num, head_size)
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)  # (b, seq_len, h)
        if self.output_attentions:
            return attention_probs, context_layer
        return context_layer

处于在注意力之后输出,在这里有一个shortcutlayernorm

class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

Layer Norm


    class BertLayerNorm(nn.Module):
        def __init__(self, hidden_size, eps=1e-12):
            """Construct a layernorm module in the TF style (epsilon inside the square root).
            """
            super(BertLayerNorm, self).__init__()
            self.weight = nn.Parameter(torch.ones(hidden_size))
            self.bias = nn.Parameter(torch.zeros(hidden_size))
            self.variance_epsilon = eps
		# 输入维度为(b, seq_len, h)
        def forward(self, x):
            u = x.mean(-1, keepdim=True) # (b, seq_len, 1)
            s = (x - u).pow(2).mean(-1, keepdim=True) # (b, seq_len, 1)
            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
            return self.weight * x + self.bias

中间线性层

class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

输出层,里面加了shortcutlayer norm

class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

一层encoder

class BertLayer(nn.Module):
    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertLayer, self).__init__()
        self.output_attentions = output_attentions
        self.attention = BertAttention(config, output_attentions=output_attentions,
                                       keep_multihead_output=keep_multihead_output)  # 注意力层
        self.intermediate = BertIntermediate(config)  # 中间层
        self.output = BertOutput(config)  # 输出层

    def forward(self, hidden_states, attention_mask, head_mask=None):
        attention_output = self.attention(hidden_states, attention_mask, head_mask)
        if self.output_attentions:
            attentions, attention_output = attention_output
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        if self.output_attentions:
            return attentions, layer_output
        return layer_output

12层encoder

class BertEncoder(nn.Module):
    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertEncoder, self).__init__()
        self.output_attentions = output_attentions
        layer = BertLayer(config, output_attentions=output_attentions,
                          keep_multihead_output=keep_multihead_output)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
        """

        :param hidden_states: (b, seq_len, h)
        :param attention_mask: [batch_size, num_heads, from_seq_length, to_seq_length]
        :param output_all_encoded_layers:
        :param head_mask:
        :return:
        """
        all_encoder_layers = []
        all_attentions = []
        for i, layer_module in enumerate(self.layer):
            hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
            if self.output_attentions:
                attentions, hidden_states = hidden_states
                all_attentions.append(attentions)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        if self.output_attentions:
            return all_attentions, all_encoder_layers
        return all_encoder_layers

对于分类问题来说,最后使用第一个字符[CLS]的表示来进行分类。[CLS]需要在微调阶段继续进行训练。

class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

	# hidden_states.shape = [batch_size, seq_len, hidden_size]
    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

预训练类。loss来自两部分,一部分是掩码模型的平均值,一部分是next sentence prediction的平均值。掩码部分真实标签值只有掩码单元有意义,其它部分均为-1,在计算loss的时候会忽略掉这部分。

class BertForPreTraining(BertPreTrainedModel):
    """BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads:
        - the masked language modeling head, and
        - the next sentence classification head.

    Params:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForPreTraining(config)
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
        super(BertForPreTraining, self).__init__(config)
        self.output_attentions = output_attentions
        self.bert = BertModel(config, output_attentions=output_attentions,
                              keep_multihead_output=keep_multihead_output)
        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                next_sentence_label=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask,
                            output_all_encoded_layers=False, head_mask=head_mask)
        if self.output_attentions:
            all_attentions, sequence_output, pooled_output = outputs
        else:
            sequence_output, pooled_output = outputs

        # sequence_output.shape = [b, seq_len, hidden_size]
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        if masked_lm_labels is not None and next_sentence_label is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            # shape = [ *, vocab_size]
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
            total_loss = masked_lm_loss + next_sentence_loss
            return total_loss
        elif self.output_attentions:
            return all_attentions, prediction_scores, seq_relationship_score
        return prediction_scores, seq_relationship_score

你可能感兴趣的:(深度学习,NLP,学习笔记,pytorch,学习笔记)