bert base
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"type_vocab_size": 2,
"vocab_size": 30522
}
三个词向量:word_embedding
、position_embedding
、token_type_embedding
class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
# 单词embedding (V, H)
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
# 位置embedding (P, H)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# 段落embedding (V, H)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long,
device=input_ids.device) # 位置排列(0, 1, 2, 3, ..., seq_length-1)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # 将position_ids维度扩充到input_ids的维度, (b, seq_length)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids) # (B, seq_length, H)
position_embeddings = self.position_embeddings(position_ids) # (B, seq_length, H)
token_type_embeddings = self.token_type_embeddings(token_type_ids) # (B, seq_length, H)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
自注意力模型,bert
里面隐藏层大小等于head
数目乘上head
大小。自注意力层输入和输出大小维度是相同的,都是(b, seq_len, h)
。
class BertSelfAttention(nn.Module):
def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0: # 隐藏层大小需要是注意力头数目的倍数
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.output_attentions = output_attentions
self.keep_multihead_output = keep_multihead_output
self.multihead_output = None
self.num_attention_heads = config.num_attention_heads # 注意力头的数目
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) # 注意力头的大小
self.all_head_size = self.num_attention_heads * self.attention_head_size # 总的注意力头的大小
# query, key和value参数大小相同,总参数为3xhxh
self.query = nn.Linear(config.hidden_size, self.all_head_size) # 在bert里面隐藏层和所有head的大小是相同的
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (
self.num_attention_heads, self.attention_head_size) # (b, h, head_num, head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3) # (b, head_num, h, head_size)
def forward(self, hidden_states, attention_mask, head_mask=None):
mixed_query_layer = self.query(hidden_states) # (b, seq_len, h)
mixed_key_layer = self.key(hidden_states) # (b, seq_len, h)
mixed_value_layer = self.value(hidden_states) # (b, seq_len, h)
query_layer = self.transpose_for_scores(mixed_query_layer) # (b, head_num, seq_len, head_size)
key_layer = self.transpose_for_scores(mixed_key_layer) # (b, head_num, seq_len, head_size)
value_layer = self.transpose_for_scores(mixed_value_layer) # (b, head_num, seq_len, head_size)
# Take the dot product between "query" and "key" to get the raw attention scores.
# (b, head_num, seq_len, head_size)x(b, head_num, head_size, seq_len)=(b, head_num, seq_len, seq_len)
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1,
-2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
# 掩码这里使用了加号,对于不掩码的位置来说,掩码值为0;对于掩码的位置来说,掩码值为-10000。使用softmax层之后,可以让-10000处的值为0。
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
# (b, head_num, seq_len, seq_len)
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
# 维度大小为 (b, num_head, seq_len, seq_len)
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
# head_mask维度大小为 (b, num_head, seq_len, seq_len),要么为0,要么为1
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer) # (b, head_num, seq_len, head_size)
if self.keep_multihead_output:
self.multihead_output = context_layer
self.multihead_output.retain_grad()
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # (b, seq_len, head_num, head_size)
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape) # (b, seq_len, h)
if self.output_attentions:
return attention_probs, context_layer
return context_layer
处于在注意力之后输出,在这里有一个shortcut
和layernorm
class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
Layer Norm
class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
# 输入维度为(b, seq_len, h)
def forward(self, x):
u = x.mean(-1, keepdim=True) # (b, seq_len, 1)
s = (x - u).pow(2).mean(-1, keepdim=True) # (b, seq_len, 1)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
中间线性层
class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
输出层,里面加了shortcut
和layer norm
class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
一层encoder
class BertLayer(nn.Module):
def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(BertLayer, self).__init__()
self.output_attentions = output_attentions
self.attention = BertAttention(config, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output) # 注意力层
self.intermediate = BertIntermediate(config) # 中间层
self.output = BertOutput(config) # 输出层
def forward(self, hidden_states, attention_mask, head_mask=None):
attention_output = self.attention(hidden_states, attention_mask, head_mask)
if self.output_attentions:
attentions, attention_output = attention_output
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
if self.output_attentions:
return attentions, layer_output
return layer_output
12层encoder
class BertEncoder(nn.Module):
def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(BertEncoder, self).__init__()
self.output_attentions = output_attentions
layer = BertLayer(config, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, head_mask=None):
"""
:param hidden_states: (b, seq_len, h)
:param attention_mask: [batch_size, num_heads, from_seq_length, to_seq_length]
:param output_all_encoded_layers:
:param head_mask:
:return:
"""
all_encoder_layers = []
all_attentions = []
for i, layer_module in enumerate(self.layer):
hidden_states = layer_module(hidden_states, attention_mask, head_mask[i])
if self.output_attentions:
attentions, hidden_states = hidden_states
all_attentions.append(attentions)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if self.output_attentions:
return all_attentions, all_encoder_layers
return all_encoder_layers
对于分类问题来说,最后使用第一个字符[CLS]
的表示来进行分类。[CLS]
需要在微调阶段继续进行训练。
class BertPooler(nn.Module):
def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
# hidden_states.shape = [batch_size, seq_len, hidden_size]
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
预训练类。loss来自两部分,一部分是掩码模型的平均值,一部分是next sentence prediction
的平均值。掩码部分真实标签值只有掩码单元有意义,其它部分均为-1
,在计算loss
的时候会忽略掉这部分。
class BertForPreTraining(BertPreTrainedModel):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.
Params:
`config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config, output_attentions=False, keep_multihead_output=False):
super(BertForPreTraining, self).__init__(config)
self.output_attentions = output_attentions
self.bert = BertModel(config, output_attentions=output_attentions,
keep_multihead_output=keep_multihead_output)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
next_sentence_label=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask,
output_all_encoded_layers=False, head_mask=head_mask)
if self.output_attentions:
all_attentions, sequence_output, pooled_output = outputs
else:
sequence_output, pooled_output = outputs
# sequence_output.shape = [b, seq_len, hidden_size]
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
if masked_lm_labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
# shape = [ *, vocab_size]
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
elif self.output_attentions:
return all_attentions, prediction_scores, seq_relationship_score
return prediction_scores, seq_relationship_score