从 CV 转 NLP 有小半年了,一直在用 Bert 系列做一些自然语言理解任务,包括命名实体识别、关系抽取,文本结构化等信息抽取任务。之前开发任务紧,Bert 拿来就用了,很多代码只知其然而不知其所以然,今日好好读了一下 BERT Pytorch 版本的源码,并逐行做了注释记录,遂开辟 NLP 菜鸟系列,随缘更新一些文章,供基础玩家阅读与学习。
耐心读下来,整个流程可以轻松读懂,源码中一些不常用的函数已经去掉~
import torch
import math
import torch.nn as nn
from transformers import BertPreTrainedModel
class BertModel(BertPreTrainedModel):
#继承BertPreTrainedModel类,该类主要包含一些预训练模型的下载地址,权重初始化函数, config配置参数类等
def __init__(self, config):
super().__init__(config) #继承父类__init__
self.config = config #Bert的配置文件
self.embeddings = BertEmbeddings(config) #对输入句子做初步embedding
self.encoder = BertEncoder(config) #Bert的核心构件,由12层BertLayer组成
self.pooler = BertPooler(config) #Bert pool 得到的输出可以用来下游 NSP 任务
self.init_weights()#权重初始化
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
'''
:param input_ids: 句子中的每个字查vocab.txt所得到的下标
:param token_type_ids: 有地方称作 segment_ids, 用于Next Sentence Predict任务,Robert后已经取消NSP任务
:param attention_mask: 1代表有用信息,0代表padding这种无用信息
:param position_ids: 代表位置下标
:param head_mask: 处理head_mask,可以用多头剪枝,一般不用/
:return: 大小为[batch_size * seq_length * hidden_size (默认为768)]的tensor
'''
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)#如果未指定attention_mask, 全部设置为1
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)#如果未指定token_type_ids,全部设置为0,即所有句子中“每个字的下标”都为0,不设定NSP任务
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)#attention_mask 由[batch_size*seq_length] 变成 [batch_size*1*1*seq_length], 后续的 self-attention会用到它
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 #0代表有用信息, -10000代表padding等无用信息,此操作防止softmax计算溢出
if head_mask is not None: #head 剪枝操作, 略
pass
else: head_mask = [None] * self.config.num_hidden_layers # encoder中的隐藏层数
embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)#大小为[batch_size * seq_length * hidden_size(768)]
##############################跳转到BertEmbeddings()######################################
encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) #Encoder层组成整个transformer
##############################跳转到BertEncoder()#########################################
sequence_output = encoder_outputs[0] #默认为 batch_size * seq_len * 768 的tensor
pooled_output = self.pooler(sequence_output) #可用于下游NSP等任务
outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] #encoder_outputs[1:]代表attention系数
return outputs
class BertEmbeddings(nn.Module):
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)#word embeding
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)#位置embedding, 其中max_position_embeddings等于max_sequence_length
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)#type_vocab_size=2, 0代表当前句子, 1代表下一句, 用作NSP任务
self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)#层归一化
self.dropout = nn.Dropout(config.hidden_dropout_prob)#dropout
def forward(self, input_ids, token_type_ids, position_ids):
'''
:param input_ids: 句子中的每个字查vocab.txt所得到的下标,大小为[batch_size * seq_length]
:param token_type_ids: 大小为[batch_size * seq_length]
:param position_ids: 大小为[batch_size * seq_length]
:return:
'''
seq_length = input_ids.size(1) #代表句子长度
if position_ids is None: #如果没给出位置下标
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)#按照句子长度从0到seq_length给出位置下标
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)#大小为[batch_size * seq_length]
if token_type_ids is None:#如果没给出 token_type_ids, 默认不使用NSP任务
token_type_ids = torch.zeros_like(input_ids)#大小为[batch_size * seq_length]
words_embeddings = self.word_embeddings(input_ids)#大小为[batch_size * seq_length * hidden_size(768)]
position_embeddings = self.position_embeddings(position_ids)#大小为[batch_size * seq_length * hidden_size(768)]
token_type_embeddings = self.token_type_embeddings(token_type_ids)#大小为[batch_size * seq_length * hidden_size(768)]
embeddings = words_embeddings + position_embeddings + token_type_embeddings #embedding之和
embeddings = self.LayerNorm(embeddings)# 层归一化
embeddings = self.dropout(embeddings)# dropout embedding
return embeddings #大小为[batch_size * seq_length * hidden_size(768)]
#回到embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
class BertEncoder(nn.Module):
def __init__(self, config):
super(BertEncoder, self).__init__()
self.output_attentions = config.output_attentions #是否输出self-attention的权重系数, 默认是不输出的
self.output_hidden_states = config.output_hidden_states #是否将每一层bert_layer的输出结果添加到all_hidden_states中
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) #num_hidden_layers=12,表示12层的encoder
##############################跳转到BertLayer(),构建BERT########################################
def forward(self, hidden_states, attention_mask, head_mask=None):
'''
一般笔者使用Bert作为embedding的方式, 配置文件中output_hidden_states和output_attentions都不会设置为输出每层的结果
'''
all_hidden_states = () #用于保留每层BertLayer()的输出
all_attentions = () #用于保留每层self-attention的系数
for i, layer_module in enumerate(self.layer):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
hidden_states = layer_outputs[0]#每一层的输出大小均为[batch_size, seq_len, 768]
if self.output_attentions:#是否输出attention权重
all_attentions = all_attentions + (layer_outputs[1],)
if self.output_hidden_states: #是否将每一层的输出结果都保留下来
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)#单独的最后一层输出
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,) #(最后一层的输出, 所有层输出的集合)
if self.output_attentions:
outputs = outputs + (all_attentions,) #(最后一层的输出, 所有层权重系数的集合)
return outputs # last-layer hidden state, (all hidden states), (all attentions)
#回到 BertModel 中
class BertLayer(nn.Module):
def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config) #self-attention, 大小为[batch_size * seq_len * 768]
#################################跳转到BertAttention()##########################################
self.intermediate = BertIntermediate(config)#self-attention 中间层 [batch_size * seq_len * 更大的dim]
#################################跳转到BertIntermediate()#######################################
self.output = BertOutput(config) #输出层
#################################跳转到BertOutput()#############################################
def forward(self, hidden_states, attention_mask, head_mask=None):
attention_outputs = self.attention(hidden_states, attention_mask, head_mask) #self-attention, 第一层的hidden_states为“word_embedding + position_embedding + token_type_embedding”
attention_output = attention_outputs[0]#(attention_outputs[1]代表q, k相乘得到的权重系数, 默认是不返回的)
intermediate_output = self.intermediate(attention_output)#往高维映射
layer_output = self.output(intermediate_output, attention_output)
outputs = (layer_output,) + attention_outputs[1:] #输出每一层的[batch*seq_len*768]及权重系数(默认不返回)
return outputs
# 回到BertEncoder中
class BertAttention(nn.Module):
def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
#################################跳转到BertSelfAttention()#############################################
self.output = BertSelfOutput(config)
#################################跳转到BertSelfOutput()#############################################
def forward(self, input_tensor, attention_mask, head_mask=None):
self_outputs = self.self(input_tensor, attention_mask, head_mask) #得到self-attention后的input_tensor, 大小为 ([batch_size * seq_len * 768], 0), 0代表不返回attention的系数
attention_output = self.output(self_outputs[0], input_tensor) #再做一次FC映射及layerNorm及Dropout常规操作
outputs = (attention_output,) + self_outputs[1:] #大小为 ([batch_size * seq_len * 768], 0), 0代表不返回attention的系数
return outputs
# 回到BertLayer中
class BertSelfAttention(nn.Module):
def __init__(self, config):
super(BertSelfAttention, self).__init__()
self.output_attentions = config.output_attentions #是否输出 self_attentions 系数
self.num_attention_heads = config.num_attention_heads #multi-head 头数: 12
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) #单个head的维度: 768 / 12 = 64
self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size 与 hidden_size 大小相等, all_head_size = 12 * 64 = 768
self.query = nn.Linear(config.hidden_size, self.all_head_size) # query矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
self.key = nn.Linear(config.hidden_size, self.all_head_size) # key矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
self.value = nn.Linear(config.hidden_size, self.all_head_size) # value矩阵, hidden_size * all_head_size (768*768) 大小的矩阵
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def transpose_for_scores(self, x):
#输入x: [batch_size * seq_len * 768]
new_x_shape = x.size()[:-1] #大小为[batch_size * seq_len]
new_x_shape = new_x_shape + (self.num_attention_heads, self.attention_head_size) #大小为[batch_size, seq_len, 12, 64]
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3) #输出大小为[batch_size, 12, seq_len, 64], 12代表头数,64代表每个头的维度
def forward(self, hidden_states, attention_mask, head_mask=None):
'''
768 代表 hidden_size
:param hidden_states: 输入大小为[batch_size * seq_len * 768]
:param attention_mask: 输入大小为[batch_size * 1 * 1 * 768]
:param head_mask: 默认不进行剪枝操作
:return:
'''
mixed_query_layer = self.query(hidden_states) #输出为 batch_size * seq_len * 768
mixed_key_layer = self.key(hidden_states) #输出为 batch_size * seq_len * 768
mixed_value_layer = self.value(hidden_states) #输出为 batch_size * seq_len * 768
query_layer = self.transpose_for_scores(mixed_query_layer) #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64
key_layer = self.transpose_for_scores(mixed_key_layer) #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64
value_layer = self.transpose_for_scores(mixed_value_layer) #转为multi-head-attention, 大小为batch_size * 12 * seq_len * 64
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) #输出为 batch_size * 12 * seq_len * seq_len
#self-attention 机制是 q, k 矩阵乘归一化得出系数后, 乘到 value 上
attention_scores = attention_scores / math.sqrt(self.attention_head_size) #归一化
attention_scores = attention_scores + attention_mask #[batch_size * 12 * seq_len * seq_len] 与 [batch_size*1*1*seq_len] 之和
attention_probs = nn.Softmax(dim=-1)(attention_scores) #这样可以mask掉无用信息
attention_probs = self.dropout(attention_probs) #大小为 [batch_size * 12 * seq_len * seq_len]
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)#系数与原值乘积就代表着该注意句子中的哪一部分,大小为 [batch_size * 12 * seq_len * 64]
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()#大小为 [batch_size * seq_len * 12 * 64], 后续view操作需要tensor连续, 需调用contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) #大小为 [batch_size * seq_len] + (768, )
context_layer = context_layer.view(*new_context_layer_shape) #大小为 [batch_size * seq_len * 768]
outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)#选择是否输出attention系数
return outputs
# 回到BertAttention中
class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size) #768 * 768 全连接
# self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
# hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# 回到BertAttention中
class BertIntermediate(nn.Module):
'''
映射self-attention后的输入[batch_size, seq_len, 768]到更大的一个维度, 维度大小为config.intermediate_size=3072
'''
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
self.intermediate_act_fn = nn.GELU()
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
# 回到BertLayer中
class BertOutput(nn.Module):
'''
BertOutput的输出代表每一层 encoder 的输出
'''
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
# self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
# hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
# 回到BertLayer中
class BertPooler(nn.Module):
#这一层只是简单地取出了句子的第一个token
def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
#hidden_states 大小为 [batch_size, seq_length, hidden_size]]
#取出每一句的第一个单词,即[CLS]对应的向量,做全连接和tanh激活。得到的输出可以用来下游NSP任务
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
这样整个bert流程便理过来了,主要是bertLayer构建encoder部分比较绕,因为bertlayer中又嵌套了self-attention机制。