bert4keras是苏剑林通过keras封装Bert,可以更快速、更友好的使用Bert。
bert4keras链接:https://github.com/bojone/bert4keras
本文是对该链接下bert4keras/bert4keras/的源码进行解析
个人认为了解源码非常重要,主要有两个看法:
在解析bert4keras源码前,需要各位具备如下知识:
bert4keras的总体架构由6个代码文件组成,主文件是models.py,其余的文件均为主文件服务,或者为下游任务的建模过程中服务。
models.py:主文件,主体Transformer类,其余的都是以Transformer为父类实现其算法,包括BERT,ALBERT,NEZHA,ELECTRA,GPT2_ML,T5的算法及其优化
layers.py: 实现各类功能的层,类似Keras中的layer,这里包括Embedding,MultiHeadAttention(多头注意力),LayerNormalization,PositionEmbedding(位置编码),RelativePositionEmbedding(相对位置编码),FeedForward等实现
本篇从主文件models.py开始介绍
Transformer类写好了bert等这类预训练模型的整体框架,主要在build,call两个函数中实现。
包括三大内容(在build函数中):Input(输入),self.call(算法计算流程),Model(建模)
实现框架如下图:
apply函数的妙用:在Transformer类中的apply函数的妙用:通过apply调用层会自动重用同名层,每次层进来都会存储在self.layers字典中,相同名字的层会重复使用,这样很好用而且在后面的Albert中要共享参数的时候也方便实现。
def apply(self, inputs, layer=None, arguments=None, **kwargs):
"""通过apply调用层会自动重用同名层
inputs: 上一层的输出;
layer: 要调用的层类名;
arguments: 传递给layer.call的参数;
kwargs: 传递给层初始化的参数。
"""
if layer is Dropout and self.dropout_rate == 0:
return inputs
arguments = arguments or {}
name = kwargs.get('name') # name='Embedding-Token'
if name not in self.layers:
layer = layer(**kwargs)
name = layer.name
self.layers[name] = layer # 保存下来,方便重复调用
return self.layers[name](inputs, **arguments) # 可以看Bert中的详细用法
依照Transformer类的逻辑顺序,总结如下:
apply_embedding:转成字向量
x embedding – s embedding – x&s add – Position embedding – LN – dropout
输出shape=(btz, seq_len, hidden_size)
apply_main_layers:计算流程主体
循环num_hidden_layers次:
Bert计算主体:Att --> DROPOUT --> Add --> LN --> FFN --> DROPOUT --> Add --> LN
输出shape=(btz, seq_len, intermediate_size)
apply_final_layers:根据下流任务调整输出
with_pool:提取CLS向量,用CLS向量表示这句话的句向量 shape=(btz, 768)
with_nsp:预测是否是下一句,shape=(btz, 2)
with_mask:Mask LM mask语言模型,在预训练的时候会用到,或者要预测句中某个字或词的时候可以用到
代码及矩阵张量运算见代码注释:
class BERT(Transformer):
"""构建BERT模型
"""
def __init__(
self,
max_position, # 序列最大长度
with_pool=False, # 是否包含Pool部分
with_nsp=False, # 是否包含NSP部分
with_mlm=False, # 是否包含MLM部分
**kwargs # 其余参数
):
super(BERT, self).__init__(**kwargs)
self.max_position = max_position
self.with_pool = with_pool
self.with_pool = with_pool
self.with_nsp = with_nsp
self.with_mlm = with_mlm
def get_inputs(self):
"""BERT的输入是token_ids和segment_ids
"""
x_in = Input(shape=(self.sequence_length,), name='Input-Token') # shape=(btz, seq_len)
s_in = Input(shape=(self.sequence_length,), name='Input-Segment') # shape=(btz, seq_len)
return [x_in, s_in]
def apply_embeddings(self, inputs):
"""BERT的embedding是token、position、segment三者embedding之和
"""
x, s = inputs # x和s均为shape=(btz, seq_len)
z = self.layer_norm_conds[0] # 条件概率
# x embedding嵌入
x = self.apply(
inputs=x,
layer=Embedding,
input_dim=self.vocab_size,
output_dim=self.embedding_size,
embeddings_initializer=self.initializer,
mask_zero=True,
name='Embedding-Token'
) # 可以拆成四个部分:1、inputs=x 是上一个的输出; 2、layer=Embedding层函数;3、从layer一直到name之间的参数,都是层函数中的参数;4、name该层的名字,存储在self.layer中
# 实为Embedding(input_dim=self.vocab_size, output_dim=self.embedding_size, embeddings_initializer=self.initializer, mask_zero=True)(x); 后面全部同理
# s embedding嵌入
s = self.apply(
inputs=s,
layer=Embedding,
input_dim=2,
output_dim=self.embedding_size,
embeddings_initializer=self.initializer,
name='Embedding-Segment'
) # Embedding(input_dim=2,output_dim=self.embedding_size,embeddings_initializer=self.initializer)(s)
# x和s相加,不是通过concat
x = self.apply(inputs=[x, s], layer=Add, name='Embedding-Token-Segment') # Add([x, s])
# x和s相加后,在和位置编码相加
x = self.apply(
inputs=x,
layer=PositionEmbedding,
input_dim=self.max_position,
output_dim=self.embedding_size,
merge_mode='add',
embeddings_initializer=self.initializer,
name='Embedding-Position'
) # PositionEmbedding(input_dim=self.max_position,output_dim=self.embedding_size,merge_mode='add',embeddings_initializer=self.initializer)(x)
# 进行LN归一化
x = self.apply(
inputs=self.simplify([x, z]),
layer=LayerNormalization,
conditional=(z is not None),
hidden_units=self.layer_norm_conds[1],
hidden_activation=self.layer_norm_conds[2],
hidden_initializer=self.initializer,
name='Embedding-Norm'
) # LayerNormalization(conditional=(z is not None),hidden_units=self.layer_norm_conds[1],hidden_activation=self.layer_norm_conds[2],hidden_initializer=self.initializer)(self.simplify([x, z]))
# dropout
x = self.apply(
inputs=x,
layer=Dropout,
rate=self.dropout_rate,
name='Embedding-Dropout'
)
# 控制输出维度为(btz, seq_len, hidden_size)
if self.embedding_size != self.hidden_size:
x = self.apply(
inputs=x,
layer=Dense,
units=self.hidden_size,
kernel_initializer=self.initializer,
name='Embedding-Mapping'
)
return x
def apply_main_layers(self, inputs, index):
"""BERT的主体是基于Self-Attention的模块
顺序:Att --> Add --> LN --> FFN --> Add --> LN
"""
x = inputs # shape=(btz, seq_len, 768)
z = self.layer_norm_conds[0] # 条件概率
attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index # Encode中每一块transformer的名字不同,代表着参数不共享了,mapping到原本bert中的每一块中
feed_forward_name = 'Transformer-%d-FeedForward' % index # Encode中每一块的FeedForward的名字不同。
attention_mask = self.compute_attention_mask() # mask
# Self Attention
xi, x, arguments = x, [x, x, x], {'a_mask': None} # 注意这里的x是[x, x, x],输入多头注意力中代表着QKV
if attention_mask is not None:
arguments['a_mask'] = True
x.append(attention_mask)
# 多头注意力
x = self.apply(
inputs=x,
layer=MultiHeadAttention,
arguments=arguments,
heads=self.num_attention_heads,
head_size=self.attention_head_size,
key_size=self.attention_key_size,
kernel_initializer=self.initializer,
name=attention_name
) # 相当于MultiHeadAttention(heads=self.num_attention_heads,head_size=self.attention_head_size,key_size=self.attention_key_size,kernel_initializer=self.initializer,arguments=arguments)(x)
# 下面每一层同理
x = self.apply(
inputs=x,
layer=Dropout,
rate=self.dropout_rate,
name='%s-Dropout' % attention_name
)
# 残差连接:将多头注意力得到的张量和一开始输入进来的x相加
x = self.apply(
inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
)
x = self.apply(
inputs=self.simplify([x, z]),
layer=LayerNormalization,
conditional=(z is not None),
hidden_units=self.layer_norm_conds[1],
hidden_activation=self.layer_norm_conds[2],
hidden_initializer=self.initializer,
name='%s-Norm' % attention_name
)
# Feed Forward
xi = x
x = self.apply(
inputs=x,
layer=FeedForward,
units=self.intermediate_size,
activation=self.hidden_act,
kernel_initializer=self.initializer,
name=feed_forward_name
)
x = self.apply(
inputs=x,
layer=Dropout,
rate=self.dropout_rate,
name='%s-Dropout' % feed_forward_name
)
x = self.apply(
inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
)
x = self.apply(
inputs=self.simplify([x, z]),
layer=LayerNormalization,
conditional=(z is not None),
hidden_units=self.layer_norm_conds[1],
hidden_activation=self.layer_norm_conds[2],
hidden_initializer=self.initializer,
name='%s-Norm' % feed_forward_name
)
return x # 输出shape=(btz, seq_len, intermediate_size)
def apply_final_layers(self, inputs):
"""根据剩余参数决定输出
"""
x = inputs # shape=(btz, seq_len, intermediate_size)
z = self.layer_norm_conds[0]
outputs = [x] # outputs是一个list,在任务下游的时候可以用bert.model.output,也可以用bert.model.outputs,只是这是个list
if self.with_pool or self.with_nsp:
# Pooler部分(提取CLS向量)
x = outputs[0]
x = self.apply(
inputs=x,
layer=Lambda,
function=lambda x: x[:, 0],
name='Pooler'
) # shape=(btz, intermediate_size)需要用lambda层来做,否则Keras在训练识别不出来,会报错
# 全连接并用激活
pool_activation = 'tanh' if self.with_pool is True else self.with_pool
x = self.apply(
inputs=x,
layer=Dense,
units=self.hidden_size,
activation=pool_activation,
kernel_initializer=self.initializer,
name='Pooler-Dense'
) # shape=(btz, hidden_size)
if self.with_nsp:
# Next Sentence Prediction部分
# 如果是NSP需要输出shape=(btz, 2)
x = self.apply(
inputs=x,
layer=Dense,
units=2,
activation='softmax',
kernel_initializer=self.initializer,
name='NSP-Proba'
)
outputs.append(x)
if self.with_mlm:
# Masked Language Model部分
x = outputs[0]
x = self.apply(
inputs=x,
layer=Dense,
units=self.embedding_size,
activation=self.hidden_act,
kernel_initializer=self.initializer,
name='MLM-Dense'
)
x = self.apply(
inputs=self.simplify([x, z]),
layer=LayerNormalization,
conditional=(z is not None),
hidden_units=self.layer_norm_conds[1],
hidden_activation=self.layer_norm_conds[2],
hidden_initializer=self.initializer,
name='MLM-Norm'
)
mlm_activation = 'softmax' if self.with_mlm is True else self.with_mlm
x = self.apply(
inputs=x,
layer=EmbeddingDense,
embedding_name='Embedding-Token',
activation=mlm_activation,
name='MLM-Proba'
)
outputs.append(x)
if len(outputs) == 1:
outputs = outputs[0]
elif len(outputs) == 2:
outputs = outputs[1]
else:
outputs = outputs[1:]
return outputs # 由pool,nsp,mlm的T/F决定,全没有的话,就直接输出shape=(btz, seq_len, intermediate_size)
更多源码详见:ZJJDJJ/bert4keras_note
祝各位看官品尝愉快~
我是卓师叔,谢谢各位!