我对于Transformer结构的pytorch版本进行了代码的梳理以及部分解析,Transformer在自然语言处理以及计算机视觉领域均大放异彩,极大地促进了语言以及视觉(ViT,Swin-T)这两大最为常见的信号的统一处理。由于时间的原因可能存在部分地方解析的并不到位或者有不准确的地方,希望大家能够多多提出宝贵的意见。
特别说明:
1.代码中的add&norm与原paper以及tensor2tensor library中的实现并不相同,原作者经过大量的实验发现,将LayerNorm放在add操作之前的效果更好!!代码实现我也是使用的norm+add的形式,但在注释的时候我还是采用原paper中的add&norm的说法,这里大家注意一下;
2.作者相较于原paper做出了许多改进,在多处添加了Dropout,Dropout为一种缓解过拟合发生的方法,多用在全连接层(Linear)之后;
3.作者使用了xavier的权重初始化方式;
4.如果想看一下输出的结果,需要下载原仓库的代码,这里只是提供了对于Transformer整体结构的解析。
Paper:https://arxiv.org/pdf/1706.03762.pdf
Suggested theory: https://jalammar.github.io/illustrated-transformer/ (amazing blog!)
Code:https://github.com/gordicaleksa/pytorch-original-transformer
import math
import copy
import torch
import torch.nn as nn
from utils.constants import *
class Transformer(nn.Module):
def __init__(self, model_dimension, src_vocab_size, trg_vocab_size, number_of_heads, number_of_layers, dropout_probability, log_attention_weights=False):
super(Transformer,self).__init__()
#------------------------------------------------#
# 首先将input token以及output token的id嵌入至向量中去
# id即为这一个token在源句子以及目标句子中的位置
#------------------------------------------------#
self.src_embedding = Embedding(src_vocab_size, model_dimension)
self.trg_embedding = Embedding(trg_vocab_size, model_dimension)
#-------------------------------------------------#
# 将单词的位置信息嵌入至源/目标token的embedding vector中
# 如果不嵌入,我们将会丧失对于人类语言表征十分重要的位置信息
#-------------------------------------------------#
self.src_pos_embedding = PositionalEncoding(model_dimension, dropout_probability)
self.trg_pos_embedding = PositionalEncoding(model_dimension, dropout_probability)
#------------------------------------------------#
# 定义多头注意力模块,定义Feed Forward Network
# 定义encoder_layer以及decoder_layer
# 之后定义Encoder结构以及Decoder结构
#------------------------------------------------#
mha = MultiHeadedAttention(model_dimension, number_of_heads, dropout_probability, log_attention_weights)
pwn = PositionwiseFeedForwardNet(model_dimension, dropout_probability)
encoder_layer = EncoderLayer(model_dimension, dropout_probability, mha, pwn)
decoder_layer = DecoderLayer(model_dimension, dropout_probability, mha, pwn)
self.encoder = Encoder(encoder_layer, number_of_layers)
self.decoder = Decoder(decoder_layer, number_of_layers)
#-----------------------------------------------------------------------------------------------------#
# Converts final target token representations into log probabilities vectors of the target vocab size
#-----------------------------------------------------------------------------------------------------#
self.decoder_generator = DecoderGenerator(model_dimension, trg_vocab_size)
self.init_params()
#------------------------------------------------------------------#
# 定义Encoder
# 1.Input Embedding操作的定义,得到embedding vectors for src token ids
# 2.Positional Encoding操作的定义,添加positional embedding
# 3.Encoder结构的定义, forward pass through the encoder
#------------------------------------------------------------------#
def encode(self, src_token_ids_batch, src_mask):
src_embeddings_batch = self.src_embedding(src_token_ids_batch)
src_embeddings_batch = self.src_pos_embedding(src_embeddings_batch)
src_representations_batch = self.encoder(src_embeddings_batch, src_mask)
return src_representations_batch
#---------------------------------------------------------------------------------------------------------------#
# 当在一个类中定义函数的时候,无论是forward函数还是其他函数,类中定义的方法均可以通过self来直接进行使用
# 定义Decoder
# 1.Output Embedding操作的定义,得到embedding vectors for trg token ids
# 2.Positional Encoding操作的定义,添加positional embedding
# 此时的Shape=(B, T, D), where B - batch size, T - longest target token-sequence length and D - model dimension
# 3.Decoder结构的定义, forward pass through the decoder
#---------------------------------------------------------------------------------------------------------------#
def decode(self, trg_token_ids_batch, src_representations_batch, trg_mask, src_mask):
trg_embeddings_batch = self.trg_embedding(trg_token_ids_batch)
trg_embeddings_batch = self.trg_pos_embedding(trg_embeddings_batch)
trg_representations_batch = self.decoder(trg_embeddings_batch, src_representations_batch, trg_mask, src_mask)
#-------------------------------------------------------------------------------------#
# 经过这一个操作之后,shape (B, T, V), where V - target vocab size
# decoder generator 只是进行了Linear以及LogSoftmax操作
#-------------------------------------------------------------------------------------#
trg_log_probs = self.decoder_generator(trg_representations_batch)
#--------------------------------------------------------------------------#
# 将token向量reshape为(B*T, V),这是对于nn.KLDivLoss十分合适的一种形式
# 在这里使用log是因为torch中的nn.KLDivLoss需要log probabilities
# 这里的reshape的形式就是将不想变换的维度以及所需的维度的数目确定之后,将其定义成-1,之后会
# 自动计算那一维度的值
#--------------------------------------------------------------------------#
trg_log_probs = trg_log_probs.reshape(-1, trg_log_probs.shape[-1])
return trg_log_probs
#--------------------------------------#
# 定义一个参数初始化函数,在paper中并没有提到
# 使用的是xavier的参数初始化方式
#--------------------------------------#
def init_params(self, default_initialization=False):
if not default_initialization:
for name, p in self.named_parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
#----------------------------------------#
# 定义Transformer的整体结构
# Encoder+Decoder
# 首先经过Encoder之后再经过Decoder
#----------------------------------------#
def forward(self, src_token_ids_batch, trg_token_ids_batch, src_mask, trg_mask):
src_representations_batch = self.encode(src_token_ids_batch, src_mask)
trg_log_probs = self.decode(trg_token_ids_batch, src_representations_batch, trg_mask, src_mask)
return trg_log_probs
#----------------------------------------------#
# EncoderLayer的定义
# 一个Encoder结构中有6个EncoderLayer(paper中设定的)
#----------------------------------------------#
class EncoderLayer(nn.Module):
def __init__(self, model_dimension, dropout_probability, multi_headed_attention, pointwise_net):
super(EncoderLayer,self).__init__()
num_of_sublayers_encoder = 2
#-------------------------------------------------#
# 在这里将sublayer的权重以及结构复制过来进行使用
# 定义多头注意力机制模块以及FFN,并且定义出model_dimension
#-------------------------------------------------#
self.sublayers = get_clones(SublayerLogic(model_dimension, dropout_probability), num_of_sublayers_encoder)
self.multi_headed_attention = multi_headed_attention
self.pointwise_net = pointwise_net
self.model_dimension = model_dimension
def forward(self, src_representations_batch, src_mask):
#-----------------------------------------------------------------------------------------------#
# Define anonymous (lambda) function which only takes src_representations_batch (srb) as input,
# this way we have a uniform interface for the sublayer logic.
#-----------------------------------------------------------------------------------------------#
encoder_self_attention = lambda srb: self.multi_headed_attention(query=srb, key=srb, value=srb, mask=src_mask)
#---------------------------------------------------------------------#
# 在这里首先过self_attention,之后过定义的第一个sublayer(add&norm)
# 其次过FFN(pointwise_net),之后过定义的第二个sublayer(add&norm)
# 这样便完成了对于一个EncoderLayer的定义
#---------------------------------------------------------------------#
src_representations_batch = self.sublayers[0](src_representations_batch, encoder_self_attention)
src_representations_batch = self.sublayers[1](src_representations_batch, self.pointwise_net)
return src_representations_batch
#----------------------------------#
# Encoder结构的的定义
#----------------------------------#
class Encoder(nn.Module):
def __init__(self, encoder_layer, number_of_layers):
super(Encoder,self).__init__()
assert isinstance(encoder_layer, EncoderLayer), f'Expected EncoderLayer got {type(encoder_layer)}.'
#-------------------------------------#
# 定义Encoder结构中的layer的数目
# 定义一个LayerNorm层
#-------------------------------------#
self.encoder_layers = get_clones(encoder_layer, number_of_layers)
self.norm = nn.LayerNorm(encoder_layer.model_dimension)
def forward(self, src_embeddings_batch, src_mask):
#--------------------------------------------------------------------------------------------------------------#
# 将src_representations_batch更新为src_embeddings_batch,进而使得接下来的token向量拥有嵌入的位置信息,这个
# 是对应最下面的EncoderLayer(index=0)
# Just update the naming so as to reflect the semantics of what this var will become (the initial encoder layer
# has embedding vectors as input but later layers have richer token representations)
#--------------------------------------------------------------------------------------------------------------#
src_representations_batch = src_embeddings_batch
for encoder_layer in self.encoder_layers:
# src_mask's role is to mask/ignore padded token representations in the multi-headed self-attention module
src_representations_batch = encoder_layer(src_representations_batch, src_mask)
#-------------------------------------------------------------------------------------------------------------#
# Not mentioned explicitly in the paper (a consequence of using LayerNorm before instead of after the sublayer
# check out the SublayerLogic module)
# 这个norm原paper里面没有,是作者额外添加的,取得了更好的效果
#-------------------------------------------------------------------------------------------------------------#
return self.norm(src_representations_batch)
#----------------------------------------#
# DecoderLayer的定义,原paper中一个Decoder
# 结构中有六层DecoderLayer
#----------------------------------------#
class DecoderLayer(nn.Module):
def __init__(self, model_dimension, dropout_probability, multi_headed_attention, pointwise_net):
super(DecoderLayer,self).__init__()
num_of_sublayers_decoder = 3
#------------------------------------------#
# 定义sublayer用于add&norm操作
# 定义pointwise_net为FFN
# 定义model_dimension
# 定义出原始的MHA以及带掩码的MHA
#------------------------------------------#
self.sublayers = get_clones(SublayerLogic(model_dimension, dropout_probability), num_of_sublayers_decoder)
self.pointwise_net = pointwise_net
self.model_dimension = model_dimension
self.trg_multi_headed_attention = copy.deepcopy(multi_headed_attention)
self.src_multi_headed_attention = copy.deepcopy(multi_headed_attention)
def forward(self, trg_representations_batch, src_representations_batch, trg_mask, src_mask):
#--------------------------------------------------------------------------------------------------------------#
# Define anonymous (lambda) function which only takes trg_representations_batch
# as input - this way we have a uniform interface for the sublayer logic.
# The inputs which are not passed into lambdas are "cached" here that's why the thing works.
# decoder_trg_self_attention为带掩码的MHA
# decoder_src_attention为不带掩码的MHA,即为原始的MHA
# 顺序为: 带掩码的MHA以及add&norm -> 原始的MHA以及add&norm -> 过FFN以及add&norm -> DecoderGenerator(Linear+LogSoftmax)
#--------------------------------------------------------------------------------------------------------------#
srb = src_representations_batch
decoder_trg_self_attention = lambda trb: self.trg_multi_headed_attention(query=trb, key=trb, value=trb, mask=trg_mask)
decoder_src_attention = lambda trb: self.src_multi_headed_attention(query=trb, key=srb, value=srb, mask=src_mask)
trg_representations_batch = self.sublayers[0](trg_representations_batch, decoder_trg_self_attention)
trg_representations_batch = self.sublayers[1](trg_representations_batch, decoder_src_attention)
trg_representations_batch = self.sublayers[2](trg_representations_batch, self.pointwise_net)
return trg_representations_batch
#----------------------------------#
# Decoder结构的的定义
#----------------------------------#
class Decoder(nn.Module):
def __init__(self, decoder_layer, number_of_layers):
super(Decoder,self).__init__()
assert isinstance(decoder_layer, DecoderLayer), f'Expected DecoderLayer got {type(decoder_layer)}.'
#--------------------------------------#
# 大体架构同Encoder结构相同,首先定义出Decoder
# 结构的层数,之后定义出LayerNorm
#--------------------------------------#
self.decoder_layers = get_clones(decoder_layer, number_of_layers)
self.norm = nn.LayerNorm(decoder_layer.model_dimension)
def forward(self, trg_embeddings_batch, src_representations_batch, trg_mask, src_mask):
#-------------------------------------------------------------------------------------#
# Just update the naming so as to reflect the semantics of what this var will become
# 将trg_representations_batch更新为trg_embeddings_batch
#-------------------------------------------------------------------------------------#
trg_representations_batch = trg_embeddings_batch
for decoder_layer in self.decoder_layers:
#-------------------------------------------------------------------------------------------------#
# Target mask masks pad tokens as well as future tokens (current target token can't look forward)
# Target mask的作用在于遮挡住当前token之后的token,使得其只能利用在其之前的token
#-------------------------------------------------------------------------------------------------#
trg_representations_batch = decoder_layer(trg_representations_batch, src_representations_batch, trg_mask, src_mask)
#-------------------------------------------------------------------------------------------------------------#
# Not mentioned explicitly in the paper (a consequence of using LayerNorm before instead of after the sublayer
# check out the SublayerLogic module)
# 这个norm原paper里面没有,是作者额外添加的,取得了更好的效果
#-------------------------------------------------------------------------------------------------------------#
return self.norm(trg_representations_batch)
#---------------------------------------------------#
# 在这里作者通过实验结构进行了自己的改进,与原paper不同,作者将
# LayerNorm用在了残差连接以及add操作之前
# 用于MHA以及FFN之后的add&Norm(对应原paper)
# 然而在这里作者将其modify为Norm&add,实验证实这样的效果更好
#---------------------------------------------------#
class SublayerLogic(nn.Module):
def __init__(self, model_dimension, dropout_probability):
super(SublayerLogic,self).__init__()
self.norm = nn.LayerNorm(model_dimension)
self.dropout = nn.Dropout(p=dropout_probability)
def forward(self, representations_batch, sublayer_module):
#---------------------------------------------------#
# 在每一个子层之后,在与其原先的输入add之前
# 使用Dropout来进行优化
# 操作的顺序为先对经过sub_layer的token向量进行LayerNorm
# 之后按照原paper进行Dropout操作,最后再执行残差连接的add操作
#---------------------------------------------------#
return representations_batch + self.dropout(sublayer_module(self.norm(representations_batch)))
#----------------------------------#
# DecoderGenerator类的定义
# 定义了Linear层以及LogSoftmax层
# Linear->LogSoftmax
#----------------------------------#
class DecoderGenerator(nn.Module):
def __init__(self, model_dimension, vocab_size):
super(DecoderGenerator,self).__init__()
self.linear = nn.Linear(model_dimension, vocab_size)
#--------------------------------------------------------------------------------------------------------------#
# -1 stands for apply the log-softmax along the last dimension i.e. over the vocab dimension as the output from
# the linear layer has shape (B, T, V), B - batch size, T - max target token-sequence, V - target vocab size
# again using log softmax as PyTorch's nn.KLDivLoss expects log probabilities (just a technical detail)
#--------------------------------------------------------------------------------------------------------------#
self.log_softmax = nn.LogSoftmax(dim=-1)
def forward(self, trg_representations_batch):
# Project from D (model dimension) into V (target vocab size) and apply the log softmax along V dimension
return self.log_softmax(self.linear(trg_representations_batch))
#----------------------------------#
# Feed Forward Network(FFN)模块的定义
#----------------------------------#
class PositionwiseFeedForwardNet(nn.Module):
"""
It's position-wise because this feed forward net will be independently applied to every token's representation.
Representations batch is of the shape (batch size, max token sequence length, model dimension).
This net will basically be applied independently to every token's representation (you can think of it as if
there was a nested for-loop going over the batch size and max token sequence length dimensions
and applied this net to token representations. PyTorch does this auto-magically behind the scenes.
"""
def __init__(self, model_dimension, dropout_probability, width_mult=4):
super(PositionwiseFeedForwardNet,self).__init__()
self.linear1 = nn.Linear(model_dimension, width_mult * model_dimension)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=dropout_probability)
self.linear2 = nn.Linear(width_mult * model_dimension, model_dimension)
#-------------------------------------------#
# 对于Linear,我们可以将其看成conv1x1,其带来的只是
# 特征通道数目的改变,并且在其后我们会有一层Dropout来
# 减缓过拟合现象的发生
#-------------------------------------------#
def forward(self, representations_batch):
return self.linear2(self.dropout(self.relu(self.linear1(representations_batch))))
#---------------------------------------------#
# Transformer中的MultiHeadedAttention模块的定义
#---------------------------------------------#
class MultiHeadedAttention(nn.Module):
"""
This module already exists in PyTorch. The reason I implemented it here from scratch is that
PyTorch implementation is super complicated as they made it as generic/robust as possible whereas
on the other hand I only want to support a limited use-case.
Also this is arguable the most important architectural component in the Transformer model.
Additional note:
This is conceptually super easy stuff. It's just that matrix implementation makes things a bit less intuitive.
If you take your time and go through the code and figure out all of the dimensions + write stuff down on paper
you'll understand everything.
Optimization notes:
qkv_nets could be replaced by Parameter(torch.empty(3 * model_dimension, model_dimension)) and one more matrix
for bias, which would make the implementation a bit more optimized. For the sake of easier understanding though,
I'm doing it like this - using 3 "feed forward nets" (without activation/identity hence the quotation marks).
Conceptually both implementations are the same.
PyTorch's query/key/value are of different shape namely (max token sequence length, batch size, model dimension)
whereas I'm using (batch size, max token sequence length, model dimension) because it's easier to understand
and consistent with computer vision apps (batch dimension is always first followed by the number of channels (C)
and image's spatial dimensions height (H) and width (W) -> (B, C, H, W).
This has an important optimization implication, they can reshape their matrix into (B*NH, S/T, HD)
(where B - batch size, S/T - max src/trg sequence length, NH - number of heads, HD - head dimension)
in a single step and I can only get to (B, NH, S/T, HD) in single step
(I could call contiguous() followed by view but that's expensive as it would incur additional matrix copy)
"""
def __init__(self, model_dimension, number_of_heads, dropout_probability, log_attention_weights):
super(MultiHeadedAttention,self).__init__()
#--------------------------------------------------------#
# head_dimension的值为model_dimension除以head的数目
#--------------------------------------------------------#
assert model_dimension % number_of_heads == 0, f'Model dimension must be divisible by the number of heads.'
self.head_dimension = int(model_dimension / number_of_heads)
self.number_of_heads = number_of_heads
self.qkv_nets = get_clones(nn.Linear(model_dimension, model_dimension), 3)
self.out_projection_net = nn.Linear(model_dimension, model_dimension)
self.attention_dropout = nn.Dropout(p=dropout_probability) # no pun intended, not explicitly mentioned in paper
self.softmax = nn.Softmax(dim=-1) # -1 stands for apply the softmax along the last dimension
self.log_attention_weights = log_attention_weights # log attention weights
self.attention_weights = None # for visualization purposes, I cache the weights here
#------------------------------------#
# 自注意力的计算,mask为了decoder而设立
#------------------------------------#
def attention(self, query, key, value, mask):
# Step 1: Scaled dot-product attention, Page 4, Chapter 3.2.1 "Scaled Dot-Product Attention"
# Notation: B - batch size, S/T max src/trg token-sequence length, NH - number of heads, HD - head dimension
# query/key/value shape = (B, NH, S/T, HD), scores shape = (B, NH, S, S), (B, NH, T, T) or (B, NH, T, S)
# scores have different shapes as MHA is used in 3 contexts, self attention for src/trg and source attending MHA
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dimension)
#---------------------------------------------------------------------------------------------------------#
# Step 2: Optionally mask tokens whose representations we want to ignore by setting a big negative number
# to locations corresponding to those tokens (force softmax to output 0 probability on those locations).
# mask shape = (B, 1, 1, S) or (B, 1, T, T) will get broad-casted (copied) as needed to match scores shape
# 在解码器中,我们不想使用到当前的token以及其之后的token的信息,因此我们可以使用mask来将其遮住
#----------------------------------------------------------------------------------------------------------#
#----------------------#
# 针对Decoder
#----------------------#
if mask is not None:
scores.masked_fill_(mask == torch.tensor(False), float("-inf"))
#----------------------------------------------------------------------------------------------------------#
# Step 3: Calculate the attention weights - how much should we attend to surrounding token representations
# 计算注意力权重,通过softmax的得分来计算句子中的其他token对于该token的重要程度
#----------------------------------------------------------------------------------------------------------#
attention_weights = self.softmax(scores)
# Step 4: Not defined in the original paper apply dropout to attention weights as well
attention_weights = self.attention_dropout(attention_weights)
#--------------------------------------------------------------------------------------------------------------#
# 基于获得的注意力权重来计算每个新的token向量
# Step 5: based on attention weights calculate new token representations
# attention_weights shape = (B, NH, S, S)/(B, NH, T, T) or (B, NH, T, S), value shape = (B, NH, S/T, HD)
# Final shape (B, NH, S, HD) for source MHAs or (B, NH, T, HD) target MHAs (again MHAs are used in 3 contexts)
#--------------------------------------------------------------------------------------------------------------#
intermediate_token_representations = torch.matmul(attention_weights, value)
return intermediate_token_representations, attention_weights # attention weights for visualization purposes
def forward(self, query, key, value, mask):
batch_size = query.shape[0]
# Step 1: Input linear projection
# Notation: B - batch size, NH - number of heads, S/T - max src/trg token-sequence length, HD - head dimension
# Shape goes from (B, S/T, NH*HD) over (B, S/T, NH, HD) to (B, NH, S/T, HD) (NH*HD=D where D is model dimension)
query, key, value = [net(x).view(batch_size, -1, self.number_of_heads, self.head_dimension).transpose(1, 2)
for net, x in zip(self.qkv_nets, (query, key, value))]
# Step 2: Apply attention - compare query with key and use that to combine values (see the function for details)
intermediate_token_representations, attention_weights = self.attention(query, key, value, mask)
# Potentially, for visualization purposes, log the attention weights, turn off during training though!
if self.log_attention_weights:
self.attention_weights = attention_weights
# Step 3: Reshape from (B, NH, S/T, HD) over (B, S/T, NH, HD) (via transpose) into (B, S/T, NHxHD) which is
# the same shape as in the beginning of this forward function i.e. input to MHA (multi-head attention) module
reshaped = intermediate_token_representations.transpose(1, 2).reshape(batch_size, -1, self.number_of_heads * self.head_dimension)
# Step 4: Output linear projection
token_representations = self.out_projection_net(reshaped)
return token_representations
#---------------------------------------------------#
# Input/Output Embedding
# 对于源token以及目标token首先进行Input/Output Embedding
#---------------------------------------------------#
class Embedding(nn.Module):
def __init__(self, vocab_size, model_dimension):
super(Embedding,self).__init__()
self.embeddings_table = nn.Embedding(vocab_size, model_dimension)
self.model_dimension = model_dimension
def forward(self, token_ids_batch):
#--------------------------------------------------------------------------#
# 首先说明token_ids_batch的维度为2, token即为进行过分词操作之后所得到的一个个的单词
# 并且其shape=(B, S/T),B=batch_size,S/T = max src/trg token-sequence length
# S/T可以理解为当前token在句子中的相对位置
# 经过Embedding操作之后,张量的shape=(B, S/T, D),其中D=model_dimension
# 这样使得每一个token都有与其相对应的唯一一个的向量
#--------------------------------------------------------------------------#
assert token_ids_batch.ndim == 2, f'Expected: (batch size, max token sequence length), got {token_ids_batch.shape}'
embeddings = self.embeddings_table(token_ids_batch)
#-----------------------------------------------#
# 根据paper Page 5, Chapter 3.4所述,最后将嵌入完毕的
# 向量乘以sqrt(model_dimension)
#-----------------------------------------------#
return embeddings * math.sqrt(self.model_dimension)
#--------------------------------------------------#
# 在Input/OutputEmbedding操作完成之后,我们进行位置信息编码
# PositionalEncoding的作用是为了使得模型充分利用与
# token在句子中的位置顺序的信息
#--------------------------------------------------#
class PositionalEncoding(nn.Module):
def __init__(self, model_dimension, dropout_probability, expected_max_sequence_length=5000):
super(PositionalEncoding,self).__init__()
self.dropout = nn.Dropout(p=dropout_probability)
#--------------------------------#
# 确定每个token的id以及其出现的频率
#--------------------------------#
position_id = torch.arange(0, expected_max_sequence_length).unsqueeze(1)
frequencies = torch.pow(10000., -torch.arange(0, model_dimension, 2, dtype=torch.float) / model_dimension)
positional_encodings_table = torch.zeros(expected_max_sequence_length, model_dimension)
#------------------------------#
# 使用sin对偶数位置上的token进行编码
# 使用cos对奇数位置上的token进行编码
# 原因见Paper并且也并非只有这一种位置
# 编码方式
#------------------------------#
positional_encodings_table[:, 0::2] = torch.sin(position_id * frequencies)
positional_encodings_table[:, 1::2] = torch.cos(position_id * frequencies)
#---------------------------------------------------------------------------#
# 在这里我们对positional_encodings_table进行了buffer的注册,
# 使得其也能够保存在state_dict中,因为它们不是可训练的参数,即为它们不需要在训练中进行更新以及
# 优化,如若不对其进行注册,state_dict将不会对其进行保存
#---------------------------------------------------------------------------#
self.register_buffer('positional_encodings_table', positional_encodings_table)
def forward(self, embeddings_batch):
#-------------------------------------------------------------------#
# embedding_batch's shape = (B, S/T, D),
# where S/T max src/trg token-sequence length, D - model dimension
#-------------------------------------------------------------------#
assert embeddings_batch.ndim == 3 and embeddings_batch.shape[-1] == self.positional_encodings_table.shape[1], \
f'Expected (batch size, max token sequence length, model dimension) got {embeddings_batch.shape}'
# We get (S/T, D) shape which will get broad-casted to (B, S/T, D) when we try and add it to embeddings
positional_encodings = self.positional_encodings_table[:embeddings_batch.shape[1]]
#---------------------------------------------------------------------------#
# 将embeddings以及positional_encodings相加之后过Dropout以降低过拟合发生的风险
# Applying dropout to the sum of positional encodings and token embeddings
# Page 7, Chapter 5.4 "Regularization"
#---------------------------------------------------------------------------#
return self.dropout(embeddings_batch + positional_encodings)
#------------------------------------#
# get_clones函数的定义,创建深层的copy使得
# 我们可以独立地使用每个module的权重
#------------------------------------#
def get_clones(module, num_of_deep_copies):
return nn.ModuleList([copy.deepcopy(module) for _ in range(num_of_deep_copies)])
#--------------------------#
# 用于分析权重中所保存的信息
#--------------------------#
def analyze_state_dict_shapes_and_names(model):
print(model.state_dict().keys())
for name, param in model.named_parameters():
print(name, param.shape)
if not param.requires_grad:
raise Exception('Expected all of the params to be trainable - no param freezing used.')
#-------------------------#
# 计算出整体模型的可训练的参数
#-------------------------#
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
#-------------------------------------------#
# 用于对构建好的Transformer网络进行测试
#-------------------------------------------#
if __name__ == "__main__":
use_big_transformer = False
#----------------------------#
# 定义一些用于测试的随机数据
#----------------------------#
src_vocab_size = 11
trg_vocab_size = 11
src_token_ids_batch = torch.randint(1, 10, size=(3, 2))
trg_token_ids_batch = torch.randint(1, 10, size=(3, 2))
transformer = Transformer(
model_dimension = BIG_MODEL_DIMENSION if use_big_transformer else BASELINE_MODEL_DIMENSION,
src_vocab_size = src_vocab_size,
trg_vocab_size = trg_vocab_size,
number_of_heads = BIG_MODEL_NUMBER_OF_HEADS if use_big_transformer else BASELINE_MODEL_NUMBER_OF_HEADS,
number_of_layers = BIG_MODEL_NUMBER_OF_LAYERS if use_big_transformer else BASELINE_MODEL_NUMBER_OF_LAYERS,
dropout_probability = BIG_MODEL_DROPOUT_PROB if use_big_transformer else BASELINE_MODEL_DROPOUT_PROB)
analyze_state_dict_shapes_and_names(transformer)
print(f'Size of the {"big" if use_big_transformer else "baseline"} transformer = {count_parameters(transformer)}')
out = transformer(src_token_ids_batch, trg_token_ids_batch, src_mask=None, trg_mask=None)