Transformer架构完整代码
import math
import torch
import collections
import numpy as np
import torch.nn as nn
from copy import deepcopy
import torch.nn.functional as F
from torch.autograd import Variable
Hypothesis = collections.namedtuple('Hypothesis', ['value', 'score'])
def clone_module_to_modulelist(module, module_num):
"""
克隆n个Module类放入ModuleList中,并返回ModuleList,这个ModuleList中的每个Module都是一模一样的
nn.ModuleList,它是一个储存不同 module,并自动将每个 module 的 parameters 添加到网络之中的容器。
你可以把任意 nn.Module 的子类 (比如 nn.Conv2d, nn.Linear 之类的) 加到这个 list 里面,
加入到 nn.ModuleList 里面的 module 是会自动注册到整个网络上的,
同时 module 的 parameters 也会自动添加到整个网络中。
:param module: 被克隆的module
:param module_num: 被克隆的module数
:return: 装有module_num个相同module的ModuleList
"""
return nn.ModuleList([deepcopy(module) for _ in range(module_num)])
class LayerNorm(nn.Module):
"""
构建一个LayerNorm Module
LayerNorm的作用:对x归一化,使x的均值为0,方差为1
LayerNorm计算公式:x-mean(x)/\sqrt{var(x)+\epsilon} = x-mean(x)/std(x)+\epsilon
"""
def __init__(self, x_size, eps=1e-6):
"""
:param x_size: 特征的维度
:param eps: eps是一个平滑的过程,取值通常在(10^-4~10^-8 之间)
其含义是,对于每个参数,随着其更新的总距离增多,其学习速率也随之变慢。
防止出现除以0的情况。
nn.Parameter将一个不可训练的类型Tensor转换成可以训练的类型parameter,
并将这个parameter绑定到这个module里面。
使用这个函数的目的也是想让某些变量在学习的过程中不断的修改其值以达到最优化。
"""
super(LayerNorm, self).__init__()
self.ones_tensor = nn.Parameter(torch.ones(x_size))
self.zeros_tensor = nn.Parameter(torch.zeros(x_size))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.ones_tensor * (x - mean) / (std + self.eps) + self.zeros_tensor
class FeatEmbedding(nn.Module):
"""
视频特征向量生成器
"""
def __init__(self, d_feat, d_model, dropout):
"""
FeatEmbedding的初始化
:param d_feat: per frame dimension(每帧的维度),作为Linear层输入的维度
:param d_model: 作为Linear层输出的维度
:param dropout: Dropout层的比率
nn.Sequential:这是一个有顺序的容器,将特定神经网络模块按照在传入构造器的顺序依次被添加到计算图中
在这里构造的容器是:LayerNorm --> Dropout --> Linear
"""
super(FeatEmbedding, self).__init__()
self.video_embeddings = nn.Sequential(
LayerNorm(d_feat),
nn.Dropout(p=dropout),
nn.Linear(d_feat, d_model)
)
def forward(self, x):
return self.video_embeddings(x)
class WordEmbedding(nn.Module):
"""
把向量构造成d_model维度的词向量,以便后续送入编码器
"""
def __init__(self, vocab_size, d_model):
"""
:param vocab_size: 字典长度
:param d_model: 词向量维度
"""
super(WordEmbedding, self).__init__()
self.d_model = d_model
self.embedding = nn.Embedding(vocab_size, d_model)
self.embed = self.embedding
def forward(self, x):
return self.embed(x) * math.sqrt(self.d_model)
class PositionalEncoding(nn.Module):
"""
正弦位置编码,即通过三角函数构建位置编码
Implementation based on "Attention Is All You Need"
:cite:`DBLP:journals/corr/VaswaniSPUJGKP17`
"""
def __init__(self, dim: int, dropout: float, max_len=5000):
"""
:param dim: 位置向量的向量维度,一般与词向量维度相同,即d_model
:param dropout: Dropout层的比率
:param max_len: 句子的最大长度
"""
if dim % 2 != 0:
raise ValueError(f"不能使用 sin/cos 位置编码,得到了奇数的维度{dim:d},应该使用偶数维度")
"""
构建位置编码pe
pe公式为:
PE(pos,2i/2i+1) = sin/cos(pos/10000^{2i/d_{model}})
"""
pe = torch.zeros(max_len, dim)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * torch.tensor(
-(math.log(10000.0) / dim))))
pe[:, 0::2] = torch.sin(position.float() * div_term)
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(1)
super(PositionalEncoding, self).__init__()
self.register_buffer('pe', pe)
self.drop_out = nn.Dropout(p=dropout)
self.dim = dim
def forward(self, emb, step=None):
"""
词向量和位置编码拼接并输出
:param emb: 词向量序列(FloatTensor),``(seq_len, batch_size, self.dim)``
:param step: 如果 stepwise("seq_len=1"),则用此位置的编码
:return: 词向量和位置编码的拼接
"""
emb = emb * math.sqrt(self.dim)
if step is None:
emb = emb + self.pe[:emb.size(0)]
else:
emb = emb + self.pe[step]
emb = self.drop_out(emb)
return emb
def self_attention(query, key, value, dropout=None, mask=None):
"""
自注意力计算
:param query: Q
:param key: K
:param value: V
:param dropout: drop比率
:param mask: 是否mask
:return: 经自注意力机制计算后的值
"""
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
"""
scores.masked_fill默认是按照传入的mask中为1的元素所在的索引,
在scores中相同的的索引处替换为value,替换值为-1e9,即-(10^9)
"""
scores = scores.masked_fill(mask == 0, -1e9)
self_attn_softmax = F.softmax(scores, dim=-1)
if dropout is not None:
self_attn_softmax = dropout(self_attn_softmax)
return torch.matmul(self_attn_softmax, value), self_attn_softmax
class MultiHeadAttention(nn.Module):
"""
多头注意力计算
"""
def __init__(self, head, d_model, dropout=0.1):
"""
:param head: 头数
:param d_model: 词向量的维度,必须是head的整数倍
:param dropout: drop比率
"""
super(MultiHeadAttention, self).__init__()
assert (d_model % head == 0)
self.d_k = d_model // head
self.head = head
self.d_model = d_model
"""
由于多头注意力机制是针对多组Q、K、V,因此有了下面这四行代码,具体作用是,
针对未来每一次输入的Q、K、V,都给予参数进行构建
其中linear_out是针对多头汇总时给予的参数
"""
self.linear_query = nn.Linear(d_model, d_model)
self.linear_key = nn.Linear(d_model, d_model)
self.linear_value = nn.Linear(d_model, d_model)
self.linear_out = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(p=dropout)
self.attn_softmax = None
def forward(self, query, key, value, mask=None):
if mask is not None:
"""
多头注意力机制的线性变换层是4维,是把query[batch, frame_num, d_model]变成[batch, -1, head, d_k]
再1,2维交换变成[batch, head, -1, d_k], 所以mask要在第二维(head维)添加一维,与后面的self_attention计算维度一样
具体点将,就是:
因为mask的作用是未来传入self_attention这个函数的时候,作为masked_fill需要mask哪些信息的依据
针对多head的数据,Q、K、V的形状维度中,只有head是通过view计算出来的,是多余的,为了保证mask和
view变换之后的Q、K、V的形状一直,mask就得在head这个维度添加一个维度出来,进而做到对正确信息的mask
"""
mask = mask.unsqueeze(1)
n_batch = query.size(0)
"""
下列三行代码都在做类似的事情,对Q、K、V三个矩阵做处理
其中view函数是对Linear层的输出做一个形状的重构,其中-1是自适应(自主计算)
从这种重构中,可以看出,虽然增加了头数,但是数据的总维度是没有变化的,也就是说多头是对数据内部进行了一次拆分
transopose(1,2)是对前形状的两个维度(索引从0开始)做一个交换,例如(2,3,4,5)会变成(2,4,3,5)
因此通过transpose可以让view的第二维度参数变成n_head
假设Linear成的输出维度是:[10, 32, 512],其中10是batch_size的大小
注:这里解释了为什么d_model // head == d_k,如若不是,则view函数做形状重构的时候会出现异常
"""
query = self.linear_query(query).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)
key = self.linear_key(key).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)
value = self.linear_value(value).view(n_batch, -1, self.head, self.d_k).transpose(1, 2)
x, self.attn_softmax = self_attention(query, key, value, dropout=self.dropout, mask=mask)
"""
下面的代码是汇总各个头的信息,拼接后形成一个新的x
其中self.head * self.d_k,可以看出x的形状是按照head数拼接成了一个大矩阵,然后输入到linear_out层添加参数
contiguous()是重新开辟一块内存后存储x,然后才可以使用.view方法,否则直接使用.view方法会报错
"""
x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.head * self.d_k)
return self.linear_out(x)
class FeedForward(nn.Module):
"""
两层具有残差网络的前馈神经网络,FNN网络
"""
def __init__(self, d_model: int, d_ff: int, dropout=0.1):
"""
:param d_model: FFN第一层输入的维度
:param d_ff: FNN第二层隐藏层输入的维度
:param dropout: drop比率
"""
super(FeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
self.dropout_1 = nn.Dropout(dropout)
self.relu = nn.ReLU()
self.dropout_2 = nn.Dropout(dropout)
def forward(self, x):
"""
:param x: 输入数据,形状为(batch_size, input_len, model_dim)
:return: 输出数据(FloatTensor),形状为(batch_size, input_len, model_dim)
"""
inter = self.dropout_1(self.relu(self.w_1(self.layer_norm(x))))
output = self.dropout_2(self.w_2(inter))
return output
class SublayerConnection(nn.Module):
"""
子层的连接: layer_norm(x + sublayer(x))
上述可以理解为一个残差网络加上一个LayerNorm归一化
"""
def __init__(self, size, dropout=0.1):
"""
:param size: d_model
:param dropout: drop比率
"""
super(SublayerConnection, self).__init__()
self.layer_norm = LayerNorm(size)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, sublayer):
return self.dropout(self.layer_norm(x + sublayer(x)))
class EncoderLayer(nn.Module):
"""
一层编码Encoder层
MultiHeadAttention -> Add & Norm -> Feed Forward -> Add & Norm
"""
def __init__(self, size, attn, feed_forward, dropout=0.1):
"""
:param size: d_model
:param attn: 已经初始化的Multi-Head Attention层
:param feed_forward: 已经初始化的Feed Forward层
:param dropout: drop比率
"""
super(EncoderLayer, self).__init__()
self.attn = attn
self.feed_forward = feed_forward
"""
下面一行的作用是因为一个Encoder层具有两个残差结构的网络
因此构建一个ModuleList存储两个SublayerConnection,以便未来对数据进行残差处理
"""
self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(size, dropout), 2)
def forward(self, x, mask):
"""
:param x: Encoder层的输入
:param mask: mask标志
:return: 经过一个Encoder层处理后的输出
"""
"""
编码层第一层子层
self.attn 应该是一个已经初始化的Multi-Head Attention层
把Encoder的输入数据x和经过一个Multi-Head Attention处理后的x_attn送入第一个残差网络进行处理得到first_x
"""
first_x = self.sublayer_connection_list[0](x, lambda x_attn: self.attn(x, x, x, mask))
"""
编码层第二层子层
把经过第一层子层处理后的数据first_x与前馈神经网络送入第二个残差网络进行处理得到Encoder层的输出
"""
return self.sublayer_connection_list[1](first_x, self.feed_forward)
class DecoderLayer(nn.Module):
"""
一层解码Decoder层
Mask MultiHeadAttention -> Add & Norm -> Multi-Head Attention -> Add & Norm
-> Feed Forward -> Add & Norm
"""
def __init__(self, d_model, attn, feed_forward, sublayer_num, dropout=0.1):
"""
:param d_model: d_model
:param attn: 已经初始化的Multi-Head Attention层
:param feed_forward: 已经初始化的Feed Forward层
:param sublayer_num: 解码器内部子层数,如果未来r2l_memory传入有值,则为4层,否则为普通的3层
:param dropout: drop比率
"""
super(DecoderLayer, self).__init__()
self.attn = attn
self.feed_forward = feed_forward
self.sublayer_connection_list = clone_module_to_modulelist(SublayerConnection(d_model, dropout), sublayer_num)
def forward(self, x, l2r_memory, src_mask, trg_mask, r2l_memory=None, r2l_trg_mask=None):
"""
:param x: Decoder的输入(captioning)
:param l2r_memory: Encoder的输出,作为Multi-Head Attention的K,V值,为从左到右的Encoder的输出
:param src_mask: 编码器输入的填充掩码
:param trg_mask: 解码器输入的填充掩码和序列掩码,即对后面单词的掩码
:param r2l_memory: 从右到左解码器的输出
:param r2l_trg_mask: 从右到左解码器的输出的填充掩码和序列掩码
:return: Encoder的输出
"""
"""
解码器第一层子层
把Decoder的输入数据x和经过一个Masked Multi-Head Attention处理后的first_x_attn送入第一个残差网络进行处理得到first_x
"""
first_x = self.sublayer_connection_list[0](x, lambda first_x_attn: self.attn(x, x, x, trg_mask))
"""
解码器第二层子层
把第一层子层得到的first_x和
经过一个Multi-Head Attention处理后的second_x_attn(由first_x和Encoder的输出进行自注意力计算)
送入第二个残差网络进行处理
"""
second_x = self.sublayer_connection_list[1](first_x,
lambda second_x_attn: self.attn(first_x, l2r_memory, l2r_memory,
src_mask))
"""
解码器第三层子层
把经过第二层子层处理后的数据second_x与前馈神经网络送入第三个残差网络进行处理得到Decoder层的输出
如果有r2l_memory数据,则还需要经过一层多头注意力计算,也就是说会有四个残差网络
r2l_memory是让Decoder层多了一层双向编码中从右到左的编码层
而只要三个残差网络的Decoder层只有从左到右的编码
"""
if not r2l_memory:
third_x = self.sublayer_connection_list[-2](second_x,
lambda third_x_attn: self.attn(second_x, r2l_memory, r2l_memory,
r2l_trg_mask))
return self.sublayer_connection_list[-1](third_x, self.feed_forward)
else:
return self.sublayer_connection_list[-1](second_x, self.feed_forward)
class Encoder(nn.Module):
"""
构建n层编码层
"""
def __init__(self, n, encoder_layer):
"""
:param n: Encoder层的层数
:param encoder_layer: 初始化的Encoder层
"""
super(Encoder, self).__init__()
self.encoder_layer_list = clone_module_to_modulelist(encoder_layer, n)
def forward(self, x, src_mask):
"""
:param x: 输入数据
:param src_mask: mask标志
:return: 经过n层Encoder处理后的数据
"""
for encoder_layer in self.encoder_layer_list:
x = encoder_layer(x, src_mask)
return x
class R2LDecoder(nn.Module):
"""
n个含有R2L自注意计算的解码层,该解码层只有3个残差网络
"""
def __init__(self, n_layers, decoder_layer):
"""
:param n_layers: Decoder层的层数
:param decoder_layer: 初始化的Decoder层
"""
super(R2LDecoder, self).__init__()
self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)
def forward(self, x, memory, src_mask, trg_mask):
for decoder_layer in self.decoder_layer_list:
x = decoder_layer(x, memory, src_mask, trg_mask)
return x
class L2RDecoder(nn.Module):
"""
n个含有L2R自注意计算的解码层,该解码层有4个残差网络
"""
def __init__(self, n_layers, decoder_layer):
"""
:param n_layers: Decoder层的层数
:param decoder_layer: 初始化的Decoder层
"""
super(L2RDecoder, self).__init__()
self.decoder_layer_list = clone_module_to_modulelist(decoder_layer, n_layers)
def forward(self, x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
for decoder_layer in self.decoder_layer_list:
x = decoder_layer(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)
return x
def sequence_mask(size):
"""
序列掩码,解码器输入数据时掩盖后续词的位置
:param size: 生成词个数
:return: 右上角为False,主对角线及左下角为True的bool矩阵
"""
attn_shape = (1, size, size)
"""
np.triu:返回函数的上三角矩阵A,k=1得到主对角线向上平移一个距离的对角线,
即保留右上对角线及其以上的数据,其余置为0,即a_11=0
"""
mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return (torch.from_numpy(mask) == 0).cuda()
def src_trg_mask(src, r2l_trg, trg, pad_idx):
"""
:param src: 编码器的输入
:param r2l_trg: r2l方向解码器的输入
:param trg: l2r方向解码器的输入
:param pad_idx: pad的索引
:return: trg为None,返回编码器输入的掩码;trg存在,返回编码器和解码器输入的掩码
"""
if isinstance(src, tuple) and len(src) == 4:
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1)
src_rel_mask = (src[3][:, :, 0] != pad_idx).unsqueeze(1)
enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask, src_rel_mask)
dec_src_mask = src_image_mask & src_motion_mask
src_mask = (enc_src_mask, dec_src_mask)
elif isinstance(src, tuple) and len(src) == 3:
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
src_object_mask = (src[2][:, :, 0] != pad_idx).unsqueeze(1)
enc_src_mask = (src_image_mask, src_motion_mask, src_object_mask)
dec_src_mask = src_image_mask & src_motion_mask
src_mask = (enc_src_mask, dec_src_mask)
elif isinstance(src, tuple) and len(src) == 2:
src_image_mask = (src[0][:, :, 0] != pad_idx).unsqueeze(1)
src_motion_mask = (src[1][:, :, 0] != pad_idx).unsqueeze(1)
enc_src_mask = (src_image_mask, src_motion_mask)
dec_src_mask = src_image_mask & src_motion_mask
src_mask = (enc_src_mask, dec_src_mask)
else:
src_mask = src_image_mask = (src[:, :, 0] != pad_idx).unsqueeze(1)
if trg and r2l_trg:
"""
trg_mask是填充掩码和序列掩码,&前是填充掩码,&后是通过subsequent_mask函数得到的序列掩码
其中type_as,是为了让序列掩码和填充掩码的维度一致
"""
trg_mask = (trg != pad_idx).unsqueeze(1) & sequence_mask(trg.size(1)).type_as(src_image_mask.data)
r2l_pad_mask = (r2l_trg != pad_idx).unsqueeze(1).type_as(src_image_mask.data)
r2l_trg_mask = r2l_pad_mask & sequence_mask(r2l_trg.size(1)).type_as(src_image_mask.data)
return src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask
else:
return src_mask
class WordProbGenerator(nn.Module):
"""
文本生成器,即把Decoder层的输出通过最后一层softmax层变化为词概率
"""
def __init__(self, d_model, vocab_size):
"""
:param d_model: 词向量维度
:param vocab_size: 词典大小
"""
super(WordProbGenerator, self).__init__()
self.linear = nn.Linear(d_model, vocab_size)
def forward(self, x):
return F.log_softmax(self.linear(x), dim=-1)
class ABDTransformer(nn.Module):
"""
拼凑出Transformer
"""
def __init__(self, vocab, d_feat, d_model, d_ff, n_heads, n_layers, dropout, feature_mode, device='cuda'):
"""
:param vocab: 字典长度
:param d_feat: per frame dimension(每帧的维度)
:param d_model: 词向量的长度
:param d_ff: FNN(FeedForward)第二层隐藏层输入的维度
:param n_heads: 多头注意力时的头数
:param n_layers: 编码器和解码器的层数
:param dropout: drop的比率
:param feature_mode: 提取视频特征的模式
:param device: 是否使用gpu
"""
super(ABDTransformer, self).__init__()
self.vocab = vocab
self.device = device
self.feature_mode = feature_mode
attn = MultiHeadAttention(n_heads, d_model, dropout)
feed_forward = FeedForward(d_model, d_ff)
"""
提取视频特征向量
通过feature_mode判断d_feat提取出的维度,也就是提取了多少种信息
共有四种特征向量信息,四种特征向量依次为:
image_mask:二维特征向量
motion_mask:三维特征向量
object_mask:目标检测,分为两部分,第一部分是目标检测框的坐标,第二部分是被检测目标的特征向量
rel_mask:是目标之间的关系特征向量
"""
if feature_mode == 'one':
self.unknown_src_embed = FeatEmbedding(d_feat, d_model, dropout)
elif feature_mode == 'two':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
elif feature_mode == 'three':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
elif feature_mode == 'four':
self.image_src_embed = FeatEmbedding(d_feat[0], d_model, dropout)
self.motion_src_embed = FeatEmbedding(d_feat[1], d_model, dropout)
self.object_src_embed = FeatEmbedding(d_feat[2], d_model, dropout)
self.rel_src_embed = FeatEmbedding(d_feat[3], d_model, dropout)
else:
raise "feature_mode没有该模式,只有['one','two','three','four']四种模式"
self.trg_embed = WordEmbedding(vocab.n_vocabs, d_model)
self.pos_embed = PositionalEncoding(d_model, dropout)
self.encoder = Encoder(n_layers, EncoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward), dropout))
"""
单向解码层
使用deepcopy的原因:因为每个层的参数是不同的,因此通过deepcopy拷贝一份到新的内存里,避免共享参数
"""
self.r2l_decoder = R2LDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
sublayer_num=3, dropout=dropout))
self.l2r_decoder = L2RDecoder(n_layers, DecoderLayer(d_model, deepcopy(attn), deepcopy(feed_forward),
sublayer_num=4, dropout=dropout))
self.word_prob_generator = WordProbGenerator(d_model, vocab.n_vocabs)
def _encoder_feature_concat(self, src, feature_type, src_mask):
"""
为接下来的encoder函数做准备,主要目的是对视频的特征向量做处理
:param src: 特征向量
:param feature_type: 根据视频的类别不同,使用不同的特征向量生成函数, ['image', 'motion', 'object', 'rel']
:param src_mask: 特征向量掩码的标志
:return: 经过处理后的视频特征向量
"""
if feature_type == 'rel':
x = self.rel_src_embed(src)
return self.encoder(x, src_mask)
x = self.__getattribute__(f'{feature_type}_src_embed')(src)
x = self.pos_embed(x)
return self.encoder(x, src_mask)
def encode(self, src, src_mask):
"""
对数据进行编码,此处主要目的是对不同类型的视频特征向量进行编码
:param src: 视频的特征向量
:param src_mask: 视频特征向量的掩码标志
:return: 成功被编码器编码的视频特征向量
"""
x_list = []
feature_type_list = ['image', 'motion', 'object', 'rel']
feature_mode_dict = {'two': 2, 'three': 3, 'four': 4}
if self.feature_mode == 'one':
return self._encoder_feature_concat[src, 'unknown', src_mask]
for i, feature_type in enumerate(feature_type_list):
if i == feature_mode_dict[self.feature_mode]:
break
x_list.append(self._encoder_feature_concat[src[i], feature_type, src_mask[i]])
return sum(x_list)
def r2l_decode(self, trg, memory, src_mask, trg_mask):
"""
对于单向编码,把视频向量转为文本向量,并且添加位置向量
:param trg: 解码器的输入
:param memory: 编码器的输出,也就是传给解码器的K、V
:param src_mask: 编码器输出的掩码标志
:param trg_mask: 解码器的掩码和单词掩码序列(看不见后面的词)
:return:
"""
x = self.trg_embed(trg)
x = self.pos_embed(x)
return self.r2l_decoder(x, memory, src_mask, trg_mask)
def l2r_decode(self, trg, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask):
"""
对于双向编码,把视频向量转为文本向量,并且添加位置向量
:param trg: 解码器的输入
:param memory: 编码器的输出,也就是传给解码器的K、V
:param src_mask: 编码器输出的掩码标志
:param trg_mask: 解码器的掩码和单词掩码序列(看不见后面的词)
:param r2l_memory: 从右到左解码器的输出
:param r2l_trg_mask: 从右到左解码器的输出的填充掩码和序列掩码
:return:
"""
x = self.trg_embed(trg)
x = self.pos_embed(x)
return self.l2r_decoder(x, memory, src_mask, trg_mask, r2l_memory, r2l_trg_mask)
def forward(self, src, r2l_trg, trg, mask):
"""
:param src: 编码器的输入
:param r2l_trg: 从右到左解码器的输入
:param trg: 从左到右解码器的输入
:param mask: mask标志
:return: 从右到左解码器和从左到右解码器的输出词概率分布
"""
if len(mask) == 4:
src_mask, r2l_pad_mask, r2l_trg_mask, trg_mask = mask
else:
raise "mask返回的是不带有解码器输入掩码的掩码元组,确认src_trg_mask()函数的参数"
if self.feature_mode == 'one':
encoding_output = self.encode(src, src_mask)
r2l_output = self.r2l_decode(r2l_trg, encoding_output, src_mask, r2l_trg_mask)
l2r_output = self.l2r_decode(trg, encoding_output, src_mask, trg_mask, r2l_output, r2l_pad_mask)
elif self.feature_mode == 'two' or 'three' or 'four':
enc_src_mask, dec_src_mask = src_mask
encoding_output = self.encode(src, enc_src_mask)
r2l_output = self.r2l_decode(r2l_trg, encoding_output, dec_src_mask, r2l_trg_mask)
l2r_output = self.l2r_decode(trg, encoding_output, dec_src_mask, trg_mask, r2l_output, r2l_pad_mask)
else:
raise "没有这种feature_mode,只有['one','two','three','four']"
r2l_pred = self.word_prob_generator(r2l_output)
l2r_pred = self.word_prob_generator(l2r_output)
return r2l_pred, l2r_pred
def greedy_decode(self, batch_size, src_mask, memory, max_len):
"""
针对r2l的解码单词生成,贪婪解码,每次按照最大概率的词作为候选词
:param batch_size: 每次送入的数据的数量
:param src_mask: 编码器输入数据的掩码标志
:param memory: 编码器的输出
:param max_len: 最大的迭代次数,即生成单词数
:return: 返回的r2l_hidden,即未来送入l2r中的r2l_memory;output是r2l层的预测输出
"""
eos_idx = self.vocab.word2idx['']
with torch.no_grad():
output = torch.ones(batch_size, 1).fill_(eos_idx).long().cuda()
for i in range(max_len + 2 - 1):
trg_mask = sequence_mask(output.size(1))
dec_out = self.r2l_decode(output, memory, src_mask, trg_mask)
r2l_hidden = dec_out
pred = self.word_prob_generator(dec_out)
next_word = pred[:, -1].max(dim=-1)[1].unsqueeze(1)
output = torch.cat([output, next_word], dim=-1)
return r2l_hidden, output
def r2l_beam_search_decode(self, batch_size, src, src_mask, model_encodings, beam_size, max_len):
"""
Beam Search算法可以参考:https://www.cnblogs.com/nickchen121/p/15499576.html
在每生成一个单词的时间步上,不是只保留当前分数最高的1个输出,而是保留num_beams个。
当num_beams=1时集束搜索就退化成了贪心搜索,也就是上述的greedy_decode。
:param batch_size: 一次送入数据的大小
:param src: 编码器的输入
:param src_mask: 编码器输入的掩码
:param model_encodings:
:param beam_size:
:param max_len: 最大迭代数,即生成单词数
:return:
"""
end_symbol = self.vocab.word2idx['']
start_symbol = self.vocab.word2idx['']
r2l_output = None
r2l_outputs = None
hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
device=self.device)) for _ in range(batch_size)]
completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
for _ in range(batch_size)]
for _ in range(max_len + 1):
if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
break
"""
2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
We run all hypotheses for all examples together through the decoder and log-softmax,
and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
"""
cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l = [], [], [], []
for i in range(batch_size):
if hypotheses[i] is None:
cur_beam_sizes += [0]
continue
cur_beam_size, decoded_len = hypotheses[i].shape
cur_beam_sizes += [cur_beam_size]
last_tokens += [hypotheses[i]]
model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
model_encodings_cur = torch.cat(model_encodings_l, dim=0)
src_mask_cur = torch.cat(src_mask_l, dim=0)
y_tm1 = torch.cat(last_tokens, dim=0)
if self.feature_mode == 'one':
out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device))
elif self.feature_mode == 'two' or 'three' or 'four':
out = self.r2l_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device))
else:
raise "out为None"
r2l_output = out
log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
_, decoded_len, vocab_sz = log_prob.shape
log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)
"""
2.2 Now we process each example in the batch.
Note that the example may have already finished processing before
other examples (no more hypotheses to try), in which case we continue
"""
new_hypotheses, new_hyp_scores = [], []
for i in range(batch_size):
if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
new_hypotheses += [None]
new_hyp_scores += [None]
continue
"""
2.2.1 We compute the cumulative scores for each live hypotheses for the example
hyp_scores is the old scores for the previous stage, and `log_prob` are the new probs for
this stage. Since they are log probs, we sum them instead of multiplying them.
The .view(-1) forces all the hypotheses into one dimension. The shape of this dimension is
cur_beam_sz * vocab_sz (ex: 5 * 50002).
So after getting the topk from it,
we can recover the generating sentence and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
.expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)
"""
2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
live_hyp_num_i = beam_size - len(completed_hypotheses[i])
top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
"""
shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz.
hyp_word_ids vals are 0 <= val < vocab_len
"""
prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs
"""
2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
"""
new_hypotheses_i, new_hyp_scores_i = [], []
for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
top_cand_hyp_scores):
prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()
new_hyp_sent = torch.cat(
(hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
if hyp_word_id == end_symbol:
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
score=cand_new_hyp_score))
else:
new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
new_hyp_scores_i.append(cand_new_hyp_score)
"""
2.2.4 We may find that the hypotheses_i for some example in the batch
is empty - we have fully processed that example. We use None as a sentinel in this case.
Above, the loops gracefully handle None examples.
"""
if len(new_hypotheses_i) > 0:
hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
else:
hypotheses_i, hyp_scores_i = None, None
new_hypotheses += [hypotheses_i]
new_hyp_scores += [hyp_scores_i]
hypotheses, hyp_scores = new_hypotheses, new_hyp_scores
"""
2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
Sometimes, we may get to max_len of a sentence and still not generate the end token.
In this case, the partial sentence we have generated will not be added to the completed_hypotheses
automatically, and we have to manually add it in. We add in as many as necessary so that there are
`beam_size` completed hypotheses for each example.
Finally, we sort each completed hypothesis by score.
"""
for i in range(batch_size):
hyps_to_add = beam_size - len(completed_hypotheses[i])
if hyps_to_add > 0:
scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
for score, id_ in zip(scores, ix):
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
score=score))
completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
return r2l_output, completed_hypotheses
def beam_search_decode(self, src, beam_size, max_len):
"""
An Implementation of Beam Search for the Transformer Model.
Beam search is performed in a batched manner. Each example in a batch generates `beam_size` hypotheses.
We return a list (len: batch_size) of list (len: beam_size) of Hypothesis,
which contain our output decoded sentence sand their scores.
:param src: shape (sent_len, batch_size). Each val is 0 < val < len(vocab_dec). The input tokens to the decoder.
:param max_len: the maximum length to decode
:param beam_size: the beam size to use
:return completed_hypotheses: A List of length batch_size,
each containing a List of beam_size Hypothesis objects.Hypothesis is a named Tuple,
its first entry is "value" and is a List of strings which contains the translated word
(one string is one word token).
The second entry is "score" and it is the log-prob score for this translated sentence.
Note: Below I note "4 bt", "5 beam_size" as the shapes of objects. 4, 5 are default values.
Actual values may differ.
"""
start_symbol = self.vocab.word2idx['']
end_symbol = self.vocab.word2idx['']
src_mask = src_trg_mask(src, r2l_trg=None, trg=None, pad_idx=self.vocab.word2idx[''])
if self.feature_mode == 'one':
batch_size = src.shape[0]
dec_src_mask = None
model_encodings = self.encode(src, src_mask)
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
elif self.feature_mode == 'two' or 'three' or 'four':
batch_size = src[0].shape[0]
enc_src_mask = src_mask[0]
dec_src_mask = src_mask[1]
model_encodings = self.encode(src, enc_src_mask)
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, dec_src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
else:
raise "batch_size为None"
"""
1.2 Setup r2l target output
r2l_memory, r2l_completed_hypotheses = self.r2l_beam_search_decode(batch_size, src, src_mask,
model_encodings=model_encodings,
beam_size=1, max_len=max_len)
r2l_memory, r2l_completed_hypotheses = self.greedy_decode(batch_size, src_mask, model_encodings, max_len)
beam_r2l_memory = [copy.deepcopy(r2l_memory) for _ in range(beam_size)]
1.3 Setup Tgt Hypothesis Tracking
"""
hypotheses = [copy.deepcopy(torch.full((1, 1), start_symbol, dtype=torch.long,
device=self.device)) for _ in range(batch_size)]
completed_hypotheses = [copy.deepcopy([]) for _ in range(batch_size)]
hyp_scores = [copy.deepcopy(torch.full((1,), 0, dtype=torch.float, device=self.device))
for _ in range(batch_size)]
for _ in range(max_len + 1):
if all([len(completed_hypotheses[i]) == beam_size for i in range(batch_size)]):
break
"""
2.1 Setup the batch. Since we use beam search, each batch has a variable number (called cur_beam_size)
between 0 and beam_size of hypotheses live at any moment. We decode all hypotheses for all batches at
the same time, so we must copy the src_encodings, src_mask, etc the appropriate number fo times for
the number of hypotheses for each example. We keep track of the number of live hypotheses for each example.
We run all hypotheses for all examples together through the decoder and log-softmax,
and then use `torch.split` to get the appropriate number of hypotheses for each example in the end.
"""
cur_beam_sizes, last_tokens, model_encodings_l, src_mask_l, r2l_memory_l = [], [], [], [], []
for i in range(batch_size):
if hypotheses[i] is None:
cur_beam_sizes += [0]
continue
cur_beam_size, decoded_len = hypotheses[i].shape
cur_beam_sizes += [cur_beam_size]
last_tokens += [hypotheses[i]]
model_encodings_l += [model_encodings[i:i + 1]] * cur_beam_size
if self.feature_mode == 'one':
src_mask_l += [src_mask[i:i + 1]] * cur_beam_size
elif dec_src_mask and (self.feature_mode == 'two' or 'three' or 'four'):
src_mask_l += [dec_src_mask[i:i + 1]] * cur_beam_size
r2l_memory_l += [r2l_memory[i: i + 1]] * cur_beam_size
model_encodings_cur = torch.cat(model_encodings_l, dim=0)
src_mask_cur = torch.cat(src_mask_l, dim=0)
y_tm1 = torch.cat(last_tokens, dim=0)
r2l_memory_cur = torch.cat(r2l_memory_l, dim=0)
if self.feature_mode == 'one':
out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src.data)).to(self.device),
r2l_memory_cur, r2l_trg_mask=None)
elif self.feature_mode == 'two' or 'three' or 'four':
out = self.l2r_decode(Variable(y_tm1).to(self.device), model_encodings_cur, src_mask_cur,
Variable(sequence_mask(y_tm1.size(-1)).type_as(src[0].data)).to(self.device),
r2l_memory_cur, r2l_trg_mask=None)
else:
raise "out为None"
log_prob = self.word_prob_generator(out[:, -1, :]).unsqueeze(1)
_, decoded_len, vocab_sz = log_prob.shape
log_prob = torch.split(log_prob, cur_beam_sizes, dim=0)
"""
2.2 Now we process each example in the batch.
Note that the example may have already finished processing before.
other examples (no more hypotheses to try), in which case we continue
"""
new_hypotheses, new_hyp_scores = [], []
for i in range(batch_size):
if hypotheses[i] is None or len(completed_hypotheses[i]) >= beam_size:
new_hypotheses += [None]
new_hyp_scores += [None]
continue
cur_beam_sz_i, dec_sent_len, vocab_sz = log_prob[i].shape
"shape (vocab_sz,)"
cumulative_hyp_scores_i = (hyp_scores[i].unsqueeze(-1).unsqueeze(-1)
.expand((cur_beam_sz_i, 1, vocab_sz)) + log_prob[i]).view(-1)
"""
2.2.2 We get the topk values in cumulative_hyp_scores_i and compute the current (generating) sentence
and the next word using: ix // vocab_sz, ix % vocab_sz.
"""
live_hyp_num_i = beam_size - len(completed_hypotheses[i])
top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(cumulative_hyp_scores_i, k=live_hyp_num_i)
"""
shape (cur_beam_sz,). prev_hyp_ids vals are 0 <= val < cur_beam_sz.
hyp_word_ids vals are 0 <= val < vocab_len
"""
prev_hyp_ids = top_cand_hyp_pos // self.vocab.n_vocabs
hyp_word_ids = top_cand_hyp_pos % self.vocab.n_vocabs
"""
2.2.3 For each of the topk words, we append the new word to the current (generating) sentence
We add this to new_hypotheses_i and add its corresponding total score to new_hyp_scores_i
"""
new_hypotheses_i, new_hyp_scores_i = [], []
for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids,
top_cand_hyp_scores):
prev_hyp_id, hyp_word_id, cand_new_hyp_score = \
prev_hyp_id.item(), hyp_word_id.item(), cand_new_hyp_score.item()
new_hyp_sent = torch.cat(
(hypotheses[i][prev_hyp_id], torch.tensor([hyp_word_id], device=self.device)))
if hyp_word_id == end_symbol:
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in new_hyp_sent[1:-1]],
score=cand_new_hyp_score))
else:
new_hypotheses_i.append(new_hyp_sent.unsqueeze(-1))
new_hyp_scores_i.append(cand_new_hyp_score)
if len(new_hypotheses_i) > 0:
hypotheses_i = torch.cat(new_hypotheses_i, dim=-1).transpose(0, -1).to(self.device)
hyp_scores_i = torch.tensor(new_hyp_scores_i, dtype=torch.float, device=self.device)
else:
hypotheses_i, hyp_scores_i = None, None
new_hypotheses += [hypotheses_i]
new_hyp_scores += [hyp_scores_i]
hypotheses, hyp_scores = new_hypotheses, new_hyp_scores
"""
2.3 Finally, we do some postprocessing to get our final generated candidate sentences.
Sometimes, we may get to max_len of a sentence and still not generate the end token.
In this case, the partial sentence we have generated will not be added to the completed_hypotheses
automatically, and we have to manually add it in. We add in as many as necessary so that there are
`beam_size` completed hypotheses for each example.
Finally, we sort each completed hypothesis by score.
"""
for i in range(batch_size):
hyps_to_add = beam_size - len(completed_hypotheses[i])
if hyps_to_add > 0:
scores, ix = torch.topk(hyp_scores[i], k=hyps_to_add)
for score, id_ in zip(scores, ix):
completed_hypotheses[i].append(Hypothesis(
value=[self.vocab.idx2word[a.item()] for a in hypotheses[i][id_][1:]],
score=score))
completed_hypotheses[i].sort(key=lambda hyp: hyp.score, reverse=True)
return r2l_completed_hypotheses, completed_hypotheses
参考链接:
- 博客配套视频链接: https://space.bilibili.com/383551518?spm_id_from=333.1007.0.0 b 站直接看
- 配套 github 链接:https://github.com/nickchen121/Pre-training-language-model
- 配套博客链接:https://www.cnblogs.com/nickchen121/p/15105048.html
- 注释版本代码地址:https://www.cnblogs.com/nickchen121/p/16518613.html