【参考:Transformer简明教程, 从理论到代码实现到项目实战, NLP进阶必知必会._哔哩哔哩_bilibili】
举了一个实例,计算过程浅显易懂
【参考:NLP - Transformer_伊织_的博客-CSDN博客】
【参考:lansinuote/Transformer_Example | github】
下面略有修改
import torch
import random
import numpy as np
import math
torch.manual_seed(100)
max_seq_len = 50 # 句子最长的长度,未到就padding填充,超过就截断
batch_size=8
embed_dim = 32 # 词向量长度 d_model=embed_dim
vocab_size =39 # 词典长度 三个特殊字符 + 10个数字 + 26个小写字母
heads=4 # 几个头
head_dim = embed_dim // heads # 8 每个头分到多少维度
# data.py
# 定义字典
zidian_x = ',,,0,1,2,3,4,5,6,7,8,9,q,w,e,r,t,y,u,i,o,p,a,s,d,f,g,h,j,k,l,z,x,c,v,b,n,m'
zidian_x = {word: i for i, word in enumerate(zidian_x.split(','))} # word:id
zidian_xr = [k for k, v in zidian_x.items()] # word
zidian_y = {k.upper(): v for k, v in zidian_x.items()} # word.upper():id
zidian_yr = [k for k, v in zidian_y.items()] # word.upper()
"""
y是x的逆序
操作:y中的数字是9-x中的数字 y中的字母是x大写
为增加复杂度,把y的第一个位置的数据复制一份放入最前面 也就是说,第一个位置和第二个位置的数据是一样的
x: a b c 1 2 3
y: 6 6 7 8 C B A
- 1.先逆序 3 2 1 c b a
- 2.变换 6 7 8 C B A
- 3.复制一位 6 6 7 8 C B A
"""
# 每调用一次,得到一对 X 和 Y
def get_data():
# 定义词集合
words = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'q', 'w', 'e', 'r',
't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k',
'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm'
]
# 定义每个词被选中的概率
p = np.array([
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
])
p = p / p.sum()
# 随机选n个词
n = random.randint(30, max_seq_len-2) # 需要预留两个位置放首尾符号
# n = random.randint(30, 48) # 需要预留两个位置放首尾符号
x = np.random.choice(words, size=n, replace=True, p=p)
# 采样的结果就是x
x = x.tolist()
# y是对x的变换得到的
# 字母大写,数字取10以内的互补数
def f(i):
i = i.upper()
if not i.isdigit():
return i
i = 9 - int(i)
return str(i)
y = [f(i) for i in x]
y = y + [y[-1]]
# 逆序
y = y[::-1]
# 加上首尾符号
x = ['' ] + x + ['' ]
y = ['' ] + y + ['' ]
# 补pad到固定长度
x = x + ['' ] * max_seq_len # 50
y = y + ['' ] * (max_seq_len + 1) # 51
x = x[:max_seq_len] # 50
y = y[:max_seq_len + 1] # 51
# 编码成数据
x = [zidian_x[i] for i in x]
y = [zidian_y[i] for i in y]
# 转tensor
x = torch.LongTensor(x)
y = torch.LongTensor(y)
return x, y
# 定义数据集
class Dataset(torch.utils.data.Dataset):
def __init__(self):
super(Dataset, self).__init__()
def __len__(self):
return 100000 # 10W数据
def __getitem__(self, i):
return get_data()
# 数据加载器
loader = torch.utils.data.DataLoader(dataset=Dataset(),
batch_size=batch_size,
drop_last=True,
shuffle=True,
collate_fn=None)
# util.py
# 注意力计算函数
def attention(Q, K, V, mask):
# b句话,每句话50个词,每个词编码成32维向量,4个头,每个头分到8维向量
# Q,K,V = [b, 4, 50, 8] # [batch_size,heads,seq_len,head_dim]
batch_size, heads, seq_len, head_dim = Q.shape
# [b, 4, 50, 8] * [b, 4, 8, 50] -> [b, 4, 50, 50]
# [batch_size,heads,seq_len,head_dim] * [batch_size,heads,head_dim,seq_len] -> [batch_size,heads,seq_len,seq_len]
# Q,K矩阵相乘,求每个词相对其他所有词的注意力
score = torch.matmul(Q, K.permute(0, 1, 3, 2)) # 前面 0 1 可以不要的
# 除以每个头维数的平方根,做数值缩放
# score /= 8 ** 0.5
score /= math.sqrt(head_dim)
# mask遮盖,mask是true的地方都被替换成-inf,这样在计算softmax的时候,-inf会被压缩到0
# mask = [b, 1, 50, 50] # [batch_size,1,seq_len,seq_len]
score = score.masked_fill_(mask, -float('inf'))
score = torch.softmax(score, dim=-1)
# 以注意力分数乘以V,得到最终的注意力结果
# [b, 4, 50, 50] * [b, 4, 50, 8] -> [b, 4, 50, 8]
# [batch_size,heads,seq_len,seq_len] * [batch_size,heads,seq_len,head_dim] -> [batch_size,heads,seq_len,head_dim]
score = torch.matmul(score, V)
# 每个头计算的结果合一
# [b, 4, 50, 8] -> [b, 50, 32]
# [batch_size,heads,seq_len,head_dim] -> [batch_size,seq_len,heads,head_dim]
# -> [batch_size,max_seq_len, heads*head_dim ] # 把多个头的数据拼接起来再返回
# score = score.permute(0, 2, 1, 3).reshape(-1, 50, 32)
score = score.permute(0, 2, 1, 3).reshape(-1, max_seq_len, heads*head_dim)
return score # [batch_size,max_seq_len,embed_dim]
# 多头注意力计算层
class MultiHead(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc_Q = torch.nn.Linear(embed_dim, embed_dim) # 32 32
self.fc_K = torch.nn.Linear(embed_dim, embed_dim)
self.fc_V = torch.nn.Linear(embed_dim, embed_dim)
self.out_fc = torch.nn.Linear(embed_dim, embed_dim) # 32 32
# 规范化之后,均值是0,标准差是1
# BN是取不同样本做归一化
# LN是取不同通道做归一化
self.norm = torch.nn.LayerNorm(normalized_shape=embed_dim, # 32
elementwise_affine=True)
self.dropout = torch.nn.Dropout(p=0.1)
def forward(self, Q, K, V, mask):
# b句话,每句话50个词,每个词编码成32维向量
# Q,K,V = [b, 50, 32]
# [batch_size,max_seq_len,embed_dim]
batch_size,max_seq_len,embed_dim = Q.shape
# 保留下原始的Q,后面要做短接用
clone_Q = Q.clone()
# 规范化
Q = self.norm(Q)
K = self.norm(K)
V = self.norm(V)
# 线性运算,维度不变
# [b, 50, 32] -> [b, 50, 32]
# [batch_size,max_seq_len,embed_dim]
K = self.fc_K(K)
V = self.fc_V(V)
Q = self.fc_Q(Q)
# 拆分成多个头
# b句话,每句话50个词,每个词编码成32维向量,4个头,每个头分到8维向量
# [b, 50, 32] -> [b, 4, 50, 8]
# [batch_size,max_seq_len,embed_dim] -> [batch_size,heads,seq_len,head_dim]
Q = Q.reshape(batch_size, max_seq_len, heads,head_dim).permute(0, 2, 1, 3)
K = Q.reshape(batch_size, max_seq_len, heads,head_dim).permute(0, 2, 1, 3)
V = Q.reshape(batch_size, max_seq_len, heads,head_dim).permute(0, 2, 1, 3)
# Q = Q.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
# K = K.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
# V = V.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
# 计算注意力
# [b, 4, 50, 8] -> [b, 50, 32]
# -> [batch_size,max_seq_len,embed_dim]
score = attention(Q, K, V, mask)
# 计算输出,维度不变
# [b, 50, 32] -> [b, 50, 32]
score = self.dropout(self.out_fc(score))
# 短接
score = clone_Q + score
return score # [batch_size,max_seq_len,embed_dim]
# 位置编码层
class PositionEmbedding(torch.nn.Module):
def __init__(self):
super().__init__()
# pos是第几个词,i是第几个维度,d_model是维度总数
def get_pe(pos, i, embed_dim):
fenmu = 1e4 ** (i / embed_dim)
pe = pos / fenmu
if i % 2 == 0:
return math.sin(pe)
return math.cos(pe)
# 初始化位置编码矩阵
pe = torch.empty(max_seq_len, embed_dim) # [max_seq_len, embed_dim]
# pe = torch.empty(50, 32)
for i in range(max_seq_len): # 50
for j in range(embed_dim): # 32
pe[i, j] = get_pe(i, j, embed_dim) # 32
pe = pe.unsqueeze(0) # [1,max_seq_len, embed_dim]
# 定义为不更新的常量
self.register_buffer('pe', pe)
# 词编码层
self.embed = torch.nn.Embedding(vocab_size, embed_dim)
# self.embed = torch.nn.Embedding(39, 32)
# 初始化参数
self.embed.weight.data.normal_(0, 0.1)
def forward(self, x):
# [8, 50] -> [8, 50, 32]
# x: [batch_size,max_seq_len]
# [batch_size,max_seq_len] -> [batch_size,max_seq_len,embed_dim] # embed_dim=embedding_dim
embed = self.embed(x)
# 词编码和位置编码相加
# [8, 50, 32] + [1, 50, 32] -> [8, 50, 32]
# -> [batch_size,max_seq_len,embed_dim]
embed = embed + self.pe
return embed # [batch_size,max_seq_len,embed_dim]
# 全连接输出层
class FullyConnectedOutput(torch.nn.Module):
def __init__(self):
super().__init__()
# 维度没变
self.fc = torch.nn.Sequential(
torch.nn.Linear(in_features=embed_dim, out_features=64), # 32 64
torch.nn.ReLU(),
torch.nn.Linear(in_features=64, out_features=embed_dim), # 64 32
torch.nn.Dropout(p=0.1),
)
self.norm = torch.nn.LayerNorm(normalized_shape=embed_dim, # 32
elementwise_affine=True)
def forward(self, x):
# x:[batch_size,max_seq_len,embed_dim]
# 保留下原始的x,后面要做短接用
clone_x = x.clone()
# 规范化
x = self.norm(x)
# 线性全连接运算
# [b, 50, 32] -> [b, 50, 32]
# [batch_size,max_seq_len,embed_dim] 维度没变
out = self.fc(x)
# 做短接
out = clone_x + out
return out # [batch_size,max_seq_len,embed_dim]
# mask.py
# multi-head attention 不需要计算针对 PAD 的注意力,需要排除PAD
# 返回结果就是 列全为pad的是true,其他为Fasle 具体看PPT理解
def mask_pad(data):
# b句话,每句话50个词,这里是还没embed的
# data = [b, 50] # [batch_size,max_seq_len]
# 判断每个词是不是
mask = data == zidian_x['' ] # 不是''就是False 是就是True
# [b, 50] -> [b, 1, 1, 50]
# [batch_size,1,1,max_seq_len]
mask = mask.reshape(-1, 1, 1, 50)
# 在计算注意力时,是计算50个词和50个词相互之间的注意力,所以是个50*50的矩阵
# 列全为pad的是true,意味着任何词对pad的注意力都是0
# 但是pad本身对其他词的注意力并不是0
# 所以行全为pad的那一行不是true
# 复制n次
# [b, 1, 1, 50] -> [b, 1, 50, 50]
# [batch_size,1,max_seq_len,max_seq_len]
mask = mask.expand(-1, 1, 50, 50)
return mask # [batch_size,1,max_seq_len,max_seq_len]
def mask_tril(data):
# b句话,每句话50个词,这里是还没embed的
# data = [b, 50] # [batch_size,max_seq_len]
# 50*50的矩阵表示每个词对其他词是否可见
# 上三角矩阵,不包括对角线,意味着,对每个词而言,他只能看到他自己,和他之前的词,而看不到之后的词
# [1, 50, 50]
"""
[[0, 1, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1],
[0, 0, 0, 0, 1],
[0, 0, 0, 0, 0]]"""
tril = 1 - torch.tril(torch.ones(1, max_seq_len, max_seq_len, dtype=torch.long)) # [1,max_seq_len,max_seq_len]
# tril = 1 - torch.tril(torch.ones(1, 50, 50, dtype=torch.long))
# 判断y当中每个词是不是pad,如果是pad则不可见
# [b, 50]
mask = data == zidian_y['' ]
# 变形+转型,为了之后的计算
# [b, 1, 50]
# [batch_size,1,max_seq_len]
mask = mask.unsqueeze(1).long()
# mask和tril求并集 ???
# [b, 1, 50] + [1, 50, 50] -> [b, 50, 50]
mask = mask + tril
# 转布尔型
mask = mask > 0
# 转布尔型,增加一个维度,便于后续的计算
mask = (mask == 1).unsqueeze(dim=1)
return mask # [batch_size,1,max_seq_len,max_seq_len]
# model.py
# 编码器层
class EncoderLayer(torch.nn.Module):
def __init__(self):
super().__init__()
self.mh = MultiHead()
self.fc = FullyConnectedOutput()
def forward(self, x, mask):
# x:[batch_size,max_seq_len,embed_dim]
# 计算自注意力,维度不变
# [b, 50, 32] -> [b, 50, 32]
score = self.mh(x, x, x, mask)
# 全连接输出,维度不变
# [b, 50, 32] -> [b, 50, 32]
out = self.fc(score)
return out # [batch_size,max_seq_len,embed_dim]
class Encoder(torch.nn.Module):
def __init__(self):
super().__init__()
self.layer_1 = EncoderLayer()
self.layer_2 = EncoderLayer()
self.layer_3 = EncoderLayer()
def forward(self, x, mask):
x = self.layer_1(x, mask)
x = self.layer_2(x, mask)
x = self.layer_3(x, mask)
return x # [batch_size,max_seq_len,embed_dim]
# 解码器层
class DecoderLayer(torch.nn.Module):
def __init__(self):
super().__init__()
self.mh1 = MultiHead()
self.mh2 = MultiHead()
self.fc = FullyConnectedOutput()
def forward(self, x, y, mask_pad_x, mask_tril_y):
# x:[batch_size,max_seq_len,embed_dim]
# y:[batch_size,max_seq_len,embed_dim]
# 先计算y的自注意力,维度不变
# [b, 50, 32] -> [b, 50, 32]
y = self.mh1(y, y, y, mask_tril_y) # masked multi-head attention
# 结合x和y的注意力计算,维度不变
# [b, 50, 32],[b, 50, 32] -> [b, 50, 32]
y = self.mh2(y, x, x, mask_pad_x) # multi-head attention
# 全连接输出,维度不变
# [b, 50, 32] -> [b, 50, 32]
y = self.fc(y)
return y # [batch_size,max_seq_len,embed_dim]
class Decoder(torch.nn.Module):
def __init__(self):
super().__init__()
self.layer_1 = DecoderLayer()
self.layer_2 = DecoderLayer()
self.layer_3 = DecoderLayer()
def forward(self, x, y, mask_pad_x, mask_tril_y):
y = self.layer_1(x, y, mask_pad_x, mask_tril_y)
y = self.layer_2(x, y, mask_pad_x, mask_tril_y)
y = self.layer_3(x, y, mask_pad_x, mask_tril_y)
return y # [batch_size,max_seq_len,embed_dim]
# 主模型
class Transformer(torch.nn.Module):
def __init__(self):
super().__init__()
self.embed_x = PositionEmbedding()
self.embed_y = PositionEmbedding()
self.encoder = Encoder()
self.decoder = Decoder()
self.fc_out = torch.nn.Linear(embed_dim, vocab_size)
# self.fc_out = torch.nn.Linear(32, 39)
def forward(self, x, y):
# x:[batch_size,max_seq_len]
# y:[batch_size,max_seq_len]
# [b, 1, 50, 50]
mask_pad_x = mask_pad(x)
mask_tril_y = mask_tril(y)
# 编码,添加位置信息
# x = [b, 50] -> [b, 50, 32]
# y = [b, 50] -> [b, 50, 32]
# x:[batch_size,max_seq_len,embed_dim]
# y:[batch_size,max_seq_len,embed_dim]
x, y = self.embed_x(x), self.embed_y(y)
# 编码层计算
# [b, 50, 32] -> [b, 50, 32]
# -> [batch_size,max_seq_len,embed_dim]
x = self.encoder(x, mask_pad_x)
# 解码层计算
# [b, 50, 32],[b, 50, 32] -> [b, 50, 32]
# -> [batch_size,max_seq_len,embed_dim]
y = self.decoder(x, y, mask_pad_x, mask_tril_y)
# 全连接输出,维度不变
# [b, 50, 32] -> [b, 50, 39]
y = self.fc_out(y) # -> [batch_size,max_seq_len,vocab_size]
return y # [batch_size,max_seq_len,vocab_size]
# main.py
# 预测函数
def predict(x):
# x = [1, 50]
model.eval()
# [1, 1, 50, 50]
mask_pad_x = mask_pad(x)
# 初始化输出,这个是固定值
# [1, 50]
# [[0,2,2,2...]]
target = [zidian_y['' ]] + [zidian_y['' ]] * 49
target = torch.LongTensor(target).unsqueeze(0)
# x编码,添加位置信息
# [1, 50] -> [1, 50, 32]
x = model.embed_x(x)
# 编码层计算,维度不变
# [1, 50, 32] -> [1, 50, 32]
x = model.encoder(x, mask_pad_x)
# 遍历生成第1个词到第49个词
for i in range(49):
# [1, 50]
y = target
# [1, 1, 50, 50]
mask_tril_y = mask_tril(y)
# y编码,添加位置信息
# [1, 50] -> [1, 50, 32]
y = model.embed_y(y)
# 解码层计算,维度不变
# [1, 50, 32],[1, 50, 32] -> [1, 50, 32]
y = model.decoder(x, y, mask_pad_x, mask_tril_y)
# 全连接输出,39分类
# [1, 50, 32] -> [1, 50, 39]
out = model.fc_out(y)
# 取出当前词的输出
# [1, 50, 39] -> [1, 39]
out = out[:, i, :]
# 取出分类结果
# [1, 39] -> [1]
out = out.argmax(dim=1).detach()
# 以当前词预测下一个词,填到结果中
target[:, i + 1] = out
return target
model = Transformer()
loss_func = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=2e-3)
sched = torch.optim.lr_scheduler.StepLR(optim, step_size=3, gamma=0.5)
for epoch in range(1):
for i, (x, y) in enumerate(loader):
# x = [8, 50] [batch_size,max_seq_len]
# y = [8, 51] [batch_size,max_seq_len+1]
# 在训练时,是拿y的每一个字符输入,预测下一个字符,所以不需要最后一个字 ???
# [8, 50, 39] [batch_size,max_seq_len,vocab_size]
pred = model(x, y[:, :-1])
# [8, 50, 39] -> [400, 39] [batch_size*max_seq_len,vocab_size]
pred = pred.reshape(-1, 39)
# [8, 51] -> [400]
y = y[:, 1:].reshape(-1)
# 计算loss时需要忽略pad
select = y != zidian_y['' ]
pred = pred[select] # 选择值为True的数据 也就是不是y中不是的索引的数据
y = y[select]
loss = loss_func(pred, y)
optim.zero_grad()
loss.backward()
optim.step()
if i % 200 == 0:
# [select, 39] -> [select]
# [batch_size*max_seq_len,vocab_size]-> [batch_size*max_seq_len]
pred = pred.argmax(1)
correct = (pred == y).sum().item()
accuracy = correct / len(pred)
lr = optim.param_groups[0]['lr']
print(epoch, i, lr, loss.item(), accuracy)
sched.step()
torch.save(model, 'model_transformer.pth')
model = torch.load('model_transformer.pth')
# 测试
for i, (x, y) in enumerate(loader):
break
# 预测
for i in range(8):
print(i)
print(''.join([zidian_xr[i] for i in x[i].tolist()]))
print(''.join([zidian_yr[i] for i in y[i].tolist()]))
print(''.join([zidian_yr[i] for i in predict(x[i].unsqueeze(0))[0].tolist()]))
0%| | 0/12500 [00:00<?, ?it/s]0 0 0.002 3.8923168182373047 0.0423728813559322
2%|▏ | 199/12500 [00:06<06:34, 31.20it/s]0 200 0.002 3.329134464263916 0.07371794871794872
3%|▎ | 399/12500 [00:12<06:00, 33.59it/s]0 400 0.002 3.270310640335083 0.10557184750733138
5%|▍ | 599/12500 [00:18<05:58, 33.19it/s]0 600 0.002 3.2918624877929688 0.11875
6%|▋ | 799/12500 [00:24<06:03, 32.15it/s]0 800 0.002 3.03589129447937 0.11858974358974358
8%|▊ | 999/12500 [00:30<05:52, 32.65it/s]0 1000 0.002 2.2280383110046387 0.35625
10%|▉ | 1199/12500 [00:36<05:46, 32.63it/s]0 1200 0.002 1.505544662475586 0.535031847133758
11%|█ | 1399/12500 [00:43<05:34, 33.23it/s]0 1400 0.002 1.0581401586532593 0.641566265060241
13%|█▎ | 1599/12500 [00:49<05:34, 32.63it/s]0 1600 0.002 1.2366271018981934 0.6627218934911243
14%|█▍ | 1799/12500 [00:55<05:26, 32.76it/s]0 1800 0.002 0.7639922499656677 0.7507163323782235
16%|█▌ | 1999/12500 [01:01<05:19, 32.84it/s]0 2000 0.002 0.4993612468242645 0.8432601880877743
18%|█▊ | 2199/12500 [01:07<05:30, 31.13it/s]0 2200 0.002 0.400225967168808 0.8506493506493507
19%|█▉ | 2399/12500 [01:14<05:07, 32.83it/s]0 2400 0.002 0.5886312127113342 0.8271604938271605
21%|██ | 2599/12500 [01:20<05:03, 32.60it/s]0 2600 0.002 0.2586905360221863 0.9216300940438872
...
98%|█████████▊| 12199/12500 [06:18<00:09, 32.72it/s]0 12200 0.002 0.03803557902574539 0.9870967741935484
99%|█████████▉| 12399/12500 [06:24<00:03, 32.08it/s]0 12400 0.002 0.07276972383260727 0.975609756097561
100%|██████████| 12500/12500 [06:27<00:00, 32.22it/s]
0
<SOS>7hjv5ffbzarhkbbdblnybxpycldzcbvbva6vckfsmskatxvn<EOS>
<SOS>NNVXTAKSMSFKCV3AVBVBCZDLCYPXBYNLBDBBKHRAZBFF4VJH2<EOS>
<SOS>NNVXTAKSMSFKCV3AVBVBCZDLCYPXBYNLBDBBKHRAZBFF4VJH2
1
<SOS>dmbrmnmckw8vufadarpx5taapvzwdbnkpkcx<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>XXCKPKNBDWZVPAAT4XPRADAFUV1WKCMNMRBMD<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>XXCKPKNBDWZVPAAT4XPRADAFUV1WKCMNMRBMD<EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS>
2
<SOS>zmvidh2k7bc4chmpmhnlbant8umgg8s4m4jb7hyuzbdk<EOS><PAD><PAD><PAD><PAD>
<SOS>KKDBZUYH2BJ5M5S1GGMU1TNABLNHMPMHC5CB2K7HDIVMZ<EOS><PAD><PAD><PAD><PAD>
<SOS>KKDBZUYH2BJ5M5S1GGMU1TNABLNHMPMHC5CB2K7HDIVMZ<EOS><EOS><EOS><EOS>
3
<SOS>gf6atxijpvnjcxgm94in6echmm47ggi9ij34ghvcvuj5f<EOS><PAD><PAD><PAD>
<SOS>FF4JUVCVHG56JI0IGG25MMHCE3NI50MGXCJNVPJIXTA3FG<EOS><PAD><PAD><PAD>
<SOS>FF4JUCVHG566JI0IGG25MMHCE3NI50MGXCJNVPJIXTA3FG<EOS><EOS><EOS>
4
<SOS>9xxcbb9ndnanfnizpshlgs39v8ggmjaazjxccl5bl6<EOS><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>33LB4LCCXJZAAJMGG1V06SGLHSPZINFNANDN0BBCXX0<EOS><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>33LB4LCCXJZAAJMGG1V06SGLHSPZINFNANDN0BBCXX0<EOS><EOS><EOS><EOS><EOS><EOS>
5
<SOS>majzbnajnlzzg8c7v8bzp6kusf9v7csmufuy4umhbhvsbvv<EOS><PAD>
<SOS>VVVBSVHBHMU5YUFUMSC2V0FSUK3PZB1V2C1GZZLNJANBZJAM<EOS><PAD>
<SOS>VVVBSVHBHMU5YUFUMSC2V0FSUK3PZB1V2C1GZZLNJANBZJAM<EOS>
6
<SOS>ab8mzkpanvuzvagmvzbc8gdkpmjnoznnn<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>NNNNZONJMPKDG1CBZVMGAVZUVNAPKZM1BA<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>NNNNZONJMPKDG1CBZVMGAVZUVNAPKZM1BA<EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS>
7
<SOS>z9ho7cjc36mvhhihmboxcggncamevscv<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>VVCSVEMACNGGCXOBMHIHHVM36CJC2OH0Z<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
<SOS>VVCSVEMACNGGCXOBMHIHVHM36CJC2OH0Z<EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS><EOS>E<EOS><EOS><EOS><EOS><EOS><EOS>
下面三个都很不错
【参考:Transformer详解 - mathor】 必看
【参考:Transformer的PyTorch实现_哔哩哔哩_bilibili】
【参考:Transformer的PyTorch实现 - mathor】
代码不好理解
【参考:手把手教你用Pytorch代码实现Transformer模型_哔哩哔哩_bilibili】
代码看这个:【参考:【手撕Transformer】Transformer输入输出细节以及代码实现(pytorch)_顾道长生’的博客-CSDN博客】
文字版看这个:【参考:手把手教你用Pytorch代码实现Transformer模型(超详细的代码解读)_白马金羁侠少年的博客-CSDN博客】
【参考:Transformer代码从零解读(Pytorch版本)_哔哩哔哩_bilibili】
【参考:nlp-tutorial/Transformer.py at master · graykode/nlp-tutorial】
decoder部分没有看懂
## from https://github.com/graykode/nlp-tutorial/tree/master/5-1.Transformer
"""
复现代码⼼得体会
1. 从整体到局部
2. 搞清楚数据流动形状,⾮常关键
"""
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import math
def make_batch(sentences):
input_batch = [[src_vocab[n] for n in sentences[0].split()]]
output_batch = [[tgt_vocab[n] for n in sentences[1].split()]]
target_batch = [[tgt_vocab[n] for n in sentences[2].split()]]
return torch.LongTensor(input_batch), torch.LongTensor(output_batch), torch.LongTensor(target_batch)
## 10
def get_attn_subsequent_mask(seq):
"""
seq: [batch_size, tgt_len]
"""
attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
# attn_shape: [batch_size, tgt_len, tgt_len]
subsequence_mask = np.triu(np.ones(attn_shape), k=1) # 生成一个上三角矩阵
subsequence_mask = torch.from_numpy(subsequence_mask).byte()
return subsequence_mask # [batch_size, tgt_len, tgt_len]
## 7. ScaledDotProductAttention
class ScaledDotProductAttention(nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, Q, K, V, attn_mask):
## 输入进来的维度分别是 [batch_size,n_heads,len_q,d_k] K: [batch_size,n_heads,len_k,d_k] V: [batch_size,n_heads,len_k,d_v]
##首先经过matmul函数得到的scores形状是 : [batch_size,n_heads,len_q,len_k]
scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)
## 然后关键词地方来了,下面这个就是用到了我们之前重点讲的attn_mask,把被mask的地方置为无限小,softmax之后基本就是0,对q的单词不起作用
scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
attn = nn.Softmax(dim=-1)(scores) # 对每一行
context = torch.matmul(attn, V)
# context: [batch_size,n_heads,len_q,d_v], attn: [batch_size,n_heads,len_q,len_k]
return context, attn
## 6. MultiHeadAttention
class MultiHeadAttention(nn.Module):
def __init__(self):
super(MultiHeadAttention, self).__init__()
## 输入进来的QKV是相等的,我们会使用映射linear做一个映射得到参数矩阵Wq, Wk,Wv
self.W_Q = nn.Linear(d_model, d_k * n_heads)
self.W_K = nn.Linear(d_model, d_k * n_heads)
self.W_V = nn.Linear(d_model, d_v * n_heads)
self.linear = nn.Linear(n_heads * d_v, d_model)
self.layer_norm = nn.LayerNorm(d_model)
def forward(self, Q, K, V, attn_mask):
# Encoder中的Q, K, V 都是原始的x,Decoder中k,v来自encoder,q来自本身
## 这个多头分为这几个步骤,首先映射分头,然后计算atten_scores,然后计算atten_value;
##输入进来的数据形状: Q: [batch_size,len_q,d_model], K: [batch_size,len_k,d_model], V: [batch_size,len_k,d_model]
residual, batch_size = Q, Q.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
##下面这个就是先映射,后分头;一定要注意的是q和k分头之后维度是一致额,所以一看这里都是dk
# [batch_size,len_q,d_k*n_heads] -> [batch_size,len_q,n_heads,d_k] -> [batch_size,n_heads,len_q,d_k]
q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # q_s: [batch_size,n_heads,len_q,d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # k_s: [batch_size,n_heads,len_k,d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # v_s: [batch_size,n_heads,len_k,d_v]
## 输入进行的attn_mask形状是 [batch_size,len_q,len_k],
# 然后经过下面这个代码得到 新的attn_mask : [batch_size,n_heads,len_q,len_k],就是把pad信息重复了n个头上
# [batch_size,len_q,len_k] -> [batch_size,1,len_q,len_k] -> [batch_size,n_heads,len_q,len_k]
attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
##然后我们计算 ScaledDotProductAttention 这个函数,去7.看一下
## 得到的结果有两个:context: [batch_size,n_heads,len_q,d_v], attn: [batch_size,n_heads,len_q,len_k]
context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
# [batch_size,n_heads,len_q,d_v] -> [batch_size,len_q,n_heads,d_v] -> [batch_size,len_q,n_heads * d_v]
# contiguous 会深拷贝一份
context = context.transpose(1, 2).contiguous().view(batch_size, -1,
n_heads * d_v) # context: [batch_size,len_q,n_heads * d_v]
output = self.linear(context) # [batch_size,len_q,d_model]
return self.layer_norm(output + residual), attn
## 8. PoswiseFeedForwardNet
class PoswiseFeedForwardNet(nn.Module):
def __init__(self):
super(PoswiseFeedForwardNet, self).__init__()
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
self.layer_norm = nn.LayerNorm(d_model)
def forward(self, inputs):
residual = inputs # inputs : [batch_size, len_q, d_model]
output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
output = self.conv2(output).transpose(1, 2)
return self.layer_norm(output + residual)
## 4. get_attn_pad_mask
## 比如说,我现在的句子长度是5,在后面注意力机制的部分,我们在计算出来QK转置除以根号之后,softmax之前,我们得到的形状
## len_input * len*input 代表每个单词对其余包含自己的单词的影响力
## 所以这里我需要有一个同等大小形状的矩阵,告诉我哪个位置是PAD部分,之后在计算计算softmax之前会把这里置为无穷大;
## 一定需要注意的是这里得到的矩阵形状是batch_size x len_q x len_k,我们是对k中的pad符号进行标识,并没有对k中的做标识,因为没必要
## seq_q 和 seq_k 不一定一致,在交互注意力,q来自解码端,k来自编码端,所以告诉模型编码这边pad符号信息就可以,解码端的pad信息在交互注意力层是没有用到的;
def get_attn_pad_mask(seq_q, seq_k):
batch_size, len_q = seq_q.size()
batch_size, len_k = seq_k.size()
# eq(zero) is PAD token
# 值为0 就会变成 true (python中true对应于1)
pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # [batch_size,1,len_k] one is masking
# 重复len_q次
return pad_attn_mask.expand(batch_size, len_q, len_k) # [batch_size,len_q,len_k]
## 3. PositionalEncoding 代码实现
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
## 位置编码的实现其实很简单,直接对照着公式去敲代码就可以,下面这个代码只是其中一种实现方式;
## 从理解来讲,需要注意的就是偶数和奇数在公式上有一个共同部分,我们使用log函数把次方拿下来,方便计算;
## pos代表的是单词在句子中的索引,这点需要注意;比如max_len是128个,那么索引就是从0,1,2,...,127
##假设我的demodel是512,2i那个符号中i从0取到了255,那么2i对应取值就是0,2,4...510
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model) # [max_len, d_model] 全是0
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(
1) # [max_len] 值为[0,1,2,...,max_len-1] ->unsqueeze(1) [max_len,1]
# 可以把公式化为 pos * e^(-(2i * log 10000) / d_model) ) 0<2i
# torch.arange(0, d_model, 2) : [0,2,4,...,d_model-1]
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # shape:[d_model//2]
pe[:, 0::2] = torch.sin(position * div_term) ## 这里需要注意的是pe[:, 0::2]这个用法,就是从0开始到最后面,步长为2,其实代表的就是偶数位置
pe[:, 1::2] = torch.cos(position * div_term) ##这里需要注意的是pe[:, 1::2]这个用法,就是从1开始到最后面,步长为2,其实代表的就是奇数位置
## 上面代码获取之后得到的pe:[max_len,d_model]
## 下面这个代码之后,我们得到的pe形状是:[max_len,1,d_model]
# [max_len,d_model] -> [max_len,1,d_model] -> [1,max_len,d_model]
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe) ## 定一个缓冲区,其实简单理解为这个参数不更新就可以
def forward(self, x):
"""
x: [seq_len, batch_size, d_model]
"""
# 词向量和位置向量相加 (位置向量取前seq_len个)
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
## 5. EncoderLayer :包含两个部分,多头注意力机制和前馈神经网络
class EncoderLayer(nn.Module):
def __init__(self):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, enc_inputs, enc_self_attn_mask):
## 下面这个就是做自注意力层,输入是enc_inputs,形状是[batch_size x seq_len_q x d_model] 需要注意的是最初始的QKV矩阵是等同于这个输入的,去看一下enc_self_attn函数 6.
enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs,
enc_self_attn_mask) # enc_inputs to same Q,K,V
enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
return enc_outputs, attn
## 2. Encoder 部分包含三个部分:词向量embedding,位置编码部分,注意力层及后续的前馈神经网络
class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.src_emb = nn.Embedding(src_vocab_size, d_model) ## 这个其实就是去定义生成一个矩阵,大小是 src_vocab_size * d_model
self.pos_emb = PositionalEncoding(d_model) ## 位置编码情况,这里是固定的正余弦函数,也可以使用类似词向量的nn.Embedding获得一个可以更新学习的位置编码
self.layers = nn.ModuleList(
[EncoderLayer() for _ in range(n_layers)]) ## 使用ModuleList对多个encoder进行堆叠,因为后续的encoder并没有使用词向量和位置编码,所以抽离出来;
def forward(self, enc_inputs):
## 这里我们的 enc_inputs 形状是: [batch_size x source_len]
## 下面这个代码通过src_emb,进行索引定位,enc_outputs输出形状是[batch_size, src_len, d_model]
enc_outputs = self.src_emb(enc_inputs)
## 这里就是位置编码,把两者相加放入到了这个函数里面,从这里可以去看一下位置编码函数的实现;3.
enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(0, 1)
##get_attn_pad_mask是为了得到句子中pad的位置信息,给到模型后面,在计算自注意力和交互注意力的时候去掉pad符号的影响,去看一下这个函数 4.
enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs)
enc_self_attns = []
for layer in self.layers:
## 去看EncoderLayer 层函数 5.
enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
enc_self_attns.append(enc_self_attn)
return enc_outputs, enc_self_attns
## 10.
class DecoderLayer(nn.Module):
def __init__(self):
super(DecoderLayer, self).__init__()
self.dec_self_attn = MultiHeadAttention()
self.dec_enc_attn = MultiHeadAttention()
self.pos_ffn = PoswiseFeedForwardNet()
def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
dec_outputs = self.pos_ffn(dec_outputs)
return dec_outputs, dec_self_attn, dec_enc_attn
## 9. Decoder
class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
self.pos_emb = PositionalEncoding(d_model)
self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model]
dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, tgt_len, d_model]
## get_attn_pad_mask 自注意力层的时候的pad 部分
dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs)
## get_attn_subsequent_mask 这个做的是自注意层的mask部分,就是当前单词之后看不到,使用一个上三角为1的矩阵
dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs)
## 两个矩阵相加,大于0的为1,不大于0的为0,为1的在之后就会被fill到无限小
dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
## 这个做的是交互注意力机制中的mask矩阵,enc的输入是k,我去看这个k里面哪些是pad符号,给到后面的模型;注意哦,我q肯定也是有pad符号,但是这里我不在意的,之前说了好多次了哈
dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs)
dec_self_attns, dec_enc_attns = [], []
for layer in self.layers:
dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask,
dec_enc_attn_mask)
dec_self_attns.append(dec_self_attn)
dec_enc_attns.append(dec_enc_attn)
return dec_outputs, dec_self_attns, dec_enc_attns
## 1. 从整体网路结构来看,分为三个部分:编码层,解码层,输出层
class Transformer(nn.Module):
def __init__(self):
super(Transformer, self).__init__()
self.encoder = Encoder() ## 编码层
self.decoder = Decoder() ## 解码层
self.projection = nn.Linear(d_model,
tgt_vocab_size,
bias=False) ## 输出层 d_model 是我们解码层每个token输出的维度大小,之后会做一个 tgt_vocab_size 大小的softmax
def forward(self, enc_inputs, dec_inputs):
# 这里有两个数据进行输入,一个是enc_inputs 形状为[batch_size, src_len],主要是作为编码段的输入,
# 一个dec_inputs,形状为[batch_size, tgt_len],主要是作为解码端的输入
# enc_inputs作为输入 形状为[batch_size, src_len],输出由自己的函数内部指定,想要什么指定输出什么,可以是全部tokens的输出,可以是特定每一层的输出;也可以是中间某些参数的输出;
# enc_outputs就是主要的输出,enc_self_attns这里没记错的是QK转置相乘之后softmax之后的矩阵值,代表的是每个单词和其他单词相关性;
enc_outputs, enc_self_attns = self.encoder(enc_inputs)
## dec_outputs 是decoder主要输出,用于后续的linear映射; dec_self_attns类比于enc_self_attns 是查看每个单词对decoder中输入的其余单词的相关性;dec_enc_attns是decoder中每个单词对encoder中每个单词的相关性;
dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
## dec_outputs做映射到词表大小
dec_logits = self.projection(dec_outputs) # dec_logits : [batch_size * src_vocab_size * tgt_vocab_size]
# ()
return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns
if __name__ == '__main__':
## 句子的输入部分,
sentences = ['ich mochte ein bier P', 'S i want a beer', 'i want a beer E']
# Transformer Parameters
# Padding Should be Zero
# 构建词表
src_vocab = {'P': 0, 'ich': 1, 'mochte': 2, 'ein': 3, 'bier': 4}
src_vocab_size = len(src_vocab)
tgt_vocab = {'P': 0, 'i': 1, 'want': 2, 'a': 3, 'beer': 4, 'S': 5, 'E': 6}
tgt_vocab_size = len(tgt_vocab)
src_len = 5 # length of source
tgt_len = 5 # length of target
## 模型参数
d_model = 512 # Embedding Size
d_ff = 2048 # FeedForward dimension
d_k = d_v = 64 # dimension of K(=Q), V
n_layers = 6 # number of Encoder of Decoder Layer
n_heads = 8 # number of heads in Multi-Head Attention
model = Transformer()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
enc_inputs, dec_inputs, target_batch = make_batch(sentences)
for epoch in range(200):
optimizer.zero_grad()
outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
loss = criterion(outputs, target_batch.contiguous().view(-1))
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
【参考:Pytorch Transformers from Scratch - YouTube】
配套代码:【参考:Machine-Learning-Collection/transformer_from_scratch.py at master · aladdinpersson/Machine-Learning-Collection】