感谢伯禹学习平台,本次学习将记录记录如何使用Pytorch高效实现网络,熟练掌握Pytorch的基础知识,记录不包含理论知识的细节展开。
Sequence to Sequence模型:Encoder-Decoder结构
具体结构:
Encoder-Pytorch实现
import torch
import torch.nn as nn
class Seq2SeqEncoder(nn.Module):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0):
super(Seq2SeqEncoder, self).__init__()
self.num_hiddens = num_hiddens
self.num_layers = num_layers
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
def begin_state(self, batch_size, device):
return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]
def forward(self, X, *args):
X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
X = X.transpose(0, 1) # RNN needs first axes to be time
# state = self.begin_state(X.shape[1], device=X.device)
out, state = self.rnn(X)
# The shape of out is (seq_len, batch_size, num_hiddens).
# state contains the hidden state and the memory cell
# of the last time step, the shape is (num_layers, batch_size, num_hiddens)
return out, state
# 测试结果如下
# 字典10个单词,词向量输出维度8,两层LSTM,隐藏层大小16
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
print(output.shape, len(state), state[0].shape, state[1].shape)
#output
torch.Size([7, 4, 16]) 2 torch.Size([2, 4, 16]) torch.Size([2, 4, 16])
Decoder-Pytorch实现
# 结构与encoder类似,多了一个dense层用来预测输出
class Seq2SeqDecoder(nn.Module):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0):
super(Seq2SeqDecoder, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs):
return enc_outputs[1]
def forward(self, X, state):
X = self.embedding(X).transpose(0, 1)
out, state = self.rnn(X, state)
# Make the batch to be the first dimension to simplify loss computation.
out = self.dense(out).transpose(0, 1)
return out, state
# 测试,唯一区别使用了encoder的state来初始化
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(X))
out, state = decoder(X, state)
print(out.shape, len(state), state[0].shape, state[1].shape)
# output
torch.Size([4, 7, 10]) 2 torch.Size([2, 4, 16]) torch.Size([2, 4, 16])
Mask损失函数的Pytorch 实现
输入的长度存在pad的问题,所以计算损失的时候屏蔽掉pad部分的损失。
def SequenceMask(X, X_len,value=0):
maxlen = X.size(1)
mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]
X[~mask]=value
return X
# 这里学会了可以直接继承nn.CrossEntropyLoss来修改损失函数,学到了
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
# pred shape: (batch_size, seq_len, vocab_size)
# label shape: (batch_size, seq_len)
# valid_length shape: (batch_size, )
def forward(self, pred, label, valid_length):
# the sample weights shape should be (batch_size, seq_len)
weights = torch.ones_like(label)
weights = SequenceMask(weights, valid_length).float()
self.reduction='none'
output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
return (output*weights).mean(dim=1)
# 测试
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))
# outputs,由于最后一个数据使用了有效长度为0所以结果输出为0,正常
tensor([2.3026, 1.7269, 0.0000])