https://github.com/bentrevett/pytorch-seq2seq/blob/master/2%20-%20Learning%20Phrase%20Representations%20using%20RNN%20Encoder-Decoder%20for%20Statistical%20Machine%20Translation.ipynb
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import TranslationDataset,Multi30k
from torchtext.data import Field,BucketIterator
import spacy
import random
import math
import os
import time
SEED=1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.determinstic=True
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')
def tokenize_de(text):
"""
Tokenizes German text from a string into a list of strings
"""
return [tok.text for tok in spacy_de.tokenizer(text)]
def tokenize_en(text):
"""
Tokenizes English text from a string into a list of strings
"""
return [tok.text for tok in spacy_en.tokenizer(text)]
SRC = Field(tokenize=tokenize_de, init_token='', eos_token='', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='', eos_token='', lower=True)
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))
print(vars(train_data.examples[0]))
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE=128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
(train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)
'''
在NLP入门(一)中使用的是多层的LSTM,在本节使用单层的GRU(gate recurrent units)
nn.GRU GRU中没有cell state,只有hidden state
nn.GRU forward函数中的输入
input 和 h_0
input shape [src sent length,batch size,emb dim]
h_0 shape [num_layers * num_directions,batch size,emb dim]
如果没有提供初始化的h_0参数,则默认初始化为全0向量(对于encoder)
GRU的输出
output,hidden
output 每个时间节点上的hidden state shape [src sent length,batch size,, num_directions * hid dim]
如果有多层GRU,则只输出最后一层的所有时间节点上的隐藏层状态(这与多层LSTM的情况一样:只输出最后一层LSTM的隐藏层和cell state状态)
hidden 最后一个时间节点上的hidden state shaoe [num_layers * num_directions,batch size,hid dim]
'''
class Encoder(nn.Module):
def __init__(self,input_dim,emb_dim,hid_dim,dropout):
super().__init__()
self.input_dim=input_dim
self.emb_dim=emb_dim
self.hid_dim=hid_dim
self.embedding=nn.Embedding(input_dim,emb_dim)
self.dropout=nn.Dropout(dropout)
self.rnn=nn.GRU(emb_dim,hid_dim)
def forward(self,src):
'''
:param src: shape [src sent length,batch size]
其中的每个数值是当前单词在编码器源词汇表中的索引下标
:return:
'''
embedded=self.embedding(src)
#embedded=[src sent length,batch size,emb dim]
output,hidden=self.rnn(self.dropout(embedded))
#output = [src sent length,batch size,hid dim*n directions]
#hidden = [n layers * n directions,batch size,hid dim]
return hidden
'''
解码器中与NLP入门(一)的主要改进之处
1.在NLP入门(一)中解码器中每个LSTM时间节点处的输出只与:上一时间节点预测输出值或者上一时间节点的ground truth label有关系(teaching force机制)
但是在本节中的明显改进在于:
当前时间节点GRU预测的输出值与以下值相关:
(1)上一时间节点GRU的输出预测单词y(t-1)或者上一时间节点ground truth label,也可以记作y(t-1)
(2)encoder输出的context vector 注意只有hidden state
则GRU输入的特征向量维度为emd dim+hid dim
2.每个LSTM节点处的预测输出单词(nn.Linear线性层的输入)
之前是只与解码器当前时间节点处输出的hidden state有关(与当前时间节点处输出的cell state无关)
现在改成与以下3个数值相关:
(1)当前时间节点处GRU单元输出的hidden state
(2)编码器输出的context vector
(3)当前GRU输入的单词标签
则线性预测层的输入特征向量维度为 hid dim*2+emb dim
'''
class Decoder(nn.Module):
def __init__(self,input_dim,emb_dim,hid_dim,dropout):
super().__init__()
self.input_dim=input_dim
self.emb_dim=emb_dim
self.hid_dim=hid_dim
self.embedding=nn.Embedding(input_dim,emb_dim)
self.dropout=nn.Dropout(dropout)
self.rnn=nn.GRU(hid_dim+emb_dim,hid_dim)
self.output=nn.Linear(hid_dim*2+emb_dim,input_dim)
def forward(self, trg,hidden,context):
'''
:param trg: shape [batch size]
:param hidden:上一时间节点处隐藏层输出 hidden state,对于1时刻而言,hidden state=context [1,hid dim,batch size]
:param context: 编码器输出的语义信息 shape [1,hid dim,batch size]
无论是hidden还是context,都是某一层GRU的输出hidden,则它们的维度都是 [1,batch size,hid dim]
:return:
'''
input=trg.unsqueeze(0)#input=[1,batch size]
embedded=self.embedding(input)
#embedded = [1,batch size,emd dim]
emb_con=torch.cat((self.dropout(embedded),context),dim=2)
#emb_con=[1,emd dim+hid dim, batch size]
output,hidden=self.rnn(emb_con,hidden)
#output = [1,batch size,hid dim]
#hidden = [1,batch size,hid dim]
output=torch.cat((emb_con,hidden),dim=2)
#output = [1,batch size,emd dim+hid dim*2]
pred=self.output(output.squeeze(0))
#pred = [batch size,input dim]
return pred,hidden#要返回当前层的hidden state作为下一个时间节点的输入
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder,device):
super().__init__()
self.encoder=encoder
self.decoder=decoder
self.device=device
def forward(self, src,trg,teaching_force_rate=0.5):
'''
:param src: [src sent length,batch size]
:param trg: [trg sent length,batch size]
:param teaching_force_rate: probability of using teaching force mechisim
:return: prediction of trg sent length words index in trg vocal
即当前模型所预测出来的每个时间节点处的索引值,索引值表示在解码器词汇表中
解码出单词的索引下标
'''
max_len=trg.shape[0]
batch_size=src.shape[1]
trg_voc_len=self.decoder.input_dim
context=self.encoder(src)
# context = [1,batch size,hid dim] for n layers* n directions=1
hidden=context
output=torch.zeros((max_len,batch_size,trg_voc_len)).to(self.device)
input=trg[0,:,:]#input指的是对于decoder的第1个时间节点的输入,sos
for t in range(1,max_len):
pred, hidden=self.decoder(input,hidden,context)
#pred = [batch size,input dim]
output[t]=pred
teaching_force=random.random()']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
def train(model, iterator, optimizer, criterion, clip):
model.train()
epoch_loss = 0
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
optimizer.zero_grad()
output = model(src, trg)
# trg = [trg sent len, batch size]
# output = [trg sent len, batch size, output dim]
output = output[1:].view(-1, output.shape[-1])
trg = trg[1:].view(-1)
# trg = [(trg sent len - 1) * batch size]
# output = [(trg sent len - 1) * batch size, output dim]
loss = criterion(output, trg)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
output = model(src, trg, 0) # turn off teacher forcing
# trg = [trg sent len, batch size]
# output = [trg sent len, batch size, output dim]
output = output[1:].view(-1, output.shape[-1])
trg = trg[1:].view(-1)
# trg = [(trg sent len - 1) * batch size]
# output = [(trg sent len - 1) * batch size, output dim]
loss = criterion(output, trg)
epoch_loss += loss.item()
return epoch_loss / len(iterator)
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
N_EPOCHS=10
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'tut2_model.pt')
best_valid_loss = float('inf')
if not os.path.isdir(f'{SAVE_DIR}'):
os.makedirs(f'{SAVE_DIR}')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
valid_loss = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
test/validation loss会比第一个例子小