https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1): ## input_size指one-hot embedding之后的维度
super(EncoderRNN, self).__init__() ## hidden_size指的是RNN中使用的hidden state维度
## n_layers 使用RNN(GRU)层数
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_size, hidden_size) ## one hot ——> embedding one-hot维度过大
self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
def forward(self, word_inputs, hidden):
# Note: we run this all at once (over the whole input sequence)
seq_len = len(word_inputs)
embedded = self.embedding(word_inputs).view(seq_len, 1, -1) ## seq_len * 1 * hidden_size
output, hidden = self.gru(embedded, hidden)
return output, hidden
def init_hidden(self):
hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size)) ## inital GRU hidden state
if USE_CUDA: hidden = hidden.cuda()
return hidden
class Attn(nn.Module): ## attention 机制
def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs): ## 1 * hidden_size, seq_len * 1 * hidden_size
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(seq_len)) # shape:seq_len
if USE_CUDA: attn_energies = attn_energies.cuda()
# Calculate energies for each encoder output
for i in range(seq_len):
attn_energies[i] = self.score(hidden, encoder_outputs[i])
# Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0) # Shape: seq_len -> 1 * 1 * seq_len
def score(self, hidden, encoder_output): ## 1 * hidden_size, 1 * hidden_size
hidden = hidden.squeeze(0) # 降维
encoder_output = encoder_output.squeeze(0)
if self.method == 'dot':
energy = hidden.dot(encoder_output)
return energy
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = hidden.dot(energy)
return energy
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = self.other.dot(energy)
return energy
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__() ## output_size 对应译文语言单词数,对应one-hot 维度
# Keep parameters for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
##将上一次预测的词语和attention结果(encoder output)加权 拼起来
self.out = nn.Linear(hidden_size * 2, output_size)
## 通过fc将结果转变成one-hot变量
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
def forward(self, word_input, last_context, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x 1 * hidden_size
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) ## 拼凑起来
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs) ## 1 * 1 * seq_len
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # attention result, details in next line
# 1 * 1 * seq_len mul 1 * seq_len * hidden_size => 1 * 1 * hidden_size
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # size: 1 * hidden_size
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
# 1 * (2*hidden_size) -> 1 * output_size
# Return final output, hidden state, and attention weights (for visualization)
return output, context, hidden, attn_weights
teacher_forcing_ratio = 0.5
clip = 5.0
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
# Zero gradients of both optimizers
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
loss = 0 # Added onto for each word
# Get size of input and target sentences
input_length = input_variable.size()[0]
target_length = target_variable.size()[0]
# Run words through encoder
encoder_hidden = encoder.init_hidden()
encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
# encoder_output size: seq_len * 1 * hidden_size
# encoder_hidden size: layers * 1 * hidden_size
# Prepare input and output variables
decoder_input = Variable(torch.LongTensor([[SOS_token]]))
decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
##
if USE_CUDA:
decoder_input = decoder_input.cuda()
decoder_context = decoder_context.cuda()
# Choose whether to use teacher forcing
use_teacher_forcing = random.random() < teacher_forcing_ratio
if use_teacher_forcing:
# Teacher forcing: Use the ground-truth target as the next input
for di in range(target_length):
decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
loss += criterion(decoder_output, target_variable[di])
decoder_input = target_variable[di] # Next target is next input
else:
# Without teacher forcing: use network's own prediction as the next input
for di in range(target_length):
decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
loss += criterion(decoder_output, target_variable[di])
# Get most likely word index (highest value) from output
topv, topi = decoder_output.data.topk(1)
ni = topi[0][0]
decoder_input = Variable(torch.LongTensor([[ni]])) # Chosen word is next input
if USE_CUDA: decoder_input = decoder_input.cuda()
# Stop at end of sentence (not necessary when using known targets)
if ni == EOS_token: break
# Backpropagation
loss.backward()
torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
encoder_optimizer.step()
decoder_optimizer.step()
return loss.data / target_length
attn_model = 'dot'
hidden_size = 500
n_layers = 2
dropout_p = 0.05
# Initialize models
encoder = EncoderRNN(input_lang.n_words, hidden_size, n_layers)
decoder = AttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, n_layers, dropout_p=dropout_p)
# Move models to GPU
if USE_CUDA:
encoder.cuda()
decoder.cuda()
# Initialize optimizers and criterion
learning_rate = 0.0001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
n_epochs = 50000
plot_every = 200
print_every = 1000
# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every
# Begin!
for epoch in range(1, n_epochs + 1):
# Get training data for this cycle
training_pair = variables_from_pair(random.choice(pairs))
input_variable = training_pair[0]
target_variable = training_pair[1]
# Run the train function
loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
# Keep track of loss
print_loss_total += loss
plot_loss_total += loss
if epoch == 0: continue
if epoch % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
print(print_summary)
if epoch % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb