Pytorch 使用attention实现转换日期并可视化attention
实现环境:python3.6
pytorch1.0
import json
from matplotlib import ticker
from numpy import *
from collections import Counter
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda')
对数据进行预处理:
首先从字符级层面统计字符的数量,然后将其转为字符对应数字的字典,最后再形成一个数字对应字符的字典。
def build_vocab(texts,n=None):
counter = Counter(''.join(texts)) #字符级层的字典,Counter:是一个简单的计数器,例如,统计字符出现的个数:
char2index = {w:i for i,(w,c) in enumerate(counter.most_common(n),start=4)} #(w,c)对应着字符:出现的次数,most_common() 方法返回最常见的元素及其计数,顺序为最常见到最少
char2index['~'] = 0 #pad,不足长度的文本在后面填充0,这里添加四种表示,也就是上面为什么从4开始的原因
char2index['^'] = 1 #sos 表示句子的开头
char2index['$'] = 2 #eos 表示句子的结尾
char2index['#'] = 3 #unk 表示句子中出现字典中没有的未知词
index2char = {i:w for w,i in char2index.items()}
return char2index,index2char
数据下载链接:https://pan.baidu.com/s/132uS7mMzn7ISqEVg8i27eA
提取码:36fu
pairs = json.load(open('./data/Time Dataset.json','rt',encoding='utf-8'))
print(pairs[:2]) #查看一下数据的格式。
[['six hours and fifty five am', '06:55'], ['48 min before 10 a.m', '09:12']]
将目标文本和原文本分开,建立各自的字典
data = array(pairs)
src_texts = data[:,0] #第一列的所有值
trg_texts = data[:,1] #第二列的所有值
src_c2ix,src_ix2c = build_vocab(src_texts)
trg_c2ix,trg_ix2c = build_vocab(trg_texts)
接下来按批量更新,定义一个随机批量生成的函数,它能将文本转成字典中的数字表示,并同时返回batch_size个样本和它们的长度,这些样本按照长度降序排序。pad的长度以batch中最长的为准。这主要是为了适应pack_padded_sequence这个函数,因为输入RNN的序列不需要讲pad标志也输入RNN中计算,RNN只需要循环计算得到其真实长度即可。
def indexes_from_text(text,char2index):
return [1] + [char2index[c] for c in text] + [2]
def pad_seq(seq,max_length):
seq += [0 for _ in range(max_length - len(seq))]
return seq
#第一个参数 function 以参数序列中的每一个元素调用 function 函数,返回包含每次 function 函数返回值的新列表。
max_src_len = max(list(map(len,src_texts)))+2 #map(function, iterable, ...)
max_trg_len = max(list(map(len,trg_texts)))+2
max_src_len,max_trg_len
(43, 7)
def random_batch(batch_size,pairs,src_c2ix,trg_c2ix):
input_seqs,target_seqs = [],[]
for i in random.choice(len(pairs),batch_size):
input_seqs.append(indexes_from_text(pairs[i][0],src_c2ix)) #从随机的索引开始生成相应的text的index
target_seqs.append(indexes_from_text(pairs[i][1],trg_c2ix))
seq_pairs = sorted(zip(input_seqs,target_seqs),key=lambda p:len(p[0]),reverse=True) #以key来排序,此处以input_seqs进行降序排序
input_seqs,target_seqs = zip(*seq_pairs) #与 zip 相反,*zipped 可理解为解压,返回二维矩阵式
input_lengths = [len(s) for s in input_seqs]
input_padded = [pad_seq(s,max(input_lengths)) for s in input_seqs]
target_lengths = [len(s) for s in target_seqs]
target_padded = [pad_seq(s,max(target_lengths)) for s in target_seqs]
input_var = torch.LongTensor(input_padded).transpose(0,1) #torch.transpose(input, dim0, dim1, out=None) → Tensor,返回输入矩阵input的转置。交换维度dim0和dim1
#得到的矩阵形状为seq_len*batch_size
target_var = torch.LongTensor(target_padded).transpose(0,1)
input_var = input_var.to(device)
target_var = target_var.to(device)
return input_var,input_lengths,target_var,target_lengths
"""
sort 与 sorted 区别:
sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
list 的 sort 方法返回的是对已经存在的列表进行操作,无返回值,而内建函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作
sorted(iterable[, cmp[, key[, reverse]]])
key -- 主要是用来进行比较的元素,只有一个参数,具体的函数的参数就是取自于可迭代对象中,指定可迭代对象中的一个元素来进行排序。
reverse -- 排序规则,reverse = True 降序 , reverse = False 升序(默认)。
"""测试batch_size = 3时是否能够正确输出
random_batch(3,data,src_c2ix,trg_c2ix)
(tensor([[ 1, 1, 1],
[ 6, 23, 6],
[ 5, 9, 18],
[ 8, 23, 23],
[ 4, 37, 9],
[ 7, 4, 26],
[33, 13, 23],
[22, 9, 2],
[30, 11, 0],
[ 7, 9, 0],
[22, 2, 0],
[34, 0, 0],
[ 4, 0, 0],
[ 6, 0, 0],
[31, 0, 0],
[ 5, 0, 0],
[ 8, 0, 0],
[ 6, 0, 0],
[20, 0, 0],
[ 4, 0, 0],
[13, 0, 0],
[ 9, 0, 0],
[11, 0, 0],
[ 9, 0, 0],
[ 2, 0, 0]], device='cuda:0'), [25, 11, 8], tensor([[ 1, 1, 1],
[ 6, 5, 7],
[ 5, 8, 8],
[ 4, 4, 4],
[ 7, 8, 5],
[ 5, 12, 8],
[ 2, 2, 2]], device='cuda:0'), [7, 7, 7])
模型:
这里的模型框架分为encoder和decoder两个部分,encoder部分比较简单,就是一层enbedding层加上两层GRU。
前面对于batch的格式处理,主要是为了处理pack_padded_sequence和pad_packer_sequence这两个类对GRU输入输出批量处理https://blog.csdn.net/lssc4205/article/details/79474735
https://blog.csdn.net/u012436149/article/details/79749409
class Encoder(nn.Module):
def __init__(self,input_dim,embedding_dim,hidden_dim,num_layers=2,dropout=0.2):
super().__init__()
self.input_dim = input_dim
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
#input_dim = vocab_size + 1
self.embedding = nn.Embedding(input_dim,embedding_dim)
self.rnn = nn.GRU(embedding_dim,hidden_dim,num_layers=num_layers,dropout=dropout)
self.dropout = nn.Dropout(dropout)
def forward(self,input_seqs,input_lengths,hidden=None):
#src = [sent_len,batch_size]
embedded = self.dropout(self.embedding(input_seqs))
#embedded = [sent_len,batch_size,emb_dim]
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded,input_lengths)
outputs,hidden = self.rnn(packed,hidden)
outputs,output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
return outputs,hidden
outputs,hidden = self.rnn(embedded,hidden)
outputs = [sent_len,batch_size,hid_dim*n_directions]
hidden = [n_layers,batch_size,hid_dim]
outputs总是来自于最后一层
首先定义一下Attention层,这里主要是对encoder的输出进行attention操作,也可以直接对embedding层的输出进行attention。
论文Neural Machine Translation by Jointly Learning to Align and Translate中定义了attention的计算公式。
decoder的输出取决于decoder先前的输出和 , 这里 包括当前GRU输出的hidden state(这部分已经考虑了先前的输出) 以及attention(上下文向量,由encoder的输出求得)。 计算公式如下:函数 非线性激活的全连接层,输入是 , , and 三者的拼接。
所谓的上下文向量就是对encoder的所有输出进行加权求和, 表示输出的第 i 个词对encoder第 j 个输出 的权重。
每个 通过对所有 进行softmax,而每个 是decoder的上一个hidden state 和指定的encoder的输出 经过某些线性操作 计算得分。
此外,论文Effective Approaches to Attention-based Neural Machine Translation中提出了计算分值的不同方式。这里用到的是第三种。
class Attention(nn.Module):
def __init__(self,hidden_dim):
super(Attention,self).__init__()
self.hidden_dim = hidden_dim
self.attn = nn.Linear(self.hidden_dim*2,hidden_dim)
self.v = nn.Parameter(torch.rand(hidden_dim))
self.v.data.normal_(mean=0,std=1./np.sqrt(self.v.size(0)))
def forward(self,hidden,encoder_outputs):
#encoder_outputs:(seq_len,batch_size,hidden_size)
#hidden:(num_layers*num_directions,batch_size,hidden_size)
max_len = encoder_outputs.size(0)
h = hidden[-1].repeat(max_len,1,1) #np.repeat(x, 3, axis=1) #沿着纵轴方向重复3次,增加列数
#(seq_len,batch_size,hidden_size)
attn_energies = self.score(h,encoder_outputs) #计算attention score
return F.softmax(attn_energies,dim=1) #使用sofrmax归一化
def score(self,hidden,encoder_outputs):
#(seq_len,batch_size,2*hidden_size->(seq_len,batch_size,hidden_size))
energy = F.tanh(self.attn(torch.cat([hidden,encoder_outputs],2)))
energy = energy.permute(1,2,0) #(batch_size,hidden_size,seq_len):permute实现了0维的到2维上,1->0,2->1
v = self.v.repeat(encoder_outputs.size(1),1).unsqueeze(1) #(batch_size,1,hidden_size):repeat()https://blog.csdn.net/xuxiatian/article/details/81167784
energy = torch.bmm(v,energy) #(batch_size,1,seq_len):torch.matmul和torch.bmm,都能实现对于batch的矩阵乘法:https://blog.csdn.net/laox1ao/article/details/79159303
return energy.squeeze(1) #(batch_size,seq_len):https://blog.csdn.net/u013444215/article/details/81941366
接下来是加入attention层的decoder,GRU的输出进入全连接层后,又进行了log_softmax操作计算输出词的概率,主要是为了方便NLLLoss损失函数,如果用CrossEntropyLoss损失函数,可以不用加softmax:损失函数NLLLoss() 的 输入 是一个对数概率向量和一个目标标签. 它不会为我们计算对数概率,适合最后一层是log_softmax()的网络. 损失函数 CrossEntropyLoss() 与 NLLLoss() 类似, 唯一的不同是它为我们去做 softmax.可以理解为:CrossEntropyLoss()=log_softmax() + NLLLoss()
class Decoder(nn.Module):
def __init__(self,output_dim,embedding_dim,hidden_dim,num_layers=2,dropout=0.2):
super().__init__()
self.output_dim = output_dim
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.embedding = nn.Embedding(output_dim,embedding_dim)
self.attention = Attention(hidden_dim)
self.rnn = nn.GRU(embedding_dim+hidden_dim,hidden_dim,num_layers=num_layers,dropout=dropout)
self.out = nn.Linear(embedding_dim+hidden_dim*2,output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self,input,hidden,encoder_outputs):
#input = [bsz]
#hidden = [n_layer*n_direction,batch_size,hid_dim]
#encoder_outputs = [sent_len,batch_size,hid_dim*n_direction]
input = input.unsqueeze(0)
#input = [1,bsz]
embedded = self.dropout(self.embedding(input))
#emdedded = [1,bsz,emb_dim]
attn_weight = self.attention(hidden,encoder_outputs)
#(batch_size,seq_len)
context = attn_weight.unsqueeze(1).bmm(encoder_outputs.transpose(0,1)).transpose(0,1)
#(batch_size,1,hidden_dim*n_directions)
#(1,batch_size,hidden_dim*n_directions)
emb_con = torch.cat((embedded,context),dim=2)
#emb_con = [1,bsz,emb_dim+hid_dim]
_,hidden = self.rnn(emb_con,hidden)
#outputs = [sent_len,batch_size,hid_dim*n_directions]
#hidden = [n_layers*n_direction,batch_size,hid_dim]
output = torch.cat((embedded.squeeze(0),hidden[-1],context.squeeze(0)),dim=1)
output = F.log_softmax(self.out(output),1)
#outputs = [sent_len,batch_size,vocab_size]
return output,hidden,attn_weight
我们定义一个Seq2Seq类,将encoder和decoder结合起来,通过一个循环,模型对每一个batch从前往后依次生成序列,训练的时候可以使用teacher_forcing随机使用真实词或是模型输出的词作为target,测试的时候就不需要了。
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder,device,teacher_forcing_ratio=0.5):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
self.teacher_forcing_ratio = teacher_forcing_ratio
def forward(self,src_seqs,src_lengths,trg_seqs):
#src_seqs = [sent_len,batch_size]
#trg_seqs = [sent_len,batch_size]
batch_size = src_seqs.shape[1]
max_len = trg_seqs.shape[0]
trg_vocab_size = self.decoder.output_dim
#建立一个tensor来保存decoder的输出
outputs = torch.zeros(max_len,batch_size,trg_vocab_size).to(self.device)
#hidden用于decoder的初始hidden的状态
#encoder_outputs用于计算上下文向量
encoder_outputs,hidden = self.encoder(src_seqs,src_lengths)
#decoder的第一个输入是
output = trg_seqs[0,:]
for t in range(1,max_len): #这里跳过sos
output,hidden,_ = self.decoder(output,hidden,encoder_outputs)
outputs[t] = output
teacher_force = random.random() < self.teacher_forcing_ratio
output = (trg_seqs[t] if teacher_force else output.max(1)[1])
return outputs
def predict(self,src_seqs,src_lengths,max_trg_len=20,start_ix=1):
max_src_len = src_seqs.shape[0]
batch_size = src_seqs.shape[1]
trg_vocab_size = self.decoder.output_dim
outputs = torch.zeros(max_trg_len,batch_size,trg_vocab_size).to(self.device)
encoder_outputs,hidden = self.encoder(src_seqs,src_lengths)
output = torch.LongTensor([start_ix]*batch_size).to(self.device) #这里初始化一个batch的一步输出大小的tensor
attn_weights = torch.zeros((max_trg_len,batch_size,max_src_len))
for t in range(1,max_trg_len):
output,hidden,attn_weight = self.decoder(output,hidden,encoder_outputs)
outputs[t] = output
output = output.max(1)[1]
attn_weights[t] = attn_weight
return outputs,attn_weights
模型训练:
直接使用1000个batch进行更新
import torch.optim as optim
embedding_dim = 100
hidden_dim = 100
batch_size = 256
clip = 5
encoder = Encoder(len(src_c2ix)+1,embedding_dim,hidden_dim)
decoder = Decoder(len(trg_c2ix)+1,embedding_dim,hidden_dim)
model = Seq2Seq(encoder,decoder,device).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.NLLLoss(ignore_index=0).to(device)
model.train()
for batch_id in range(1,1001):
src_seqs,src_lengths,trg_seqs,_ = random_batch(batch_size,pairs,src_c2ix,trg_c2ix)
optimizer.zero_grad()
output = model(src_seqs,src_lengths,trg_seqs)
loss = criterion(output.view(-1,output.shape[2]),trg_seqs.view(-1))
loss.backward()
#http://www.cnblogs.com/lindaxin/p/7998196.html
torch.nn.utils.clip_grad_norm_(model.parameters(),clip) #既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新),那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值
optimizer.step() #https://blog.csdn.net/gdymind/article/details/82708920
if batch_id % 100 == 0:
print('current loss:{:.4f}'.format(loss))
torch.save(model, 'model.pth')
current loss:0.8211
current loss:0.3182
current loss:0.2070
current loss:0.1032
current loss:0.0706
current loss:0.0345
current loss:0.0343
current loss:0.0215
current loss:0.0108
current loss:0.0169
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Seq2Seq. It won't be checked for correctness upon loading.
"type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Encoder. It won't be checked for correctness upon loading.
"type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Decoder. It won't be checked for correctness upon loading.
"type " + obj.__name__ + ". It won't be checked "
c:\users\administrator\appdata\local\programs\python\python36\lib\site-packages\torch\serialization.py:256: UserWarning: Couldn't retrieve source code for container of type Attention. It won't be checked for correctness upon loading.
"type " + obj.__name__ + ". It won't be checked "
进行测试:
主要实验可视化attention权重
def show_attention(input_words,output_words,attentions):
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 300 #分辨率
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(attentions,cmap='bone') #可视化矩阵
fig.colorbar(cax)
#设置axes
ax.set_xticklabels(['']+input_words)
ax.set_yticklabels(['']+output_words)
#以每一个刻度显示label
ax.xaxis.set_major_locator(ticker.MultipleLocator())
ax.yaxis.set_major_locator(ticker.MultipleLocator())
plt.show()
plt.close()
def evaluate(model,text,src_c2ix,trg_ix2c):
model.eval()
with torch.no_grad():
seq = torch.LongTensor(indexes_from_text(text,src_c2ix)).view(-1,1).to(device)
outputs,attn_weights = model.predict(seq,[seq.size(0)],max_trg_len)
outputs = outputs.squeeze(1).cpu().numpy()
attn_weights = attn_weights.squeeze(1).cpu().numpy()
output_words = [trg_ix2c[np.argmax(word_prob)] for word_prob in outputs]
show_attention(list('^'+text+'$'),output_words,attn_weights)
text = 'thirsty 1 before 3 clock affternoon'
evaluate(model,text,src_c2ix,trg_ix2c)
text = 'forty seven min before 10 p.m'
evaluate(model,text,src_c2ix,trg_ix2c)