NMT(Neural Machine Translation)基于神经网络的机器翻译模型效果越来越好,还记得大学时代Google翻译效果还是差强人意,近些年来使用NMT后已基本能满足非特殊需求了。目前NMT的主流模型是采用Seq2Seq + Attention架构,本文基于PyTorch实现一个小型的英文到中文的翻译系统。
数据集及全部代码下载链接:百度网盘,提取码:sauz
训练数据为14K左右的中英平行语料,及dev、test数据集。已经处理好无需特殊处理,数据集比较小
构建模型之前的预处理步骤
def load_file(path,tgt_add_bos=True):
en = []
cn = []
seg = pkuseg.pkuseg()
with open(path,'r') as f:
for line in f.readlines():
line = line.strip().split('\t')
en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
# test时tgt不加开头结束,用于BLEU计算
if tgt_add_bos:
cn.append(["BOS"] + seg.cut(line[1]) + ["EOS"])
else:
cn.append(seg.cut(line[1]))
return en,cn
建立词表
建立word2id及id2word的map映射,不要忘记加入PAD、UNK
句子用词表转为机器可以处理的number形式
句子padding及mini_batch处理
整合预处理过程在DataProcessor中
class DataProcessor(object):
def __init__(self,args):
cached_en_tokenizer = os.path.join(args.data_dir,"cached_{}".format("en_tokenizer"))
cached_cn_tokenizer = os.path.join(args.data_dir, "cached_{}".format("cn_tokenizer"))
if not os.path.exists(cached_en_tokenizer) or not os.path.exists(cached_cn_tokenizer):
en_sents, cn_sents = load_file(args.data_dir + "train.txt")
en_word2idx, en_id2word, en_vocab_size = build_tokenizer(en_sents,args)
cn_word2idx, cn_id2word, cn_vocab_size = build_tokenizer(cn_sents, args)
torch.save([en_word2idx, en_id2word, en_vocab_size],cached_en_tokenizer)
torch.save([cn_word2idx, cn_id2word, cn_vocab_size],cached_cn_tokenizer)
else:
en_word2idx, en_id2word, en_vocab_size = torch.load(cached_en_tokenizer)
cn_word2idx, cn_id2word, cn_vocab_size = torch.load(cached_cn_tokenizer)
self.en_tokenizer = Tokenizer(en_word2idx, en_id2word, en_vocab_size)
self.cn_tokenizer = Tokenizer(cn_word2idx, cn_id2word, cn_vocab_size)
def get_train_examples(self,args):
return self._create_examples(os.path.join(args.data_dir,"train.txt"),"train",args)
def get_dev_examples(self,args):
return self._create_examples(os.path.join(args.data_dir,"dev.txt"),"dev",args)
def _create_examples(self,path,set_type,args):
en_sents,cn_sents = load_file(path)
out_en_sents,out_cn_sents = tokenize2num(en_sents,cn_sents,
self.en_tokenizer.word2idx,self.cn_tokenizer.word2idx)
minibatches = getminibatches(len(out_en_sents),args.batch_size)
all_examples = []
for minibatch in minibatches:
mb_en_sentences = [out_en_sents[i] for i in minibatch]
mb_cn_sentences = [out_cn_sents[i] for i in minibatch]
mb_x,mb_x_len = prepare_data(mb_en_sentences)
mb_y,mb_y_len = prepare_data(mb_cn_sentences)
all_examples.append((mb_x,mb_x_len,mb_y,mb_y_len))
return all_examples
Seq2Seq + Attention机制在NMT中被证明效果特别好,整个模型由Encoder、Attention及Decoder组成,外层用Seq2Seq统一包装。代码基于PyTorch实现
Encoder编码器处理步骤有:
class Encoder(nn.Module):
def __init__(self,vocab_size,embed_size,enc_hidden_size,dec_hidden_size,dropout=0.2):
super(Encoder,self).__init__()
self.embed = nn.Embedding(vocab_size,embed_size)
self.rnn = nn.GRU(embed_size,enc_hidden_size,batch_first=True,bidirectional=True)
self.dropout = nn.Dropout(dropout)
# 将encoder的输出转为decoder的输入,* 2 是使用了bidirectional
self.fc = nn.Linear(enc_hidden_size*2, dec_hidden_size)
def forward(self,x,lengths):
embedded = self.dropout(self.embed(x))
# 新版pytorch增加了batch里的排序功能,默认需要强制倒序
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,lengths,batch_first=True)
# hid 【2, batch, enc_hidden_size】
packed_out, hid = self.rnn(packed_embedded)
# 【batch, seq, 2 * enc_hidden_size】
out,_ = nn.utils.rnn.pad_packed_sequence(packed_out,batch_first=True,total_length=max(lengths))
# 将hid双向叠加 【batch, 2*enc_hidden_size】
hid = torch.cat([hid[-2],hid[-1]],dim=1)
# 转为decoder输入hidden state 【1,batch,dec_hidden_size】
hid = torch.tanh(self.fc(hid)).unsqueeze(0)
return out,hid
Attention层主要处理步骤:
class Attention(nn.Module):
""" """
def __init__(self,enc_hidden_size,dec_hidden_size):
super(Attention,self).__init__()
self.enc_hidden_size = enc_hidden_size
self.dec_hidden_size = dec_hidden_size
self.liner_in = nn.Linear(2*enc_hidden_size,dec_hidden_size)
self.liner_out = nn.Linear(2*enc_hidden_size+dec_hidden_size,dec_hidden_size)
def forward(self,output,context,mask):
# context 上下文输出,即encoder的gru hidden state 【batch,enc_seq,enc_hidden*2】
# output decoder的gru hidden state 【batch,dec_seq, dec_hidden】
# mask 【batch, dec_seq, enc_seq】mask在decoder中创建
batch_size = context.shape[0]
enc_seq = context.shape[1]
dec_seq = output.shape[1]
# score计算公式使用双线性模型 h*w*s
context_in = self.liner_in(context.reshape(batch_size*enc_seq,-1).contiguous())
context_in = context_in.view(batch_size,enc_seq,-1).contiguous()
atten = torch.bmm(output,context_in.transpose(1,2))
# 【batch,dec_seq,enc_seq】
atten.data.masked_fill(mask,-1e6) # mask置零
atten = F.softmax(atten,dim=2)
# 将score和value加权求和,得到输出
# 【batch, dec_seq, 2*enc_hidden】
context = torch.bmm(atten,context)
# 将attention + output 堆叠获取融合信息
output = torch.cat((context,output),dim=2)
# 最终输出 batch,dec_seq,dec_hidden_size
output = torch.tanh(self.liner_out(output.view(batch_size*dec_seq,-1))).view(batch_size,dec_seq,-1)
return output,atten
注意:本文的atten是在Decoder的hidden结束后再做atten及融合等一系列变换,和原始论文https://arxiv.org/pdf/1409.0473.pdf中decoder阶段一步一步decode有出入,这个版本的实现见toy_translation。
Decoder层主要处理步骤:
class Decoder(nn.Module):
""""""
def __init__(self,vocab_size,embedded_size,enc_hidden_size,dec_hidden_size,dropout=0.2):
super(Decoder,self).__init__()
self.embed = nn.Embedding(vocab_size,embedded_size)
self.atten = Attention(enc_hidden_size,dec_hidden_size)
# decoder不使用bidirectional
self.rnn = nn.GRU(embedded_size,dec_hidden_size,batch_first=True)
self.out = nn.Linear(dec_hidden_size,vocab_size)
self.dropout = nn.Dropout(dropout)
def create_mask(self,x_len,y_len):
# 最长句子的长度
max_x_len = x_len.max()
max_y_len = y_len.max()
# 句子batch
batch_size = len(x_len)
# 将超出自身序列长度的元素设为False
x_mask = (torch.arange(max_x_len.item())[None, :] < x_len[:, None]).float() # [batch,max_x_len]
y_mask = (torch.arange(max_y_len.item())[None, :] < y_len[:, None]).float() # [batch,max_y_len]
# y_mask[:, :, None] size: [batch,max_y_len,1]
# x_mask[:, None, :] size: [batch,1,max_x_len]
# 需要mask的地方设置为true
mask = (1 - y_mask[:, :, None] * x_mask[:, None, :]) != 0
# [batch_size, max_y_len, max_x_len]
return mask
def forward(self,ctx,ctx_lengths,y,y_lengths,hid):
'''
:param ctx:encoder层的输出 : 【batch, enc_seq, 2*enc_hidden】
:param ctx_lengths: encoder层输入句子的长度list
:param y: decoder层的输入 【batch, dec_seq, dec_hidden】
:param y_lengths: decoder输入的句子长度
:param hid: encoder层输出的最后一个hidden state 【1, batch, dec_hidden】
:return:
'''
y_embed = self.dropout(self.embed(y))
# 这里没法保证译文也是排倒序
y_packed = nn.utils.rnn.pack_padded_sequence(y_embed,y_lengths,batch_first=True,enforce_sorted=False)
# 将emcoder的hidden state作为decoder的第一个hidden state
pack_output, hid = self.rnn(y_packed,hid)
output_seq,_ = nn.utils.rnn.pad_packed_sequence(pack_output,batch_first=True,total_length=max(y_lengths))
# 做attention之前需要创建mask
mask = self.create_mask(ctx_lengths,y_lengths)
# annention处理
output,atten = self.atten(output_seq,ctx,mask)
# 将输出转为vocab_size的softmax概率分布并取对数
output = F.log_softmax(self.out(output),dim=-1)
return output,atten,hid
将模型整合后,整个完整的模型计算图:
def beam_search(self,x,x_lengths,y,EOS_id,topk=5,max_length=100):
encoder_out,hid = self.encoder(x,x_lengths)
BOS_id = y[0][0].item()
hypotheses = [[BOS_id]]
hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=y.device)
completed_hypotheses = []
t = 0
while len(completed_hypotheses) < topk and t < max_length:
t+=1
hyp_num = len(hypotheses)
# 扩展成batch
exp_src_encodings = encoder_out.expand(hyp_num,encoder_out.shape[1],encoder_out.shape[2])
exp_x_lengths = x_lengths.expand(hyp_num)
exp_hid = hid.expand(hid.shape[0],hyp_num,hid.shape[2])
output_t,atten_t,exp_hid = self.decoder(
exp_src_encodings,exp_x_lengths,
torch.tensor(hypotheses).long().to(y.device),torch.ones(hyp_num).long().to(y.device) * t,
exp_hid
)
live_hyp_num = topk - len(completed_hypotheses)
# 这里把num * vocab 展开来方便取topk
contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand(hyp_num,output_t.shape[-1]) + output_t[:,-1,:].squeeze(1)).view(-1)
top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores,k=live_hyp_num)
# 标记当前概率最大的k个,其是跟在哪个单词的后面
prev_hyp_ids = top_cand_hyp_pos / (output_t.shape[-1])
hyp_word_ids = top_cand_hyp_pos % (output_t.shape[-1])
new_hypotheses = []
live_hyp_ids = []
new_hyp_scores = []
for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
prev_hyp_id = prev_hyp_id.item()
hyp_word_id = hyp_word_id.item()
cand_new_hyp_score = cand_new_hyp_score.item()
# 将当前最大概率的k个,拼接在正确的prev单词后面
new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word_id]
if hyp_word_id == EOS_id:
# 搜寻终止
completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
score=cand_new_hyp_score))
else:
new_hypotheses.append(new_hyp_sent)
live_hyp_ids.append(prev_hyp_id)
new_hyp_scores.append(cand_new_hyp_score)
if len(completed_hypotheses) == topk:
break
hypotheses = new_hypotheses
hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=y.device)
# 若搜寻了max_len后还没有一个到达EOS则取第一个
if len(completed_hypotheses) == 0:
completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
score=hyp_scores[0].item()))
completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
return completed_hypotheses
模型使用Adam,初始learnning_rate设为5e-4,并使用了线性衰减,总共训练了10个epoch,每个epoch保存一次,取eval中loss最小的模型。训练的损失曲线如下:
取最好的模型在test集上计算BLEU分数,我们使用nltk工具包的corpus_bleu函数计算BLEU,由于每个句子参考译文只有一句及模型训练的数据量太少,最后的Corpus BLEU: 10.620235626863046。
尝试翻译几个句子看看效果:
Seq2Seq + Attention实现NMT的方式目前应用的很广泛,鉴于本项目的数据量太少,效果还能接受。后续可以寻找更多的平行语料,尝试back translation,加入Coverage mechanism等方式继续优化项目。