随着全球化的深入,翻译需求日益增长。传统的人工翻译方式虽然质量高,但效率低,成本高。机器翻译的出现,为解决这一问题提供了可能。英译中机器翻译任务是机器翻译领域的一个重要分支,旨在将英文文本自动翻译成中文。本博客以《PyTorch自然语言处理入门与实战》第九章的Seq2seq模型处理英译中翻译任务作为基础,附上模型预测模块。
模型的训练及验证模块的详细解析见PyTorch实战:基于Seq2seq模型处理机器翻译任务(模型训练及验证)
en2id
和zh2id
在预测阶段中,需要加载模型训练及验证阶段保存的字典对象en2id
和zh2id
。
代码如下:
import pickle
with open("en2id.pkl", 'rb') as f:
en2id = pickle.load(f)
with open("zh2id.pkl", 'rb') as f:
zh2id = pickle.load(f)
在对输入文本进行预测时,需要先将文本进行分词操作。参考代码如下:
def extract_words(sentence):
"""
从给定的英文句子中提取单词,并去除单词后的标点符号。
Args:
sentence (str): 要提取单词的英文句子。
Returns:
List[str]: 提取并处理后的单词列表。
"""
en_words = []
for w in sentence.split(' '): # 将英文句子按空格分词
w = w.replace('.', '').replace(',', '') # 去除跟单词连着的标点符号
w = w.lower() # 统一单词大小写
if w:
en_words.append(w)
return en_words
# 测试函数
sentence = 'I am Dave Gallo.'
print(extract_words(sentence))
代码如下:
import torch
import torch.nn as nn
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim) # 词嵌入
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
# src = (src len, batch size)
embedded = self.dropout(self.embedding(src))
# embedded = (src len, batch size, emb dim)
outputs, (hidden, cell) = self.rnn(embedded)
# outputs = (src len, batch size, hid dim * n directions)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# rnn的输出总是来自顶部的隐藏层
return hidden, cell
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.output_dim = output_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.fc_out = nn.Linear(hid_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, cell):
# 各输入的形状
# input = (batch size)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# LSTM是单向的 ==> n directions == 1
# hidden = (n layers, batch size, hid dim)
# cell = (n layers, batch size, hid dim)
input = input.unsqueeze(0) # (batch size) --> [1, batch size)
embedded = self.dropout(self.embedding(input)) # (1, batch size, emb dim)
output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
# LSTM理论上的输出形状
# output = (seq len, batch size, hid dim * n directions)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# 解码器中的序列长度 seq len == 1
# 解码器的LSTM是单向的 n directions == 1 则实际上
# output = (1, batch size, hid dim)
# hidden = (n layers, batch size, hid dim)
# cell = (n layers, batch size, hid dim)
prediction = self.fc_out(output.squeeze(0))
# prediction = (batch size, output dim)
return prediction, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, input_word_count, output_word_count, encode_dim, decode_dim, hidden_dim, n_layers,
encode_dropout, decode_dropout, device):
"""
:param input_word_count: 英文词表的长度 34737
:param output_word_count: 中文词表的长度 4015
:param encode_dim: 编码器的词嵌入维度
:param decode_dim: 解码器的词嵌入维度
:param hidden_dim: LSTM的隐藏层维度
:param n_layers: 采用n层LSTM
:param encode_dropout: 编码器的dropout概率
:param decode_dropout: 编码器的dropout概率
:param device: cuda / cpu
"""
super().__init__()
self.encoder = Encoder(input_word_count, encode_dim, hidden_dim, n_layers, encode_dropout)
self.decoder = Decoder(output_word_count, decode_dim, hidden_dim, n_layers, decode_dropout)
self.device = device
def forward(self, src):
# src = (src len, batch size)
# 编码器的隐藏层输出将作为解码器的第一个隐藏层输入
hidden, cell = self.encoder(src)
# 解码器的第一个输入应该是起始标识符
input = src[0, :] # 取trg的第“0”行所有列 “0”指的是索引
pred = [0] # 预测的第一个输出应该是起始标识符
top1 = 0
while top1 != 1 and len(pred) < 100:
# 解码器的输入包括:起始标识符的词嵌入input; 编码器输出的 hidden and cell states
# 解码器的输出包括:输出张量(predictions) and new hidden and cell states
output, hidden, cell = self.decoder(input, hidden, cell)
top1 = output.argmax(dim=1) # (batch size, )
pred.append(top1.item())
input = top1
return pred
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # GPU可用 用GPU
# Seq2Seq模型实例化
source_word_count = 34737 # 英文词表的长度 34737
target_word_count = 4015 # 中文词表的长度 4015
encode_dim = 256 # 编码器的词嵌入维度
decode_dim = 256 # 解码器的词嵌入维度
hidden_dim = 512 # LSTM的隐藏层维度
n_layers = 2 # 采用n层LSTM
encode_dropout = 0.5 # 编码器的dropout概率
decode_dropout = 0.5 # 编码器的dropout概率
model = Seq2Seq(source_word_count, target_word_count, encode_dim, decode_dim, hidden_dim, n_layers, encode_dropout,
decode_dropout, device).to(device)
# 加载训练好的模型
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
提示:预测代码是我们基于训练及验证代码进行改造的,不一定完全正确,可以参考后自行修改~
import torch
import torch.nn as nn
import pickle
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim) # 词嵌入
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
# src = (src len, batch size)
embedded = self.dropout(self.embedding(src))
# embedded = (src len, batch size, emb dim)
outputs, (hidden, cell) = self.rnn(embedded)
# outputs = (src len, batch size, hid dim * n directions)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# rnn的输出总是来自顶部的隐藏层
return hidden, cell
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.output_dim = output_dim
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
self.fc_out = nn.Linear(hid_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, input, hidden, cell):
# 各输入的形状
# input = (batch size)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# LSTM是单向的 ==> n directions == 1
# hidden = (n layers, batch size, hid dim)
# cell = (n layers, batch size, hid dim)
input = input.unsqueeze(0) # (batch size) --> [1, batch size)
embedded = self.dropout(self.embedding(input)) # (1, batch size, emb dim)
output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
# LSTM理论上的输出形状
# output = (seq len, batch size, hid dim * n directions)
# hidden = (n layers * n directions, batch size, hid dim)
# cell = (n layers * n directions, batch size, hid dim)
# 解码器中的序列长度 seq len == 1
# 解码器的LSTM是单向的 n directions == 1 则实际上
# output = (1, batch size, hid dim)
# hidden = (n layers, batch size, hid dim)
# cell = (n layers, batch size, hid dim)
prediction = self.fc_out(output.squeeze(0))
# prediction = (batch size, output dim)
return prediction, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, input_word_count, output_word_count, encode_dim, decode_dim, hidden_dim, n_layers,
encode_dropout, decode_dropout, device):
"""
:param input_word_count: 英文词表的长度 34737
:param output_word_count: 中文词表的长度 4015
:param encode_dim: 编码器的词嵌入维度
:param decode_dim: 解码器的词嵌入维度
:param hidden_dim: LSTM的隐藏层维度
:param n_layers: 采用n层LSTM
:param encode_dropout: 编码器的dropout概率
:param decode_dropout: 编码器的dropout概率
:param device: cuda / cpu
"""
super().__init__()
self.encoder = Encoder(input_word_count, encode_dim, hidden_dim, n_layers, encode_dropout)
self.decoder = Decoder(output_word_count, decode_dim, hidden_dim, n_layers, decode_dropout)
self.device = device
def forward(self, src):
# src = (src len, batch size)
# 编码器的隐藏层输出将作为解码器的第一个隐藏层输入
hidden, cell = self.encoder(src)
# 解码器的第一个输入应该是起始标识符
input = src[0, :] # 取trg的第“0”行所有列 “0”指的是索引
pred = [0] # 预测的第一个输出应该是起始标识符
top1 = 0
while top1 != 1 and len(pred) < 100:
# 解码器的输入包括:起始标识符的词嵌入input; 编码器输出的 hidden and cell states
# 解码器的输出包括:输出张量(predictions) and new hidden and cell states
output, hidden, cell = self.decoder(input, hidden, cell)
top1 = output.argmax(dim=1) # (batch size, )
pred.append(top1.item())
input = top1
return pred
if __name__ == '__main__':
sentence = 'I am Dave Gallo.'
en_words = []
for w in sentence.split(' '): # 英文内容按照空格字符进行分词
# 按照空格进行分词后,某些单词后面会跟着标点符号 "." 和 “,”
w = w.replace('.', '').replace(',', '') # 去掉跟单词连着的标点符号
w = w.lower() # 统一单词大小写
if w:
en_words.append(w)
print(en_words)
with open("en2id.pkl", 'rb') as f:
en2id = pickle.load(f)
with open("zh2id.pkl", 'rb') as f:
zh2id = pickle.load(f)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # GPU可用 用GPU
# Seq2Seq模型实例化
source_word_count = 34737 # 英文词表的长度 34737
target_word_count = 4015 # 中文词表的长度 4015
encode_dim = 256 # 编码器的词嵌入维度
decode_dim = 256 # 解码器的词嵌入维度
hidden_dim = 512 # LSTM的隐藏层维度
n_layers = 2 # 采用n层LSTM
encode_dropout = 0.5 # 编码器的dropout概率
decode_dropout = 0.5 # 编码器的dropout概率
model = Seq2Seq(source_word_count, target_word_count, encode_dim, decode_dim, hidden_dim, n_layers, encode_dropout,
decode_dropout, device).to(device)
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
src = [0] # 0 --> 起始标识符的编码
for i in range(len(en_words)):
src.append(en2id[en_words[i]])
src = src + [1] # 1 --> 终止标识符的编码
text_input = torch.LongTensor(src)
text_input = text_input.unsqueeze(-1).to(device)
text_output = model(text_input)
print(text_output)
id2zh = dict()
for k, v in zh2id.items():
id2zh[v] = k
text_output = [id2zh[index] for index in text_output]
text_output = " ".join(text_output)
print(text_output)