下面是一个使用torch.nn.Transformer
进行序列到序列(Sequence-to-Sequence)的机器翻译任务的示例代码,包括数据加载、模型搭建和训练过程。
import torch
import torch.nn as nn
from torch.nn import Transformer
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
# 数据加载
def load_data():
# 加载源语言数据和目标语言数据
# 在这里你可以根据实际情况进行数据加载和预处理
src_sentences = [...] # 源语言句子列表
tgt_sentences = [...] # 目标语言句子列表
return src_sentences, tgt_sentences
def preprocess_data(src_sentences, tgt_sentences):
# 在这里你可以进行数据预处理,如分词、建立词汇表等
# 为了简化示例,这里直接返回原始数据
return src_sentences, tgt_sentences
def create_vocab(sentences):
# 建立词汇表,并为每个词分配一个唯一的索引
# 这里可以使用一些现有的库,如torchtext等来处理词汇表的构建
word2idx = {}
idx2word = {}
for sentence in sentences:
for word in sentence:
if word not in word2idx:
index = len(word2idx)
word2idx[word] = index
idx2word[index] = word
return word2idx, idx2word
def sentence_to_tensor(sentence, word2idx):
# 将句子转换为张量形式,张量的每个元素表示词语在词汇表中的索引
tensor = [word2idx[word] for word in sentence]
return torch.tensor(tensor)
def collate_fn(batch):
# 对批次数据进行填充,使每个句子长度相同
max_length = max(len(sentence) for sentence in batch)
padded_batch = []
for sentence in batch:
padded_sentence = sentence + [0] * (max_length - len(sentence))
padded_batch.append(padded_sentence)
return torch.tensor(padded_batch)
# 模型定义
class TranslationModel(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, embedding_size, hidden_size, num_layers, num_heads, dropout):
super(TranslationModel, self).__init__()
self.embedding = nn.Embedding(src_vocab_size, embedding_size)
self.transformer = Transformer(
d_model=embedding_size,
nhead=num_heads,
num_encoder_layers=num_layers,
num_decoder_layers=num_layers,
dim_feedforward=hidden_size,
dropout=dropout
)
self.fc = nn.Linear(embedding_size, tgt_vocab_size)
def forward(self, src_sequence, tgt_sequence):
embedded_src = self.embedding(src_sequence)
embedded_tgt = self.embedding(tgt_sequence)
output = self.transformer(embedded_src, embedded_tgt)
output = self.fc(output)
return output
# 参数设置
src_vocab_size = 1000
tgt_vocab_size = 2000
embedding_size = 256
hidden_size = 512
num_layers = 4
num_heads = 8
dropout = 0.2
learning_rate = 0.001
batch_size = 32
num_epochs = 10
# 加载和预处理数据
src_sentences, tgt_sentences = load_data()
src_sentences, tgt_sentences = preprocess_data(src_sentences, tgt_sentences)
src_word2idx, src_idx2word = create_vocab(src_sentences)
tgt_word2idx, tgt_idx2word = create_vocab(tgt_sentences)
# 将句子转换为张量形式
src_tensor = [sentence_to_tensor(sentence, src_word2idx) for sentence in src_sentences]
tgt_tensor = [sentence_to_tensor(sentence, tgt_word2idx) for sentence in tgt_sentences]
# 创建数据加载器
dataset = list(zip(src_tensor, tgt_tensor))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
# 创建模型实例
model = TranslationModel(src_vocab_size, tgt_vocab_size, embedding_size, hidden_size, num_layers, num_heads, dropout)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=learning_rate)
# 训练模型
for epoch in range(num_epochs):
total_loss = 0.0
num_batches = 0
for batch in dataloader:
src_inputs, tgt_inputs = batch[:, :-1], batch[:, 1:]
optimizer.zero_grad()
output = model(src_inputs, tgt_inputs)
loss = criterion(output.view(-1, tgt_vocab_size), tgt_inputs.view(-1))
loss.backward()
clip_grad_norm_(model.parameters(), max_norm=1) # 防止梯度爆炸
optimizer.step()
total_loss += loss.item()
num_batches += 1
average_loss = total_loss / num_batches
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}")
# 在训练完成后,可以使用模型进行推理和翻译
上述代码是一个基本的序列到序列机器翻译任务的示例,其中使用torch.nn.Transformer
作为模型架构。首先,我们加载数据并进行预处理,然后为源语言和目标语言建立词汇表。接下来,我们创建一个自定义的TranslationModel
类,该类使用Transformer模型进行翻译。在训练过程中,我们使用交叉熵损失函数和Adam优化器进行模型训练。代码中使用的collate_fn
函数确保每个批次的句子长度一致,并对句子进行填充。在每个训练周期中,我们计算损失并进行反向传播和参数更新。最后,打印每个训练周期的平均损失。
请注意,在实际应用中,还需要根据任务需求进行更多的定制和调整。例如,加入位置编码、使用更复杂的编码器或解码器模型等。此示例可以作为使用torch.nn.Transformer
进行序列到序列机器翻译任务的起点。