基于循环神经网络的模型可以使用更长的上下文,因此更适合序列标注问题。
此处以 NITK 提供的宾州树库(Penn Treebank)样例数据为例,使用 LSTM 网络进行词性标注。
首先加载词性标注语料库
def load_treebank():
from nltk.corpus import treebank
#sents 存储全部经过标记化的句子
#postags 存储每个标记对应的词性标注结果
sents, postags = zip(*(zip(*sent) for sent in treebank.tagged_sents()))
#""为预留的用于补齐序列长度的标记
vocab = Vocab.build(sents, reserved_tokens=[""])
#字符串表示的词性标注标签,也需要使用词表映射为索引值
tag_vocab = Vocab.build(postags)
#前3000句作为训练数据
train_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[:3000], postags[:3000])]
#其余作为测试数据
test_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[3000:], postags[3000:])]
return train_data, test_data, vocab, tag_vocab
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import defaultdict
from vocab import Vocab
from utils import load_treebank
#tqdm是一个Python模块,能以进度条的方式显式迭代的进度
from tqdm.auto import tqdm
WEIGHT_INIT_RANGE = 0.1
class LstmDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
def collate_fn(examples):
device = torch.device('cuda')
lengths = torch.tensor([len(ex[0]) for ex in examples]).to(device)
inputs = [torch.tensor(ex[0]) for ex in examples]
#每个序列不只有一个答案,而是每个标记对应一个答案
targets = [torch.tensor(ex[1]) for ex in examples]
#对输入和输出序列都进行补齐
inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab[""]).to(device)
targets = pad_sequence(targets, batch_first=True, padding_value=vocab[""]).to(device)
#返回结果增加了最后一项,即mask项,用于记录哪些是序列实际的有效标记
return inputs, lengths, targets, inputs != vocab[""]
def init_weights(model):
for param in model.parameters():
torch.nn.init.uniform_(param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE)
#需要从nn.Module派生一个LSTM子类
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
super(LSTM, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.output = nn.Linear(hidden_dim, num_class)
init_weights(self)
def forward(self, inputs, lengths):
embeddings = self.embeddings(inputs)
#使用 pack_padded_sequence将变长序列打包
x_pack = pack_padded_sequence(embeddings, lengths.to('cpu'), batch_first=True, enforce_sorted=False)
hidden, (hn, cn) = self.lstm(x_pack)
#pad_packed_sequence与pack_padded_sequence相反,是对打包的序列进行解包
#即还原成结尾经过补齐的多个序列
hidden, _ = pad_packed_sequence(hidden, batch_first=True)
#在文本分类中,仅使用最后一个状态的隐含层(hc)
# 而在序列标注中,需要使用序列全部状态的隐含层(hidden)
outputs = self.output(hidden)
log_probs = F.log_softmax(outputs, dim=-1)
return log_probs
embedding_dim = 128
hidden_dim = 256
batch_size = 32
num_epoch = 5
#加载数据
train_data, test_data, vocab, pos_vocab = load_treebank()
train_dataset = LstmDataset(train_data)
test_dataset = LstmDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
num_class = len(pos_vocab)
#加载模型
device = torch.device('cuda')
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) #将模型加载到GPU中(如果已经正确安装)
#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
inputs, lengths, targets, mask = [x.to(device) for x in batch]
log_probs = model(inputs, lengths)
#需要使用mask来保证仅对有效的标记求损失,对正确预测结果以及总的标记计数
loss = nll_loss(log_probs[mask], targets[mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
#测试过程
acc = 0
total = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
inputs, lengths, targets, mask = [x.to(device) for x in batch]
with torch.no_grad():
output = model(inputs, lengths)
#需要使用mask来保证仅对有效的标记求损失,对正确预测结果以及总的标记计数
acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
total += mask.sum().item()
#输出在测试集上的准确率
print(f"Acc: {acc / total:.2f}")
输出结果
基于 Transformer 实现词性标注相当于将基于 Transformer 实现的情感分类与基于 LSTM 实现的词性标注相融合。其中,collate_fn函数与 LSTM 词性有注中的相同。Transformer 层的实现与 Transformer 情感分类基本相同,只有在forward 函数中需要取序列中每个输人对应的隐含层并计算概率,而不是第1个 输入的隐含层(代表整个序列)。
import math
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from collections import defaultdict
from vocab import Vocab
from utils import load_treebank, length_to_mask
#tqdm是一个Pyth模块,能以进度条的方式显式迭代的进度
from tqdm.auto import tqdm
class TransformerDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
def collate_fn(examples):
lengths = torch.tensor([len(ex[0]) for ex in examples])
inputs = [torch.tensor(ex[0]) for ex in examples]
targets = [torch.tensor(ex[1]) for ex in examples]
# 对batch内的样本进行padding,使其具有相同长度
inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab[""])
targets = pad_sequence(targets, batch_first=True, padding_value=vocab[""])
return inputs, lengths, targets, inputs != vocab[""]
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=512):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term2 = torch.pow(torch.tensor(10000.0), torch.arange(0, d_model, 2).float() / d_model)
div_term1 = torch.pow(torch.tensor(10000.0), torch.arange(1, d_model, 2).float() / d_model)
# 高级切片方式,即从0开始,两个步长取一个。即奇数和偶数位置赋值不一样。直观来看就是每一句话的
pe[:, 0::2] = torch.sin(position * div_term2)
pe[:, 1::2] = torch.cos(position * div_term1)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return x
class Transformer(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class,
dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=512, activation: str = "relu"):
super(Transformer, self).__init__()
# 词嵌入层
self.embedding_dim = embedding_dim
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
# 编码层:使用Transformer
encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_head, dim_feedforward, dropout, activation)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
# 输出层
self.output = nn.Linear(hidden_dim, num_class)
def forward(self, inputs, lengths):
inputs = torch.transpose(inputs, 0, 1)
hidden_states = self.embeddings(inputs)
hidden_states = self.position_embedding(hidden_states)
attention_mask = length_to_mask(lengths.to('cuda')) == False
#最后的转置操作将数据还原为batch_first
hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask).transpose(0, 1)
logits = self.output(hidden_states)
#取序列中每个输入的隐含层
log_probs = F.log_softmax(logits, dim=-1)
return log_probs
embedding_dim = 128
hidden_dim = 128
batch_size = 32
num_epoch = 5
#加载数据
train_data, test_data, vocab, pos_vocab = load_treebank()
train_dataset = TransformerDataset(train_data)
test_dataset = TransformerDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
num_class = len(pos_vocab)
#加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) #将模型加载到GPU中(如果已经正确安装)
#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
inputs, lengths, targets, mask = [x.to(device) for x in batch]
log_probs = model(inputs, lengths)
loss = nll_loss(log_probs[mask], targets[mask])
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
#测试过程
acc = 0
total = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
inputs, lengths, targets, mask = [x.to(device) for x in batch]
with torch.no_grad():
output = model(inputs, lengths)
acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
total += mask.sum().item()
#输出在测试集上的准确率
print(f"Acc: {acc / total:.2f}")