pytorch 词性标注实战

基于循环神经网络词性标注

基于循环神经网络的模型可以使用更长的上下文,因此更适合序列标注问题。

此处以 NITK 提供的宾州树库(Penn Treebank)样例数据为例,使用 LSTM 网络进行词性标注。

首先加载词性标注语料库

def load_treebank():
    from nltk.corpus import treebank
    #sents 存储全部经过标记化的句子
    #postags 存储每个标记对应的词性标注结果
    sents, postags = zip(*(zip(*sent) for sent in treebank.tagged_sents()))
    #""为预留的用于补齐序列长度的标记
    vocab = Vocab.build(sents, reserved_tokens=[""])
    #字符串表示的词性标注标签,也需要使用词表映射为索引值
    tag_vocab = Vocab.build(postags)
    #前3000句作为训练数据
    train_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[:3000], postags[:3000])]
    #其余作为测试数据
    test_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[3000:], postags[3000:])]

    return train_data, test_data, vocab, tag_vocab
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from collections import defaultdict
from vocab import Vocab
from utils import load_treebank

#tqdm是一个Python模块,能以进度条的方式显式迭代的进度
from tqdm.auto import tqdm

WEIGHT_INIT_RANGE = 0.1

class LstmDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

def collate_fn(examples):
    device = torch.device('cuda')   
    lengths = torch.tensor([len(ex[0]) for ex in examples]).to(device)
    inputs = [torch.tensor(ex[0]) for ex in examples]
    #每个序列不只有一个答案,而是每个标记对应一个答案
    targets = [torch.tensor(ex[1]) for ex in examples]
    #对输入和输出序列都进行补齐
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab[""]).to(device)
    targets = pad_sequence(targets, batch_first=True, padding_value=vocab[""]).to(device)
    #返回结果增加了最后一项,即mask项,用于记录哪些是序列实际的有效标记
    return inputs, lengths, targets, inputs != vocab[""]


def init_weights(model):
    for param in model.parameters():
        torch.nn.init.uniform_(param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE)

#需要从nn.Module派生一个LSTM子类
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(LSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.output = nn.Linear(hidden_dim, num_class)
        init_weights(self)

    def forward(self, inputs, lengths):
        embeddings = self.embeddings(inputs)
        #使用 pack_padded_sequence将变长序列打包
        x_pack = pack_padded_sequence(embeddings, lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        hidden, (hn, cn) = self.lstm(x_pack)
        #pad_packed_sequence与pack_padded_sequence相反,是对打包的序列进行解包
        #即还原成结尾经过补齐的多个序列
        hidden, _ = pad_packed_sequence(hidden, batch_first=True)
        #在文本分类中,仅使用最后一个状态的隐含层(hc)
        # 而在序列标注中,需要使用序列全部状态的隐含层(hidden)
        outputs = self.output(hidden)
        log_probs = F.log_softmax(outputs, dim=-1)
        return log_probs

embedding_dim = 128
hidden_dim = 256
batch_size = 32
num_epoch = 5

#加载数据
train_data, test_data, vocab, pos_vocab = load_treebank()
train_dataset = LstmDataset(train_data)
test_dataset = LstmDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

num_class = len(pos_vocab)

#加载模型
device = torch.device('cuda')
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) #将模型加载到GPU中(如果已经正确安装)

#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        log_probs = model(inputs, lengths)
        #需要使用mask来保证仅对有效的标记求损失,对正确预测结果以及总的标记计数
        loss = nll_loss(log_probs[mask], targets[mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

#测试过程
acc = 0
total = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
    inputs, lengths, targets, mask = [x.to(device) for x in batch]
    with torch.no_grad():
        output = model(inputs, lengths)
        #需要使用mask来保证仅对有效的标记求损失,对正确预测结果以及总的标记计数
        acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
        total += mask.sum().item()

#输出在测试集上的准确率
print(f"Acc: {acc / total:.2f}")

输出结果

基于Transformer的词性标注

基于 Transformer 实现词性标注相当于将基于 Transformer 实现的情感分类与基于 LSTM 实现的词性标注相融合。其中,collate_fn函数与 LSTM 词性有注中的相同。Transformer 层的实现与 Transformer 情感分类基本相同,只有在forward 函数中需要取序列中每个输人对应的隐含层并计算概率,而不是第1个 输入的隐含层(代表整个序列)。

import math
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from collections import defaultdict
from vocab import Vocab
from utils import load_treebank, length_to_mask

#tqdm是一个Pyth模块,能以进度条的方式显式迭代的进度
from tqdm.auto import tqdm

class TransformerDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        return self.data[i]

def collate_fn(examples):
    lengths = torch.tensor([len(ex[0]) for ex in examples])
    inputs = [torch.tensor(ex[0]) for ex in examples]
    targets = [torch.tensor(ex[1]) for ex in examples]
    # 对batch内的样本进行padding,使其具有相同长度
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab[""])
    targets = pad_sequence(targets, batch_first=True, padding_value=vocab[""])
    return inputs, lengths, targets, inputs != vocab[""]

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term2 = torch.pow(torch.tensor(10000.0), torch.arange(0, d_model, 2).float() / d_model)
        div_term1 = torch.pow(torch.tensor(10000.0), torch.arange(1, d_model, 2).float() / d_model)
        # 高级切片方式,即从0开始,两个步长取一个。即奇数和偶数位置赋值不一样。直观来看就是每一句话的
        pe[:, 0::2] = torch.sin(position * div_term2)
        pe[:, 1::2] = torch.cos(position * div_term1)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class Transformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class,
                 dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=512, activation: str = "relu"):
        super(Transformer, self).__init__()
        # 词嵌入层
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
        # 编码层:使用Transformer
        encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_head, dim_feedforward, dropout, activation)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        # 输出层
        self.output = nn.Linear(hidden_dim, num_class)

    def forward(self, inputs, lengths):
        inputs = torch.transpose(inputs, 0, 1)
        hidden_states = self.embeddings(inputs)
        hidden_states = self.position_embedding(hidden_states)
        attention_mask = length_to_mask(lengths.to('cuda')) == False
        #最后的转置操作将数据还原为batch_first
        hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask).transpose(0, 1)
        logits = self.output(hidden_states)
        #取序列中每个输入的隐含层
        log_probs = F.log_softmax(logits, dim=-1)
        return log_probs

embedding_dim = 128
hidden_dim = 128
batch_size = 32
num_epoch = 5

#加载数据
train_data, test_data, vocab, pos_vocab = load_treebank()
train_dataset = TransformerDataset(train_data)
test_dataset = TransformerDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

num_class = len(pos_vocab)

#加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) #将模型加载到GPU中(如果已经正确安装)

#训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        log_probs = model(inputs, lengths)
        loss = nll_loss(log_probs[mask], targets[mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

#测试过程
acc = 0
total = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
    inputs, lengths, targets, mask = [x.to(device) for x in batch]
    with torch.no_grad():
        output = model(inputs, lengths)
        acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
        total += mask.sum().item()

#输出在测试集上的准确率
print(f"Acc: {acc / total:.2f}")

你可能感兴趣的:(pytorch,深度学习,人工智能)