1.加载数据
使用NLTK提供的句子倾向性分析数据
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import math
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from collections import defaultdict
#tqdm是一个Python模块,能以进度条的方式显式迭代的进度
from tqdm.auto import tqdm
def load_sentence_polarity():
from nltk.corpus import sentence_polarity
vocab = Vocab.build(sentence_polarity.sents())
#褒贬各取4000句子作为训练集,标签分为0和1,每个样本是由索引值列表和标签组成的元组
train_data = [(vocab.convert_tokens_to_ids(sentence), 0)
for sentence in sentence_polarity.sents(categories='pos')[:4000]] \
+ [(vocab.convert_tokens_to_ids(sentence), 1)
for sentence in sentence_polarity.sents(categories='neg')[:4000]]
test_data = [(vocab.convert_tokens_to_ids(sentence), 0)
for sentence in sentence_polarity.sents(categories='pos')[4000:]] \
+ [(vocab.convert_tokens_to_ids(sentence), 1)
for sentence in sentence_polarity.sents(categories='neg')[4000:]]
return train_data, test_data, vocab
class BowDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
2.构建词表,实现标志(token)和索引之间的联系
# Defined in Section 4.6.1
from collections import defaultdict, Counter
class Vocab:
def __init__(self, tokens=None):
self.idx_to_token = list()
self.token_to_idx = dict()
if tokens is not None:
if "" not in tokens:
tokens = tokens + [""]
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
self.unk = self.token_to_idx['']
@classmethod #注意这个注释的作用
def build(cls, text, min_freq=1, reserved_tokens=None):
token_freqs = defaultdict(int) #储存标记到出现次数的映射字典
for sentence in text:
for token in sentence:
token_freqs[token] += 1
uniq_tokens = [""] + (reserved_tokens if reserved_tokens else [])
uniq_tokens += [token for token, freq in token_freqs.items() \
if freq >= min_freq and token != ""]
return cls(uniq_tokens)
def __len__(self):
#返回词表的大小,即词表中有多少不同的标记
return len(self.idx_to_token)
def __getitem__(self, token):
#查找输入标记对应的索引值,如果标记不存在则返回的索引值为0
return self.token_to_idx.get(token, self.unk)
def convert_tokens_to_ids(self, tokens):
#查找一系列输入标记对应的索引值,self[token]就是调用上述的getitem
return [self[token] for token in tokens]
def convert_ids_to_tokens(self, indices):
#查找一系列索引值对应的标记
return [self.idx_to_token[index] for index in indices]
3.数据加载
#collate_fn1用于对一个批次的样本进行整理
def collate_fn1(examples):
#获取每个序列的长度
lengths = torch.tensor([len(ex[0]) for ex in examples])
inputs = [torch.tensor(ex[0]) for ex in examples]
targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
# 对batch内的样本进行padding,使其具有相同长度
inputs = pad_sequence(inputs, batch_first=True)
return inputs, lengths, targets
# 超参数设置
embedding_dim = 128
hidden_dim = 128
num_class = 2
batch_size = 32
num_epoch = 5
# 加载数据
train_data, test_data, vocab = load_sentence_polarity()
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn1, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn1, shuffle=False)
4.模型
4.1LSTM
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
super(LSTM, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.output = nn.Linear(hidden_dim, num_class)
def forward(self, inputs, lengths):
embeddings = self.embeddings(inputs)
#将变长序列打包
x_pack = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
hidden, (hn, cn) = self.lstm(x_pack)
outputs = self.output(hn[-1]) #hn[1,batch_size,hidden_size],outpusts[batch_size,num_class]
log_probs = F.log_softmax(outputs, dim=-1)
return log_probs
4.2transformer
#其作用是根据一个批次中每个序列的长度生成MASK矩阵,即忽略较短序列的无效部分
def length_to_mask(lengths):
max_len = torch.max(lengths)
mask = torch.arange(max_len).expand(lengths.shape[0], max_len).cuda() < lengths.unsqueeze(1)
return mask
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=512):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return x
class Transformer(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class,
dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
super(Transformer, self).__init__()
# 词嵌入层
self.embedding_dim = embedding_dim
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
# 编码层:使用Transformer
encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_head, dim_feedforward, dropout, activation)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
# 输出层
self.output = nn.Linear(hidden_dim, num_class)
def forward(self, inputs, lengths):
inputs = torch.transpose(inputs, 0, 1)#inputs第一维是长度第二维批次
hidden_states = self.embeddings(inputs)
hidden_states = self.position_embedding(hidden_states)
attention_mask = length_to_mask(lengths) == False
hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask) #seqlen,batch_size,hidden_dim
hidden_states = hidden_states[0, :, :]
output = self.output(hidden_states)
log_probs = F.log_softmax(output, dim=1)
return log_probs
5.模型训练与预测
# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device) # 将模型加载到GPU中(如果已经正确安装)
# 训练过程
nll_loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用Adam优化器
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
inputs, lengths, targets = [x.to(device) for x in batch]
log_probs = model(inputs, lengths)
loss = nll_loss(log_probs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
# 测试过程
acc = 0
for batch in tqdm(test_data_loader, desc=f"Testing"):
inputs, lengths, targets = [x.to(device) for x in batch]
with torch.no_grad():
output = model(inputs, lengths)
acc += (output.argmax(dim=1) == targets).sum().item()
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader):.2f}")