本篇文章参考:
Faster Sentiment Analysis–with torchtext
部分细节可能会略作改动,代码注释尽数基于自己的理解。文章目的仅作个人领悟记录,并不完全是tutorial的翻译,可能并不适用所有初学者,但也可从中互相借鉴吸收参考。
接上一篇:torchtext使用–updated IMDB
之前用普通RNN和LSTM两种方法训练IMDB任务,效果不错,但是杀鸡焉用牛刀。事实上,对于IMDB这种情感分析的简单任务,完全不需要用RNN这种训练消耗带价那么大的工具,简单的训练embed然后每个词的感情取平均就可以起到很好的效果。
为了使得参数减少,这次实验将使用词袋
import torch
import torchtext
import torch.nn as nn
from torchtext import data
from torchtext import datasets
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import math
use_cuda=torch.cuda.is_available()
device=torch.device("cuda" if use_cuda else "cpu")
SEED=1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
根据zip()和zip(* )的使用可以写两种词袋生成的函数(关于zip()、zip(* )详见blog python3 zip() 函数使用指南):
def generate_bigrams(x):
n_grams = set(zip(*tuple([x[i:] for i in range(2)])))
for n_gram in n_grams:
x.append(' '.join(n_gram))
return x
def generate_bigrams(x):
n_grams = set(zip(*[x[i:] for i in range(2)]))
for n_gram in n_grams:
x.append(' '.join(n_gram))
return x
Field最基本的功能就是对数据集进行分词,当然也封装了其他很多语料预处理的操作。比如上一篇的include_lengths可以对处理的每一个句子分词之后,计算出其token的数量,主要在下游需要用到RNN模型的pack pad操作时使用。
这里,我们将不再使RNN,因此我们将不再用include_length参数。而是preprocessing,可以让Field对句子进行分词之后,对这些token进行一步预处理,随后再将其序列化。
TEXT=data.Field(tokenize="spacy",tokenizer_language="en_core_web_sm",
preprocessing=generate_bigrams)
LABEL=data.LabelField(dtype=torch.float)
train_data, test_data =datasets.IMDB.splits(TEXT,LABEL)
train_data, valid_data=train_data.split(split_ratio=0.8)
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,max_size=MAX_VOCAB_SIZE,vectors="glove.6B.100d",
unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator=\
data.BucketIterator.splits((train_data,valid_data,test_data),
batch_size=BATCH_SIZE,device=device)
一个极其简单的二维池化,算出一个句子所有token的embedding的平均值
class FastText(nn.Module):
def __init__(self,vocab_size, embedding_dim, output_dim, pad_idx):
super(FastText, self).__init__()
self.embed=nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx)
self.fc=nn.Linear(embedding_dim,output_dim)
def forward(self, text:torch.Tensor):
#text:(seq,batch)
embeded=self.embed(text)
#embeded:(seq,batch,embedding_size)
embeded=embeded.permute(1,0,2)
#embeded:(batch,seq,embedding_size)
pooled=F.avg_pool2d(embeded,(embeded.shape[1],1)).squeeze(1)
#pooled:(batch,embedding_size)
return self.fc(pooled) #(batch,outpit_dim)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model=FastText(INPUT_DIM,EMBEDDING_DIM,OUTPUT_DIM,PAD_IDX)
model.embed.weight.data[PAD_IDX].shape#查看一下维度
torch.Size([100])
单个token的vector取出来是只有一维的,因此全零tensor也只设置一个维度就好了
UNK_IDX=TEXT.vocab.stoi[TEXT.unk_token]
model.embed.weight.data[PAD_IDX]=torch.zeros(EMBEDDING_DIM)
model.embed.weight.data[UNK_IDX]=torch.zeros(EMBEDDING_DIM)
criterion=nn.BCEWithLogitsLoss()
#搬到GPU上
model=model.to(device)
criterion=criterion.to(device)
#Adam不用传学习率,内部有针对的学习率调整
optimizer=optim.Adam(model.parameters())
accuracy计算
def binary_accuracy(preds:torch.Tensor, y:torch.Tensor):
#preds:(batch)
preds=torch.round(torch.sigmoid(preds))
correct=torch.sum((preds==y).float())
return correct/len(preds)
def train(model: nn.Module, iterator: data.BucketIterator,
optimizer: optim.Adam, criterion: nn.BCEWithLogitsLoss):
model.train()
epoch_loss = 0.
epoch_acc = 0.
for batch in iterator:
preds = model(batch.text).squeeze(1) # (batch)
loss = criterion(preds, batch.label)
acc = binary_accuracy(preds, batch.label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_acc += acc.item()
epoch_loss += loss.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model: nn.Module, iterator: data.BucketIterator,
criterion: nn.BCEWithLogitsLoss):
model.eval()
epoch_loss = 0.
epoch_acc = 0.
with torch.no_grad():
for batch in iterator:
preds = model(batch.text).squeeze(1) # (batch)
loss = criterion(preds, batch.label)
acc = binary_accuracy(preds, batch.label)
epoch_acc += acc.item()
epoch_loss += loss.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
import time
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'FastText_IMDB.pth')
print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%')
Epoch: 01 | Epoch Time: 0m 9s
Train Loss: 0.685 | Train Acc: 60.20%
Val. Loss: 0.613 | Val. Acc: 68.95%
Epoch: 02 | Epoch Time: 0m 9s
Train Loss: 0.628 | Train Acc: 75.32%
Val. Loss: 0.457 | Val. Acc: 79.25%
Epoch: 03 | Epoch Time: 0m 8s
Train Loss: 0.532 | Train Acc: 82.34%
Val. Loss: 0.386 | Val. Acc: 83.56%
Epoch: 04 | Epoch Time: 0m 8s
Train Loss: 0.445 | Train Acc: 86.35%
Val. Loss: 0.364 | Val. Acc: 85.96%
Epoch: 05 | Epoch Time: 0m 8s
Train Loss: 0.384 | Train Acc: 88.46%
Val. Loss: 0.370 | Val. Acc: 87.03%
最后验证一下:
model.load_state_dict(torch.load("FastText_IMDB.pth"))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
Test Loss: 0.384 | Test Acc: 85.20%
可以看到速度快了很多,而且准确率也很高。这是因为对于情感分析这种自然语言处理任务,并不依赖时序,RNN显得没什么用武之地,并没有突出的优势。它只是一种简单的感知任务,对于情感的判断其实只要借助句子中的几个关键字就可以了。avg_pooled是一个不错的选择。
import spacy
nlp = spacy.load('en_core_web_sm')
def predict_sentiment(model, sentence):
model.eval()
tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
indexed = [TEXT.vocab.stoi[t] for t in tokenized]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
prediction = torch.sigmoid(model(tensor))
return prediction.item()
因为是求平均,所以下面两句话分不出来很大的区别
predict_sentiment(model,'This movie is a bomb!')
0.9994305968284607
predict_sentiment(model,'This movie is the bomb!')
0.9022582173347473
简单的标点:
predict_sentiment(model,'!')
0.9999986886978149
predict_sentiment(model,'?')
0.0
来点骚的(兄嘚借一部说话)
predict_sentiment(model,'The leg of the actress can be palyed a year!')
0.9121155142784119
下一篇:torchtext使用–convolution IMDB