import torch
import torchtext # pip install torchdata
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.nn import TransformerEncoder, TransformerEncoderLayer
#处理文本的思路:
# 1. 分词
# 2. 生成词表 he--30, her--31
# 3. 词嵌入 30--》(0.2, 0.4, 0.2, 0.9, 2.1) 独热编码 tf-idf hash
train_iter, test_iter = torchtext.datasets.IMDB()
from torchtext.data.utils import get_tokenizer # 分词工具
from torchtext.vocab import build_vocab_from_iterator # 创建词表工具
tokenizer = get_tokenizer('basic_english') # 分词工具做初始化
def yield_tokens(data):
for _, text in data:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["", ""])
vocab.set_default_index(vocab[""])
vocab(['this', 'is', 'a', 'book', 'about', 'pytorch'])
# [14, 10, 6, 276, 50, 1]
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x == 'pos')
text_pipeline('this is a book about pytorch')
# [14, 10, 6, 276, 50, 1]
label_pipeline('pos')
# 1
label_pipeline('neg')
# 0
from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(batch):
label_list, text_list = [], []
for (_label, _text) in batch:
label_list.append(label_pipeline(_label))
precess_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
text_list.append(precess_text)
label_list = torch.tensor(label_list)
text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
return label_list.to(device), text_list.to(device)
from torchtext.data.functional import to_map_style_dataset
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
train_dataloader = DataLoader(train_dataset, batch_size=16,
shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=16,
shuffle=True, collate_fn=collate_batch)
for i, (l, b) in enumerate(train_dataloader):
print(l.size(), b.size())
if i>9:
break
# torch.Size([16]) torch.Size([16, 450])
# torch.Size([16]) torch.Size([16, 660])
# torch.Size([16]) torch.Size([16, 993])
# torch.Size([16]) torch.Size([16, 533])
# torch.Size([16]) torch.Size([16, 1096])
# torch.Size([16]) torch.Size([16, 580])
# torch.Size([16]) torch.Size([16, 1073])
# torch.Size([16]) torch.Size([16, 446])
# torch.Size([16]) torch.Size([16, 1072])
# torch.Size([16]) torch.Size([16, 1103])
# torch.Size([16]) torch.Size([16, 1012])
# 创建模型
class Net(nn.Module):
def __init__(self, vocab_size, embeding_dim, num_heads, depth=12):
super(Net, self).__init__()
self.em = nn.Embedding(vocab_size, embeding_dim)
encoder_layers = TransformerEncoderLayer(embeding_dim, num_heads, dropout=0.5, activation='relu', batch_first=True)
self.transformer_encoder = TransformerEncoder(encoder_layers, depth)
self.avg = nn.AdaptiveAvgPool1d(10)
self.fc1 = nn.Linear(embeding_dim*10, 64)
self.fc2 = nn.Linear(64, 2)
def forward(self, inputs):
x = self.em(inputs)
x = self.transformer_encoder(x).transpose(1, 2)
x = self.avg(x)
x = x.reshape(-1, x.size(1)*x.size(2))
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
vocab_size = len(vocab)
emsize = 96 # embeding_dim 必须整除 num_heads
num_heads = 8
model = Net(vocab_size, emsize, num_heads, depth=6).to(device)
loss_fn = nn.CrossEntropyLoss()
from torch.optim import lr_scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
def train(dataloader):
total_acc, total_count, total_loss, = 0, 0, 0
model.train()
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
with torch.no_grad():
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def test(dataloader):
model.eval()
total_acc, total_count, total_loss, = 0, 0, 0
with torch.no_grad():
for label, text in dataloader:
predicted_label = model(text)
loss = loss_fn(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item()*label.size(0)
return total_loss/total_count, total_acc/total_count
def fit(epochs, train_dl, test_dl):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
epoch_loss, epoch_acc = train(train_dl)
epoch_test_loss, epoch_test_acc = test(test_dl)
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
test_loss.append(epoch_test_loss)
test_acc.append(epoch_test_acc)
exp_lr_scheduler.step()
template = ("epoch:{:2d}, train_loss: {:.5f}, train_acc: {:.1f}% ,"
"test_loss: {:.5f}, test_acc: {:.1f}%")
print(template.format(
epoch, epoch_loss, epoch_acc*100, epoch_test_loss, epoch_test_acc*100))
print("Done!")
return train_loss, test_loss, train_acc, test_acc
EPOCHS = 30
train_loss, test_loss, train_acc, test_acc = fit(EPOCHS,
train_dataloader,
test_dataloader)