代码链接
torchtext的使用可以参考这篇博客《torchtext 的基本使用》
确定性操作通常比非确定性操作慢,因此您的模型的单次运行性能可能会降低。但是,确定性可以通过促进实验、调试和回归测试来节省开发时间。
上面的引用来自官方文档:https://pytorch.org/docs/stable/notes/randomness.html#reproducibility
模型的可重现性是指,在相同的网络结构、超参数下,相同的数据输入模型具有相同的输出效果。
保证模型的可重现性可从两个方面进行,具体方式参见上面给出的官方文档;
import torch
import random
import numpy as np
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# torch.use_deterministic_algorithms(True)Token
导入 BertTokenizer 的目的是为了方便后面使用 torchtext 构建Filed;
BertTokenizer 继承了 PreTrainedTokenizer,Tokenizers 负责为模型准备输入,主要有以下方法:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
# 获取特殊字符,BERT模型中,输入序列的第一个token为 '',序列结束为''
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
# 输入的最大长度,即输入 token 的最大数量
max_input_length = tokenizer.max_model_input_sizes['bert-base-chinese']
bert-base-chinses
从 huggingface 官方下载,git clone [https://huggingface.co/bert-base-chinese](https://huggingface.co/bert-base-chinese)
;
使用 torchtext 构建Filed、Dataset 和 Iterator;
token化的函数,即分词函数;输入的真实 token 大小应该为 max_input_length - 2,因为要在输入的开始位置添加字符
,末尾位置添加字符
;
def tokenize_and_cut(sentence):
tokens = tokenizer.tokenize(sentence)
tokens = tokens[: max_input_length - 2]
return tokens
因为关键字参数 preprocessing = tokenizer.convert_tokens_to_ids
,所以将 init_token
等特殊字符都设置为这些特殊字符在 vocabulary 中对应的 id;
from torchtext.legacy import data
TEXT = data.Field(batch_first=True,
use_vocab=False,
tokenize=tokenize_and_cut,
preprocessing=tokenizer.convert_tokens_to_ids,
init_token = init_token_idx,
eos_token=eos_token_idx,
pad_token=pad_token_idx,
unk_token=unk_token_idx)
LABEL = data.LabelField()
将 Field 与csv文件中的字段对应,构建Dataset;
fields = [('label', LABEL), ('comment_processed', TEXT)]
train_Dataset, val_Dataset, test_Dataset = data.TabularDataset.splits(
path='/workspace/vscode/works/研一上学期任务/data',
format='csv',
train='train_data.csv',
validation='valid_data.csv',
test='test_data.csv',
skip_header=True,
fields=fields)
LABEL.build_vocab(train_Dataset)
print(LABEL.vocab.stoi)
生成迭代器
batch_size = 64
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_Dataset, val_Dataset, test_Dataset),
batch_size=batch_size,
sort=False,
device=device)
代码说明,参考官方文档:https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=torch nn gru#torch.nn.GRU
嵌入层的维度通过 bert.config.to_dict()['hidden_size']
导入;
使用 dropout 来缓解过拟合;
embeded = self.bert(text)[0]
,由论文《**BERT pre-training of deep bidirectional transformers for language understanding》**可知,使用BERT输入的第一个token 对应的输出作为 RNN 的输入;
模型 GRU 的关键字参数 batch_first 的值要和 torchtext 中 Field 的关键字参数 batch_first 的值一样;如果不一样,模型中的矩阵运算会出错;若为True,则输入的第一个维度为批量大小(batch size),若为False,输入的第一个维度为样本大小(sentence len);
模型 GRU 中的关键字参数 dropout,默认值为0,如果不为0,则在除最后一层之外的每个 GRU 层的输出上引入一个 Dropout 层;
隐藏层的大小为 (D * num_layers, batch size, hidden_size), D = 2 if bidirectional=True else 1;
torch.cat()方法,将多个tensor拼接成一个tensor,每个tensor的形状必须一样,关键字参数 dim默认值为0,表示按列拼接,dim=1 表示按行拼接;
如果为多层的循环神经网络,我们只取最后一层的隐藏单元的权重参数,所以有下面的代码:
if self.rnn.bidirectional:
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
else:
hidden = self.dropout(hidden[-1,:,:])
import torch.nn as nn
from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-chinese')
class BERTGRUSentiment(nn.Module):
def __init__(self, bert, hidden_dim, nums_output, n_layers, bidirectional, dropout):
super(BERTGRUSentiment, self).__init__()
self.bert = bert
embedding_dim = bert.config.to_dict()['hidden_size']
self.rnn = nn.GRU(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
batch_first=True,
dropout = 0 if n_layers < 2 else dropout)
self.dropout = nn.Dropout(dropout)
self.output = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, nums_output)
def forward(self, text):
# text的大小: (batch size, sentence len)
# with torch.no_grad():
embeded = self.bert(text)[0]
# embeded的大小:(batch size, sentence len, embeded dim)
_, hidden = self.rnn(embeded)
# hidden的大小:(nums_layer * bidirectional, batch size, embeded dim)
if self.rnn.bidirectional:
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
else:
hidden = self.dropout(hidden[-1,:,:])
# output的大小:(batch size, nums_output)
output = self.output(hidden)
return output
import time
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
def accuracy(pred, y):
correct = (pred.argmax(dim=1) == y).float()
return correct.sum() / len(correct)
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
preds = model(batch.comment_processed).squeeze(1)
loss = criterion(preds, batch.label)
acc = accuracy(preds, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
preds = model(batch.comment_processed).squeeze(1)
loss = criterion(preds, batch.label)
acc = accuracy(preds, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
NUM_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(NUM_EPOCHS):
start_time = time.time()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), "bert-GRU-Reviews-Sentiment.pt")
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
model.load_state_dict(torch.load('bert-GRU-Reviews-Sentiment.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')