视频链接:https://www.bilibili.com/video/BV1vz4y1R7Mm?p=4
预先下载 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz
然后将其解压
下面的tokenizer_language设置为 解压路径 + “en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0”
预先使用迅雷下载 http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
然后将文件移动到同级目录 ./data/imdb 中。
使用“解压到当前文件夹”进行解压
预先使用迅雷下载 https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/embeddings/glove/glove.6B.zip
然后移动到同级目录 .vector_cache 中。
import torch
from torchtext import data
SEED = 1 # 固定随机数种子 确保可复现
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True # ??
text = data.Field(tokenize='spacy', tokenizer_language=r'F:\tmp\en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0')
label = data.LabelField(dtype=torch.float)
from torchtext import datasets
train_data_all, test_data = datasets.IMDB.splits(text, label) # 耗时较长
print(vars(train_data.examples[0]))
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':', 'INSPECTOR', ':', 'I', "'m", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'STUDENT', ':', 'Welcome', 'to', 'Bromwell', 'High', '.', 'I', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'Bromwell', 'High', 'is', 'far', 'fetched', '.', 'What', 'a', 'pity', 'that', 'it', 'is', "n't", '!'], 'label': 'pos'}
import random
train_data, valid_data = train_data_all.split(random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000
创建 vocabulary
把单词映射到 index
text.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d') # glove是高质量的词向量
label.build_vocab(train_data) # train_data 里面本来就包含着 text 和 label
print(f'Unique tokens is text vocabulary: {len(text.vocab)}')
print(f'Unique tokens is label vocabulary: {len(label.vocab)}')
Unique tokens is text vocabulary: 25002
Unique tokens is label vocabulary: 2
text.vocab.itos[:10]
['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
label.vocab.itos
['neg', 'pos']
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter, valid_iter, test_iter = data.BucketIterator.splits( # BucketIterator会先对句子排序再PAD
(train_data, valid_data, test_data),
batch_size=BATCH_SIZE,
device=device)
batch = next(iter(valid_iter))
batch.text, batch.label
(tensor([[ 388, 6, 0, ..., 3281, 11, 66],
[ 371, 1127, 3306, ..., 3, 63, 23],
[1784, 666, 214, ..., 407, 28, 9],
...,
[ 13, 466, 23, ..., 1, 1, 1],
[ 68, 88, 4, ..., 1, 1, 1],
[ 4, 1, 1, ..., 1, 1, 1]]),
tensor([0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1.,
1., 1., 1., 0., 0., 0., 1., 0., 0., 0.]))
把每个单词都通过Embedding层投射成Word Embedding Vector,然后把一句话中的所有Word Vector做平均,得到整个句子的Vector表示。
接下来把Vector传入Linear层,做分类即可。
import torch
import torch.nn as nn
class WordAVGModel(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, padding_idx):
super(WordAVGModel, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
self.linear = nn.Linear(embedding_size, output_size)
def forward(self, text):
embedded = self.embed(text) # [seq_len, batch_size, embed_dim]
embedded = embedded.transpose(1, 0) # 交换第0和第1个维度, [batch_size, seq_len, embed_dim]
# or
# embedded = embedded.permute(1, 0, 2) # 维度重排序,依次指定每个维度对应的原维度
# 使用 avg_pool 之前, batch_size 必须挪到前面
pooled = nn.functional.avg_pool2d(embedded, (embedded.shape[1], 1)) # [batch_size, 1, embed_dim]
pooled = pooled.squeeze()
return self.linear(pooled).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]
model = WordAVGModel(vocab_size, embedding_dim, output_size, padding_idx)
print(model)
def count_parameters(model):
"""获取模型参数总数"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)
# numel 返回tensor的总数
print(count_parameters(model))
WordAVGModel(
(embed): Embedding(25002, 100, padding_idx=1)
(linear): Linear(in_features=100, out_features=1, bias=True)
)
2500301
# Embedding Weight 初始化
pretrained_embedding = text.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding) # 带_的是replace函数
model.embed.weight.data[padding_idx] = torch.zeros(embedding_dim)
unk_idx = text.vocab.stoi[text.unk_token]
model.embed.weight.data[unk_idx] = torch.zeros(embedding_dim)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss() # binary cross entropy with logits (sigmoid 之前叫 logits)
model = model.to(device)
def binary_accuracy(preds, y):
"""计算准确率"""
round_preds = torch.round(torch.sigmoid(preds)) # 四舍五入
correct = (round_preds == y).float()
accuracy = correct.sum() / len(correct)
return accuracy
def train(model, iterator, optimizer, crit):
epoch_loss = 0.
epoch_accu = 0.
total_len = 0
model.train()
for batch in iterator:
preds = model(batch.text)
loss = crit(preds, batch.label)
acc = binary_accuracy(preds, batch.label)
optimizer.zero_grad()
loss.backward() # loss 里面存放所有变量的梯度
optimizer.step()
epoch_loss += loss.item() * len(batch.label)
epoch_accu += acc.item() * len(batch.label)
total_len += len(batch.label)
return epoch_loss / total_len, epoch_accu / total_len
def evaluation(model, iterator, crit):
epoch_loss = 0.
epoch_accu = 0.
total_len = 0
model.eval()
for batch in iterator:
preds = model(batch.text)
loss = crit(preds, batch.label)
acc = binary_accuracy(preds, batch.label)
epoch_loss += loss.item() * len(batch.label)
epoch_accu += acc.item() * len(batch.label)
total_len += len(batch.label)
return epoch_loss / total_len, epoch_accu / total_len
n_epochs = 10
best_valid_accu = 0.
for epoch in range(n_epochs):
train_loss, train_accu = train(model, train_iter, optimizer, crit)
eval_loss, eval_accu = evaluation(model, valid_iter, crit)
if valid_accu > best_valid_accu:
best_valid_accu = valid_acc
torch.save(model.state_dict(), "wordavg_model.pytorch_model.bin")
print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")
model.load_state_dict(torch.load("wordavg_model.pytorch_model.bin"))
import spacy
nlp = spacy.load(r'F:\tmp\en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0')
def predict_sentiment(sentence):
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [text.vocab.stoi[t] for t in tokenized]
# Long Tensor是Long类型的Tensor
tensor = torch.LongTensor(indexed).to(device) # shape: [seq_len]
tensor = tensor.unsqueeze(1) # [seq_len, 1]
pred = torch.sigmoid(model(tensor)) # prediction
return pred.item()
print("GOOD:", predict_sentiment("This film is good."))
print("BAD: ", predict_sentiment("This film is realy bad."))
GOOD: 0.379981130361557
BAD: 0.4407946467399597
import torch
import torch.nn as nn
class RNNModel(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, padding_idx, hidden_size, dropout):
super(RNNModel, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, num_layers=2)
self.linear = nn.Linear(embedding_size * 2, output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embed(text) # [seq_len, batch_size, embed_dim]
embedded = self.dropout(embedded)
output, (hidden, cell) = self.lstm(embedded)
# hidden: 4 * batch_size * hidden_size
# 其中 4 = 双向 * num_layers
# 为什么用 hidden 不用 output?
# output 的最后一个就是 hidden_state
# output 的 shape 是 (seq_len*2) * batch_size * hidden_size
# hidden 的 shape 是 (num_layers*2) * batch_size * hidden_size
# print(output.shape)
# print(hidden.shape)
hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
hidden = self.dropout(hidden.squeeze())
return self.linear(hidden).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]
model = RNNModel(vocab_size=vocab_size,
embedding_size=embedding_dim,
output_size=output_size,
padding_idx=padding_idx,
hidden_size=100,
dropout=0.5)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss() # binary cross entropy with logits (sigmoid 之前叫 logits)
model = model.to(device)
n_epochs = 10
best_valid_accu = 0.
for epoch in range(n_epochs):
train_loss, train_accu = train(model, train_iter, optimizer, crit)
eval_loss, eval_accu = evaluation(model, valid_iter, crit)
if valid_accu > best_valid_accu:
best_valid_accu = valid_acc
torch.save(model.state_dict(), "lstm_model.pytorch_model.bin")
print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")
class CNNModel(nn.Module):
def __init__(self, vocab_size, embedding_size, output_size, padding_idx, num_filters, filter_sizes, dropout):
super(CNNModel, self).__init__()
self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
for filter_size in filter_sizes
])
# self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
self.linear = nn.Linear(num_filters * len(filter_sizes), output_size)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
text = text.permute(1, 0) # seq_len * batch_size -> batch_size * seq_len
embedded = self.embed(text) # batch_size * seq_len * embbedding_size
embedded = embedded.unsqueeze(1) # batch_size * 1 * seq_len * embedding_size
# conved = nn.functional.relu(self.conv(embedded)) # batch_size * num_filters * seq_len-filter_size+1 * 1
# conved = conved.squeeze() # batch_size * num_filters * seq_len-filter_size+1
# pooled = nn.functional.max_pool1d(conved, conved.shape[2]).squeeze(2)
conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
pooled = torch.cat(pooled, dim=1) # batch_size, 3 * n_filters
pooled = self.dropout(pooled)
return self.linear(pooled).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]
model = CNNModel(vocab_size=vocab_size,
embedding_size=embedding_dim,
output_size=output_size,
padding_idx=padding_idx,
num_filters=50,
filter_sizes=[3, 4, 5],
dropout=0.5)
# Embedding Weight 初始化
pretrained_embedding = text.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding) # 带_的是replace函数
unk_idx = text.vocab.stoi[text.unk_token]
model.embed.weight.data[padding_idx] = torch.zeros(embedding_dim)
model.embed.weight.data[unk_idx] = torch.zeros(embedding_dim)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()
model = model.to(device)
n_epochs = 10
best_valid_acc = 0.
for epoch in range(n_epochs):
train_loss, train_accu = train(model, train_iter, optimizer, crit)
eval_loss, eval_accu = evaluation(model, valid_iter, crit)
if valid_accu > best_valid_accu:
best_valid_accu = valid_acc
torch.save(model.state_dict(), "cnn_model.pytorch_model.bin")
print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")
test_loss, test_accu = evaluation(model, test_iter, crit)
print(f"Test loss {test_loss}\t acc {test_accu}")