Pytorch学习(4)——自然语言分类模型

视频链接:https://www.bilibili.com/video/BV1vz4y1R7Mm?p=4

预先下载 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.3.0.tar.gz

然后将其解压

下面的tokenizer_language设置为 解压路径 + “en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0”

预先使用迅雷下载 http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

然后将文件移动到同级目录 ./data/imdb 中。

使用“解压到当前文件夹”进行解压

预先使用迅雷下载 https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/embeddings/glove/glove.6B.zip

然后移动到同级目录 .vector_cache 中。

import torch
from torchtext import data

SEED = 1  # 固定随机数种子 确保可复现

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  # ??

text = data.Field(tokenize='spacy', tokenizer_language=r'F:\tmp\en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0')
label = data.LabelField(dtype=torch.float)
from torchtext import datasets

train_data_all, test_data = datasets.IMDB.splits(text, label)  # 耗时较长
print(vars(train_data.examples[0]))
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High', '.', 'A', 'classic', 'line', ':', 'INSPECTOR', ':', 'I', "'m", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'STUDENT', ':', 'Welcome', 'to', 'Bromwell', 'High', '.', 'I', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'Bromwell', 'High', 'is', 'far', 'fetched', '.', 'What', 'a', 'pity', 'that', 'it', 'is', "n't", '!'], 'label': 'pos'}
import random
train_data, valid_data = train_data_all.split(random_state=random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')
Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000

创建 vocabulary

把单词映射到 index

text.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d')  # glove是高质量的词向量
label.build_vocab(train_data)  # train_data 里面本来就包含着 text 和 label
print(f'Unique tokens is text vocabulary: {len(text.vocab)}')
print(f'Unique tokens is label vocabulary: {len(label.vocab)}')
Unique tokens is text vocabulary: 25002
Unique tokens is label vocabulary: 2
text.vocab.itos[:10]
['', '', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
label.vocab.itos
['neg', 'pos']
BATCH_SIZE = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iter, valid_iter, test_iter = data.BucketIterator.splits(  # BucketIterator会先对句子排序再PAD
        (train_data, valid_data, test_data), 
        batch_size=BATCH_SIZE,
        device=device)
batch = next(iter(valid_iter))
batch.text, batch.label
(tensor([[ 388,    6,    0,  ..., 3281,   11,   66],
         [ 371, 1127, 3306,  ...,    3,   63,   23],
         [1784,  666,  214,  ...,  407,   28,    9],
         ...,
         [  13,  466,   23,  ...,    1,    1,    1],
         [  68,   88,    4,  ...,    1,    1,    1],
         [   4,    1,    1,  ...,    1,    1,    1]]),
 tensor([0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
         1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
         1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1.,
         1., 1., 1., 0., 0., 0., 1., 0., 0., 0.]))

Word Averaging 模型

把每个单词都通过Embedding层投射成Word Embedding Vector,然后把一句话中的所有Word Vector做平均,得到整个句子的Vector表示。

接下来把Vector传入Linear层,做分类即可。

import torch
import torch.nn as nn

class WordAVGModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, padding_idx):
        super(WordAVGModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        self.linear = nn.Linear(embedding_size, output_size)
        
    def forward(self, text):
        embedded = self.embed(text)  # [seq_len, batch_size, embed_dim]
        embedded = embedded.transpose(1, 0)  # 交换第0和第1个维度, [batch_size, seq_len, embed_dim]
        # or 
        # embedded = embedded.permute(1, 0, 2)  # 维度重排序,依次指定每个维度对应的原维度
        # 使用 avg_pool 之前, batch_size 必须挪到前面
        pooled = nn.functional.avg_pool2d(embedded, (embedded.shape[1], 1))  # [batch_size, 1, embed_dim]
        pooled = pooled.squeeze()
        return self.linear(pooled).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]

model = WordAVGModel(vocab_size, embedding_dim, output_size, padding_idx)
print(model)

def count_parameters(model):
    """获取模型参数总数"""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# numel 返回tensor的总数

print(count_parameters(model))
WordAVGModel(
  (embed): Embedding(25002, 100, padding_idx=1)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)
2500301
# Embedding Weight 初始化

pretrained_embedding = text.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)  # 带_的是replace函数
model.embed.weight.data[padding_idx] = torch.zeros(embedding_dim)
unk_idx = text.vocab.stoi[text.unk_token]
model.embed.weight.data[unk_idx] = torch.zeros(embedding_dim)

训练模型

optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()  # binary cross entropy with logits (sigmoid 之前叫 logits)

model = model.to(device)
def binary_accuracy(preds, y):
    """计算准确率"""
    round_preds = torch.round(torch.sigmoid(preds))  # 四舍五入
    correct = (round_preds == y).float()
    accuracy = correct.sum() / len(correct)
    return accuracy
def train(model, iterator, optimizer, crit):
    epoch_loss = 0.
    epoch_accu = 0.
    total_len = 0
    model.train()
    for batch in iterator:
        preds = model(batch.text)
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        optimizer.zero_grad()
        loss.backward()  # loss 里面存放所有变量的梯度
        optimizer.step()
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_accu += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    return epoch_loss / total_len, epoch_accu / total_len
def evaluation(model, iterator, crit):
    epoch_loss = 0.
    epoch_accu = 0.
    total_len = 0
    model.eval()
    
    for batch in iterator:
        preds = model(batch.text)
        loss = crit(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_accu += acc.item() * len(batch.label)
        total_len += len(batch.label)
        
    return epoch_loss / total_len, epoch_accu / total_len
n_epochs = 10
best_valid_accu = 0.
for epoch in range(n_epochs):
    train_loss, train_accu = train(model, train_iter, optimizer, crit)
    eval_loss, eval_accu = evaluation(model, valid_iter, crit)
    
    if valid_accu > best_valid_accu:
        best_valid_accu = valid_acc
        torch.save(model.state_dict(), "wordavg_model.pytorch_model.bin")
    
    print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
    print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")
model.load_state_dict(torch.load("wordavg_model.pytorch_model.bin"))
import spacy
nlp = spacy.load(r'F:\tmp\en_core_web_sm-2.3.0\en_core_web_sm\en_core_web_sm-2.3.0')
def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [text.vocab.stoi[t] for t in tokenized]
    # Long Tensor是Long类型的Tensor
    tensor = torch.LongTensor(indexed).to(device)  #  shape: [seq_len]
    tensor = tensor.unsqueeze(1)  # [seq_len, 1]
    pred = torch.sigmoid(model(tensor))  # prediction
    return pred.item()
print("GOOD:", predict_sentiment("This film is good."))
print("BAD: ", predict_sentiment("This film is realy bad."))
GOOD: 0.379981130361557
BAD:  0.4407946467399597

RNN模型

import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, padding_idx, hidden_size, dropout):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_size, hidden_size, bidirectional=True, num_layers=2)
        self.linear = nn.Linear(embedding_size * 2, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.embed(text)  # [seq_len, batch_size, embed_dim]
        embedded = self.dropout(embedded)
        output, (hidden, cell) = self.lstm(embedded)  
        # hidden: 4 * batch_size * hidden_size
        # 其中 4 = 双向 * num_layers
        
        # 为什么用 hidden 不用 output?
        # output 的最后一个就是 hidden_state
        # output 的 shape 是 (seq_len*2) * batch_size * hidden_size
        # hidden 的 shape 是 (num_layers*2) * batch_size * hidden_size
        
#         print(output.shape)
#         print(hidden.shape)
        
        hidden = torch.cat([hidden[-1], hidden[-2]], dim=1)
        hidden = self.dropout(hidden.squeeze())
        return self.linear(hidden).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]

model = RNNModel(vocab_size=vocab_size, 
                 embedding_size=embedding_dim, 
                 output_size=output_size, 
                 padding_idx=padding_idx,
                 hidden_size=100, 
                 dropout=0.5)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()  # binary cross entropy with logits (sigmoid 之前叫 logits)

model = model.to(device)
n_epochs = 10
best_valid_accu = 0.
for epoch in range(n_epochs):
    train_loss, train_accu = train(model, train_iter, optimizer, crit)
    eval_loss, eval_accu = evaluation(model, valid_iter, crit)
    
    if valid_accu > best_valid_accu:
        best_valid_accu = valid_acc
        torch.save(model.state_dict(), "lstm_model.pytorch_model.bin")
    
    print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
    print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")

CNN模型

class CNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, padding_idx, num_filters, filter_sizes, dropout):
        super(CNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
            for filter_size in filter_sizes
        ])
        # self.conv = nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(filter_size, embedding_size))
        self.linear = nn.Linear(num_filters * len(filter_sizes), output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1, 0)  # seq_len * batch_size -> batch_size * seq_len
        embedded = self.embed(text)  # batch_size * seq_len * embbedding_size
        embedded = embedded.unsqueeze(1)  # batch_size * 1 * seq_len * embedding_size
        
        # conved = nn.functional.relu(self.conv(embedded))  # batch_size * num_filters * seq_len-filter_size+1 * 1
        # conved = conved.squeeze()  # batch_size * num_filters * seq_len-filter_size+1
        # pooled = nn.functional.max_pool1d(conved, conved.shape[2]).squeeze(2)
        
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        pooled = torch.cat(pooled, dim=1)  # batch_size, 3 * n_filters
        pooled = self.dropout(pooled)
    
        return self.linear(pooled).squeeze()
vocab_size = len(text.vocab)
embedding_dim = 100
output_size = 1
padding_idx = text.vocab.stoi[text.pad_token]

model = CNNModel(vocab_size=vocab_size,
                 embedding_size=embedding_dim,
                 output_size=output_size,
                 padding_idx=padding_idx,
                 num_filters=50,
                 filter_sizes=[3, 4, 5],
                 dropout=0.5)
# Embedding Weight 初始化

pretrained_embedding = text.vocab.vectors
model.embed.weight.data.copy_(pretrained_embedding)  # 带_的是replace函数
unk_idx = text.vocab.stoi[text.unk_token]
model.embed.weight.data[padding_idx] = torch.zeros(embedding_dim)
model.embed.weight.data[unk_idx] = torch.zeros(embedding_dim)
optimizer = torch.optim.Adam(model.parameters())
crit = nn.BCEWithLogitsLoss()

model = model.to(device)
n_epochs = 10
best_valid_acc = 0.
for epoch in range(n_epochs):
    train_loss, train_accu = train(model, train_iter, optimizer, crit)
    eval_loss, eval_accu = evaluation(model, valid_iter, crit)
    
    if valid_accu > best_valid_accu:
        best_valid_accu = valid_acc
        torch.save(model.state_dict(), "cnn_model.pytorch_model.bin")
    
    print(f"Epoch {epoch}\tTrain loss {train_loss}\t acc {train_accu}")
    print(f"Epoch {epoch}\tValid loss {eval_loss}\t acc {eval_accu}")
test_loss, test_accu = evaluation(model, test_iter, crit)
print(f"Test loss {test_loss}\t acc {test_accu}")

你可能感兴趣的:(Pytorch学习)