语言模型在Pytorch的简单实现

导包和设置参数

import torchtext
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果可以复现,我们经常会把各种random seed固定在某一个值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)

BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000

分割数据集

  • 我们会继续使用上次的text8作为我们的训练,验证和测试数据。
  • TorchText的一个重要概念是Field,它决定了你的数据会如何被处理。我们使用TEXT这个field来处理文本数据。我们的TEXT field有lower=True这个参数,所以所有的单词都会被lowercase。
  • torchtext提供了LanguageModelingDataset这个class来帮助我们处理语言模型数据集。
  • build_vocab可以根据我们提供的训练数据集来创建最高频单词的单词表,max_size帮助我们限定单词总量。
  • BPTTIterator可以连续地得到连贯的句子,BPTT的全程是back propagation through time。
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path=".", 
    train="/content/gdrive/My Drive/Colab Notebooks/text8.train.txt", 
    validation="/content/gdrive/My Drive/Colab Notebooks/text8.dev.txt", 
    test="/content/gdrive/My Drive/Colab Notebooks/text8.test.txt", 
    text_field=TEXT)
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
print("vocabulary size: {}".format(len(TEXT.vocab)))

VOCAB_SIZE = len(TEXT.vocab)
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=-1, bptt_len=32, repeat=False, shuffle=True)
  • vocabulary size: 50002
  • 为什么我们的单词表有50002个单词而不是50000呢?因为TorchText给我们增加了两个特殊的token,表示未知的单词,表示padding。
  • 模型的输入是一串文字,模型的输出也是一串文字,他们之间相差一个位置,因为语言模型的目标是根据之前的单词预测下一个单词。

数据演示

其中该部分只是做演示使用。

it = iter(train_iter)
batch = next(it)
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:, 0].data])) #其中0指代的是第nth batch
print()
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,0].data]))

结果如下所示:
anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans of the french revolution whilst the term

originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans of the french revolution whilst the term is

其中,目标是根据前面的单词预测后面的单词,如anarchism的目标是预测下一个单词originated,anarchism何originated的目标是预测下一个单词as。

总结

  1. 为什么我们的单词表有50002个单词而不是50000呢?因为TorchText给我们增加了两个特殊的token,表示未知的单词,表示padding。
  2. 模型的输入是一串文字,模型的输出也是一串文字,他们之间相差一个位置,因为语言模型的目标是根据之前的单词预测下一个单词。

定义模型

  • 继承nn.Module
  • 初始化函数
  • forward函数
  • 其余可以根据模型需要定义相关的函数
import torch
import torch.nn as nn


class RNNModel(nn.Module):
    """ 一个简单的循环神经网络"""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        ''' 该模型包含以下几层:
            - 词嵌入层
            - 一个循环神经网络层(RNN, LSTM, GRU)
            - 一个线性层,从hidden state到输出单词表
            - 一个dropout层,用来做regularization
        '''
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid      #hidden size
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        ''' Forward pass:
            - word embedding
            - 输入循环神经网络
            - 一个线性层从hidden state转化为输出单词表
        '''
        # text:seq_length * batch_size
        emb = self.drop(self.encoder(input)) #seq_length * batch_size * embed_size
        output, hidden = self.rnn(emb, hidden)  
        # output:seq_length * batch_size * hidden
        # hidden:(1 * batch_size * hidden_size, 1 * batch_size * hidden_size)
        # output: (seq_length * batch_size) * hidden_size 
        output = self.drop(output)
        out_vocab= self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        # out_vocab: (seq_length * batch_size) * vocab_size
        return out_vocab.view(output.size(0), output.size(1), decoded.size(1)),  hidden

    def init_hidden(self, bsz, requires_grad=True): #bsz指的是batch_size
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad),
                    weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)) #LSTM返回两个
        else:
            return weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)

初始化模型

model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
if USE_CUDA:
    model = model.cuda()

模型评估

ef evaluate(model, data):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0.
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()
            hidden = repackage_hidden(hidden)
            with torch.no_grad():
                output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item()*np.multiply(*data.size())
            
    loss = total_loss / total_count
    model.train()
    return loss

我们需要定义下面的一个function,帮助我们把一个hidden state和计算图之前的历史分离。否则神经网络训练的句子过长,容易出现梯度消失或者梯度爆炸的情况。

# Remove this part
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

定义loss function和optimizer

loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

训练模型

基本步骤如下所示:

  1. 模型一般需要训练若干个epoch
  2. 每个epoch我们都把所有的数据分成若干个batch
  3. 把每个batch的输入和输出都包装成cuda tensor
  4. forward pass,通过输入的句子预测每个单词的下一个单词
  5. 用模型的预测和正确的下一个单词计算cross entropy loss
  6. 清空模型当前gradient
  7. backward pass
  8. gradient clipping,防止梯度爆炸
  9. 更新模型参数
  10. 每隔一定的iteration输出模型在当前iteration的loss,以及在验证集上做模型的评估
import copy
GRAD_CLIP = 1.
NUM_EPOCHS = 2

val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        if i % 1000 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())
    
        if i % 10000 == 0:
            val_loss = evaluate(model, val_iter)
            
            if len(val_losses) == 0 or val_loss < min(val_losses):
                print("best model, val loss: ", val_loss)
                torch.save(model.state_dict(), "lm-best.th")
            else:
                scheduler.step()
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            val_losses.append(val_loss)
best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load("lm-best.th"))

加载模型

best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load("lm-best.th"))

验证数据

使用最好的模型在valid数据上计算perplexity

val_loss = evaluate(best_model, val_iter)
print("perplexity: ", np.exp(val_loss))

使用最好的模型在测试数据上计算perplexity

test_loss = evaluate(best_model, test_iter)
print("perplexity: ", np.exp(test_loss))

使用训练好的模型生成一些句子

hidden = best_model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):
    output, hidden = best_model(input, hidden)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]  #采样导致每次生成结果不一致
    input.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

参考链接为

  • http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/
  • https://github.com/pytorch/text
  • https://torchtext.readthedocs.io/en/latest/index.html

你可能感兴趣的:(Pytorch)