长短期记忆网络(Long Short-Term Memory,简称LSTM),是RNN的一种,为了解决RNN存在长期依赖问题而设计出来的。
LSTM的基本结构:
LSTM与RNN的结构相比,在参数更新的过程中,增加了三个门,由左到右分别是遗忘门(也称记忆门)、输入门、输出门。
图片来源:
LSTM的工作原理究竟是什么?深入了解LSTM-电子发烧友网
1.点乘操作决定多少信息可以传送过去,当为0时,不传送;当为1时,全部传送。
对于输入xt和ht-1,遗忘门会输出一个值域为[0, 1]的数字,放进Ct−1中。当为0时,全部删除;当为1时,全部保留。
对于对于输入xt和ht-1,输入门会选择信息的去留,并且通过tanh激活函数更新临时Ct
通过遗忘门和输入门输出累加,更新最终的Ct
通过Ct和输出门,更新memory
__ init __(input _ size, hidden_size,num _layers)
LSTM.foward():
out,[ht,ct] = lstm(x,[ht-1,ct-1])
x:[一句话单词数,batch几句话,表示的维度]
h/c:[层数,batch,记忆(参数)的维度]
out:[一句话单词数,batch,参数的维度]
import torch |
|
import torch.nn as nn |
|
lstm = nn.LSTM(input_size = 100,hidden_size = 20,num_layers = 4) |
|
print(lstm) |
|
#LSTM(100, 20, num_layers=4) |
|
x = torch.randn(10,3,100) |
|
out,(h,c)=lstm(x) |
|
print(out.shape,h.shape,c.shape) |
|
#torch.Size([10, 3, 20]) torch.Size([4, 3, 20]) torch.Size([4, 3, 20]) |
单层使用方法:
cell = nn.LSTMCell(input_size = 100,hidden_size=20) |
|
x = torch.randn(10,3,100) |
|
h = torch.zeros(3,20) |
|
c = torch.zeros(3,20) |
|
for xt in x: |
|
h,c = cell(xt,[h,c]) |
|
print(h.shape,c.shape) |
|
#torch.Size([3, 20]) torch.Size([3, 20]) |
Google CoLab环境,需要魔法。
import torch |
|
from torch import nn, optim |
|
from torchtext import data, datasets |
|
print('GPU:', torch.cuda.is_available()) |
|
torch.manual_seed(123) |
|
TEXT = data.Field(tokenize='spacy') |
|
LABEL = data.LabelField(dtype=torch.float) |
|
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) |
|
print('len of train data:', len(train_data)) |
|
print('len of test data:', len(test_data)) |
|
print(train_data.examples[15].text) |
|
print(train_data.examples[15].label) |
|
# word2vec, glove |
|
TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d') |
|
LABEL.build_vocab(train_data) |
|
batchsz = 30 |
|
device = torch.device('cuda') |
|
train_iterator, test_iterator = data.BucketIterator.splits( |
|
(train_data, test_data), |
|
batch_size = batchsz, |
|
device=device |
|
) |
|
class RNN(nn.Module): |
|
def __init__(self, vocab_size, embedding_dim, hidden_dim): |
|
""" |
|
""" |
|
super(RNN, self).__init__() |
|
# [0-10001] => [100] |
|
self.embedding = nn.Embedding(vocab_size, embedding_dim) |
|
# [100] => [256] |
|
self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, |
|
bidirectional=True, dropout=0.5) |
|
# [256*2] => [1] |
|
self.fc = nn.Linear(hidden_dim*2, 1) |
|
self.dropout = nn.Dropout(0.5) |
|
def forward(self, x): |
|
""" |
|
x: [seq_len, b] vs [b, 3, 28, 28] |
|
""" |
|
# [seq, b, 1] => [seq, b, 100] |
|
embedding = self.dropout(self.embedding(x)) |
|
# output: [seq, b, hid_dim*2] |
|
# hidden/h: [num_layers*2, b, hid_dim] |
|
# cell/c: [num_layers*2, b, hid_di] |
|
output, (hidden, cell) = self.rnn(embedding) |
|
# [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2] |
|
hidden = torch.cat([hidden[-2], hidden[-1]], dim=1) |
|
# [b, hid_dim*2] => [b, 1] |
|
hidden = self.dropout(hidden) |
|
out = self.fc(hidden) |
|
return out |
|
rnn = RNN(len(TEXT.vocab), 100, 256) |
|
pretrained_embedding = TEXT.vocab.vectors |
|
print('pretrained_embedding:', pretrained_embedding.shape) |
|
rnn.embedding.weight.data.copy_(pretrained_embedding) |
|
print('embedding layer inited.') |
|
optimizer = optim.Adam(rnn.parameters(), lr=1e-3) |
|
criteon = nn.BCEWithLogitsLoss().to(device) |
|
rnn.to(device) |
|
import numpy as np |
|
def binary_acc(preds, y): |
|
""" |
|
get accuracy |
|
""" |
|
preds = torch.round(torch.sigmoid(preds)) |
|
correct = torch.eq(preds, y).float() |
|
acc = correct.sum() / len(correct) |
|
return acc |
|
def train(rnn, iterator, optimizer, criteon): |
|
avg_acc = [] |
|
rnn.train() |
|
for i, batch in enumerate(iterator): |
|
# [seq, b] => [b, 1] => [b] |
|
pred = rnn(batch.text).squeeze(1) |
|
# |
|
loss = criteon(pred, batch.label) |
|
acc = binary_acc(pred, batch.label).item() |
|
avg_acc.append(acc) |
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
if i%10 == 0: |
|
print(i, acc) |
|
avg_acc = np.array(avg_acc).mean() |
|
print('avg acc:', avg_acc) |
|
def eval(rnn, iterator, criteon): |
|
avg_acc = [] |
|
rnn.eval() |
|
with torch.no_grad(): |
|
for batch in iterator: |
|
# [b, 1] => [b] |
|
pred = rnn(batch.text).squeeze(1) |
|
# |
|
loss = criteon(pred, batch.label) |
|
acc = binary_acc(pred, batch.label).item() |
|
avg_acc.append(acc) |
|
avg_acc = np.array(avg_acc).mean() |
|
print('>>test:', avg_acc) |
|
for epoch in range(10): |
|
eval(rnn, test_iterator, criteon) |
|
train(rnn, train_iterator, optimizer, criteon) |