目录
6.3 LSTM的记忆能力实验
6.3.1 模型构建
6.3.1.1 LSTM层
6.3.1.2 模型汇总
6.3.2 模型训练
6.3.2.1 训练指定长度的数字预测模型
6.3.2.2 多组训练
6.3.2.3 损失曲线展示
【思考题1】LSTM与SRN实验结果对比,谈谈看法。
6.3.3 模型评价
6.3.3.1 在测试集上进行模型评价
6.3.3.2 模型在不同长度的数据集上的准确率变化图
【思考题3】分析LSTM中单元状态和门数值的变化图,并用自己的话解释该图。
总结
使用LSTM模型重新进行数字求和实验,验证LSTM模型的长程依赖能力。
使用第6.1.2.4节中定义Model_RNN4SeqClass模型,并构建 LSTM 算子.
只需要实例化 LSTM ,并传入Model_RNN4SeqClass模型,就可以用 LSTM 进行数字求和实验。
# coding=gbk
import torch.nn.functional as F
import torch.nn as nn
import torch
# 声明LSTM和相关参数
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, Wi_attr=None, Wf_attr=None, Wo_attr=None, Wc_attr=None,
Ui_attr=None, Uf_attr=None, Uo_attr=None, Uc_attr=None, bi_attr=None, bf_attr=None,
bo_attr=None, bc_attr=None):
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
# 初始化模型参数
if Wi_attr==None:
self.W_i=torch.zeros(size=[input_size, hidden_size],dtype=torch.float32)
self.W_i=torch.nn.Parameter(self.W_i)
else:
self.W_i=torch.nn.Parameter(Wi_attr)
if Wf_attr==None:
self.W_f = torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
self.W_f = torch.nn.Parameter(self.W_f)
else:
self.W_f = torch.nn.Parameter(Wf_attr)
if Wo_attr==None:
self.W_o = torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
self.W_o = torch.nn.Parameter(self.W_o)
else:
self.W_o = torch.nn.Parameter(Wo_attr)
if Wc_attr==None:
self.W_c = torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
self.W_c = torch.nn.Parameter(self.W_c)
else:
self.W_c = torch.nn.Parameter(Wc_attr)
if Ui_attr==None:
self.U_i = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
self.U_i = torch.nn.Parameter(self.U_i)
else:
self.U_i = torch.nn.Parameter(Ui_attr)
if Uf_attr==None:
self.U_f = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
self.U_f = torch.nn.Parameter(self.U_f)
else:
self.U_f = torch.nn.Parameter(Uf_attr)
if Uo_attr==None:
self.U_o = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
self.U_o = torch.nn.Parameter(self.U_o)
else:
self.U_o = torch.nn.Parameter(Uo_attr)
if Uc_attr==None:
self.U_c = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
self.U_c = torch.nn.Parameter(self.U_c)
else:
self.U_c = torch.nn.Parameter(Uc_attr)
if bi_attr==None:
self.b_i = torch.zeros(size=[1, hidden_size], dtype=torch.float32)
self.b_i = torch.nn.Parameter(self.b_i)
else:
self.b_i = torch.nn.Parameter(bi_attr)
if bf_attr==None:
self.b_f = torch.zeros(size=[1, hidden_size], dtype=torch.float32)
self.b_f = torch.nn.Parameter(self.b_f)
else:
self.b_f = torch.nn.Parameter(bf_attr)
if bo_attr==None:
self.b_o = torch.zeros(size=[1, hidden_size], dtype=torch.float32)
self.b_o = torch.nn.Parameter(self.b_o)
else:
self.b_o = torch.nn.Parameter(bo_attr)
if bc_attr==None:
self.b_c = torch.zeros(size=[1, hidden_size], dtype=torch.float32)
self.b_c = torch.nn.Parameter(self.b_c)
else:
self.b_c = torch.nn.Parameter(bc_attr)
# 初始化状态向量和隐状态向量
def init_state(self, batch_size):
hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
return hidden_state, cell_state
# 定义前向计算
def forward(self, inputs, states=None):
# inputs: 输入数据,其shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = inputs.shape
# 初始化起始的单元状态和隐状态向量,其shape为batch_size x hidden_size
if states is None:
states = self.init_state(batch_size)
hidden_state, cell_state = states
# 执行LSTM计算,包括:输入门、遗忘门和输出门、候选内部状态、内部状态和隐状态向量
for step in range(seq_len):
# 获取当前时刻的输入数据step_input: 其shape为batch_size x input_size
step_input = inputs[:, step, :]
# 计算输入门, 遗忘门和输出门, 其shape为:batch_size x hidden_size
I_gate = F.sigmoid(torch.matmul(step_input, self.W_i) + torch.matmul(hidden_state, self.U_i) + self.b_i)
F_gate = F.sigmoid(torch.matmul(step_input, self.W_f) + torch.matmul(hidden_state, self.U_f) + self.b_f)
O_gate = F.sigmoid(torch.matmul(step_input, self.W_o) + torch.matmul(hidden_state, self.U_o) + self.b_o)
# 计算候选状态向量, 其shape为:batch_size x hidden_size
C_tilde = F.tanh(torch.matmul(step_input, self.W_c) + torch.matmul(hidden_state, self.U_c) + self.b_c)
# 计算单元状态向量, 其shape为:batch_size x hidden_size
cell_state = F_gate * cell_state + I_gate * C_tilde
# 计算隐状态向量,其shape为:batch_size x hidden_size
hidden_state = O_gate * F.tanh(cell_state)
return hidden_state
Wi_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wf_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wo_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wc_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Ui_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uf_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uo_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uc_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
bi_attr = torch.tensor([[0.1, 0.1]])
bf_attr = torch.tensor([[0.1, 0.1]])
bo_attr = torch.tensor([[0.1, 0.1]])
bc_attr = torch.tensor([[0.1, 0.1]])
lstm = LSTM(2, 2, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
inputs = torch.tensor([[[1, 0]]], dtype=torch.float32)
hidden_state = lstm(inputs)
print(hidden_state)
得到以下结果:
tensor([[0.0594, 0.0952]], grad_fn=)
这里我们可以将自己实现的SRN和Paddle框架内置的SRN返回的结果进行打印展示,实现代码如下
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn(size=[batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
paddle_lstm = nn.LSTM(input_size, hidden_size)
self_lstm = LSTM(input_size, hidden_size)
self_hidden_state = self_lstm(inputs)
paddle_outputs, (paddle_hidden_state, paddle_cell_state) = paddle_lstm(inputs)
print("self_lstm hidden_state: ", self_hidden_state.shape)
print("paddle_lstm outpus:", paddle_outputs.shape)
print("paddle_lstm hidden_state:", paddle_hidden_state.shape)
print("paddle_lstm cell_state:", paddle_cell_state.shape)
得到以下结果:
self_lstm hidden_state: torch.Size([8, 32])
paddle_lstm outpus: torch.Size([8, 20, 32])
paddle_lstm hidden_state: torch.Size([1, 20, 32])
paddle_lstm cell_state: torch.Size([1, 20, 32])
可以看到,自己实现的LSTM由于没有考虑多层因素,因此没有层次这个维度,因此其输出shape为[8, 32]。同时由于在以上代码使用Paddle内置API实例化LSTM时,默认定义的是1层的单向SRN,因此其shape为[1, 8, 32],同时隐状态向量为[8,20, 32].
接下来,我们可以将自己实现的LSTM与Paddle内置的LSTM在输出值的精度上进行对比,这里首先根据Paddle内置的LSTM实例化模型(为了进行对比,在实例化时只保留一个偏置,将偏置bih
import torch
torch.manual_seed(0)
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size, hidden_size = 2, 5, 10, 10
inputs = torch.randn(size=[batch_size, seq_len, input_size])
# 设置模型的hidden_size
bih_attr = torch.nn.Parameter(torch.zeros([4*hidden_size, ]))
paddle_lstm = nn.LSTM(input_size, hidden_size)
paddle_lstm.bias_ih_l0=bih_attr
paddle_lstm.bias_ih_l1=bih_attr
paddle_lstm.bias_ih_l2=bih_attr
paddle_lstm.bias_ih_l3=bih_attr
paddle_lstm.bias_ih_l4=bih_attr
# 获取paddle_lstm中的参数,并设置相应的paramAttr,用于初始化lstm
print(paddle_lstm.weight_ih_l0.T.shape)
chunked_W = torch.split(paddle_lstm.weight_ih_l0.T, split_size_or_sections=10, dim=-1)
chunked_U = torch.split(paddle_lstm.weight_hh_l0.T, split_size_or_sections=10, dim=-1)
chunked_b = torch.split(paddle_lstm.bias_hh_l0.T, split_size_or_sections=10, dim=-1)
print(chunked_b[0].shape,chunked_b[1].shape,chunked_b[2].shape)
Wi_attr = torch.tensor(chunked_W[0])
Wf_attr = torch.tensor(chunked_W[1])
Wc_attr = torch.tensor(chunked_W[2])
Wo_attr = torch.tensor(chunked_W[3])
Ui_attr = torch.tensor(chunked_U[0])
Uf_attr = torch.tensor(chunked_U[1])
Uc_attr = torch.tensor(chunked_U[2])
Uo_attr = torch.tensor(chunked_U[3])
bi_attr = torch.tensor(chunked_b[0])
bf_attr = torch.tensor(chunked_b[1])
bc_attr = torch.tensor(chunked_b[2])
bo_attr = torch.tensor(chunked_b[3])
self_lstm = LSTM(input_size, hidden_size, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
# 进行前向计算,获取隐状态向量,并打印展示
self_hidden_state = self_lstm(inputs)
paddle_outputs, (paddle_hidden_state, _) = paddle_lstm(inputs)
print("paddle SRN:\n", paddle_hidden_state.detach().numpy().squeeze(0))
print("self SRN:\n", self_hidden_state.detach().numpy())
设置为0),然后提取该模型对应的参数,进行参数分割后,使用相应参数去初始化自己实现的LSTM,从而保证两者在参数初始化时是一致的。
在进行实验时,首先定义输入数据inputs
,然后将该数据分别传入Paddle内置的LSTM与自己实现的LSTM模型中,最后通过对比两者的隐状态输出向量。代码实现如下:
torch.Size([10, 40])
torch.Size([10]) torch.Size([10]) torch.Size([10])
paddle SRN:
[[ 0.06057303 0.0352371 -0.04730584 0.16420795 0.13122755 -0.15738934
0.1771467 -0.00439037 -0.02465727 -0.3045934 ]
[ 0.14093119 0.11173882 0.27511147 0.04056947 -0.00766448 -0.16597556
0.32193324 0.01466936 -0.28634343 -0.19916353]
[ 0.14699097 0.03865489 -0.11907008 0.24300049 0.31992295 -0.07868578
0.19904399 0.03308991 0.09627407 -0.1424047 ]
[ 0.06207867 0.2342088 0.00657276 0.1791542 0.32928583 -0.04207081
-0.06663163 -0.00604617 -0.10334547 0.10602648]
[ 0.05457556 0.05111036 -0.10710873 0.00312713 -0.09948594 -0.11760624
0.11195059 0.13914587 -0.09120954 -0.1052993 ]]
self SRN:
[[ 0.0940564 -0.14659543 -0.14954016 0.20936163 -0.12826967 0.14749622
0.00946941 0.1993472 -0.06859784 -0.2767597 ]
[ 0.17217153 0.16705877 -0.05719084 0.14882174 0.10330292 -0.20432511
0.13150844 0.03508793 -0.07331903 -0.06966008]]
进程已结束,退出代码为 0
可以看到,两者的输出基本是一致的。另外,还可以进行对比两者在运算速度方面的差异。代码实现如下:
import time
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn(size=[batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
self_lstm = LSTM(input_size, hidden_size)
paddle_lstm = nn.LSTM(input_size, hidden_size)
# 计算自己实现的SRN运算速度
model_time = 0
for i in range(100):
strat_time = time.time()
hidden_state = self_lstm(inputs)
# 预热10次运算,不计入最终速度统计
if i < 10:
continue
end_time = time.time()
model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('self_lstm speed:', avg_model_time, 's')
# 计算Paddle内置的SRN运算速度
model_time = 0
for i in range(100):
strat_time = time.time()
outputs, (hidden_state, cell_state) = paddle_lstm(inputs)
# 预热10次运算,不计入最终速度统计
if i < 10:
continue
end_time = time.time()
model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('paddle_lstm speed:', avg_model_time, 's')
得到以下结果:
self_lstm speed: 0.004594082302517361 s
paddle_lstm speed: 0.0009862528906928168 s
可以看到,由于Torch框架的LSTM底层采用了C++实现并进行优化,Paddle框架内置的LSTM运行效率远远高于自己实现的LSTM。
在本节实验中,我们将使用6.1.2.4的Model_RNN4SeqClass作为预测模型,不同在于在实例化时将传入实例化的LSTM层。
动手联系6.2 在我们手动实现的LSTM算子中,是逐步计算每个时刻的隐状态。请思考如何实现更加高效的LSTM算子。
本节将基于RunnerV3类进行训练,首先定义模型训练的超参数,并保证和简单循环网络的超参数一致. 然后定义一个train函数,其可以通过指定长度的数据集,并进行训练. 在train函数中,首先加载长度为length的数据,然后实例化各项组件并创建对应的Runner,然后训练该Runner。同时在本节将使用4.5.4节定义的准确度(Accuracy)作为评估指标,代码实现如下:
import os
import random
import torch
import numpy as np
from nndl import RunnerV3
from nndl import Accuracy, RunnerV3
# 训练轮次
num_epochs = 500
# 学习率
lr = 0.001
# 输入数字的类别数
num_digits = 10
# 将数字映射为向量的维度
input_size = 32
# 隐状态向量的维度
hidden_size = 32
# 预测数字的类别数
num_classes = 19
# 批大小
batch_size = 8
# 模型保存目录
save_dir = "./checkpoints"
# 可以设置不同的length进行不同长度数据的预测实验
def train(length):
print(f"\n====> Training LSTM with data of length {length}.")
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
# 加载长度为length的数据
data_path = f"./datasets/{length}"
train_examples, dev_examples, test_examples = load_data(data_path)
train_set, dev_set, test_set = DigitSumDataset(train_examples), DigitSumDataset(dev_examples), DigitSumDataset(test_examples)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)
dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)
# 实例化模型
base_model = LSTM(input_size, hidden_size)
model = Model_RNN4SeqClass(base_model, num_digits, input_size, hidden_size, num_classes)
# 指定优化器
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
# 定义评价指标
metric = Accuracy()
# 定义损失函数
loss_fn = torch.nn.CrossEntropyLoss()
# 基于以上组件,实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 进行模型训练
model_save_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=100, log_steps=100, save_path=model_save_path)
return runner
接下来,分别进行数据长度为10, 15, 20, 25, 30, 35的数字预测模型训练实验,训练后的runner
保存至runners
字典中。
lstm_runners = {}
lengths = [10, 15, 20, 25, 30, 35]
for length in lengths:
runner = train(length)
lstm_runners[length] = runner
[Train] epoch: 486/500, step: 18500/19000, loss: 0.03764
[Evaluate] dev score: 0.89000, dev loss: 0.38093
[Train] epoch: 489/500, step: 18600/19000, loss: 0.04673
[Evaluate] dev score: 0.88000, dev loss: 0.38495
[Train] epoch: 492/500, step: 18700/19000, loss: 0.06683
[Evaluate] dev score: 0.88000, dev loss: 0.38080
[Train] epoch: 494/500, step: 18800/19000, loss: 0.05129
[Evaluate] dev score: 0.89000, dev loss: 0.38528
[Train] epoch: 497/500, step: 18900/19000, loss: 0.03119
[Evaluate] dev score: 0.89000, dev loss: 0.38632
[Evaluate] dev score: 0.89000, dev loss: 0.38519
# 画出训练过程中的损失图
for length in lengths:
runner = lstm_runners[length]
fig_name = f"./images/6.11_{length}.pdf"
plot_training_loss(runner, fig_name, sample_step=100)
这是之前做过的SRN的结果,直接拿过来看一下
可以明显对比出来LSTM在序列增长的时候收敛性更好,因为LSTM本身就是为了解决SRN长程依赖而改造的。
使用测试数据对在训练过程中保存的最好模型进行评价,观察模型在测试集上的准确率. 同时获取模型在训练过程中在验证集上最好的准确率,实现代码如下:
lstm_dev_scores = []
lstm_test_scores = []
for length in lengths:
print(f"Evaluate LSTM with data length {length}.")
runner = lstm_runners[length]
# 加载训练过程中效果最好的模型
model_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.load_model(model_path)
# 加载长度为length的数据
data_path = f"./datasets/{length}"
train_examples, dev_examples, test_examples = load_data(data_path)
test_set = DigitSumDataset(test_examples)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)
# 使用测试集评价模型,获取测试集上的预测准确率
score, _ = runner.evaluate(test_loader)
lstm_test_scores.append(score)
lstm_dev_scores.append(max(runner.dev_scores))
for length, dev_score, test_score in zip(lengths, lstm_dev_scores, lstm_test_scores):
print(f"[LSTM] length:{length}, dev_score: {dev_score}, test_score: {test_score: .5f}")
Evaluate LSTM with data length 10.
Evaluate LSTM with data length 15.
Evaluate LSTM with data length 20.
Evaluate LSTM with data length 25.
Evaluate LSTM with data length 30.
Evaluate LSTM with data length 35.
[LSTM] length:10, dev_score: 0.88, test_score: 0.90000
[LSTM] length:15, dev_score: 0.88, test_score: 0.84000
[LSTM] length:20, dev_score: 0.87, test_score: 0.81000
[LSTM] length:25, dev_score: 0.8, test_score: 0.67000
[LSTM] length:30, dev_score: 0.66, test_score: 0.58000
[LSTM] length:35, dev_score: 0.46, test_score: 0.32000
import matplotlib.pyplot as plt
plt.plot(lengths, lstm_dev_scores, '-o', color='#e8609b', label="LSTM Dev Accuracy")
plt.plot(lengths, lstm_test_scores,'-o', color='#000000', label="LSTM Test Accuracy")
#绘制坐标轴和图例
plt.ylabel("accuracy", fontsize='large')
plt.xlabel("sequence length", fontsize='large')
plt.legend(loc='lower left', fontsize='x-large')
fig_name = "./images/6.12.pdf"
plt.savefig(fig_name)
plt.show()
当LSTM处理序列数据[6, 7, 0, 0, 1, 0, 0, 0, 0, 0]的过程中单元状态和门数值的变化图,其中横坐标为输入数字,纵坐标为相应门或单元状态向量的维度,颜色的深浅代表数值的大小。可以看到,当输入门遇到不同位置的数字0时,保持了相对一致的数值大小,表明对于0元素保持相同的门控过滤机制,避免输入信息的变化给当前模型带来困扰;当遗忘门遇到数字1后,遗忘门数值在一些维度上变小,表明对某些信息进行了遗忘;随着序列的输入,输出门和单元状态在某些维度上数值变小,在某些维度上数值变大,表明输出门在根据信息的重要性选择信息进行输出,同时单元状态也在保持着对文本预测重要的一些信息.
四个门是控制输入输出和遗忘,计算出状态后判断是遗忘还是更新,最后输出门判断输出。
RNN比起前面的神经网络要难很多,这次实验也用了比较长的时间,主要用来总结和归纳上,学的脑子有点痒痒,看来是要长脑子了哈哈