使用LSTM模型重新进行数字求和实验,验证LSTM模型的长程依赖能力。
使用第6.1.2.4节中定义Model_RNN4SeqClass模型,并构建 LSTM 算子.
只需要实例化 LSTM ,并传入Model_RNN4SeqClass模型,就可以用 LSTM 进行数字求和实验。
自定义LSTM算子
import torch.nn.functional as F
import torch
# 声明LSTM和相关参数
from torch import nn
import torch
import torch.nn
import torch.nn.functional as F
torch.manual_seed(0)
# SRN模型
class SRN(nn.Module):
def __init__(self, input_size, hidden_size, W_attr=None, U_attr=None, b_attr=None):
super(SRN, self).__init__()
# 嵌入向量的维度
self.input_size = input_size
# 隐状态的维度
self.hidden_size = hidden_size
# 定义模型参数W,其shape为 input_size x hidden_size
if W_attr == None:
W = torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
W = torch.tensor(W_attr, dtype=torch.float32)
self.W = torch.nn.Parameter(W)
# 定义模型参数U,其shape为hidden_size x hidden_size
if U_attr == None:
U = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
else:
U = torch.tensor(U_attr, dtype=torch.float32)
self.U = torch.nn.Parameter(U)
# 定义模型参数b,其shape为 1 x hidden_size
if b_attr == None:
b = torch.zeros(size=[1, hidden_size], dtype=torch.float32)
else:
b = torch.tensor(b_attr, dtype=torch.float32)
self.b = torch.nn.Parameter(b)
# 初始化向量
def init_state(self, batch_size):
hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
return hidden_state
# 定义前向计算
def forward(self, inputs, hidden_state=None):
# inputs: 输入数据, 其shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = inputs.shape
# 初始化起始状态的隐向量, 其shape为 batch_size x hidden_size
if hidden_state is None:
hidden_state = self.init_state(batch_size)
# 循环执行RNN计算
for step in range(seq_len):
# 获取当前时刻的输入数据step_input, 其shape为 batch_size x input_size
step_input = inputs[:, step, :]
# 获取当前时刻的隐状态向量hidden_state, 其shape为 batch_size x hidden_size
hidden_state = torch.tanh(torch.matmul(step_input, self.W) + torch.matmul(hidden_state, self.U) + self.b)
return hidden_state
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, Wi_attr=None, Wf_attr=None, Wo_attr=None, Wc_attr=None,
Ui_attr=None, Uf_attr=None, Uo_attr=None, Uc_attr=None, bi_attr=None, bf_attr=None,
bo_attr=None, bc_attr=None):
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
# 初始化模型参数
if Wi_attr == None:
W_i = torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
W_i = torch.tensor(Wi_attr, dtype=torch.float32)
self.W_i = torch.nn.Parameter(W_i)
self.W_f = torch.nn.Parameter(torch.tensor(Wf_attr, dtype=torch.float32))
self.W_o = torch.nn.Parameter(torch.tensor(Wo_attr, dtype=torch.float32))
self.W_c = torch.nn.Parameter(torch.tensor(Wc_attr, dtype=torch.float32))
self.U_i = torch.nn.Parameter(torch.tensor(Ui_attr, dtype=torch.float32))
self.U_f = torch.nn.Parameter(torch.tensor(Uf_attr, dtype=torch.float32))
self.U_o = torch.nn.Parameter(torch.tensor(Uo_attr, dtype=torch.float32))
self.U_c = torch.nn.Parameter(torch.tensor(Uc_attr, dtype=torch.float32))
self.b_i = torch.nn.Parameter(torch.tensor(bi_attr, dtype=torch.float32))
self.b_f = torch.nn.Parameter(torch.tensor(bf_attr, dtype=torch.float32))
self.b_o = torch.nn.Parameter(torch.tensor(bo_attr, dtype=torch.float32))
self.b_c = torch.nn.Parameter(torch.tensor(bc_attr, dtype=torch.float32))
# 初始化状态向量和隐状态向量
def init_state(self, batch_size):
hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
return hidden_state, cell_state
# 定义前向计算
def forward(self, inputs, states=None):
# inputs: 输入数据,其shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = inputs.shape
# 初始化起始的单元状态和隐状态向量,其shape为batch_size x hidden_size
if states is None:
states = self.init_state(batch_size)
hidden_state, cell_state = states
# 执行LSTM计算,包括:输入门、遗忘门和输出门、候选内部状态、内部状态和隐状态向量
for step in range(seq_len):
# 获取当前时刻的输入数据step_input: 其shape为batch_size x input_size
step_input = inputs[:, step, :]
# 计算输入门, 遗忘门和输出门, 其shape为:batch_size x hidden_size
I_gate = torch.sigmoid(torch.matmul(step_input, self.W_i) + torch.matmul(hidden_state, self.U_i) + self.b_i)
F_gate = torch.sigmoid(torch.matmul(step_input, self.W_f) + torch.matmul(hidden_state, self.U_f) + self.b_f)
O_gate = torch.sigmoid(torch.matmul(step_input, self.W_o) + torch.matmul(hidden_state, self.U_o) + self.b_o)
# 计算候选状态向量, 其shape为:batch_size x hidden_size
C_tilde = torch.tanh(torch.matmul(step_input, self.W_c) + torch.matmul(hidden_state, self.U_c) + self.b_c)
# 计算单元状态向量, 其shape为:batch_size x hidden_size
cell_state = F_gate * cell_state + I_gate * C_tilde
# 计算隐状态向量,其shape为:batch_size x hidden_size
hidden_state = O_gate * torch.tanh(cell_state)
return hidden_state
## 初始化参数并运行
Wi_attr = [[0.1, 0.2], [0.1, 0.2]]
Wf_attr = [[0.1, 0.2], [0.1, 0.2]]
Wo_attr = [[0.1, 0.2], [0.1, 0.2]]
Wc_attr = [[0.1, 0.2], [0.1, 0.2]]
Ui_attr = [[0.0, 0.1], [0.1, 0.0]]
Uf_attr = [[0.0, 0.1], [0.1, 0.0]]
Uo_attr = [[0.0, 0.1], [0.1, 0.0]]
Uc_attr = [[0.0, 0.1], [0.1, 0.0]]
bi_attr = [[0.1, 0.1]]
bf_attr = [[0.1, 0.1]]
bo_attr = [[0.1, 0.1]]
bc_attr = [[0.1, 0.1]]
lstm = LSTM(2, 2, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
inputs = torch.tensor([[[1, 0]]], dtype=torch.float32)
hidden_state = lstm(inputs)
print(hidden_state)
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn(size=[batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
torch_lstm = nn.LSTM(input_size, hidden_size)
self_lstm = LSTM(input_size, hidden_size)
self_hidden_state = self_lstm(inputs)
torch_outputs, (torch_hidden_state, torch_cell_state) = torch_lstm(inputs)
print("self_lstm hidden_state: ", self_hidden_state.shape)
print("torch_lstm outpus:", torch_outputs.shape)
print("torch_lstm hidden_state:", torch_hidden_state.shape)
print("torch_lstm cell_state:", torch_cell_state.shape)
可以看到,自己实现的LSTM由于没有考虑多层因素,因此没有层次这个维度,因此其输出shape为[8, 32]。同时由于在以上代码使用Paddle内置API实例化LSTM时,默认定义的是1层的单向SRN,因此其shape为[1, 8, 32],同时隐状态向量为[8,20, 32].
将自定义LSTM与pytorch内置的LSTM进行对比
import torch
torch.seed()
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size, hidden_size = 2, 5, 10, 10
inputs = torch.randn([batch_size, seq_len, input_size])
# 设置模型的hidden_size
torch_lstm = nn.LSTM(input_size, hidden_size, bias=True)
# 获取torch_lstm中的参数,并设置相应的paramAttr,用于初始化lstm
print(torch_lstm.weight_ih_l0.T.shape)
chunked_W = torch.split(torch_lstm.weight_ih_l0.T, split_size_or_sections=10, dim=-1)
chunked_U = torch.split(torch_lstm.weight_hh_l0.T, split_size_or_sections=10, dim=-1)
chunked_b = torch.split(torch_lstm.bias_hh_l0.T, split_size_or_sections=10, dim=-1)
Wi_attr = chunked_W[0]
Wf_attr = chunked_W[1]
Wc_attr = chunked_W[2]
Wo_attr = chunked_W[3]
Ui_attr = chunked_U[0]
Uf_attr = chunked_U[1]
Uc_attr = chunked_U[2]
Uo_attr = chunked_U[3]
bi_attr = chunked_b[0]
bf_attr = chunked_b[1]
bc_attr = chunked_b[2]
bo_attr = chunked_b[3]
self_lstm = LSTM(input_size, hidden_size, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
# 进行前向计算,获取隐状态向量,并打印展示
self_hidden_state = self_lstm(inputs)
torch_outputs, (torch_hidden_state, _) = torch_lstm(inputs)
print("torch SRN:\n", torch_hidden_state.detach().numpy().squeeze(0))
print("self SRN:\n", self_hidden_state.detach().numpy())
可以看到,两者的输出基本是一致的。另外,还可以进行对比两者在运算速度方面的差异。代码实现如下:
import time
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn([batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
self_lstm = LSTM(input_size, hidden_size)
torch_lstm = nn.LSTM(input_size, hidden_size)
# 计算自己实现的SRN运算速度
model_time = 0
for i in range(100):
strat_time = time.time()
hidden_state = self_lstm(inputs)
# 预热10次运算,不计入最终速度统计
if i < 10:
continue
end_time = time.time()
model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('self_lstm speed:', avg_model_time, 's')
# 计算torch内置的SRN运算速度
model_time = 0
for i in range(100):
strat_time = time.time()
outputs, (hidden_state, cell_state) = torch_lstm(inputs)
# 预热10次运算,不计入最终速度统计
if i < 10:
continue
end_time = time.time()
model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('torch_lstm speed:', avg_model_time, 's')
可以看到,由于PyTorch底层采用了C++实现并进行优化,Paddle框架内置的LSTM运行效率远远高于自己实现的LSTM。
import os
import random
import torch
import numpy as np
# 训练轮次
num_epochs = 500
# 学习率
lr = 0.001
# 输入数字的类别数
num_digits = 10
# 将数字映射为向量的维度
input_size = 32
# 隐状态向量的维度
hidden_size = 32
# 预测数字的类别数
num_classes = 19
# 批大小
batch_size = 8
# 模型保存目录
save_dir = "./checkpoints"
# 可以设置不同的length进行不同长度数据的预测实验
def train(length):
print(f"\n====> Training LSTM with data of length {length}.")
np.random.seed(0)
random.seed(0)
# 加载长度为length的数据
data_path = f"./datasets/{length}"
train_examples, dev_examples, test_examples = load_data(data_path)
train_set, dev_set, test_set = DigitSumDataset(train_examples), DigitSumDataset(dev_examples), DigitSumDataset(test_examples)
train_loader = DataLoader(train_set, batch_size=batch_size)
dev_loader = DataLoader(dev_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)
# 实例化模型
base_model = LSTM(input_size, hidden_size)
model = Model_RNN4SeqClass(base_model, num_digits, input_size, hidden_size, num_classes)
# 指定优化器
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
# 定义评价指标
metric = Accuracy()
# 定义损失函数
loss_fn = torch.nn.CrossEntropyLoss()
# 基于以上组件,实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 进行模型训练
model_save_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=100, log_steps=100, save_path=model_save_path)
return runner
lstm_runners = {}
lengths = [10, 15, 20, 25, 30, 35]
for length in lengths:
runner = train(length)
lstm_runners[length] = runner
# 画出训练过程中的损失图
for length in lengths:
runner = lstm_runners[length]
fig_name = f"./images/6.11_{length}.pdf"
plot_training_loss(runner, fig_name, sample_step=100)
LSTM模型在不同长度数据集上进行训练后的损失变化,同SRN模型一样,随着序列长度的增加,训练集上的损失逐渐不稳定,验证集上的损失整体趋向于变大,这说明当序列长度增加时,保持长期依赖的能力同样在逐渐变弱. 但是同上节实验运行结果(下图)相比,LSTM模型在序列长度增加时,收敛情况比SRN模型更好。
LSTM在序列长度较短以及较长时,准确率优于SRN,比SRM收敛更稳定。
LSTM作为一种特殊的RNN,加入了门控机制,避免了梯度爆炸和消失的问题,还可以解决SRN的长程依赖问。
lstm_dev_scores = []
lstm_test_scores = []
for length in lengths:
print(f"Evaluate LSTM with data length {length}.")
runner = lstm_runners[length]
# 加载训练过程中效果最好的模型
model_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.load_model(model_path)
# 加载长度为length的数据
data_path = f"./datasets/{length}"
train_examples, dev_examples, test_examples = load_data(data_path)
test_set = DigitSumDataset(test_examples)
test_loader = DataLoader(test_set, batch_size=batch_size)
# 使用测试集评价模型,获取测试集上的预测准确率
score, _ = runner.evaluate(test_loader)
lstm_test_scores.append(score)
lstm_dev_scores.append(max(runner.dev_scores))
for length, dev_score, test_score in zip(lengths, lstm_dev_scores, lstm_test_scores):
print(f"[LSTM] length:{length}, dev_score: {dev_score}, test_score: {test_score: .5f}")
Evaluate LSTM with data length 15.
Evaluate LSTM with data length 20.
Evaluate LSTM with data length 25.
Evaluate LSTM with data length 30.
Evaluate LSTM with data length 35.
[LSTM] length:10, dev_score: 0.86, test_score: 0.82000
[LSTM] length:15, dev_score: 0.86, test_score: 0.92000
[LSTM] length:20, dev_score: 0.71, test_score: 0.70000
[LSTM] length:25, dev_score: 0.92, test_score: 0.90000
[LSTM] length:30, dev_score: 0.64, test_score: 0.60000
[LSTM] length:35, dev_score: 0.16, test_score: 0.09000
import matplotlib.pyplot as plt
plt.plot(lengths, lstm_dev_scores, '-o', color='#e8609b', label="LSTM Dev Accuracy")
plt.plot(lengths, lstm_test_scores,'-o', color='#000000', label="LSTM Test Accuracy")
#绘制坐标轴和图例
plt.ylabel("accuracy", fontsize='large')
plt.xlabel("sequence length", fontsize='large')
plt.legend(loc='lower left', fontsize='x-large')
fig_name = "./images/6.12.pdf"
plt.savefig(fig_name)
plt.show()
随着数据集长度的增加,LSTM模型和SRN模型在验证集和测试集上的准确率整体均趋向于降低,LSTM模型保持长期依赖的能力要优于SRN模型
# 声明LSTM和相关参数
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, Wi_attr=None, Wf_attr=None, Wo_attr=None, Wc_attr=None,
Ui_attr=None, Uf_attr=None, Uo_attr=None, Uc_attr=None, bi_attr=None, bf_attr=None,
bo_attr=None, bc_attr=None):
super(LSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
# 初始化模型参数
if Wi_attr==None:
Wi=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
Wi = torch.tensor(Wi_attr, dtype=torch.float32)
self.W_i = torch.nn.Parameter(Wi)
if Wf_attr==None:
Wf=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
Wf = torch.tensor(Wf_attr, dtype=torch.float32)
self.W_f = torch.nn.Parameter(Wf)
if Wo_attr==None:
Wo=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
Wo = torch.tensor(Wo_attr, dtype=torch.float32)
self.W_o =torch.nn.Parameter(Wo)
if Wc_attr==None:
Wc=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
else:
Wc = torch.tensor(Wc_attr, dtype=torch.float32)
self.W_c = torch.nn.Parameter(Wc)
if Ui_attr==None:
Ui = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
else:
Ui = torch.tensor(Ui_attr, dtype=torch.float32)
self.U_i = torch.nn.Parameter(Ui)
if Uf_attr == None:
Uf = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
else:
Uf = torch.tensor(Uf_attr, dtype=torch.float32)
self.U_f = torch.nn.Parameter(Uf)
if Uo_attr == None:
Uo = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
else:
Uo = torch.tensor(Uo_attr, dtype=torch.float32)
self.U_o = torch.nn.Parameter(Uo)
if Uc_attr == None:
Uc = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
else:
Uc = torch.tensor(Uc_attr, dtype=torch.float32)
self.U_c = torch.nn.Parameter(Uc)
if bi_attr == None:
bi = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
else:
bi = torch.tensor(bi_attr, dtype=torch.float32)
self.b_i = torch.nn.Parameter(bi)
if bf_attr == None:
bf = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
else:
bf = torch.tensor(bf_attr, dtype=torch.float32)
self.b_f = torch.nn.Parameter(bf)
if bo_attr == None:
bo = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
else:
bo = torch.tensor(bo_attr, dtype=torch.float32)
self.b_o = torch.nn.Parameter(bo)
if bc_attr == None:
bc = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
else:
bc = torch.tensor(bc_attr, dtype=torch.float32)
self.b_c = torch.nn.Parameter(bc)
# 初始化状态向量和隐状态向量
def init_state(self, batch_size):
hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
return hidden_state, cell_state
# 定义前向计算
def forward(self, inputs, states=None):
# inputs: 输入数据,其shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = inputs.shape
# 初始化起始的单元状态和隐状态向量,其shape为batch_size x hidden_size
if states is None:
states = self.init_state(batch_size)
hidden_state, cell_state = states
# 定义相应的门状态和单元状态向量列表
self.Is = []
self.Fs = []
self.Os = []
self.Cs = []
# 初始化状态向量和隐状态向量
cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
# 执行LSTM计算,包括:隐藏门、输入门、遗忘门、候选状态向量、状态向量和隐状态向量
for step in range(seq_len):
input_step = inputs[:, step, :]
I_gate = F.sigmoid(torch.matmul(input_step, self.W_i) + torch.matmul(hidden_state, self.U_i) + self.b_i)
F_gate = F.sigmoid(torch.matmul(input_step, self.W_f) + torch.matmul(hidden_state, self.U_f) + self.b_f)
O_gate = F.sigmoid(torch.matmul(input_step, self.W_o) + torch.matmul(hidden_state, self.U_o) + self.b_o)
C_tilde = F.tanh(torch.matmul(input_step, self.W_c) + torch.matmul(hidden_state, self.U_c) + self.b_c)
cell_state = F_gate * cell_state + I_gate * C_tilde
hidden_state = O_gate * F.tanh(cell_state)
# 存储门状态向量和单元状态向量
self.Is.append(I_gate.detach().numpy().copy())
self.Fs.append(F_gate.detach().numpy().copy())
self.Os.append(O_gate.detach().numpy().copy())
self.Cs.append(cell_state.detach().numpy().copy())
return hidden_state
接下来,需要使用新的LSTM模型,重新实例化一个runner,本节使用序列长度为10的模型进行此项实验,因此需要加载序列长度为10的模型
# 实例化模型
base_model = LSTM(input_size, hidden_size)
model = Model_RNN4SeqClass(base_model, num_digits, input_size, hidden_size, num_classes)
# 指定优化器
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
# 定义评价指标
metric = Accuracy()
# 定义损失函数
loss_fn = torch.nn.CrossEntropyLoss()
# 基于以上组件,重新实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
length = 10
# 加载训练过程中效果最好的模型
model_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.load_model(model_path)
接下来,给定一条数字序列,并使用数字预测模型进行数字预测,这样便会将相应的门状态和单元状态向量保存至模型中. 然后分别从模型中取出这些向量,并将这些向量进行绘制展示。代码实现如下:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_tensor(inputs, tensor, save_path, vmin=0, vmax=1):
tensor = np.stack(tensor, axis=0)
tensor = np.squeeze(tensor, 1).T
plt.figure(figsize=(16,6))
# vmin, vmax定义了色彩图的上下界
ax = sns.heatmap(tensor, vmin=vmin, vmax=vmax)
ax.set_xticklabels(inputs)
ax.figure.savefig(save_path)
# 定义模型输入
inputs = [6, 7, 0, 0, 1, 0, 0, 0, 0, 0]
X = torch.as_tensor(inputs.copy())
X = X.unsqueeze(0)
# 进行模型预测,并获取相应的预测结果
logits = runner.predict(X)
predict_label = torch.argmax(logits, dim=-1)
print(f"predict result: {predict_label.numpy()[0]}")
# 输入门
Is = runner.model.rnn_model.Is
plot_tensor(inputs, Is, save_path="./images/6.13_I.pdf")
# 遗忘门
Fs = runner.model.rnn_model.Fs
plot_tensor(inputs, Fs, save_path="./images/6.13_F.pdf")
# 输出门
Os = runner.model.rnn_model.Os
plot_tensor(inputs, Os, save_path="./images/6.13_O.pdf")
# 单元状态
Cs = runner.model.rnn_model.Cs
plot_tensor(inputs, Cs, save_path="./images/6.13_C.pdf", vmin=-5, vmax=5)
老师说的比较清楚·:横坐标为输入数字,纵坐标为相应门或单元状态向量的维度,颜色的深浅代表数值的大小。可以看到,当输入门遇到不同位置的数字0时,保持了相对一致的数值大小,表明对于0元素保持相同的门控过滤机制,避免输入信息的变化给当前模型带来困扰;当遗忘门遇到数字1后,遗忘门数值在一些维度上变小,表明对某些信息进行了遗忘;随着序列的输入,输出门和单元状态在某些维度上数值变小,在某些维度上数值变大,表明输出门在根据信息的重要性选择信息进行输出,同时单元状态也在保持着对文本预测重要的一些信息.
最近又看了一些关于LSTM的博客和文章,把之前不明白的地方捋顺了,也逐渐明白老师上课讲的内容了。但还是有不少欠缺的地方,经常要请教同学,还是要多努力。