长短期记忆修改了循环神经网络隐藏状态的计算方式,并引入了与隐藏状态形状相同的记忆细胞(某些文献把记忆细胞当成一种特殊的隐藏状态)。
通过输出门来控制从记忆细胞到隐藏状态
在时间步t,长短期记忆的输出层计算和之前描述的循环神经网络输出层计算一样
超参数num_hiddens定义了隐藏单元的个数
ctx = gb.try_gpu()
input_dim = vocab_size
num_hiddens = 256
output_dim = vocab_size
def get_params():
# 输入门参数.
W_xi = nd.random_normal(scale=0.01, shape=(input_dim, num_hiddens),
ctx=ctx)
W_hi = nd.random_normal(scale=0.01, shape=(num_hiddens, num_hiddens),
ctx=ctx)
b_i = nd.zeros(num_hiddens, ctx=ctx)
# 遗忘门参数。
W_xf = nd.random_normal(scale=0.01, shape=(input_dim, num_hiddens),
ctx=ctx)
W_hf = nd.random_normal(scale=0.01, shape=(num_hiddens, num_hiddens),
ctx=ctx)
b_f = nd.zeros(num_hiddens, ctx=ctx)
# 输出门参数。
W_xo = nd.random_normal(scale=0.01, shape=(input_dim, num_hiddens),
ctx=ctx)
W_ho = nd.random_normal(scale=0.01, shape=(num_hiddens, num_hiddens),
ctx=ctx)
b_o = nd.zeros(num_hiddens, ctx=ctx)
# 候选细胞参数。
W_xc = nd.random_normal(scale=0.01, shape=(input_dim, num_hiddens),
ctx=ctx)
W_hc = nd.random_normal(scale=0.01, shape=(num_hiddens, num_hiddens),
ctx=ctx)
b_c = nd.zeros(num_hiddens, ctx=ctx)
# 输出层参数。
W_hy = nd.random_normal(scale=0.01, shape=(num_hiddens, output_dim),
ctx=ctx)
b_y = nd.zeros(output_dim, ctx=ctx)
params = [W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc,
b_c, W_hy, b_y]
for param in params:
param.attach_grad()
return params
def lstm_rnn(inputs, state_h, state_c, *params):
[W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c,
W_hy, b_y] = params
H = state_h
C = state_c
outputs = []
for X in inputs:
I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i)
F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f)
O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o)
C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c)
C = F * C + I * C_tilda
H = O * C.tanh()
Y = nd.dot(H, W_hy) + b_y
outputs.append(Y)
return (outputs, H, C)