RNN
1.完成rnn_layers.py中的rnn_step_forward()函数
def rnn_step_forward(x, prev_h, Wx, Wh, b):
"""
Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
activation function.
The input data has dimension D, the hidden state has dimension H, and we use
a minibatch size of N.
Inputs:
- x: Input data for this timestep, of shape (N, D).
- prev_h: Hidden state from previous timestep, of shape (N, H)
- Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
- Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
- b: Biases of shape (H,)
Returns a tuple of:
- next_h: Next hidden state, of shape (N, H)
- cache: Tuple of values needed for the backward pass.
"""
next_h, cache = None, None
##############################################################################
# TODO: Implement a single forward step for the vanilla RNN. Store the next #
# hidden state and any values you need for the backward pass in the next_h #
# and cache variables respectively. #
##############################################################################
temp1=np.dot(x,Wx)
temp2=np.dot(prev_h,Wh)
cache=(x,prev_h,Wx,Wh,temp1+temp2+b)
next_h=np.tanh(temp1+temp2+b)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return next_h, cache
2.完成rnn_layers.py中的rnn_step_backward()
def rnn_step_backward(dnext_h, cache):
"""
Backward pass for a single timestep of a vanilla RNN.
Inputs:
- dnext_h: Gradient of loss with respect to next hidden state, of shape (N, H)
- cache: Cache object from the forward pass
Returns a tuple of:
- dx: Gradients of input data, of shape (N, D)
- dprev_h: Gradients of previous hidden state, of shape (N, H)
- dWx: Gradients of input-to-hidden weights, of shape (D, H)
- dWh: Gradients of hidden-to-hidden weights, of shape (H, H)
- db: Gradients of bias vector, of shape (H,)
"""
dx, dprev_h, dWx, dWh, db = None, None, None, None, None
##############################################################################
# TODO: Implement the backward pass for a single step of a vanilla RNN. #
# #
# HINT: For the tanh function, you can compute the local derivative in terms #
# of the output value from tanh. #
##############################################################################
x=cache[0]
h=cache[1]
Wx=cache[2]
Wh=cache[3]
cacheD=cache[4]
N,H=h.shape
#计算激活函数的导数
temp=np.ones((N,H))-np.square(np.tanh(cacheD))
delta=np.multiply(temp,dnext_h)
#计算x的梯度
tempx1=np.dot(delta,Wx.T)
dx=tempx1
#tempx = np.dot(Wx, delta.T)
#dx = tempx.T
#print(dx1==dx)
#h的梯度
temph1=np.dot(delta,Wh.T)
dprev_h=temph1
#temph = np.dot(Wh, delta.T)
#dprev_h = temph.T
#print(dprev_h1.shape,dprev_h.shape)
#Wxh
dWx=np.dot(x.T,delta)
#Whh
dWh=np.dot(h.T,delta)
#b的梯度
db = np.sum(delta, axis=0)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return dx, dprev_h, dWx, dWh, db
3.完成前向传播rnn_layers.py中的rnn_forward
def rnn_forward(x, h0, Wx, Wh, b):
"""
Run a vanilla RNN forward on an entire sequence of data. We assume an input
sequence composed of T vectors, each of dimension D. The RNN uses a hidden
size of H, and we work over a minibatch containing N sequences. After running
the RNN forward, we return the hidden states for all timesteps.
Inputs:
- x: Input data for the entire timeseries, of shape (N, T, D).
- h0: Initial hidden state, of shape (N, H)
- Wx: Weight matrix for input-to-hidden connections, of shape (D, H)
- Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
- b: Biases of shape (H,)
Returns a tuple of:
- h: Hidden states for the entire timeseries, of shape (N, T, H).
- cache: Values needed in the backward pass
"""
h, cache = None, None
##############################################################################
# TODO: Implement forward pass for a vanilla RNN running on a sequence of #
# input data. You should use the rnn_step_forward function that you defined #
# above. You can use a for loop to help compute the forward pass. #
##############################################################################
N,T,D=x.shape
N,H=h0.shape
prev_h=h0
#计算隐状态
h1=np.empty([N,T,H])
#隐状态序列
h2=np.empty([N,T,H])
#滞后h一个时间点
h3=np.empty([N,T,H])
for i in range(0,T):
temp_h,cache_temp=rnn_step_forward(x[:,i,:],prev_h,Wx, Wh, b)
h3[:,i,:]=prev_h
prev_h=temp_h
h2[:,i,:]=temp_h
h1[:,i,:]=cache_temp[4]
cache=(x,h3,Wx,Wh,h1)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return h2, cache
4.完成反向传播rnn_layers.py中的rnn_backward()
def rnn_backward(dh, cache):
"""
Compute the backward pass for a vanilla RNN over an entire sequence of data.
Inputs:
- dh: Upstream gradients of all hidden states, of shape (N, T, H).
NOTE: 'dh' contains the upstream gradients produced by the
individual loss functions at each timestep, *not* the gradients
being passed between timesteps (which you'll have to compute yourself
by calling rnn_step_backward in a loop).
Returns a tuple of:
- dx: Gradient of inputs, of shape (N, T, D)
- dh0: Gradient of initial hidden state, of shape (N, H)
- dWx: Gradient of input-to-hidden weights, of shape (D, H)
- dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
- db: Gradient of biases, of shape (H,)
"""
dx, dh0, dWx, dWh, db = None, None, None, None, None
##############################################################################
# TODO: Implement the backward pass for a vanilla RNN running an entire #
# sequence of data. You should use the rnn_step_backward function that you #
# defined above. You can use a for loop to help compute the backward pass. #
##############################################################################
x=cache[0]
N,T,D=x.shape
N,T,H=dh.shape
#初始化
dWx=np.zeros((D,H))
dWh=np.zeros((H,H))
db=np.zeros(H)
dout=dh
dx=np.empty([N,T,D])
dh=np.empty([N,T,H])
#当前时刻隐藏状态对应的梯度
hnow=np.zeros([N,H])
for k in range(0,T):
i=T-1-k
hnow=hnow+dout[:,i,:]
cacheT=(cache[0][:,i,:],cache[1][:,i,:],cache[2],cache[3],cache[4][:,i,:])
#单步反向传播
dx_temp,dprev_h,dWx_temp,dWh_temp,db_temp=rnn_step_backward(hnow,cacheT)
hnow=dprev_h
dx[:,i,:]=dx_temp
#将每一层共享的梯度相加
dWx=dWx+dWx_temp
dWh=dWh+dWh_temp
db=db+db_temp
dh0=hnow
##############################################################################
# END OF YOUR CODE #
##############################################################################
return dx, dh0, dWx, dWh, db
5.完成rnn_layers.py中的词嵌入部分
def word_embedding_forward(x, W):
"""
Forward pass for word embeddings. We operate on minibatches of size N where
each sequence has length T. We assume a vocabulary of V words, assigning each
word to a vector of dimension D.
Inputs:
- x: Integer array of shape (N, T) giving indices of words. Each element idx
of x muxt be in the range 0 <= idx < V.
- W: Weight matrix of shape (V, D) giving word vectors for all words.
Returns a tuple of:
- out: Array of shape (N, T, D) giving word vectors for all input words.
- cache: Values needed for the backward pass
"""
out, cache = None, None
##############################################################################
# TODO: Implement the forward pass for word embeddings. #
# #
# HINT: This can be done in one line using NumPy's array indexing. #
##############################################################################
out=W[x,:]
cache=x,W
##############################################################################
# END OF YOUR CODE #
##############################################################################
return out, cache
def word_embedding_backward(dout, cache):
"""
Backward pass for word embeddings. We cannot back-propagate into the words
since they are integers, so we only return gradient for the word embedding
matrix.
HINT: Look up the function np.add.at
Inputs:
- dout: Upstream gradients of shape (N, T, D)
- cache: Values from the forward pass
Returns:
- dW: Gradient of word embedding matrix, of shape (V, D).
"""
dW = None
##############################################################################
# TODO: Implement the backward pass for word embeddings. #
# #
# Note that words can appear more than once in a sequence. #
# HINT: Look up the function np.add.at #
##############################################################################
x,W=cache
dW=np.zeros_like(W)
np.add.at(dW,x,dout)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return dW
6.完成classifiers/rnn.py中的loss部分,LSTM,RNN部分需去掉注释标记
def loss(self, features, captions):
"""
Compute training-time loss for the RNN. We input image features and
ground-truth captions for those images, and use an RNN (or LSTM) to compute
loss and gradients on all parameters.
Inputs:
- features: Input image features, of shape (N, D)
- captions: Ground-truth captions; an integer array of shape (N, T) where
each element is in the range 0 <= y[i, t] < V
Returns a tuple of:
- loss: Scalar loss
- grads: Dictionary of gradients parallel to self.params
"""
# Cut captions into two pieces: captions_in has everything but the last word
# and will be input to the RNN; captions_out has everything but the first
# word and this is what we will expect the RNN to generate. These are offset
# by one relative to each other because the RNN should produce word (t+1)
# after receiving word t. The first element of captions_in will be the START
# token, and the first element of captions_out will be the first word.
captions_in = captions[:, :-1]
captions_out = captions[:, 1:]
# You'll need this
mask = (captions_out != self._null)
# Weight and bias for the affine transform from image features to initial
# hidden state
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
# Word embedding matrix
W_embed = self.params['W_embed']
# Input-to-hidden, hidden-to-hidden, and biases for the RNN
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
# Weight and bias for the hidden-to-vocab transformation.
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
loss, grads = 0.0, {}
############################################################################
# TODO: Implement the forward and backward passes for the CaptioningRNN. #
# In the forward pass you will need to do the following: #
# (1) Use an affine transformation to compute the initial hidden state #
# from the image features. This should produce an array of shape (N, H)#
# (2) Use a word embedding layer to transform the words in captions_in #
# from indices to vectors, giving an array of shape (N, T, W). #
# (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to #
# process the sequence of input word vectors and produce hidden state #
# vectors for all timesteps, producing an array of shape (N, T, H). #
# (4) Use a (temporal) affine transformation to compute scores over the #
# vocabulary at every timestep using the hidden states, giving an #
# array of shape (N, T, V). #
# (5) Use (temporal) softmax to compute loss using captions_out, ignoring #
# the points where the output word is <NULL> using the mask above. #
# #
# In the backward pass you will need to compute the gradient of the loss #
# with respect to all model parameters. Use the loss and grads variables #
# defined above to store loss and gradients; grads[k] should give the #
# gradients for self.params[k]. #
# #
# Note also that you are allowed to make use of functions from layers.py #
# in your implementation, if needed. #
############################################################################
captions_in = captions[:,:-1]
captions_out=captions[:,1:]
mask=(captions_out!=self._null)
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
W_embed = self.params['W_embed']
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
N, D = features.shape
out, cache_affine = temporal_affine_forward(features.reshape(N, 1, D), W_proj, b_proj)
N, t, H = out.shape
h0 = out.reshape(N, H)
word_out, cache_word = word_embedding_forward(captions_in, W_embed)
#RNN
#hidden, cache_hidden = rnn_forward(word_out, h0, Wx, Wh, b)
#LSTM
hidden, cache_hidden = lstm_forward(word_out, h0, Wx, Wh, b)
out_vo, cache_vo = temporal_affine_forward(hidden, W_vocab, b_vocab)
loss, dx = temporal_softmax_loss(out_vo[:, :, :], captions_out, mask, verbose=False)
dx_affine, dW_vocab, db_vocab = temporal_affine_backward(dx, cache_vo)
grads['W_vocab'] = dW_vocab
grads['b_vocab'] = db_vocab
#RNN
#dx_hidden, dh0, dWx, dWh, db = rnn_backward(dx_affine, cache_hidden)
#LSTM
dx_hidden, dh0, dWx, dWh, db = lstm_backward(dx_affine, cache_hidden)
grads['Wx'] = dWx
grads['Wh'] = dWh
grads['b'] = db
dW_embed = word_embedding_backward(dx_hidden, cache_word)
grads['W_embed'] = dW_embed
dx_initial, dW_proj, db_proj = temporal_affine_backward(dh0.reshape(N, t, H), cache_affine)
grads['W_proj'] = dW_proj
grads['b_proj'] = db_proj
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads
7.完成classifiers/rnn.py中的sample部分,图片注释采样
def sample(self, features, max_length=30):
"""
Run a test-time forward pass for the model, sampling captions for input
feature vectors.
At each timestep, we embed the current word, pass it and the previous hidden
state to the RNN to get the next hidden state, use the hidden state to get
scores for all vocab words, and choose the word with the highest score as
the next word. The initial hidden state is computed by applying an affine
transform to the input image features, and the initial word is the <START>
token.
For LSTMs you will also have to keep track of the cell state; in that case
the initial cell state should be zero.
Inputs:
- features: Array of input image features of shape (N, D).
- max_length: Maximum length T of generated captions.
Returns:
- captions: Array of shape (N, max_length) giving sampled captions,
where each element is an integer in the range [0, V). The first element
of captions should be the first sampled word, not the <START> token.
"""
N = features.shape[0]
captions = self._null * np.ones((N, max_length), dtype=np.int32)
# Unpack parameters
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
W_embed = self.params['W_embed']
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
###########################################################################
# TODO: Implement test-time sampling for the model. You will need to #
# initialize the hidden state of the RNN by applying the learned affine #
# transform to the input image features. The first word that you feed to #
# the RNN should be the <START> token; its value is stored in the #
# variable self._start. At each timestep you will need to do to: #
# (1) Embed the previous word using the learned word embeddings #
# (2) Make an RNN step using the previous hidden state and the embedded #
# current word to get the next hidden state. #
# (3) Apply the learned affine transformation to the next hidden state to #
# get scores for all words in the vocabulary #
# (4) Select the word with the highest score as the next word, writing it #
# (the word index) to the appropriate slot in the captions variable #
# #
# For simplicity, you do not need to stop generating after an <END> token #
# is sampled, but you can if you want to. #
# #
# HINT: You will not be able to use the rnn_forward or lstm_forward #
# functions; you'll need to call rnn_step_forward or lstm_step_forward in #
# a loop. #
# #
# NOTE: we are still working over minibatches in this function. Also if #
# you are using an LSTM, initialize the first cell state to zeros. #
###########################################################################
N = features.shape[0]
captions = self._null * np.ones((N, max_length), dtype=np.int32)
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
W_embed = self.params['W_embed']
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
N, D = features.shape
out, cache_affine = temporal_affine_forward(features.reshape(N, 1, D), W_proj, b_proj)
N, t, H = out.shape
h0 = out.reshape(N, H)
h = h0
x0 = W_embed[[1, 1], :]
x_input = x0
captions[:, 0] = [1, 1]
prev_c = np.zeros_like(h)
for i in range(0, max_length-1):
# RNN step forward
#next_h, _ = rnn_step_forward(x_input, h, Wx, Wh, b)
#LSTM
next_h, next_c, cache = lstm_step_forward(x_input, h, prev_c, Wx, Wh, b)
prev_c=next_c
out_vo, cache_vo = temporal_affine_forward(next_h.reshape(N, 1, H), W_vocab, b_vocab)
index = np.argmax(out_vo, axis=2)
h = next_h
captions[:, i + 1] = np.squeeze(index)
############################################################################
# END OF YOUR CODE #
############################################################################
return captions
LSTM
8.完成rnn_layers.py中的lstm_step_forward
def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
"""
Forward pass for a single timestep of an LSTM.
The input data has dimension D, the hidden state has dimension H, and we use
a minibatch size of N.
Note that a sigmoid() function has already been provided for you in this file.
Inputs:
- x: Input data, of shape (N, D)
- prev_h: Previous hidden state, of shape (N, H)
- prev_c: previous cell state, of shape (N, H)
- Wx: Input-to-hidden weights, of shape (D, 4H)
- Wh: Hidden-to-hidden weights, of shape (H, 4H)
- b: Biases, of shape (4H,)
Returns a tuple of:
- next_h: Next hidden state, of shape (N, H)
- next_c: Next cell state, of shape (N, H)
- cache: Tuple of values needed for backward pass.
"""
next_h, next_c, cache = None, None, None
#############################################################################
# TODO: Implement the forward pass for a single timestep of an LSTM. #
# You may want to use the numerically stable sigmoid implementation above. #
#############################################################################
N,H=prev_h.shape
A = x.dot(Wx) + prev_h.dot(Wh) + b
ai = A[:, 0:H]
af = A[:, H:2 * H]
ao = A[:, 2 * H:3 * H]
ag = A[:, 3 * H:4 * H]
i = sigmoid(ai)
f = sigmoid(af)
o = sigmoid(ao)
g = np.tanh(ag)
next_c = np.multiply(f, prev_c) + np.multiply(i, g)
next_h = np.multiply(o, np.tanh(next_c))
cache = (x, prev_h, prev_c, i, f, o, g, Wx, Wh, next_c, A)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return next_h, next_c, cache
9.完成rnn_layers.py中的lstm_step_backward
def lstm_step_backward(dnext_h, dnext_c, cache):
"""
Backward pass for a single timestep of an LSTM.
Inputs:
- dnext_h: Gradients of next hidden state, of shape (N, H)
- dnext_c: Gradients of next cell state, of shape (N, H)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient of input data, of shape (N, D)
- dprev_h: Gradient of previous hidden state, of shape (N, H)
- dprev_c: Gradient of previous cell state, of shape (N, H)
- dWx: Gradient of input-to-hidden weights, of shape (D, 4H)
- dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H)
- db: Gradient of biases, of shape (4H,)
"""
dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
#############################################################################
# TODO: Implement the backward pass for a single timestep of an LSTM. #
# #
# HINT: For sigmoid and tanh you can compute local derivatives in terms of #
# the output value from the nonlinearity. #
#############################################################################
# 提取cache中的变量
N, H = dnext_h.shape
f = cache[4]
o = cache[5]
i = cache[3]
g = cache[6]
nc = cache[9]
prev_c = cache[2]
prev_x = cache[0]
prev_h = cache[1]
A = cache[10]
ai = A[:, 0:H]
af = A[:, H:2 * H]
ao = A[:, 2 * H:3 * H]
ag = A[:, 3 * H:4 * H]
Wx = cache[7]
Wh = cache[8]
# 计算到c_t‐1的梯度
dc_c = np.multiply(dnext_c, f)
dc_h_temp = np.multiply(dnext_h, o)
temp = np.ones_like(nc)-np.square(np.tanh(nc))
temp2 = np.multiply(temp, f)
dprev_c = np.multiply(temp2, dc_h_temp) + dc_c
# 计算(dE/dh)(dh/dc)
dc_from_h = np.multiply(dc_h_temp, temp)
dtotal_c = dc_from_h + dnext_c
# 计算到o,f,i,g的梯度
tempo = np.multiply(np.tanh(nc), dnext_h)
tempf = np.multiply(dtotal_c, prev_c)
tempi = np.multiply(dtotal_c, g)
tempg = np.multiply(dtotal_c, i)
# 计算到ao,ai,af,ag的梯度
tempao = np.multiply(tempo, np.multiply(o, np.ones_like(o)-o))
tempai = np.multiply(tempi, np.multiply(i, np.ones_like(o)-i))
tempaf = np.multiply(tempf, np.multiply(f, np.ones_like(o)-f))
dtanhg = np.ones_like(ag)-np.square(np.tanh(ag))
tempag = np.multiply(tempg, dtanhg)
# 计算各参数的梯度
TEMP = np.concatenate((tempai, tempaf, tempao, tempag), axis=1)
dx = TEMP.dot(Wx.T)
dprev_h = TEMP.dot(Wh.T)
xt = prev_x.T
dWx = xt.dot(TEMP)
ht = prev_h.T
dWh = ht.dot(TEMP)
db = np.sum(TEMP, axis=0).T
##############################################################################
# END OF YOUR CODE #
##############################################################################
return dx, dprev_h, dprev_c, dWx, dWh, db
10.完成rnn_layers.py中的lstm_forward
def lstm_forward(x, h0, Wx, Wh, b):
"""
Forward pass for an LSTM over an entire sequence of data. We assume an input
sequence composed of T vectors, each of dimension D. The LSTM uses a hidden
size of H, and we work over a minibatch containing N sequences. After running
the LSTM forward, we return the hidden states for all timesteps.
Note that the initial cell state is passed as input, but the initial cell
state is set to zero. Also note that the cell state is not returned; it is
an internal variable to the LSTM and is not accessed from outside.
Inputs:
- x: Input data of shape (N, T, D)
- h0: Initial hidden state of shape (N, H)
- Wx: Weights for input-to-hidden connections, of shape (D, 4H)
- Wh: Weights for hidden-to-hidden connections, of shape (H, 4H)
- b: Biases of shape (4H,)
Returns a tuple of:
- h: Hidden states for all timesteps of all sequences, of shape (N, T, H)
- cache: Values needed for the backward pass.
"""
h, cache = None, None
#############################################################################
# TODO: Implement the forward pass for an LSTM over an entire timeseries. #
# You should use the lstm_step_forward function that you just defined. #
#############################################################################
N, T, D = x.shape
N, H = h0.shape
prev_h = h0
# 以下的变量为反向传播时所需
h3 = np.empty([N, T, H])
h4 = np.empty([N, T, H])
I = np.empty([N, T, H])
F = np.empty([N, T, H])
O = np.empty([N, T, H])
G = np.empty([N, T, H])
NC = np.empty([N, T, H])
AT = np.empty([N, T, 4 * H])
h2 = np.empty([N, T, H])
prev_c = np.zeros_like(prev_h)
for i in range(0, T):
h3[:, i, :] = prev_h
h4[:, i, :] = prev_c
# 单步前向传播
next_h, next_c, cache_temp = lstm_step_forward(x[:, i, :], prev_h, prev_c, Wx, Wh, b)
prev_h = next_h
prev_c = next_c
h2[:, i, :] = prev_h
I[:, i, :] = cache_temp[3]
F[:, i, :] = cache_temp[4]
O[:, i, :] = cache_temp[5]
G[:, i, :] = cache_temp[6]
NC[:, i, :] = cache_temp[9]
AT[:, i, :] = cache_temp[10]
cache = (x, h3, h4, I, F, O, G, Wx, Wh, NC, AT)
##############################################################################
# END OF YOUR CODE #
##############################################################################
return h2, cache
11.完成rnn_layers.py中的lstm_backward
def lstm_backward(dh, cache):
"""
Backward pass for an LSTM over an entire sequence of data.]
Inputs:
- dh: Upstream gradients of hidden states, of shape (N, T, H)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient of input data of shape (N, T, D)
- dh0: Gradient of initial hidden state of shape (N, H)
- dWx: Gradient of input-to-hidden weight matrix of shape (D, 4H)
- dWh: Gradient of hidden-to-hidden weight matrix of shape (H, 4H)
- db: Gradient of biases, of shape (4H,)
"""
dx, dh0, dWx, dWh, db = None, None, None, None, None
#############################################################################
# TODO: Implement the backward pass for an LSTM over an entire timeseries. #
# You should use the lstm_step_backward function that you just defined. #
#############################################################################
x = cache[0]
N, T, D = x.shape
N, T, H = dh.shape
dWx = np.zeros((D, 4 * H))
dWh = np.zeros((H, 4 * H))
db = np.zeros(4 * H)
dout = dh
dx = np.empty([N, T, D])
hnow = np.zeros([N, H])
cnow = np.zeros([N, H])
for k in range(0, T):
i = T-1-k
hnow = hnow + dout[:, i, :]
cacheT = (cache[0][:, i, :], cache[1][:, i, :], cache[2][:, i, :], cache[3][:, i, :], cache[4][:, i, :],
cache[5][:, i, :], cache[6][:, i, :], cache[7], cache[8], cache[9][:, i, :], cache[10][:, i, :])
dx_temp, dprev_h, dprev_c, dWx_temp, dWh_temp, db_temp = lstm_step_backward(hnow, cnow, cacheT)
hnow = dprev_h
cnow = dprev_c
dx[:, i, :] = dx_temp
dWx = dWx + dWx_temp
dWh = dWh + dWh_temp
db = db + db_temp
dh0 = hnow
##############################################################################
# END OF YOUR CODE #
##############################################################################
return dx, dh0, dWx, dWh, db
loss与sample部分见前。