我们来看一下下面的循环神经网络的图,
分两步来完成实现
(1)实现RNN的一个时间步所需要计算的东西。
(2)在时间步上实现一个循环,以便一次处理所有输入。
循环神经网络可以看作是单元的重复,首先要实现单个时间步的计算,下图描述了RNN单元的单个时间步的操作。
def rnn_cell(xt,a_prev,parameters):
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]
a_next = np.tanh(np.dot(Waa,a_prev) + np.dot(Wax,xt) +ba)
yt_pred = softmax(np.dot(Wya,a_next) + by)
cache = (a_next, a_prev, xt, parameters)
return a_next, yt_pred, cache
RNN是刚刚构建的单元格的重复连接,如果输入的数据序列经过10个时间步,那么将复制RNN单元10次 。
def rnn_forward(x,a0,parameters):
caches = []
n_x, m, T_x = x.shape
n_y, n_a = parameters["Wya"].shape
a = np.zeros([n_a, m, T_x])
y_pred = np.zeros([n_y, m, T_x])
a_next = a0
for t in range(T_x):
a_next, yt_pred, cache = rnn_cell(x[:,:,t], a_next, parameters)
a[:,:,t] = a_next
y_pred[:,:,t] = yt_pred
caches.append(cache)
caches = (caches, x)
return a, y_pred, caches
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QxjQcoKx-1574580660969)(D:\jupyter_space\deeplearning.ai\5\Building a Recurrent Neural Network - Step by Step\images\rnn_cell_backprop.png)]
def rnn_cell_backward(da_next, cache):
(a_next, a_prev, xt, parameters) = cache
Wax = parameters["Wax"]
Waa = parameters["Waa"]
Wya = parameters["Wya"]
ba = parameters["ba"]
by = parameters["by"]
dtanh = (1- a_next**2) * da_next
dxt = np.dot(Wax.T, dtanh)
dWax = np.dot(dtanh, xt.T)
da_prev = np.dot(Waa.T, dtanh)
dWaa = np.dot(dtanh, a_prev.T)
dba = np.sum(dtanh, 1, keepdims=True)
gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
return gradients
def rnn_backward(da, caches):
(caches, x) = caches
(a1, a0, x1, parameters) = caches[0]
n_a, m, T_x = da.shape
n_x, m = x1.shape
dx = np.zeros((n_x, m, T_x))
dWax = np.zeros((n_a, n_x))
dWaa = np.zeros((n_a, n_a))
dba = np.zeros((n_a, 1))
da0 = np.zeros((n_a, m))
da_prevt = np.zeros((n_a, m))
for t in reversed(range(T_x)):
gradients = rnn_cell_backward(da[:,:, t] + da_prevt, caches[t])
dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
dx[:, :, t] = dxt
dWax += dWaxt
dWaa += dWaat
dba += dbat
da0 = da_prevt
gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
return gradients
GRU前向传播公式(简化):
c ^ < t > = t a n h ( W c c c < t − 1 > + W c x x < t − 1 > + b c ) Γ u = s i g m o i d ( c ^ < t > ) c < t > = Γ u ∗ c ^ < t > + ( 1 − Γ u ) c < t − 1 > c ^ < t > = t a n h ( W c c c < t − 1 > + W c x x < t − 1 > + b c ) Γ u = s i g m o i d ( c ^ < t > ) c < t > = Γ u ∗ c ^ < t > + ( 1 − Γ u ) c < t − 1 > \hat{c}^{
GRU前向传播公式(全部):
Γ r = s i g m o i d ( W r c c < t − 1 > + W r x x < t − 1 > + b r ) c ^ < t > = t a n h ( W c c ( Γ r ∗ c < t − 1 > ) + W c x x < t − 1 > + b c ) Γ u = s i g m o i d ( W u c c < t − 1 > + W u x x < t − 1 > + b u ) c < t > = Γ u ∗ c ^ < t > + ( 1 − Γ u ) c < t − 1 > a < t > = c < t > \Gamma_r=sigmoid(W_{rc}c^{
def gru_cell(xt,c_prev,parameters):
Wcx = parameters["Wcx"]
Wcc = parameters["Wcc"]
Wyc = parameters["Wyc"]
bc = parameters["bc"]
by = parameters["by"]
c_temp = np.tanh(np.dot(Wcc, c_prev) + np.dot(Wcx, xt) + bc)
fu = sigmoid(c_temp)
c_next = fu * c_temp +(1 - fu) *c_prev
yt_pred = softmax(np.dot(Wyc, c_next) + by)
cache = (c_next, c_prev, xt, parameters)
return c_next, yt_pred, cache
def gru_forward(x, c0, parameters):
caches = []
n_x, m, T_x = x.shape
n_y, n_c = parameters["Wyc"].shape
c = np.zeros([n_c, m, T_x])
y_pred = np.zeros([n_y, m, T_x])
c_next = c0
for t in range(T_x):
c_next, yt_pred, cache = gru_cell(x[:,:,t], c_next, parameters)
c[:,:,t] = c_next
y_pred[:,:,t] = yt_pred
caches.append(cache)
caches = (caches, x)
return c, y_pred, caches
关于LSTM的详细情况可以看https://blog.csdn.net/zhangbaoanhadoop/article/details/81952284
def lstm_cell(xt, a_prev, c_prev, parameters):
Wf = parameters["Wf"]
bf = parameters["bf"]
Wi = parameters["Wi"]
bi = parameters["bi"]
Wc = parameters["Wc"]
bc = parameters["bc"]
Wo = parameters["Wo"]
bo = parameters["bo"]
Wy = parameters["Wy"]
by = parameters["by"]
n_x, m = xt.shape
n_y, n_a = Wy.shape
concat = np.zeros((n_a + n_x, m))
concat[: n_a, :] = a_prev
concat[n_a :, :] = xt
ft = sigmoid(np.dot(Wf, concat) + bf)
it = sigmoid(np.dot(Wi, concat) + bi)
cct = np.tanh(np.dot(Wc, concat) + bc)
c_next = ft * c_prev + it * cct
ot = sigmoid(np.dot(Wo, concat) + bo)
a_next = ot * np.tanh(c_next)
yt_pred = softmax(np.dot(Wy, a_next) + by)
cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)
return a_next, c_next, yt_pred, cache
我们已经实现了LSTM单元的一个时间步的前向传播,现在我们要对LSTM网络进行前向传播进行计算 ,这部分与之前类似。
def lstm_forward(x, a0, parameters):
caches = []
n_x, m, T_x = x.shape
n_y, n_a = parameters["Wy"].shape
a = np.zeros((n_a, m, T_x))
c = a
y = np.zeros((n_y, m, T_x))
a_next = a0
c_next = np.zeros(a_next.shape)
for t in range(T_x):
a_next, c_next, yt, cache = lstm_cell(x[:,:,t], a_next, c_next, parameters)
a[:,:,t] = a_next
y[:,:,t] = yt
c[:,:,t] = c_next
caches.append(cache)
caches = (caches, x)
return a, y, c, caches
门的导数
d Γ o ⟨ t ⟩ = d a n e x t ∗ tanh ( c n e x t ) ∗ Γ o ⟨ t ⟩ ∗ ( 1 − Γ o ⟨ t ⟩ ) d c ~ ⟨ t ⟩ = d c n e x t ∗ Γ i ⟨ t ⟩ + Γ o ⟨ t ⟩ ( 1 − tanh ( c n e x t ) 2 ) ∗ i t ∗ d a n e x t ∗ c ~ ⟨ t ⟩ ∗ ( 1 − tanh ( c ~ ) 2 ) d Γ u ⟨ t ⟩ = d c n e x t ∗ c ~ ⟨ t ⟩ + Γ o ⟨ t ⟩ ( 1 − tanh ( c n e x t ) 2 ) ∗ c ~ ⟨ t ⟩ ∗ d a n e x t ∗ Γ u ⟨ t ⟩ ∗ ( 1 − Γ u ⟨ t ⟩ ) d Γ f ⟨ t ⟩ = d c n e x t ∗ c ~ p r e v + Γ o ⟨ t ⟩ ( 1 − tanh ( c n e x t ) 2 ) ∗ c p r e v ∗ d a n e x t ∗ Γ f ⟨ t ⟩ ∗ ( 1 − Γ f ⟨ t ⟩ ) d \Gamma_o^{\langle t \rangle} = da_{next}*\tanh(c_{next}) * \Gamma_o^{\langle t \rangle}*(1-\Gamma_o^{\langle t \rangle})\\d\tilde c^{\langle t \rangle} = dc_{next}*\Gamma_i^{\langle t \rangle}+ \Gamma_o^{\langle t \rangle} (1-\tanh(c_{next})^2) * i_t * da_{next} * \tilde c^{\langle t \rangle} * (1-\tanh(\tilde c)^2) \\d\Gamma_u^{\langle t \rangle} = dc_{next}*\tilde c^{\langle t \rangle} + \Gamma_o^{\langle t \rangle} (1-\tanh(c_{next})^2) * \tilde c^{\langle t \rangle} * da_{next}*\Gamma_u^{\langle t \rangle}*(1-\Gamma_u^{\langle t \rangle})\\d\Gamma_f^{\langle t \rangle} = dc_{next}*\tilde c_{prev} + \Gamma_o^{\langle t \rangle} (1-\tanh(c_{next})^2) * c_{prev} * da_{next}*\Gamma_f^{\langle t \rangle}*(1-\Gamma_f^{\langle t \rangle})\\ dΓo⟨t⟩=danext∗tanh(cnext)∗Γo⟨t⟩∗(1−Γo⟨t⟩)dc~⟨t⟩=dcnext∗Γi⟨t⟩+Γo⟨t⟩(1−tanh(cnext)2)∗it∗danext∗c~⟨t⟩∗(1−tanh(c~)2)dΓu⟨t⟩=dcnext∗c~⟨t⟩+Γo⟨t⟩(1−tanh(cnext)2)∗c~⟨t⟩∗danext∗Γu⟨t⟩∗(1−Γu⟨t⟩)dΓf⟨t⟩=dcnext∗c~prev+Γo⟨t⟩(1−tanh(cnext)2)∗cprev∗danext∗Γf⟨t⟩∗(1−Γf⟨t⟩)
参数的导数
d W f = d Γ f ⟨ t ⟩ ∗ ( a p r e v x t ) T d W u = d Γ u ⟨ t ⟩ ∗ ( a p r e v x t ) T d W c = d c ~ ⟨ t ⟩ ∗ ( a p r e v x t ) T d W o = d Γ o ⟨ t ⟩ ∗ ( a p r e v x t ) T dW_f = d\Gamma_f^{\langle t \rangle} * \begin{pmatrix} a_{prev} \\ x_t\end{pmatrix}^T \\ dW_u = d\Gamma_u^{\langle t \rangle} * \begin{pmatrix} a_{prev} \\ x_t\end{pmatrix}^T \\ dW_c = d\tilde c^{\langle t \rangle} * \begin{pmatrix} a_{prev} \\ x_t\end{pmatrix}^T \\dW_o = d\Gamma_o^{\langle t \rangle} * \begin{pmatrix} a_{prev} \\ x_t\end{pmatrix}^T dWf=dΓf⟨t⟩∗(aprevxt)TdWu=dΓu⟨t⟩∗(aprevxt)TdWc=dc~⟨t⟩∗(aprevxt)TdWo=dΓo⟨t⟩∗(aprevxt)T
def lstm_cell_backward(da_next, dc_next, cache):
# 从cache中获取信息
(a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) = cache
# 获取xt与a_next的维度信息
n_x, m = xt.shape
n_a, m = a_next.shape
dot = da_next * np.tanh(c_next) * ot * (1 - ot)
dcct = (dc_next * it + ot * (1 - np.square(np.tanh(c_next))) * it * da_next) * (1 - np.square(cct))
dit = (dc_next * cct + ot * (1 - np.square(np.tanh(c_next))) * cct * da_next) * it * (1 - it)
dft = (dc_next * c_prev + ot * (1 - np.square(np.tanh(c_next))) * c_prev * da_next) * ft * (1 - ft)
# 根据公式11-14计算参数的导数
concat = np.concatenate((a_prev, xt), axis=0).T
dWf = np.dot(dft, concat)
dWi = np.dot(dit, concat)
dWc = np.dot(dcct, concat)
dWo = np.dot(dot, concat)
dbf = np.sum(dft,axis=1,keepdims=True)
dbi = np.sum(dit,axis=1,keepdims=True)
dbc = np.sum(dcct,axis=1,keepdims=True)
dbo = np.sum(dot,axis=1,keepdims=True)
# 使用公式15-17计算洗起来了隐藏状态、先前记忆状态、输入的导数。
da_prev = np.dot(parameters["Wf"][:, :n_a].T, dft) + np.dot(parameters["Wc"][:, :n_a].T, dcct) + np.dot(parameters["Wi"][:, :n_a].T, dit) + np.dot(parameters["Wo"][:, :n_a].T, dot)
dc_prev = dc_next * ft + ot * (1 - np.square(np.tanh(c_next))) * ft * da_next
dxt = np.dot(parameters["Wf"][:, n_a:].T, dft) + np.dot(parameters["Wc"][:, n_a:].T, dcct) + np.dot(parameters["Wi"][:, n_a:].T, dit) + np.dot(parameters["Wo"][:, n_a:].T, dot)
# 保存梯度信息到字典
gradients = {"dxt": dxt, "da_prev": da_prev, "dc_prev": dc_prev, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
return gradients
def lstm_backward(da, caches):
# 从caches中获取第一个cache(t=1)的值
caches, x = caches
(a1, c1, a0, c0, f1, i1, cc1, o1, x1, parameters) = caches[0]
# 获取da与x1的维度信息
n_a, m, T_x = da.shape
n_x, m = x1.shape
# 初始化梯度
dx = np.zeros([n_x, m, T_x])
da0 = np.zeros([n_a, m])
da_prevt = np.zeros([n_a, m])
dc_prevt = np.zeros([n_a, m])
dWf = np.zeros([n_a, n_a + n_x])
dWi = np.zeros([n_a, n_a + n_x])
dWc = np.zeros([n_a, n_a + n_x])
dWo = np.zeros([n_a, n_a + n_x])
dbf = np.zeros([n_a, 1])
dbi = np.zeros([n_a, 1])
dbc = np.zeros([n_a, 1])
dbo = np.zeros([n_a, 1])
# 处理所有时间步
for t in reversed(range(T_x)):
# 使用lstm_cell_backward函数计算所有梯度
gradients = lstm_cell_backward(da[:,:,t],dc_prevt,caches[t])
# 保存相关参数
dx[:,:,t] = gradients['dxt']
dWf = dWf+gradients['dWf']
dWi = dWi+gradients['dWi']
dWc = dWc+gradients['dWc']
dWo = dWo+gradients['dWo']
dbf = dbf+gradients['dbf']
dbi = dbi+gradients['dbi']
dbc = dbc+gradients['dbc']
dbo = dbo+gradients['dbo']
# 将第一个激活的梯度设置为反向传播的梯度da_prev。
da0 = gradients['da_prev']
# 保存所有梯度到字典变量内
gradients = {"dx": dx, "da0": da0, "dWf": dWf,"dbf": dbf, "dWi": dWi,"dbi": dbi,
"dWc": dWc,"dbc": dbc, "dWo": dWo,"dbo": dbo}
return gradients