原理文章链接:深度学习——循环神经网络GRU公式推导
#encoding=utf-8
import numpy as np
def sigmoid(x):
return 1/(1+np.exp(-x))
def tanh(x):
return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
def softmax(o):
#求和作为分母
all_y = 0.0
for item in o:
all_y += item
return o/all_y
class GRUCell:
def __init__(self,W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o):
self.W_r = W_r
self.W_z = W_z
self.W_hh = W_hh
self.U_r = U_r
self.U_z = U_z
self.U_hh = U_hh
self.B_r = B_r
self.B_z = B_z
self.B_hh = B_hh
self.W_o = W_o
self.B_o = B_o
self.rt_stack = []
self.zt_stack = []
self.hht_stack = []
self.X_stack = []
self.St_1_stack = []
def forward(self,X,St_1):
self.X_stack.append(X)
self.St_1_stack.append(St_1)
net_r = np.dot(self.W_r.T, X) + np.dot(self.U_r.T,St_1) + self.B_r
rt = sigmoid(net_r)
self.rt_stack.append(rt)
net_z = np.dot(self.W_z.T,X) + np.dot(self.U_z.T,St_1) + self.B_z
zt = sigmoid(net_z)
self.zt_stack.append(zt)
net_hh = np.dot(self.W_hh.T,X) + np.dot(self.U_hh,rt*St_1) + self.B_hh
hht = tanh(net_hh)
self.hht_stack.append(hht)
St = (1-zt)*St_1 + zt * hht
Ot = np.dot(self.W_o.T,St) + self.B_o
Y = softmax(Ot)
return Y,St
def calc_o_delta(self,Y,Y_r):
return Y - Y_r
def calc_delta(self,St_delta):
rt = self.rt_stack[-1]
zt = self.zt_stack.pop()
hht = self.hht_stack.pop()
St_1 = self.St_1_stack[-1]
delta_net_hht = St_delta * zt * (1 - np.square(hht))
delta_net_zt = St_delta * (hht - St_1) * zt * (1 - zt)
delta_net_rt = delta_net_hht * np.dot(self.U_hh,St_1)* rt * (1-rt)
self.delta_net_hht = delta_net_hht
self.delta_net_zt = delta_net_zt
self.delta_net_rt = delta_net_rt
St_1_delta = St_delta * (1-zt)+np.dot(self.U_z,delta_net_zt)+np.dot(self.U_hh,(delta_net_hht*rt))+np.dot(self.U_r,delta_net_rt)
return St_1_delta
def calc_gradient(self):
St_1 = self.St_1_stack.pop()
Xt = self.X_stack.pop()
rt = self.rt_stack.pop()
W_r_grad = np.dot(Xt,self.delta_net_rt.T)
U_r_grad = np.dot(St_1,self.delta_net_rt.T)
B_r_grad = self.delta_net_rt
W_z_grad = np.dot(Xt,self.delta_net_zt.T)
U_z_grad = np.dot(St_1,self.delta_net_zt.T)
B_z_grad = self.delta_net_zt
W_hh_grad = np.dot(Xt,self.delta_net_hht.T)
U_hh_grad = np.dot((rt*St_1),self.delta_net_hht.T)
B_hh_grad = self.delta_net_hht
return W_r_grad,U_r_grad,B_r_grad,W_z_grad,U_z_grad,B_z_grad,W_hh_grad,U_hh_grad,B_hh_grad
def backward(self,St_delta):
self.calc_delta(St_delta)
self.W_r_grad,self.U_r_grad,self.B_r_grad,self.W_z_grad,self.U_z_grad,self.B_z_grad,self.W_hh_grad,self.U_hh_grad,self.B_hh_grad=self.calc_gradient()
def update(self,lr):
self.W_r -= lr * self.W_r_grad
self.W_z -= lr * self.W_z_grad
self.W_hh -= lr * self.W_hh_grad
self.U_r -= lr * self.U_r_grad
self.U_z -= lr * self.U_z_grad
self.U_hh -= lr * self.U_hh_grad
self.B_r -= lr * self.B_r_grad
self.B_z -= lr * self.B_z_grad
self.B_hh -= lr * self.B_hh_grad
介绍,我们层次构建的GRU以四个网络单元为例:
输入: X 1 X_1 X1
标签: Y r 1 Y_{r1} Yr1
基本思想:根据我们已经确定的GRUcell,只需要将GRUCell向前传递4次即可,并且每次的输入的X都是X_1。
#encoding=utf-8
import numpy as np
def sigmoid(x):
return 1/(1+np.exp(-x))
def tanh(x):
return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
def softmax(o):
#求和作为分母
all_y = 0.0
for item in o:
all_y += item
return o/all_y
class GRUCell:
def __init__(self,W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o,lr):
self.W_r = W_r
self.W_z = W_z
self.W_hh = W_hh
self.U_r = U_r
self.U_z = U_z
self.U_hh = U_hh
self.B_r = B_r
self.B_z = B_z
self.B_hh = B_hh
self.W_o = W_o
self.B_o = B_o
self.rt_stack = []
self.zt_stack = []
self.hht_stack = []
self.delta_net_rt_stack = []
self.delta_net_zt_stack = []
self.delta_net_hht_stack = []
self.delta_St_1_stack = []
self.X_stack = []
self.St_1_stack = []
self.grad_W_r = []
self.grad_W_z = []
self.grad_W_hh = []
self.grad_U_r = []
self.grad_U_z = []
self.grad_U_hh = []
self.grad_B_r = []
self.grad_B_z = []
self.grad_B_hh = []
self.lr = lr
def forward(self,X,St_1):
self.X_stack.append(X)
self.St_1_stack.append(St_1)
net_r = np.dot(self.W_r.T, X) + np.dot(self.U_r.T,St_1) + self.B_r
rt = sigmoid(net_r)
self.rt_stack.append(rt)
net_z = np.dot(self.W_z.T,X) + np.dot(self.U_z.T,St_1) + self.B_z
zt = sigmoid(net_z)
self.zt_stack.append(zt)
net_hh = np.dot(self.W_hh.T,X) + np.dot(self.U_hh,rt*St_1) + self.B_hh
hht = tanh(net_hh)
self.hht_stack.append(hht)
St = (1-zt)*St_1 + zt * hht
Ot = np.dot(self.W_o.T,St) + self.B_o
Y = softmax(Ot)
return Y,St
def calc_o_delta(self,Y,Y_r):
return Y - Y_r
def update_o(self,delta_o,St):
#注意,这里没有采用激活函数,所以net_o = o
self.W_o -= self.lr * np.dot(St,delta_o.T)
self.B_o -= self.lr * delta_o
def calc_delta(self,St_delta):
rt = self.rt_stack[-1]
zt = self.zt_stack.pop()
hht = self.hht_stack.pop()
St_1 = self.St_1_stack[-1]
delta_net_hht = St_delta * zt * (1 - np.square(hht))
delta_net_zt = St_delta * (hht - St_1) * zt * (1 - zt)
delta_net_rt = delta_net_hht * np.dot(self.U_hh,St_1)* rt * (1-rt)
St_1_delta = St_delta * (1-zt)+np.dot(self.U_z,delta_net_zt)+np.dot(self.U_hh,(delta_net_hht*rt))+np.dot(self.U_r,delta_net_rt)
self.delta_net_hht_stack.append(delta_net_hht)
self.delta_net_zt_stack.append(delta_net_zt)
self.delta_net_rt_stack.append(delta_net_rt)
self.delta_St_1_stack.append(St_1_delta)
return St_1_delta
def calc_gradient(self):
St_1 = self.St_1_stack.pop()
Xt = self.X_stack.pop()
rt = self.rt_stack.pop()
delta_net_rt = self.delta_net_rt_stack.pop()
delta_net_zt = self.delta_net_zt_stack.pop()
delta_net_hht = self.delta_net_hht_stack.pop()
W_r_grad = np.dot(Xt,delta_net_rt.T)
U_r_grad = np.dot(St_1,delta_net_rt.T)
B_r_grad = delta_net_rt
W_z_grad = np.dot(Xt,delta_net_zt.T)
U_z_grad = np.dot(St_1,delta_net_zt.T)
B_z_grad = delta_net_zt
W_hh_grad = np.dot(Xt,delta_net_hht.T)
U_hh_grad = np.dot((rt*St_1),delta_net_hht.T)
B_hh_grad = delta_net_hht
self.grad_W_r.append(W_r_grad)
self.grad_U_r.append(U_r_grad)
self.grad_B_r.append(B_r_grad)
self.grad_W_z.append(W_z_grad)
self.grad_U_z.append(U_z_grad)
self.grad_B_z.append(B_z_grad)
self.grad_W_hh.append(W_hh_grad)
self.grad_U_hh.append(U_hh_grad)
self.grad_B_hh.append(B_hh_grad)
def backward(self,St_delta):
St_1_delta = self.calc_delta(St_delta)
self.calc_gradient()
#计算完梯度之后,返回前一个时刻的误差,作为下一次的输入
return St_1_delta
def update(self):
for W_r_grad in self.grad_W_r:
self.W_r -= self.lr * W_r_grad
for W_z_grad in self.grad_W_z:
self.W_z -= self.lr * W_z_grad
for W_hh_grad in self.grad_W_hh:
self.W_hh -= self.lr * W_hh_grad
for U_r_grad in self.grad_U_r:
self.U_r -= self.lr * U_r_grad
for U_z_grad in self.grad_U_z:
self.U_z -= self.lr * U_z_grad
for U_hh_grad in self.grad_U_hh:
self.U_hh -= self.lr * U_hh_grad
for B_r_grad in self.grad_B_r:
self.B_r -= self.lr * B_r_grad
for B_z_grad in self.grad_B_z:
self.B_z -= self.lr * B_z_grad
for B_hh_grad in self.grad_B_hh:
self.B_hh -= self.lr * B_hh_grad
def gradient_check():
X = np.array([[4.0],[2.0],[2.0],[1.0]])
Y_r = np.array([[1],[0]])
St_1 = np.array([[0.5],[0.5]])
W_r = np.random.rand(4,2)
U_r = np.random.rand(2,2)
W_z = np.random.rand(4,2)
U_z = np.random.rand(2,2)
W_hh = np.random.rand(4,2)
U_hh = np.random.rand(2,2)
B_r = np.random.rand(2,1)
B_z = np.random.rand(2,1)
B_hh = np.random.rand(2,1)
W_o = np.random.rand(2,2)
B_o = np.random.rand(2,1)
grucell = GRUCell(W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o,0.1)
for i in range(1000):
St = St_1
for j in range(4):
Y,St = grucell.forward(X,St)
#在经历过四个单元之后,Y,St是最后一个单元的导数,开始反向求导
#首先求关于O的导数
St_delta = grucell.calc_o_delta(Y,Y_r)
#更新W_o和B_o
grucell.update_o(St_delta,St)
#需要经过四次方向传播来求导,实现
for j in range(4):
St_delta = grucell.backward(St_delta)
#所有节点的求导过结束之后,进行求和更新
grucell.update()
print("the Y is ")
print(Y)
gradient_check()