深度学习——基于Numpy实现GRUCell和GRU

深度学习——基于Numpy实现GRUCell和GRU

原理文章链接:深度学习——循环神经网络GRU公式推导

1. GRUCell单元构建

#encoding=utf-8
import numpy as np
def sigmoid(x):
    return 1/(1+np.exp(-x))
def tanh(x):
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
def softmax(o):
    #求和作为分母
    all_y = 0.0
    for item in o:
        all_y += item
    return o/all_y
class GRUCell:
    def __init__(self,W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o):
        self.W_r = W_r
        self.W_z = W_z
        self.W_hh = W_hh
        self.U_r = U_r
        self.U_z = U_z
        self.U_hh = U_hh
        self.B_r  = B_r
        self.B_z = B_z
        self.B_hh = B_hh
        self.W_o = W_o
        self.B_o = B_o
        self.rt_stack = []
        self.zt_stack = []
        self.hht_stack = []
        self.X_stack = []
        self.St_1_stack = []
    def forward(self,X,St_1):
        self.X_stack.append(X)
        self.St_1_stack.append(St_1)
        net_r = np.dot(self.W_r.T, X) + np.dot(self.U_r.T,St_1) + self.B_r
        rt = sigmoid(net_r)
        self.rt_stack.append(rt)
        net_z = np.dot(self.W_z.T,X) + np.dot(self.U_z.T,St_1) + self.B_z
        zt = sigmoid(net_z)
        self.zt_stack.append(zt)
        net_hh = np.dot(self.W_hh.T,X) + np.dot(self.U_hh,rt*St_1) + self.B_hh
        hht = tanh(net_hh)
        self.hht_stack.append(hht)
        St = (1-zt)*St_1 + zt * hht
        Ot = np.dot(self.W_o.T,St) + self.B_o
        Y = softmax(Ot)
        return Y,St
    def calc_o_delta(self,Y,Y_r):
        return Y - Y_r
    def calc_delta(self,St_delta):
        rt = self.rt_stack[-1]
        zt = self.zt_stack.pop()
        hht = self.hht_stack.pop()
        St_1 = self.St_1_stack[-1]
        delta_net_hht = St_delta * zt * (1 - np.square(hht))
        delta_net_zt = St_delta * (hht - St_1) * zt * (1 - zt)
        delta_net_rt = delta_net_hht * np.dot(self.U_hh,St_1)* rt * (1-rt)
        self.delta_net_hht = delta_net_hht
        self.delta_net_zt = delta_net_zt
        self.delta_net_rt = delta_net_rt
        St_1_delta = St_delta * (1-zt)+np.dot(self.U_z,delta_net_zt)+np.dot(self.U_hh,(delta_net_hht*rt))+np.dot(self.U_r,delta_net_rt)                      
        return St_1_delta
    def calc_gradient(self):
        St_1 = self.St_1_stack.pop()
        Xt = self.X_stack.pop()
        rt = self.rt_stack.pop()
        W_r_grad = np.dot(Xt,self.delta_net_rt.T)
        U_r_grad = np.dot(St_1,self.delta_net_rt.T)
        B_r_grad = self.delta_net_rt
        W_z_grad = np.dot(Xt,self.delta_net_zt.T)
        U_z_grad = np.dot(St_1,self.delta_net_zt.T)
        B_z_grad = self.delta_net_zt
        W_hh_grad = np.dot(Xt,self.delta_net_hht.T)
        U_hh_grad = np.dot((rt*St_1),self.delta_net_hht.T)
        B_hh_grad = self.delta_net_hht
        return W_r_grad,U_r_grad,B_r_grad,W_z_grad,U_z_grad,B_z_grad,W_hh_grad,U_hh_grad,B_hh_grad
    def backward(self,St_delta):
        self.calc_delta(St_delta)
        self.W_r_grad,self.U_r_grad,self.B_r_grad,self.W_z_grad,self.U_z_grad,self.B_z_grad,self.W_hh_grad,self.U_hh_grad,self.B_hh_grad=self.calc_gradient()
    def update(self,lr):
        self.W_r -= lr * self.W_r_grad
        self.W_z -= lr * self.W_z_grad
        self.W_hh -= lr * self.W_hh_grad
        self.U_r -= lr * self.U_r_grad
        self.U_z -= lr * self.U_z_grad
        self.U_hh -= lr * self.U_hh_grad
        self.B_r -= lr * self.B_r_grad
        self.B_z -= lr * self.B_z_grad
        self.B_hh -= lr * self.B_hh_grad

2 根据GRU单元构建GRU网络

介绍,我们层次构建的GRU以四个网络单元为例:
输入: X 1 X_1 X1
标签: Y r 1 Y_{r1} Yr1
基本思想:根据我们已经确定的GRUcell,只需要将GRUCell向前传递4次即可,并且每次的输入的X都是X_1。

2.1 代码实现

#encoding=utf-8
import numpy as np
def sigmoid(x):
    return 1/(1+np.exp(-x))
def tanh(x):
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
def softmax(o):
    #求和作为分母
    all_y = 0.0
    for item in o:
        all_y += item
    return o/all_y
class GRUCell:
    def __init__(self,W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o,lr):
        self.W_r = W_r
        self.W_z = W_z
        self.W_hh = W_hh
        self.U_r = U_r
        self.U_z = U_z
        self.U_hh = U_hh
        self.B_r  = B_r
        self.B_z = B_z
        self.B_hh = B_hh
        self.W_o = W_o
        self.B_o = B_o
        
        self.rt_stack = []
        self.zt_stack = []
        self.hht_stack = []
        self.delta_net_rt_stack = []
        self.delta_net_zt_stack = []
        self.delta_net_hht_stack = []
        self.delta_St_1_stack = []
        
        self.X_stack = []
        self.St_1_stack = []
        self.grad_W_r = []
        self.grad_W_z = []
        self.grad_W_hh = []
        self.grad_U_r = []
        self.grad_U_z = []
        self.grad_U_hh = []
        self.grad_B_r  = []
        self.grad_B_z = []
        self.grad_B_hh = []
        
        self.lr = lr
        
    def forward(self,X,St_1):
        self.X_stack.append(X)
        self.St_1_stack.append(St_1)
        net_r = np.dot(self.W_r.T, X) + np.dot(self.U_r.T,St_1) + self.B_r
        rt = sigmoid(net_r)
        self.rt_stack.append(rt)
        net_z = np.dot(self.W_z.T,X) + np.dot(self.U_z.T,St_1) + self.B_z
        zt = sigmoid(net_z)
        self.zt_stack.append(zt)
        net_hh = np.dot(self.W_hh.T,X) + np.dot(self.U_hh,rt*St_1) + self.B_hh
        hht = tanh(net_hh)
        self.hht_stack.append(hht)
        St = (1-zt)*St_1 + zt * hht
        Ot = np.dot(self.W_o.T,St) + self.B_o
        
        Y = softmax(Ot)
        return Y,St
    def calc_o_delta(self,Y,Y_r):
        return Y - Y_r
    def update_o(self,delta_o,St):
        #注意,这里没有采用激活函数,所以net_o = o
        self.W_o -= self.lr * np.dot(St,delta_o.T)
        self.B_o -= self.lr * delta_o
    def calc_delta(self,St_delta):
        rt = self.rt_stack[-1]
        zt = self.zt_stack.pop()
        hht = self.hht_stack.pop()
        St_1 = self.St_1_stack[-1]
        delta_net_hht = St_delta * zt * (1 - np.square(hht))
        delta_net_zt = St_delta * (hht - St_1) * zt * (1 - zt)
        delta_net_rt = delta_net_hht * np.dot(self.U_hh,St_1)* rt * (1-rt)
        St_1_delta = St_delta * (1-zt)+np.dot(self.U_z,delta_net_zt)+np.dot(self.U_hh,(delta_net_hht*rt))+np.dot(self.U_r,delta_net_rt)                      
        
        self.delta_net_hht_stack.append(delta_net_hht)
        self.delta_net_zt_stack.append(delta_net_zt)
        self.delta_net_rt_stack.append(delta_net_rt)
        self.delta_St_1_stack.append(St_1_delta)
        return St_1_delta
    def calc_gradient(self):
        St_1 = self.St_1_stack.pop()
        Xt = self.X_stack.pop()
        rt = self.rt_stack.pop()
        delta_net_rt = self.delta_net_rt_stack.pop()
        delta_net_zt = self.delta_net_zt_stack.pop()
        delta_net_hht = self.delta_net_hht_stack.pop()
        
        W_r_grad = np.dot(Xt,delta_net_rt.T)
        U_r_grad = np.dot(St_1,delta_net_rt.T)
        B_r_grad = delta_net_rt
        W_z_grad = np.dot(Xt,delta_net_zt.T)
        U_z_grad = np.dot(St_1,delta_net_zt.T)
        B_z_grad = delta_net_zt
        W_hh_grad = np.dot(Xt,delta_net_hht.T)
        U_hh_grad = np.dot((rt*St_1),delta_net_hht.T)
        B_hh_grad = delta_net_hht
        
        self.grad_W_r.append(W_r_grad)
        self.grad_U_r.append(U_r_grad)
        self.grad_B_r.append(B_r_grad)

        self.grad_W_z.append(W_z_grad)
        self.grad_U_z.append(U_z_grad)
        self.grad_B_z.append(B_z_grad)
        
        self.grad_W_hh.append(W_hh_grad)
        self.grad_U_hh.append(U_hh_grad)
        self.grad_B_hh.append(B_hh_grad)
        
        
    def backward(self,St_delta):
        St_1_delta = self.calc_delta(St_delta)
        self.calc_gradient()
        #计算完梯度之后,返回前一个时刻的误差,作为下一次的输入
        return St_1_delta
    def update(self):
        for W_r_grad in self.grad_W_r:
            self.W_r -= self.lr * W_r_grad
        for W_z_grad in self.grad_W_z:
            self.W_z -= self.lr * W_z_grad
        for W_hh_grad in self.grad_W_hh:
            self.W_hh -= self.lr * W_hh_grad
        for U_r_grad in self.grad_U_r:
            self.U_r -= self.lr * U_r_grad
        for U_z_grad in self.grad_U_z:
            self.U_z -= self.lr * U_z_grad
        for U_hh_grad in self.grad_U_hh:
            self.U_hh -= self.lr * U_hh_grad
        for B_r_grad in self.grad_B_r:
            self.B_r -= self.lr * B_r_grad
        for B_z_grad in self.grad_B_z:
            self.B_z -= self.lr * B_z_grad
        for B_hh_grad in self.grad_B_hh:
            self.B_hh -= self.lr * B_hh_grad

def gradient_check():
    X = np.array([[4.0],[2.0],[2.0],[1.0]])
    Y_r = np.array([[1],[0]])
    St_1 = np.array([[0.5],[0.5]])
    
    W_r = np.random.rand(4,2)
    U_r = np.random.rand(2,2)
    W_z = np.random.rand(4,2)
    U_z = np.random.rand(2,2)
    W_hh = np.random.rand(4,2)
    U_hh = np.random.rand(2,2)
    B_r = np.random.rand(2,1)
    B_z = np.random.rand(2,1)
    B_hh = np.random.rand(2,1)
    W_o = np.random.rand(2,2)
    B_o = np.random.rand(2,1)
    grucell = GRUCell(W_r,W_z,W_hh,W_o,U_r,U_z,U_hh,B_r,B_z,B_hh,B_o,0.1)
    for i in range(1000):
        St = St_1
        for j in range(4):
            Y,St = grucell.forward(X,St)
        #在经历过四个单元之后,Y,St是最后一个单元的导数,开始反向求导
        #首先求关于O的导数
        St_delta = grucell.calc_o_delta(Y,Y_r)
        #更新W_o和B_o
        grucell.update_o(St_delta,St)
        #需要经过四次方向传播来求导,实现
        for j in range(4):
            St_delta = grucell.backward(St_delta)
        #所有节点的求导过结束之后,进行求和更新
        grucell.update()
        
        print("the Y is ")
        print(Y)
gradient_check()

你可能感兴趣的:(神经网络)