搭建深度学习框架(六):实现LSTM网络的搭建

上一节已经实现了MLP网络的搭建、运用和框架的发布,这一节将实现LSTM网络的搭建和运用。
代码下载地址:xhpxiaohaipeng/xhp_flow_frame
1.LSTM的实现原理
本人在我的硕士毕业论文中已经对LSTM的实现原理做了具体分析和公式推导,本文的LSTM代码就是依据毕业论文内的公式推导实现的,有兴趣的可以下载进行查看,这里不做具体分析。
主要核心内容可参考如下:
(1)LSTM细胞,主要包括三个门,遗忘门、输入门,输出门。
搭建深度学习框架(六):实现LSTM网络的搭建_第1张图片
单个LSTM细胞前馈过程如下:
(I).遗忘门
搭建深度学习框架(六):实现LSTM网络的搭建_第2张图片
(ii).输入门
搭建深度学习框架(六):实现LSTM网络的搭建_第3张图片
(III).输出门搭建深度学习框架(六):实现LSTM网络的搭建_第4张图片
(2). LSTM网络,由多个LSTM细胞构成,每个LSTM细胞的输入包括现在的输入x和上个LSTM细胞输出的细胞状态C和隐藏状态h。搭建深度学习框架(六):实现LSTM网络的搭建_第5张图片
那么反向传播的计算公式如下:
(I)对于两个LSTM细胞构成的网络,公式推导如下
搭建深度学习框架(六):实现LSTM网络的搭建_第6张图片
搭建深度学习框架(六):实现LSTM网络的搭建_第7张图片

(II)那么对于N个LSTM细胞组成的网络,公式推导如下:
搭建深度学习框架(六):实现LSTM网络的搭建_第8张图片
2.LSTM代码实现
(1)LSTM细胞的前馈过程和反向传播过程实现。

class LSTMcell():

    def __init__(self, x, wf, wi, wc, wo, bf, bi, bc, bo, input_size, hidden_size, s_prev=None, h_prev=None):
        super(LSTMcell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        """
        判断输入变量的特征大小是否正确
        """
        assert x.shape[2] == input_size ,'input expect size:{},but get size:{}!!'.format(input_size,x.shape[2])
        """
        初始化计算变量
        """
        self.f = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.i = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.c = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.o = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.s = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.h = np.zeros((x.shape[0], x.shape[1], hidden_size))
        self.xc= np.zeros((x.shape[0], x.shape[1], hidden_size+input_size))

        self.wf = wf
        self.wi = wi
        self.wc = wc
        self.wo = wo
        """
        统一将偏置变量设为一维变量
        """
        self.bf = bf.squeeze()
        self.bi = bi.squeeze()
        self.bc = bc.squeeze()
        self.bo = bo.squeeze()


        self.h_prev = h_prev
        self.s_prev = s_prev

        self.gradients = {}
        self.x = x

    def sigmoid(self,x):

        return 1. / (1 + np.exp(-x))

    def forward(self):

        """
        如果输入的第一个LSTM细胞,初始化隐藏状态向量和细胞状态向量
        """
        if self.s_prev is None:
            self.s_prev = np.zeros((self.x.shape[0], self.x.shape[1], self.hidden_size))
        if self.h_prev is None:
            self.h_prev = np.zeros((self.x.shape[0], self.x.shape[1], self.hidden_size))

        """
        LSTM细胞前向计算
        """
        self.xc = np.concatenate((self.x,self.h_prev),axis=2)
        self.f = self.sigmoid(np.matmul(self.xc,self.wf) + self.bf)
        self.i = self.sigmoid(np.dot(self.xc,self.wi) + self.bi)
        self.c = np.tanh(np.dot(self.xc,self.wc) + self.bc)
        self.s = self.c*self.i + self.s_prev*self.f
        self.o = self.sigmoid(np.dot(self.xc,self.wo) + self.bo)
        self.h = np.tanh(self.s)*self.o



    def diff_sigmoid(self, x):

        return (1. - x) * x

    def diff_tanh(self, x):

        return 1. - x ** 2

    def backward(self):

        """
        LSTM细胞反向梯度计算,基于乘法运算求导和链式法则求导
        """
        """
        公共的梯度
        """
        ds = self.diff_tanh(np.tanh(self.s))
        """
        各梯度
        """
        df = self.s_prev * self.diff_sigmoid(self.f) * self.o * ds
        di = self.c * self.diff_sigmoid(self.i) * self.o * ds
        dc = self.i * self.diff_tanh(self.c) * self.o * ds
        do = np.tanh(self.c) * self.diff_sigmoid(self.o)

        dxc = self.o * self.diff_tanh(self.c)*(self.s_prev * self.diff_sigmoid(self.f) * self.wf + \
                                      self.i*self.diff_tanh(self.c)*self.wc + self.c * \
                                      self.diff_sigmoid(self.i)*self.wi ) + np.tanh(self.s) * \
                                      self.diff_sigmoid(self.o)*self.wo

        ds_prev = self.o * ds * self.f
        """
        取一个batch_size梯度的平均值作为最后的梯度值
        """
        self.xc = np.concatenate((self.x, self.h_prev), axis=2)
        self.xc = self.xc.transpose(0, 2, 1)
        self.gradients['wf'] = np.mean(np.multiply(self.xc,df),axis=0,keepdims=False)
        self.gradients['wi'] = np.mean(np.multiply(self.xc,di),axis=0,keepdims=False)
        self.gradients['wc'] = np.mean(np.multiply(self.xc,dc),axis=0,keepdims=False)
        self.gradients['wo'] = np.mean(np.multiply(self.xc,do),axis=0,keepdims=False)

        self.gradients['bf'] = np.mean(df,axis=0,keepdims=False)
        self.gradients['bi'] = np.mean(di,axis=0,keepdims=False)
        self.gradients['bc'] = np.mean(dc,axis=0,keepdims=False)
        self.gradients['bo'] = np.mean(do,axis=0,keepdims=False)

        self.gradients['xc'] = np.mean(dxc,axis=0,keepdims=False)
        self.gradients['x'] = self.gradients['xc'][:self.x.shape[2]]
        self.gradients['h_prev'] = self.gradients['xc'][self.x.shape[2]:]
        self.gradients['s_prev'] = ds_prev

(2).LSTM网络的前馈过程和反向传播过程。

class LSTM(Node):

    def __init__(self, input_x, wf, wi, wc, wo, bf, bi, bc, bo, input_size, hidden_size,
                 h_prev = None,s_prev = None,name='LSTM', is_trainable=False):
        Node.__init__(self, [input_x, wf, wi, wc, wo, bf, bi, bc, bo], \
                      name=name, is_trainable=is_trainable)

        """
        初始化变量值
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.input_x = input_x

        self.wf = wf
        self.wi = wi
        self.wc = wc
        self.wo = wo

        self.bf = bf
        self.bi = bi
        self.bc = bc
        self.bo = bo
        """
        用来传递初始细胞状态和隐藏状态
        """
        self.h_ = h_prev
        self.s_ = s_prev
    def forward(self):

        assert self.input_x.value.ndim == 3, 'expect 3 dim input,but get {} dim input!!'.format(self.input_x.ndim)

        """
        定义存储LSTM细胞的列表容器,不能在init里定义,
        否则所有输入的LSTM细胞都会被保存在列表里,
        不会随着输入的更新而清空并更新列表
        """
        self.lstm_node_list = []
        """
        初始细胞状态和隐藏状态不能在init里定义,
        否则上一个输入的最后一个LSTM细胞的细胞
        状态和隐藏状态输出会被记住,并用于下一个
        输入的初始细胞状态和隐藏状态输入,这会造成无法训练。
        """
        self.s_prev = self.s_
        self.h_prev = self.h_
        """
        按照输入变量依次填入LSTM细胞
        """
        for i in range(self.input_x.value.shape[1]):
            """
            把前一个LSTM细胞输出的隐藏状态和细胞状态
            传递给下一个LSTM细胞
            """
            if len(self.lstm_node_list) > 0:
                self.s_prev = self.lstm_node_list[i-1].s
                self.h_prev = self.lstm_node_list[i-1].h
            """
            按照输入数据的顺序依次填入LSTM细胞
            """
            self.lstm_node_list.append(LSTMcell(self.input_x.value[:,i,:][:,None,:], self.wf.value, self.wi.value, self.wc.value, self.wo.value, \
                         self.bf.value, self.bi.value, self.bc.value, self.bo.value, self.input_size, self.hidden_size, self.s_prev, self.h_prev))
            """
            LSTM细胞进行前向计算
            """
            self.lstm_node_list[i].forward()
            """
            合并LSTM细胞的输出结果作为LSTM的输出
            """
            if i == 0:
                self.value = self.lstm_node_list[i].h
            else:
                self.value = np.concatenate((self.value, self.lstm_node_list[i].h), axis=1)

    def backward(self):
        """
        初始化各个梯度值为0
        """
        self.gradients[self.wf] = np.zeros_like(self.wf.value)
        self.gradients[self.wi] = np.zeros_like(self.wi.value)
        self.gradients[self.wc] = np.zeros_like(self.wc.value)
        self.gradients[self.wo] = np.zeros_like(self.wo.value)

        self.gradients[self.bf] = np.zeros_like(self.bf.value).squeeze()
        self.gradients[self.bi] = np.zeros_like(self.bi.value).squeeze()
        self.gradients[self.bc] = np.zeros_like(self.bc.value).squeeze()
        self.gradients[self.bo] = np.zeros_like(self.bo.value).squeeze()
        """
        实际上与LSTM网络连接的MLP,相当于只与最后一个LSTM细胞相连,
        因为最后的梯度更新都会流向最后一个LSTM细胞, 相当于梯度更新
        只与最后一个LSTM细胞有关
        """
        self.gradients[self.input_x] = np.zeros_like(self.input_x.value[:,0,:])

        """
        按照倒序进行梯度计算
        将节点反转过来求梯度
        """
        for backward_node_index in range(len(self.lstm_node_list[::-1])):
            self.lstm_node_list[backward_node_index].backward()
            """
            最后一个LSTM细胞的梯度不涉及到基于时间序列的链式法则求解梯度
            """
            if backward_node_index == 0:

                gradients_wf = self.lstm_node_list[backward_node_index].gradients['wf']
                gradients_wi = self.lstm_node_list[backward_node_index].gradients['wi']
                gradients_wc = self.lstm_node_list[backward_node_index].gradients['wc']
                gradients_wo = self.lstm_node_list[backward_node_index].gradients['wo']

                gradients_bf = self.lstm_node_list[backward_node_index].gradients['bf']
                gradients_bi = self.lstm_node_list[backward_node_index].gradients['bi']
                gradients_bc = self.lstm_node_list[backward_node_index].gradients['bc']
                gradients_bo = self.lstm_node_list[backward_node_index].gradients['bo']

                gradients_h = self.lstm_node_list[backward_node_index].gradients['h_prev']
                gradients_x = self.lstm_node_list[backward_node_index].gradients['x']

            else:
                """
                基于时间的梯度计算法则计算梯度(BPTT,其实就是链式法则)
                """
                h_grdient_index = 1
                while h_grdient_index != backward_node_index:
                    """
                    #0,1,2,...i-1  各LSTM细胞之间的h梯度相乘,按照先后顺序有不同数量的h梯度因数
                    """
                    gradients_h *= self.lstm_node_list[h_grdient_index].gradients['h_prev']
                    h_grdient_index += 1
                """ 
                梯度相加原则
                #0,1,2,3,....i 所有节点的梯度相加
                """
                gradients_wf += np.dot(self.lstm_node_list[backward_node_index].gradients['wf'], gradients_h)
                gradients_wi += np.dot(self.lstm_node_list[backward_node_index].gradients['wi'], gradients_h)
                gradients_wc += np.dot(self.lstm_node_list[backward_node_index].gradients['wc'], gradients_h)
                gradients_wo += np.dot(self.lstm_node_list[backward_node_index].gradients['wo'], gradients_h)

                gradients_bf += np.dot(self.lstm_node_list[backward_node_index].gradients['bf'], gradients_h)
                gradients_bi += np.dot(self.lstm_node_list[backward_node_index].gradients['bi'], gradients_h)
                gradients_bc += np.dot(self.lstm_node_list[backward_node_index].gradients['bc'], gradients_h)
                gradients_bo += np.dot(self.lstm_node_list[backward_node_index].gradients['bo'], gradients_h)

                gradients_x += np.dot(self.lstm_node_list[backward_node_index].gradients['x'], gradients_h)

        gradients_bf = gradients_bf.squeeze()
        gradients_bi = gradients_bi.squeeze()
        gradients_bc = gradients_bc.squeeze()
        gradients_bo = gradients_bo.squeeze()

        for n in self.outputs:

            gradients_from_loss_to_self = n.gradients[self]
            gradients_from_loss_to_self = np.mean(gradients_from_loss_to_self, axis=0, keepdims=False)
            """
            对于输入x的梯度计算需保留所有LSTM细胞的梯度计算,为LSTM的输入节点的梯度计算做准备。
            """
            self.gradients[self.input_x] += np.dot(gradients_from_loss_to_self,gradients_x.T)

            """
            #取一个batch的平均值和所有node的平均值
            """
            gradients_from_loss_to_self = np.mean(gradients_from_loss_to_self, axis=0, keepdims=True)
            self.gradients[self.wf] += gradients_from_loss_to_self*gradients_wf
            self.gradients[self.wi] += gradients_from_loss_to_self*gradients_wi
            self.gradients[self.wc] += gradients_from_loss_to_self*gradients_wc
            self.gradients[self.wo] += gradients_from_loss_to_self*gradients_wo
            self.gradients[self.bf] += gradients_from_loss_to_self*gradients_bf
            self.gradients[self.bi] += gradients_from_loss_to_self*gradients_bi
            self.gradients[self.bc] += gradients_from_loss_to_self*gradients_bc
            self.gradients[self.bo] += gradients_from_loss_to_self*gradients_bo

3.使用搭建的LSTM运用解决时间序列回归问题。
实现代码如下:

from sklearn.datasets import load_boston
from tqdm import tqdm
from sklearn.utils import shuffle, resample
import numpy as np
from xhp_flow.nn.node import Placeholder,Linear,Sigmoid,ReLu,Leakrelu,Elu,Tanh,LSTM
from xhp_flow.optimize.optimize import toplogical_sort,run_steps,optimize,forward,save_model,load_model
from xhp_flow.loss.loss import MSE,EntropyCrossLossWithSoftmax
import matplotlib.pyplot as plt
import torch
from glob import glob
from data_prepare_for_many import *
torch.manual_seed(1)
MAX_LENGTH = 100
torch.cuda.set_device(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('ok')
length = 10
predict_length = 1
batch_size = 512
file_path_train = np.array(glob('data/train/*'))
# file_path_test = np.array(glob('data/test/*'))
# file_path_valid = np.array(glob('data/valid/*'))
# Training_generator1, Test, Valid, WholeSet= get_dataloader(batch_size,length,predict_length)
Training_generator, WholeSet_train = get_dataloader(batch_size, length, predict_length, file_path_train, 'train')
x1, y = next(iter(Training_generator))
input_x, y = x1.numpy(), y.numpy()


class LSTMtest():
    def __init__(self, input_size=8, hidden_size=256, output_size=1):
        self.x, self.y = Placeholder(name='x', is_trainable=False), Placeholder(name='y', is_trainable=False)
        self.wf, self.bf = Placeholder(name='wf'), Placeholder(name='bf')
        self.wi, self.bi = Placeholder(name='wi'), Placeholder(name='bi')
        self.wc, self.bc = Placeholder(name='wc'), Placeholder(name='bc')
        self.wo, self.bo = Placeholder(name='wo'), Placeholder(name='bo')

        self.w0, self.b0 = Placeholder(name='w0'), Placeholder(name='b0')
        self.w1, self.b1 = Placeholder(name='w1'), Placeholder(name='b1')
        self.w2, self.b2 = Placeholder(name='w2'), Placeholder(name='b2')

        self.linear0 = Linear(self.x, self.w0, self.b0, name='linear0')
        self.lstm = LSTM(self.linear0, self.wf, self.wi, self.wc, self.wo, self.bf, self.bi, self.bc, self.bo,
                         input_size, hidden_size, name='LSTM')
        self.linear1 = Linear(self.lstm, self.w1, self.b1, name='linear1')
        self.output = ReLu(self.linear1, name='Elu')
        self.y_pre = Linear(self.output, self.w2, self.b2, name='output_pre')
        self.MSE_loss = MSE(self.y_pre, self.y, name='MSE')

        # 初始化数据结构
        self.feed_dict = {
            self.x: input_x,
            self.y: y,
            self.w0: np.random.rand(input_x.shape[2], input_size),
            self.b0: np.zeros(input_size),
            self.wf: np.random.rand(input_size + hidden_size, hidden_size),
            self.bf: np.zeros(hidden_size),
            self.wi: np.random.rand(input_size + hidden_size, hidden_size),
            self.bi: np.zeros(hidden_size),
            self.wc: np.random.rand(input_size + hidden_size, hidden_size),
            self.bc: np.zeros(hidden_size),
            self.wo: np.random.rand(input_size + hidden_size, hidden_size),
            self.bo: np.zeros(hidden_size),
            self.w1: np.random.rand(hidden_size, hidden_size),
            self.b1: np.zeros(hidden_size),
            self.w2: np.random.rand(hidden_size, output_size),
            self.b2: np.zeros(output_size),
        }


# In[ ]:
lstm = LSTMtest(16, 16, 1)
graph_sort_lstm = toplogical_sort(lstm.feed_dict)  # 拓扑排序
print(graph_sort_lstm)
def train(model, train_data, epoch=6, learning_rate=1e-3):
    # 开始训练
    losses = []
    loss_min = np.inf
    for e in range(epoch):
        for X, Y in train_data:
            X, Y = X.numpy(), Y.numpy()
            model.x.value = X
            model.y.value = Y
            run_steps(graph_sort_lstm)
            # if model.y_pre.value is not None:
            # print(model.y_pre.value.shape,Y.shape)
            optimize(graph_sort_lstm, learning_rate=learning_rate)
            loss = model.MSE_loss.value
            losses.append(loss)
            # print('loss:',loss)
        print("epoch:{}/{},loss:{:.6f}".format(e,epoch,np.mean(losses)))
        if np.mean(losses) < loss_min:
            print('loss is {:.6f}, is decreasing!! save moddel'.format(np.mean(losses)))
            save_model("model/lstm.xhp", model)
            loss_min = np.mean(losses)
    print('min loss:',loss_min)
    plt.plot(losses)
    plt.savefig("image/lstm_loss.png")
    plt.show()


train(lstm, Training_generator, 1000)

最后得到的最小损失为5.9795,训练过程如下:
搭建深度学习框架(六):实现LSTM网络的搭建_第9张图片
4.使用pytorch搭建网络解决同样的问题
代码如下:

import torch
from torch import nn
class lstmtest(nn.Module):
    def __init__(self, input_size,hidden_size,output_size,dropout=0):
        super(lstmtest, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = 1

        self.in2lstm = nn.Linear(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=self.num_layers, bidirectional=False, batch_first=False,
                            dropout=dropout)
        self.fc0 = nn.Linear(hidden_size, hidden_size)
        self.relu = nn.ReLU()
        nn.LogSoftmax()
        self.fc1 = nn.Linear(hidden_size, output_size)

    def forward(self, input):
        out1 = self.in2lstm(input)
        lstm,_ = self.lstm(out1)
        out1 = self.fc0(lstm)
        out1 = self.relu(out1)
        output = self.fc1(out1)
        return output
from data_prepare_for_many import *
torch.manual_seed(1)
MAX_LENGTH = 100
torch.cuda.set_device(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('ok')
length = 10
predict_length = 1
batch_size = 512
file_path_train = np.array(glob('data/train/*'))
#file_path_test = np.array(glob('data/test/*'))
#file_path_valid = np.array(glob('data/valid/*'))
# Training_generator1, Test, Valid, WholeSet= get_dataloader(batch_size,length,predict_length)
Training_generator, WholeSet_train = get_dataloader(batch_size, length, predict_length, file_path_train, 'train')
lstm = lstmtest(7,16,1)
lstm = lstm.double()
encoder_optimizer = torch.optim.Adam(lstm.parameters(), lr=1e-3)
def train(model,train_data,epoch = 1000,learning_rate = 1e-3):
    #开始训练
    losses = []
    criterion = nn.MSELoss()
    loss_min = np.inf
    for e in range(epoch):
        for X,Y in train_data:
            y = model(X)
            loss = criterion(y, Y)
            loss.backward()
            encoder_optimizer.step()
            losses.append(loss.item())
        if np.mean(losses) < loss_min:
            print('loss is {:.6f}, is decreasing!! save moddel'.format(np.mean(losses)))
            torch.save(model.state_dict(),'model/torch_lstm_test.pth')
            loss_min = np.mean(losses)
        print("epoch:{},loss:{}".format(e, np.mean(losses)))
    print('min loss:',loss_min)
    plt.plot(losses)
    plt.savefig("image/torch_lstm_loss.png")
    plt.show()
    
train(lstm,Training_generator,1000)

最小损失为3.58.
训练过程如下:搭建深度学习框架(六):实现LSTM网络的搭建_第10张图片

相比pytorch,自己搭建的网络还是有差距。主要是:
1.没有实现Adam优化算法。
2.没有对初始参数做最优设计。
3.还有原因还待分析解决。

你可能感兴趣的:(深度学习框架搭建,深度学习,神经网络,python)