多层神经网络,从零开始——(九)、优化函数

常用的优化算法有:随机梯度下降、带动量的随机梯度下降、AdaGrad算法、RMSProp算法、Adam算法,其中Adam算法结合了RMSProp和动量,一种鲁棒性非常好的方法。

一、优化函数基类

module mod_BaseGradientOptimizationMethod
use mod_NNStructure
implicit none
    
!-----------------------
! 抽象类:梯度优化算法 |
!-----------------------
type, abstract, public :: BaseGradientOptimizationMethod

!||||||||||||    
contains   !|
!||||||||||||

    !* 设置网络结构
    procedure(abs_set_NN), deferred, public :: set_NN
    
    !* 设置迭代的时间步,因为学习率可能与时间相关
    procedure(abs_set_iterative_step), deferred, public :: set_iterative_step
    
    !* 更新神经网络的参数
    procedure(abs_update_NN), deferred, public :: update_NN
    
    !* 前处理工作
    procedure(abs_pre_process), deferred, public :: pre_process
    
    !* 后处理工作
    procedure(abs_post_process), deferred, public :: post_process
   

end type BaseGradientOptimizationMethod
!===================
    

!-------------------
! 抽象类:函数接口 |
!-------------------    
abstract interface   

    !* 设置网络结构
    subroutine abs_set_NN( this, nn_structrue )
    import :: BaseGradientOptimizationMethod
    import :: NNStructure
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        class(NNStructure), target, intent(in) :: nn_structrue

    end subroutine
    !====
    
    !* 更新神经网络的参数
    subroutine abs_update_NN( this, bp_algorithm )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        character(len=*), optional, intent(in) :: bp_algorithm

    end subroutine
    !====
    
    !* 设置迭代的时间步
    subroutine abs_set_iterative_step( this, step )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this
        integer, intent(in) :: step

    end subroutine
    !====
    
    !* 前处理工作
    subroutine abs_pre_process( this )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this

    end subroutine
    !====
    
    !* 后处理工作
    subroutine abs_post_process( this )
    import :: BaseGradientOptimizationMethod
    implicit none
        class(BaseGradientOptimizationMethod), intent(inout) :: this

    end subroutine
    !====

end interface
!===================
    
end module

二、Adam方法

!---------------------------------------------------------!
!* From Paper:                                          *!
!*   Author: Diederik P. Kingma, Jimmy Lei Ba.           *! 
!*   Title:  ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION. *!
!*   Year:   2015.                                       *!
!---------------------------------------------------------!
module mod_OptimizationAdam
use mod_Precision
use mod_NNStructure
use mod_BaseGradientOptimizationMethod
use mod_NNParameter
use mod_Log
implicit none

!-----------------------
! 工作类:Adam优化方法 |
!-----------------------
type, extends(BaseGradientOptimizationMethod), public :: OptimizationAdam
    !* 继承自BaseGradientOptimizationMethod并实现其接口
    
    !---------------------------------------------!
    !* Adam 算法使用的参数,采用                 *!
    !*《Deep Learning》, Ian Goodfellow, e.t.c.  *!
    !* 一书上的记号.                             *!
    !---------------------------------------------!
    !* 步长
    real(PRECISION), private :: eps = 0.001
    !* 矩估计衰减速率
    real(PRECISION), private :: rho_1 = 0.9
    real(PRECISION), private :: rho_2 = 0.999
    !* 衰减率的幂次
    real(PRECISION), private :: rho_1_t 
    real(PRECISION), private :: rho_2_t
    !* 数值稳定小参数,防止除以小数不稳定
    real(PRECISION), private :: delta = 1.E-8       
    !* 权重的一阶矩估计(moment estimation) s
    !*       二阶矩估计(moment estimation) r
    type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_s 
    type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_r
    
    !* 阈值的一阶矩估计(moment estimation) s
    !*       二阶矩估计(moment estimation) r
    type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_r
    type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_s  
    !---------------------------------------------!
    
    
    
    class(NNStructure), pointer, private :: my_NN
    
    !* 是否设置NN
    logical, private :: is_set_NN_done = .false.
        
    !* 是否初始化内存空间
    logical, private :: is_allocate_done = .false.
    
    !* 层的数目,不含输入层
    integer, private :: layers_count
    
    ! 每层节点数目构成的数组: 
    !     数组的大小是所有层的数目(含输入层)
    integer, dimension(:), allocatable, private :: layers_node_count
    
!||||||||||||    
contains   !|
!||||||||||||

    !* 设置网络结构
    procedure, public :: set_NN => m_set_NN
    
    !* 训练之前设置
    !* 修改Adam算法的默认参数
    procedure, public :: set_Adam_parameter => m_set_Adam_parameter 
    
    !* batch每迭代一次需要调用之
    procedure, public :: set_iterative_step => m_set_step
    
    !* 每完成一组batch的迭代,需要调用之
    !* 更新神经网络的参数
    procedure, public :: update_NN => m_update_NN
    !* 权值、阈值一阶、二阶矩估计置 0
    procedure, public :: set_ME_zero => m_set_ME_zero
    
    !* 前处理工作
    procedure, public :: pre_process => m_pre_process
    
    !* 后处理工作
    procedure, public :: post_process => m_post_process
    
    
    procedure, private :: allocate_pointer   => m_allocate_pointer
    procedure, private :: allocate_memory    => m_allocate_memory
    procedure, private :: deallocate_pointer => m_deallocate_pointer
    procedure, private :: deallocate_memory  => m_deallocate_memory
    
    final :: OptimizationAdam_clean_space
    
end type OptimizationAdam
!===================
    
    !-------------------------
    private :: m_set_NN
    private :: m_update_NN
    private :: m_set_Adam_parameter
    private :: m_set_step
    
    private :: m_set_ME_zero
    
    private :: m_pre_process
    private :: m_post_process
    
    private :: m_allocate_pointer
    private :: m_allocate_memory
    private :: m_deallocate_pointer
    private :: m_deallocate_memory
    !-------------------------
    
!||||||||||||    
contains   !|
!|||||||||||| 
    
    !* 更新神经网络的参数
    subroutine m_update_NN( this, bp_algorithm )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        character(len=*), optional, intent(in) :: bp_algorithm

        integer :: layer_index, l_count 
        
        l_count = this % layers_count
        
        !* 假设:一个batch完成一次完整反向计算,
        !* 计算得到了平均梯度:avg_dW、avg_dTheta
        do layer_index=1, l_count
            associate (                                                           &              
                eps        => this % eps,                                         &
                rho_1      => this % rho_1,                                       &
                rho_2      => this % rho_2,                                       &
                rho_1_t    => this % rho_1_t,                                     &
                rho_2_t    => this % rho_2_t,                                     &
                delta      => this % delta,                                       &
                W_S        => this % pt_W_ME_s( layer_index ) % W,                &
                W_R        => this % pt_W_ME_r( layer_index ) % W,                &
                Theta_S    => this % pt_Theta_ME_s( layer_index ) % Theta,        &
                Theta_R    => this % pt_Theta_ME_r( layer_index ) % Theta,        &
                W          => this % my_NN % pt_W(layer_index) % W,               &
                Theta      => this % my_NN % pt_Theta(layer_index) % Theta,       &
                dW         => this % my_NN % pt_Layer( layer_index ) % dW,        &
                dTheta     => this % my_NN % pt_Layer( layer_index ) % dTheta,    &
                avg_dW     => this % my_NN % pt_Layer( layer_index ) % avg_dW,    &               
                avg_dTheta => this % my_NN % pt_Layer( layer_index ) % avg_dTheta &
            )
        
            if (PRESENT(bp_algorithm) .and. &
                (TRIM(ADJUSTL(bp_algorithm)) == 'standard')) then
                !* s <-- ρ_1 * s + (1 - ρ_1) * g
                !* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
                
                W_S = rho_1 * W_S + (1 - rho_1) * dW
                W_R = rho_2 * W_R + (1 - rho_2) * dW * dW 
            
                Theta_S = rho_1 * Theta_S + (1 - rho_1) * dTheta 
                Theta_R = rho_2 * Theta_R + (1 - rho_2) * dTheta * dTheta
            
                !* △θ = -ε * s_hat / (√(r_hat) + δ)
                !* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
                dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
                W = W + dW
            
                dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
                    (SQRT(Theta_R / (1 - rho_2_t)) + delta)
                Theta = Theta + dTheta
            
            else
                !* 默认是针对一个batch更新阈值和权值
            
                !* s <-- ρ_1 * s + (1 - ρ_1) * g
                !* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
                !avg_dW     = avg_dW     + 1.E-4 * W
                !avg_dTheta = avg_dTheta + 1.E-4 * Theta
                
                W_S = rho_1 * W_S + (1 - rho_1) * avg_dW
                W_R = rho_2 * W_R + (1 - rho_2) * avg_dW * avg_dW 
            
                Theta_S = rho_1 * Theta_S + (1 - rho_1) * avg_dTheta 
                Theta_R = rho_2 * Theta_R + (1 - rho_2) * avg_dTheta * avg_dTheta
            
                !* △θ = -ε * s_hat / (√(r_hat) + δ)
                !* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
                dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
                W = W + dW
            
                dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
                    (SQRT(Theta_R / (1 - rho_2_t)) + delta)
                Theta = Theta + dTheta
            
                avg_dW = 0
                avg_dTheta = 0
            end if
    
            end associate
        end do 
        
        return
    end subroutine m_update_NN
    !====
    
    !* 修改Adam算法的默认参数
    !* 单独设置后面的参数需要按关键字调用
    subroutine m_set_Adam_parameter( this, eps, rho_1, rho_2, delta )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        real(PRECISION), optional, intent(in) :: eps, rho_1, rho_2, delta

        if (PRESENT(eps))  this % eps = eps
        
        if (PRESENT(rho_1))  this % rho_1 = rho_1

        if (PRESENT(rho_2))  this % rho_2 = rho_2
        
        if (PRESENT(delta))  this % delta = delta
        
        return
    end subroutine m_set_Adam_parameter
    !====
    
    !* 设置网络结构
    subroutine m_set_NN( this, nn_structrue )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        class(NNStructure), target, intent(in) :: nn_structrue

        this % my_NN => nn_structrue
        
        this % is_set_NN_done = .true.
        
        call this % allocate_pointer()
        call this % allocate_memory()
        
        return
    end subroutine m_set_NN
    !====
    
    !* 设置迭代的时间步,计算衰减率幂次
    subroutine m_set_step( this, step )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        integer, intent(in) :: step 

        this % rho_1_t = (this % rho_1)**step
        this % rho_2_t = (this % rho_2)**step
        
        return
    end subroutine m_set_step
    !====
    
    !* 前处理工作
    subroutine m_pre_process( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        call this % set_ME_zero()
        
        return
    end subroutine m_pre_process
    !====
    
    !* 后处理工作
    subroutine m_post_process( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        continue
        
        return
    end subroutine m_post_process
    !====
    
    !* 权值、阈值一阶、二阶矩估计置 0
    subroutine m_set_ME_zero( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this

        integer :: layer_index, l_count
        
        l_count = this % layers_count
        
        do layer_index=1, l_count
            this % pt_W_ME_s( layer_index ) % W = 0
            this % pt_W_ME_r( layer_index ) % W = 0
            this % pt_Theta_ME_s( layer_index ) % Theta = 0
            this % pt_Theta_ME_r( layer_index ) % Theta = 0
        end do 
        
        return
    end subroutine m_set_ME_zero
    !====
    
    
    !* 申请OptimizationAdam包含的指针所需空间
    subroutine m_allocate_pointer( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        integer :: l_count
        
        if (this % is_set_NN_done == .false.) then          
            call LogErr("mod_OptimizationAdam: SUBROUTINE m_allocate_pointer, &
                is_set_NN_done is false.")          
            stop
        end if
        
        l_count = this % my_NN % layers_count
        this % layers_count = l_count
    
        allocate( this % pt_W_ME_s(l_count) )
        allocate( this % pt_W_ME_r(l_count) )
        allocate( this % pt_Theta_ME_s(l_count) )
        allocate( this % pt_Theta_ME_r(l_count) )
        
        allocate( this % layers_node_count(0:l_count) )
        
        this % layers_node_count = this % my_NN % layers_node_count
    
        call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_pointer")
        
        return
    end subroutine m_allocate_pointer
    !====
    
    !* 申请每层所需的内存空间
    subroutine m_allocate_memory( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        integer :: M, N, layer_index, l_count
        
        l_count = this % layers_count
        
        do layer_index=1, l_count
        
            M = this % layers_node_count(layer_index - 1)
            N = this % layers_node_count(layer_index)
                      
            !* undo: Fortran2003语法检测申请错误
            !* 注意:矩阵大小为 N×M,而不是 M×N.
            allocate( this % pt_W_ME_s( layer_index ) % W(N,M) )
            allocate( this % pt_W_ME_r( layer_index ) % W(N,M) )
            allocate( this % pt_Theta_ME_s( layer_index ) % Theta(N) )
            allocate( this % pt_Theta_ME_r( layer_index ) % Theta(N) )
            
        end do
    
        this % is_allocate_done = .true.
    
        call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_memory")
    
        return
    end subroutine m_allocate_memory
    !====
    
    !* 销毁指针 
    subroutine m_deallocate_pointer( this )
    implicit none
        class(OptimizationAdam), intent(inout) :: this
        
        deallocate( this % layers_node_count )
        deallocate( this % pt_W_ME_s         )
        deallocate( this % pt_W_ME_r         )
        deallocate( this % pt_Theta_ME_s     )
        deallocate( this % pt_Theta_ME_r     )
    
        return
    end subroutine m_deallocate_pointer
    !====
    
    !* 销毁内存空间
    subroutine m_deallocate_memory( this )
    implicit none
        class(OptimizationAdam), intent(inout)  :: this
        
        integer :: layer_index
        
        do layer_index=1, this % layers_count
            
            deallocate( this % pt_W_ME_s( layer_index ) % W )
            deallocate( this % pt_W_ME_r( layer_index ) % W )
            deallocate( this % pt_Theta_ME_s( layer_index ) % Theta )
            deallocate( this % pt_Theta_ME_r( layer_index ) % Theta )
            
        end do
        
        call this % deallocate_pointer()
        
        this % is_allocate_done = .false.
    
        return
    end subroutine m_deallocate_memory 
    !====
    
    !* 析构函数,清理内存空间
    subroutine OptimizationAdam_clean_space( this )
    implicit none
        type(OptimizationAdam), intent(inout) :: this
    
        call this % deallocate_memory()
        
        call LogInfo("OptimizationAdam: SUBROUTINE clean_space.")
        
        return
    end subroutine OptimizationAdam_clean_space
    !====
    
    
end module

附录

多层神经网络,从零开始——(一)、Fortran读取MNIST数据集
多层神经网络,从零开始——(二)、Fortran随机生成“双月”分类问题数据
多层神经网络,从零开始——(三)、BP神经网络公式的详细推导
多层神经网络,从零开始——(四)、多层BP神经网络的矩阵形式
多层神经网络,从零开始——(五)、定义数据结构
多层神经网络,从零开始——(六)、激活函数
多层神经网络,从零开始——(七)、损失函数
多层神经网络,从零开始——(八)、分类问题中为什么使用交叉熵作为损失函数
多层神经网络,从零开始——(九)、优化函数
多层神经网络,从零开始——(十)、参数初始化
多层神经网络,从零开始——(十一)、实现训练类
多层神经网络,从零开始——(十二)、实现算例类
多层神经网络,从零开始——(十三)、关于并行计算的简单探讨

你可能感兴趣的:(多层神经网络,从零开始——(九)、优化函数)