常用的优化算法有:随机梯度下降、带动量的随机梯度下降、AdaGrad算法、RMSProp算法、Adam算法,其中Adam算法结合了RMSProp和动量,一种鲁棒性非常好的方法。
一、优化函数基类
module mod_BaseGradientOptimizationMethod
use mod_NNStructure
implicit none
!-----------------------
! 抽象类:梯度优化算法 |
!-----------------------
type, abstract, public :: BaseGradientOptimizationMethod
!||||||||||||
contains !|
!||||||||||||
!* 设置网络结构
procedure(abs_set_NN), deferred, public :: set_NN
!* 设置迭代的时间步,因为学习率可能与时间相关
procedure(abs_set_iterative_step), deferred, public :: set_iterative_step
!* 更新神经网络的参数
procedure(abs_update_NN), deferred, public :: update_NN
!* 前处理工作
procedure(abs_pre_process), deferred, public :: pre_process
!* 后处理工作
procedure(abs_post_process), deferred, public :: post_process
end type BaseGradientOptimizationMethod
!===================
!-------------------
! 抽象类:函数接口 |
!-------------------
abstract interface
!* 设置网络结构
subroutine abs_set_NN( this, nn_structrue )
import :: BaseGradientOptimizationMethod
import :: NNStructure
implicit none
class(BaseGradientOptimizationMethod), intent(inout) :: this
class(NNStructure), target, intent(in) :: nn_structrue
end subroutine
!====
!* 更新神经网络的参数
subroutine abs_update_NN( this, bp_algorithm )
import :: BaseGradientOptimizationMethod
implicit none
class(BaseGradientOptimizationMethod), intent(inout) :: this
character(len=*), optional, intent(in) :: bp_algorithm
end subroutine
!====
!* 设置迭代的时间步
subroutine abs_set_iterative_step( this, step )
import :: BaseGradientOptimizationMethod
implicit none
class(BaseGradientOptimizationMethod), intent(inout) :: this
integer, intent(in) :: step
end subroutine
!====
!* 前处理工作
subroutine abs_pre_process( this )
import :: BaseGradientOptimizationMethod
implicit none
class(BaseGradientOptimizationMethod), intent(inout) :: this
end subroutine
!====
!* 后处理工作
subroutine abs_post_process( this )
import :: BaseGradientOptimizationMethod
implicit none
class(BaseGradientOptimizationMethod), intent(inout) :: this
end subroutine
!====
end interface
!===================
end module
二、Adam方法
!---------------------------------------------------------!
!* From Paper: *!
!* Author: Diederik P. Kingma, Jimmy Lei Ba. *!
!* Title: ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION. *!
!* Year: 2015. *!
!---------------------------------------------------------!
module mod_OptimizationAdam
use mod_Precision
use mod_NNStructure
use mod_BaseGradientOptimizationMethod
use mod_NNParameter
use mod_Log
implicit none
!-----------------------
! 工作类:Adam优化方法 |
!-----------------------
type, extends(BaseGradientOptimizationMethod), public :: OptimizationAdam
!* 继承自BaseGradientOptimizationMethod并实现其接口
!---------------------------------------------!
!* Adam 算法使用的参数,采用 *!
!*《Deep Learning》, Ian Goodfellow, e.t.c. *!
!* 一书上的记号. *!
!---------------------------------------------!
!* 步长
real(PRECISION), private :: eps = 0.001
!* 矩估计衰减速率
real(PRECISION), private :: rho_1 = 0.9
real(PRECISION), private :: rho_2 = 0.999
!* 衰减率的幂次
real(PRECISION), private :: rho_1_t
real(PRECISION), private :: rho_2_t
!* 数值稳定小参数,防止除以小数不稳定
real(PRECISION), private :: delta = 1.E-8
!* 权重的一阶矩估计(moment estimation) s
!* 二阶矩估计(moment estimation) r
type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_s
type (Layer_Weight), dimension(:), pointer, public :: pt_W_ME_r
!* 阈值的一阶矩估计(moment estimation) s
!* 二阶矩估计(moment estimation) r
type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_r
type (Layer_Threshold), dimension(:), pointer, public :: pt_Theta_ME_s
!---------------------------------------------!
class(NNStructure), pointer, private :: my_NN
!* 是否设置NN
logical, private :: is_set_NN_done = .false.
!* 是否初始化内存空间
logical, private :: is_allocate_done = .false.
!* 层的数目,不含输入层
integer, private :: layers_count
! 每层节点数目构成的数组:
! 数组的大小是所有层的数目(含输入层)
integer, dimension(:), allocatable, private :: layers_node_count
!||||||||||||
contains !|
!||||||||||||
!* 设置网络结构
procedure, public :: set_NN => m_set_NN
!* 训练之前设置
!* 修改Adam算法的默认参数
procedure, public :: set_Adam_parameter => m_set_Adam_parameter
!* batch每迭代一次需要调用之
procedure, public :: set_iterative_step => m_set_step
!* 每完成一组batch的迭代,需要调用之
!* 更新神经网络的参数
procedure, public :: update_NN => m_update_NN
!* 权值、阈值一阶、二阶矩估计置 0
procedure, public :: set_ME_zero => m_set_ME_zero
!* 前处理工作
procedure, public :: pre_process => m_pre_process
!* 后处理工作
procedure, public :: post_process => m_post_process
procedure, private :: allocate_pointer => m_allocate_pointer
procedure, private :: allocate_memory => m_allocate_memory
procedure, private :: deallocate_pointer => m_deallocate_pointer
procedure, private :: deallocate_memory => m_deallocate_memory
final :: OptimizationAdam_clean_space
end type OptimizationAdam
!===================
!-------------------------
private :: m_set_NN
private :: m_update_NN
private :: m_set_Adam_parameter
private :: m_set_step
private :: m_set_ME_zero
private :: m_pre_process
private :: m_post_process
private :: m_allocate_pointer
private :: m_allocate_memory
private :: m_deallocate_pointer
private :: m_deallocate_memory
!-------------------------
!||||||||||||
contains !|
!||||||||||||
!* 更新神经网络的参数
subroutine m_update_NN( this, bp_algorithm )
implicit none
class(OptimizationAdam), intent(inout) :: this
character(len=*), optional, intent(in) :: bp_algorithm
integer :: layer_index, l_count
l_count = this % layers_count
!* 假设:一个batch完成一次完整反向计算,
!* 计算得到了平均梯度:avg_dW、avg_dTheta
do layer_index=1, l_count
associate ( &
eps => this % eps, &
rho_1 => this % rho_1, &
rho_2 => this % rho_2, &
rho_1_t => this % rho_1_t, &
rho_2_t => this % rho_2_t, &
delta => this % delta, &
W_S => this % pt_W_ME_s( layer_index ) % W, &
W_R => this % pt_W_ME_r( layer_index ) % W, &
Theta_S => this % pt_Theta_ME_s( layer_index ) % Theta, &
Theta_R => this % pt_Theta_ME_r( layer_index ) % Theta, &
W => this % my_NN % pt_W(layer_index) % W, &
Theta => this % my_NN % pt_Theta(layer_index) % Theta, &
dW => this % my_NN % pt_Layer( layer_index ) % dW, &
dTheta => this % my_NN % pt_Layer( layer_index ) % dTheta, &
avg_dW => this % my_NN % pt_Layer( layer_index ) % avg_dW, &
avg_dTheta => this % my_NN % pt_Layer( layer_index ) % avg_dTheta &
)
if (PRESENT(bp_algorithm) .and. &
(TRIM(ADJUSTL(bp_algorithm)) == 'standard')) then
!* s <-- ρ_1 * s + (1 - ρ_1) * g
!* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
W_S = rho_1 * W_S + (1 - rho_1) * dW
W_R = rho_2 * W_R + (1 - rho_2) * dW * dW
Theta_S = rho_1 * Theta_S + (1 - rho_1) * dTheta
Theta_R = rho_2 * Theta_R + (1 - rho_2) * dTheta * dTheta
!* △θ = -ε * s_hat / (√(r_hat) + δ)
!* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
W = W + dW
dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
(SQRT(Theta_R / (1 - rho_2_t)) + delta)
Theta = Theta + dTheta
else
!* 默认是针对一个batch更新阈值和权值
!* s <-- ρ_1 * s + (1 - ρ_1) * g
!* r <-- ρ_2 * r + (1 - ρ_2) * g ⊙ g
!avg_dW = avg_dW + 1.E-4 * W
!avg_dTheta = avg_dTheta + 1.E-4 * Theta
W_S = rho_1 * W_S + (1 - rho_1) * avg_dW
W_R = rho_2 * W_R + (1 - rho_2) * avg_dW * avg_dW
Theta_S = rho_1 * Theta_S + (1 - rho_1) * avg_dTheta
Theta_R = rho_2 * Theta_R + (1 - rho_2) * avg_dTheta * avg_dTheta
!* △θ = -ε * s_hat / (√(r_hat) + δ)
!* s_hat = s / (1 - ρ^t_1), r_hat = r / (1 - ρ^t_2)
dW = -eps * (W_S / (1 - rho_1_t)) / (SQRT(W_R / (1 - rho_2_t)) + delta)
W = W + dW
dTheta = -eps * (Theta_S / (1 - rho_1_t)) / &
(SQRT(Theta_R / (1 - rho_2_t)) + delta)
Theta = Theta + dTheta
avg_dW = 0
avg_dTheta = 0
end if
end associate
end do
return
end subroutine m_update_NN
!====
!* 修改Adam算法的默认参数
!* 单独设置后面的参数需要按关键字调用
subroutine m_set_Adam_parameter( this, eps, rho_1, rho_2, delta )
implicit none
class(OptimizationAdam), intent(inout) :: this
real(PRECISION), optional, intent(in) :: eps, rho_1, rho_2, delta
if (PRESENT(eps)) this % eps = eps
if (PRESENT(rho_1)) this % rho_1 = rho_1
if (PRESENT(rho_2)) this % rho_2 = rho_2
if (PRESENT(delta)) this % delta = delta
return
end subroutine m_set_Adam_parameter
!====
!* 设置网络结构
subroutine m_set_NN( this, nn_structrue )
implicit none
class(OptimizationAdam), intent(inout) :: this
class(NNStructure), target, intent(in) :: nn_structrue
this % my_NN => nn_structrue
this % is_set_NN_done = .true.
call this % allocate_pointer()
call this % allocate_memory()
return
end subroutine m_set_NN
!====
!* 设置迭代的时间步,计算衰减率幂次
subroutine m_set_step( this, step )
implicit none
class(OptimizationAdam), intent(inout) :: this
integer, intent(in) :: step
this % rho_1_t = (this % rho_1)**step
this % rho_2_t = (this % rho_2)**step
return
end subroutine m_set_step
!====
!* 前处理工作
subroutine m_pre_process( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
call this % set_ME_zero()
return
end subroutine m_pre_process
!====
!* 后处理工作
subroutine m_post_process( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
continue
return
end subroutine m_post_process
!====
!* 权值、阈值一阶、二阶矩估计置 0
subroutine m_set_ME_zero( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
integer :: layer_index, l_count
l_count = this % layers_count
do layer_index=1, l_count
this % pt_W_ME_s( layer_index ) % W = 0
this % pt_W_ME_r( layer_index ) % W = 0
this % pt_Theta_ME_s( layer_index ) % Theta = 0
this % pt_Theta_ME_r( layer_index ) % Theta = 0
end do
return
end subroutine m_set_ME_zero
!====
!* 申请OptimizationAdam包含的指针所需空间
subroutine m_allocate_pointer( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
integer :: l_count
if (this % is_set_NN_done == .false.) then
call LogErr("mod_OptimizationAdam: SUBROUTINE m_allocate_pointer, &
is_set_NN_done is false.")
stop
end if
l_count = this % my_NN % layers_count
this % layers_count = l_count
allocate( this % pt_W_ME_s(l_count) )
allocate( this % pt_W_ME_r(l_count) )
allocate( this % pt_Theta_ME_s(l_count) )
allocate( this % pt_Theta_ME_r(l_count) )
allocate( this % layers_node_count(0:l_count) )
this % layers_node_count = this % my_NN % layers_node_count
call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_pointer")
return
end subroutine m_allocate_pointer
!====
!* 申请每层所需的内存空间
subroutine m_allocate_memory( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
integer :: M, N, layer_index, l_count
l_count = this % layers_count
do layer_index=1, l_count
M = this % layers_node_count(layer_index - 1)
N = this % layers_node_count(layer_index)
!* undo: Fortran2003语法检测申请错误
!* 注意:矩阵大小为 N×M,而不是 M×N.
allocate( this % pt_W_ME_s( layer_index ) % W(N,M) )
allocate( this % pt_W_ME_r( layer_index ) % W(N,M) )
allocate( this % pt_Theta_ME_s( layer_index ) % Theta(N) )
allocate( this % pt_Theta_ME_r( layer_index ) % Theta(N) )
end do
this % is_allocate_done = .true.
call LogDebug("OptimizationAdam: SUBROUTINE m_allocate_memory")
return
end subroutine m_allocate_memory
!====
!* 销毁指针
subroutine m_deallocate_pointer( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
deallocate( this % layers_node_count )
deallocate( this % pt_W_ME_s )
deallocate( this % pt_W_ME_r )
deallocate( this % pt_Theta_ME_s )
deallocate( this % pt_Theta_ME_r )
return
end subroutine m_deallocate_pointer
!====
!* 销毁内存空间
subroutine m_deallocate_memory( this )
implicit none
class(OptimizationAdam), intent(inout) :: this
integer :: layer_index
do layer_index=1, this % layers_count
deallocate( this % pt_W_ME_s( layer_index ) % W )
deallocate( this % pt_W_ME_r( layer_index ) % W )
deallocate( this % pt_Theta_ME_s( layer_index ) % Theta )
deallocate( this % pt_Theta_ME_r( layer_index ) % Theta )
end do
call this % deallocate_pointer()
this % is_allocate_done = .false.
return
end subroutine m_deallocate_memory
!====
!* 析构函数,清理内存空间
subroutine OptimizationAdam_clean_space( this )
implicit none
type(OptimizationAdam), intent(inout) :: this
call this % deallocate_memory()
call LogInfo("OptimizationAdam: SUBROUTINE clean_space.")
return
end subroutine OptimizationAdam_clean_space
!====
end module
附录
多层神经网络,从零开始——(一)、Fortran读取MNIST数据集
多层神经网络,从零开始——(二)、Fortran随机生成“双月”分类问题数据
多层神经网络,从零开始——(三)、BP神经网络公式的详细推导
多层神经网络,从零开始——(四)、多层BP神经网络的矩阵形式
多层神经网络,从零开始——(五)、定义数据结构
多层神经网络,从零开始——(六)、激活函数
多层神经网络,从零开始——(七)、损失函数
多层神经网络,从零开始——(八)、分类问题中为什么使用交叉熵作为损失函数
多层神经网络,从零开始——(九)、优化函数
多层神经网络,从零开始——(十)、参数初始化
多层神经网络,从零开始——(十一)、实现训练类
多层神经网络,从零开始——(十二)、实现算例类
多层神经网络,从零开始——(十三)、关于并行计算的简单探讨