https://zhuanlan.zhihu.com/p/346205754
https://blog.csdn.net/google19890102/article/details/69942970
https://zhuanlan.zhihu.com/p/32626442
https://zhuanlan.zhihu.com/p/32230623
import torch
import numpy as np
import warnings
warnings.filterwarnings('ignore') #ignore warnings
x = torch.linspace(-np.pi, np.pi, 2000)
y = torch.sin(x)
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)
model = torch.nn.Sequential(
torch.nn.Linear(3, 1),
torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(1, 1001):
y_pred = model(xx)
loss = loss_fn(y_pred, y)
if t % 100 == 0:
print('No.{: 5d}, loss: {:.6f}'.format(t, loss.item()))
optimizer.zero_grad() # 梯度清零
loss.backward() # 反向传播计算梯度
optimizer.step() # 梯度下降法更新参数
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
d_p = p.grad #获取参数梯度
if weight_decay != 0:
d_p = d_p.add(p, alpha=weight_decay)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(d_p).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
if nesterov:
d_p = d_p.add(buf, alpha=momentum)
else:
d_p = buf
p.add_(d_p, alpha=-group['lr']) #更新参数
return loss
模型参数
θ \theta θ ,
目标函数
J ( θ ) J(\theta) J(θ)
每一个时刻t(假设是一个batch)的梯度
g t = ▽ θ J ( θ ) g_{t}=\bigtriangledown _{\theta}J(\theta) gt=▽θJ(θ)
学习率为
η \eta η
根据历史梯度的一阶动量
m t = ϕ ( g 1 , g 2 , . . . g t ) m_{t}=\phi (g_{1},g_{2},...g_{t}) mt=ϕ(g1,g2,...gt)
根据历史梯度的二阶动量
v t = ψ ( g 1 , g 2 , . . . g t ) v_{t}=\psi (g_{1},g_{2},...g_{t}) vt=ψ(g1,g2,...gt)
更新模型参数
θ t + 1 = θ t − 1 v t + ϵ m t \theta_{t+1}=\theta_{t}-\frac{1}{\sqrt{v_{t}+\epsilon }}m_{t} θt+1=θt−vt+ϵ1mt ,
平滑项,防止分母为0
m t = η ∗ g t m_{t}=\eta*g_{t} mt=η∗gt
v t = I 2 v_{t}=I^2 vt=I2
SGD 的缺点在于收敛速度慢,可能在鞍点处震荡。并且,如何合理的选择学习率是 SGD 的一大难点。
m t = γ ∗ m t − 1 + η ∗ g t m_{t}=\gamma*m_{t-1}+\eta*g_{t} mt=γ∗mt−1+η∗gt
也就是按照一定比例的前一次的变化方向和大小,一般比例是0.9
带一阶动量的SGD
m t = γ ∗ m t − 1 + η ∗ g t m_{t}=\gamma*m_{t-1}+\eta*g_{t} mt=γ∗mt−1+η∗gt
v t = I 2 v_{t}=I^2 vt=I2
也就是在求解梯度的时候把参数改成参数经过动量变换后的参数的梯度
g t = ▽ θ J ( θ − γ ∗ m t − 1 ) g_{t}=\bigtriangledown _{\theta}J(\theta-\gamma*m_{t-1}) gt=▽θJ(θ−γ∗mt−1)
m t = γ ∗ m t − 1 + η ∗ g t m_{t}=\gamma*m_{t-1}+\eta*g_{t} mt=γ∗mt−1+η∗gt
v t = I 2 v_{t}=I^2 vt=I2
也就是根据参数改变的频率修改相应参数变化的快慢
v t = ∑ g t 2 v_{t}=\sum g_{t}^{2} vt=∑gt2
v t = γ ∗ v t − 1 + ( 1 − γ ) ∗ g t 2 v_{t}=\gamma*v_{t-1}+(1-\gamma)*g_{t}^{2} vt=γ∗vt−1+(1−γ)∗gt2
m t = η ∗ ( β 1 ∗ m t − 1 + ( 1 − β 1 ) g t ) m_{t}=\eta*(\beta _{1}*m_{t-1}+(1-\beta _{1})g_{t}) mt=η∗(β1∗mt−1+(1−β1)gt)
v t = β 2 ∗ v t − 1 + ( 1 − β 2 ) ∗ g t 2 v_{t}=\beta _{2}*v_{t-1}+(1-\beta_{2} )*g_{t}^{2} vt=β2∗vt−1+(1−β2)∗gt2
其中
m 0 = 0 , v 0 = 0. m_0=0,v_{0}=0. m0=0,v0=0.
m 1 = 0.1 ∗ g 0 , v 1 = 0.1 ∗ g 0 2 . m_1=0.1*g_{0},v_{1}=0.1*g^2_{0}. m1=0.1∗g0,v1=0.1∗g02.
所以初始阶段有偏移,偏向0 。所以做一个偏执矫正
m ^ t = m t 1 − β 1 t \hat m_t=\frac{m_t}{1-\beta_1^t} m^t=1−β1tmt
v ^ t = v t 1 − β 1 t \hat v_t=\frac{v_t}{1-\beta_1^t} v^t=1−β1tvt
from torch.optim import lr_scheduler
from matplotlib import pyplot as plt
from torch.optim import SGD
from torch import nn
class DummyModel(nn.Module):
def __init__(self, class_num=10):
super(DummyModel, self).__init__()
self.base = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
)
self.gap = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Linear(128, class_num)
def forward(self, x):
x = self.base(x)
x = self.gap(x)
x = x.view(x.shape[0], -1)
x = self.fc(x)
return x
model = DummyModel().cuda()
def create_optimizer():
return SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4)
def plot_lr(scheduler, title='', labels=['base'], nrof_epoch=100):
lr_li = [[] for _ in range(len(labels))]
epoch_li = list(range(nrof_epoch))
for epoch in epoch_li:
scheduler.step() # 调用step()方法,计算和更新optimizer管理的参数基于当前epoch的学习率
lr = scheduler.get_last_lr() # 获取当前epoch的学习率
for i in range(len(labels)):
lr_li[i].append(lr[i])
for lr, label in zip(lr_li, labels):
plt.plot(epoch_li, lr, label=label)
plt.grid()
plt.xlabel('epoch')
plt.ylabel('lr')
plt.title(title)
plt.legend()
plt.show()
optimizer = create_optimizer()
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 50, 1e-5)
plot_lr(scheduler, title='CosineAnnealingLR')
def step(self, epoch=None):
# Raise a warning if old pattern is detected
# https://github.com/pytorch/pytorch/issues/20124
if self._step_count == 1:
if not hasattr(self.optimizer.step, "_with_counter"):
warnings.warn("Seems like `optimizer.step()` has been overridden after learning rate scheduler "
"initialization. Please, make sure to call `optimizer.step()` before "
"`lr_scheduler.step()`. See more details at "
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
# Just check if there were two first lr_scheduler.step() calls before optimizer.step()
elif self.optimizer._step_count < 1:
warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
"In PyTorch 1.1.0 and later, you should call them in the opposite order: "
"`optimizer.step()` before `lr_scheduler.step()`. Failure to do this "
"will result in PyTorch skipping the first value of the learning rate schedule. "
"See more details at "
"https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
self._step_count += 1
class _enable_get_lr_call:
def __init__(self, o):
self.o = o
def __enter__(self):
self.o._get_lr_called_within_step = True
return self
def __exit__(self, type, value, traceback):
self.o._get_lr_called_within_step = False
with _enable_get_lr_call(self):
if epoch is None:
self.last_epoch += 1
values = self.get_lr() #获取新的学习率,每个学习率函数分别实现不同的代码
else:
warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
self.last_epoch = epoch
if hasattr(self, "_get_closed_form_lr"):
values = self._get_closed_form_lr()
else:
values = self.get_lr()
for i, data in enumerate(zip(self.optimizer.param_groups, values)):
param_group, lr = data
param_group['lr'] = lr #更新optimizer学习率参数
self.print_lr(self.verbose, i, lr, epoch)
self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
def get_lr(self):
if not self._get_lr_called_within_step:
warnings.warn("To get the last learning rate computed by the scheduler, "
"please use `get_last_lr()`.", UserWarning)
if self.last_epoch == 0:
return self.base_lrs
elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
return [group['lr'] + (base_lr - self.eta_min) *
(1 - math.cos(math.pi / self.T_max)) / 2
for base_lr, group in
zip(self.base_lrs, self.optimizer.param_groups)]
return [(1 + math.cos(math.pi * self.last_epoch / self.T_max)) /
(1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) *
(group['lr'] - self.eta_min) + self.eta_min
for group in self.optimizer.param_groups]