翻译自 https://www.kaggle.com/rhtsingh/guide-to-huggingface-schedulers-differential-lrs
import numpy as np
import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import plotly.offline as pyo
pio.templates.default='plotly_white'
import torch
import torch.nn as nn
import transformers
from transformers import AutoModel
from transformers import AdamW
from transformers import (
get_constant_schedule,
get_constant_schedule_with_warmup,
get_cosine_schedule_with_warmup,
get_cosine_with_hard_restarts_schedule_with_warmup,
get_linear_schedule_with_warmup,
get_polynomial_decay_schedule_with_warmup
)
epochs = 10
class Net(nn.Module):
def __init__(self, model_name):
super(Net, self).__init__()
self.roberta = AutoModel.from_pretrained(model_name)
self.classifier = nn.Linear(768, 1)
def forward(self, input_ids):
outputs = self.roberta(input_ids)
sequence_output = outputs[1]
return self.classifier(sequence_output)
“差异学习率”的直观想法是嵌入从第一层开始,没有上下文信息。随着嵌入更深地进入网络,它们每一层都能获得更一般的和上下文相关的信息。然而,当我们接近最后一层时,我们开始收集特定于Transformer的训练前任务的信息(例如RoBERTa的“掩码语言模型”(MLM)和“下一句话预测”(NSP)。
因此,我们可以用不同的学习速率来微调各个层次,即早期层次的学习速率较低,中间层次的学习速率稍高,顶层的学习速率稍高。
3.我们还可以用完全不同的、比transformer模型高得多的学习率来训练特定的任务层,因为它之前没有受过训练,需要更快地学习。
“完整模型的统一学习率”-在这个策略中,我们将为完整模型设置单个学习率(在本例中为5e-5),这通常是完成的。
“transformer和任务特定层的差异学习率(i)”-在这个策略中,我们将有两个不同的lr。一个统一用于完整的变压器模型(RoBERTa),另一个用于特定于任务的层(regression)。
3.“transformer层和特定任务层的差异学习率(a)”-这里我们将为不同的transformer层设置不同的lr。我将层1-4、4-8、8-12分组,并根据上面讨论的设置学习速率。除此之外,我还对回归变量设置了不同的更高的学习率。
def get_optimizer_params(model, type='s'):
# differential learning rate and weight decay
param_optimizer = list(model.named_parameters())
learning_rate = 5e-5
no_decay = ['bias', 'gamma', 'beta']
if type == 's':
optimizer_parameters = filter(lambda x: x.requires_grad, model.parameters())
elif type == 'i':
optimizer_parameters = [
{'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.01},
{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0},
{'params': [p for n, p in model.named_parameters() if "roberta" not in n],
'lr': 1e-3,
'weight_decay_rate':0.01}
]
elif type == 'a':
group1=['layer.0.','layer.1.','layer.2.','layer.3.']
group2=['layer.4.','layer.5.','layer.6.','layer.7.']
group3=['layer.8.','layer.9.','layer.10.','layer.11.']
group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
optimizer_parameters = [
{'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.01},
{'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': learning_rate/2.6},
{'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': learning_rate},
{'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': learning_rate*2.6},
{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0},
{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': learning_rate/2.6},
{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': learning_rate},
{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': learning_rate*2.6},
{'params': [p for n, p in model.named_parameters() if "roberta" not in n], 'lr':1e-3, "momentum" : 0.99},
]
return optimizer_parameters
这定义了一个默认的布局,其中’ title '是唯一可以改变的参数。我们将使用它来为每个图形进行布局。
def get_default_layout(title):
font_style = 'Courier New'
layout = {}
layout['height'] = 400
layout['width'] = 1200
layout['template'] = 'plotly_white'
layout['dragmode'] = 'zoom'
layout['hovermode'] = 'x'
layout['hoverlabel'] = {
'font_size': 14,
'font_family':font_style
}
layout['font'] = {
'size':14,
'family':font_style,
'color':'rgb(128, 128, 128)'
}
layout['xaxis'] = {
'title': 'Epochs',
'showgrid': True,
'type': 'linear',
'categoryarray': None,
'gridwidth': 1,
'ticks': 'outside',
'showline': True,
'showticklabels': True,
'tickangle': 0,
'tickmode': 'array'
}
layout['yaxis'] = {
'title': 'Learning Rate',
'exponentformat':'none',
'showgrid': True,
'type': 'linear',
'categoryarray': None,
'gridwidth': 1,
'ticks': 'outside',
'showline': True,
'showticklabels': True,
'tickangle': 0,
'tickmode': 'array'
}
layout['title'] = {
'text':title,
'x': 0.5,
'y': 0.95,
'xanchor': 'center',
'yanchor': 'top',
'font': {
'family':font_style,
'size':14,
'color':'black'
}
}
layout['showlegend'] = True
layout['legend'] = {
'x':0.1,
'y':1.1,
'orientation':'h',
'itemclick': 'toggleothers',
'font': {
'family':font_style,
'size':14,
'color':'black'
}
}
return go.Layout(layout)
创建一个学习率恒定的优化器,使用学习率设置在优化。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace1, trace2], layout=layout)
创建一个学习率恒定的优化器,在此期间学习率在0和优化器中初始lr之间线性增加。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)
创建一个学习率随优化器中初始lr集到0之间的余弦函数值的减小而减小的计划,该计划经过一段预热期,在此期间它在0和优化器中初始lr集之间线性增加。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[1]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
learning_rates3.append(optimizer.param_groups[3]["lr"])
learning_rates4.append(optimizer.param_groups[8]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 1-4',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 5-8',
marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates3,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 9-12',
marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates4,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
创建一个学习率随着优化器中初始lr集到0之间的余弦函数值的减小而减小的计划,在预热期间它在0和优化器中初始lr集之间线性增加后,需要多次艰难重启。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)
learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[1]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
learning_rates3.append(optimizer.param_groups[3]["lr"])
learning_rates4.append(optimizer.param_groups[8]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 1-4',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 5-8',
marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates3,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 9-12',
marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates4,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
创建一个学习率从优化器中的初始lr集线性减少到0的计划,在此之前的预热期,学习率从0线性增加到优化器中的初始lr集。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)
learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[1]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
learning_rates3.append(optimizer.param_groups[3]["lr"])
learning_rates4.append(optimizer.param_groups[8]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 1-4',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 5-8',
marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates3,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 9-12',
marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates4,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
创建一个学习率从优化器中初始lr集到lr end定义的lr以多项式衰减的方式递减的调度,在预热期间,学习率从0线性增加到优化器中初始lr集。
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)
learning_rates = []
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='LR',
marker=dict(color='#3498d5'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)
learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[0]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
'betas': (0.9, 0.999),
'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)
learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
optimizer.step()
scheduler.step()
learning_rates1.append(optimizer.param_groups[1]["lr"])
learning_rates2.append(optimizer.param_groups[2]["lr"])
learning_rates3.append(optimizer.param_groups[3]["lr"])
learning_rates4.append(optimizer.param_groups[8]["lr"])
trace1 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates1,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 1-4',
marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates2,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 5-8',
marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates3,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Roberta Layers 9-12',
marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
x=np.arange(0, epochs, 1),
y=learning_rates4,
texttemplate="%{y:.6f}",
mode='markers+lines',
name='Regressor',
marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)
欢迎关注公众号: