# ————————————————— 利用model.apply(weights_init)实现初始化
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif classname.find('BatchNorm') != -1:
m.weight.data.fill_(1)
m.bias.data.zero_()
elif classname.find('Linear') != -1:
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data = torch.ones(m.bias.data.size())
# ————————————————— 直接放在__init__构造函数中实现初始化
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm1d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight.data)
if m.bias is not None:
m.bias.data.zero_()
# —————————————————
self.weight = Parameter(torch.Tensor(out_features, in_features))
self.bias = Parameter(torch.FloatTensor(out_features))
nn.init.xavier_uniform_(self.weight)
nn.init.zero_(self.bias)
nn.init.constant_(m, initm)
# nn.init.kaiming_uniform_()
# self.weight.data.normal_(std=0.001)
def separate_bn_prelu_params(model, ignored_params=[]):
bn_prelu_params = []
for m in model.modules():
if isinstance(m, nn.BatchNorm2d):
ignored_params += list(map(id, m.parameters()))
bn_prelu_params += m.parameters()
if isinstance(m, nn.BatchNorm1d):
ignored_params += list(map(id, m.parameters()))
bn_prelu_params += m.parameters()
elif isinstance(m, nn.PReLU):
ignored_params += list(map(id, m.parameters()))
bn_prelu_params += m.parameters()
base_params = list(filter(lambda p: id(p) not in ignored_params, model.parameters()))
return base_params, bn_prelu_params, ignored_params
OPTIMIZER = optim.SGD([
{'params': base_params, 'weight_decay': WEIGHT_DECAY},
{'params': fc_head_param, 'weight_decay': WEIGHT_DECAY * 10},
{'params': bn_prelu_params, 'weight_decay': 0.0}
], lr=LR, momentum=MOMENTUM ) # , nesterov=True
(参MobileFaceNet_Pytorch-master中参数分组weight_decay)
Note 1:PReLU(x) = max(0,x) + a * min(0,x)
. Here a
is a learnable parameter. When called without arguments, nn.PReLU()
uses a single parameter a
across all input channels. If called with nn.PReLU(nChannels)
, a separate a
is used for each input channel.
Note 2: weight decay should not be used when learning a
for good performance.
Note 3: The default number of a
to learn is 1, the default initial value of a
is 0.25.
from torch.optim import lr_scheduler
# define optimizers
ignored_params = list(map(id, net.linear1.parameters())) # id() 函数用于获取对象的内存地址
ignored_params += list(map(id, ArcMargin.weight))
prelu_params_id = []
prelu_params = []
for m in net.modules():
if isinstance(m, nn.PReLU):
ignored_params += list(map(id, m.parameters()))
prelu_params += m.parameters()
base_params = filter(lambda p: id(p) not in ignored_params, net.parameters()) # filter()返回满足要求的元素
optimizer_ft = optim.SGD([
{'params': base_params, 'weight_decay': 4e-5},
{'params': net.linear1.parameters(), 'weight_decay': 4e-4},
{'params': ArcMargin.weight, 'weight_decay': 4e-4},
{'params': prelu_params, 'weight_decay': 0.0}
], lr=0.1, momentum=0.9, nesterov=True)
exp_lr_scheduler = lr_scheduler.MultiStepLR(optimizer_ft, milestones=[36, 52, 58], gamma=0.1)
'''
>>> for epoch in range(100):
>>> scheduler.step()
>>> train(...)
>>> validate(...)
'''
(参考face_evoLVe_Pytorch-master中)
自定义schedule
def schedule_lr(optimizer):
for params in optimizer.param_groups:
params['lr'] /= 10.
print(optimizer)
方法一:利用model.modules()和obj.__class__ (更普适)
# Module.children()与Module.modules()都是返回网络模型里的组成元素,但children()返回最外层元素,modules()返回所有级别的元素
# 下面的关键词if 'model',其实源于模型定义文件如model_resnet,py中的‘model’,该文件中自定义的所有Model子类,都会前缀'model_resnet',所有可通过这种方式一次性筛选出自定义的模块
def separate_irse_bn_paras(model):
paras_only_bn = []
paras_no_bn = []
for layer in model.modules():
if 'model' in str(layer.__class__): # eg. a=[1,2] type(a): a.__class__:
continue
if 'container' in str(layer.__class__): # 去掉Sequential型的模块
continue
else:
if 'batchnorm' in str(layer.__class__):
paras_only_bn.extend([*layer.parameters()])
else:
paras_no_bn.extend([*layer.parameters()]) # extend()用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表)
return paras_only_bn, paras_no_bn
方法二:调用modules.parameters和named_parameters()
但是本质上,parameters()是根据named_parameters()获取,named_parameters()是根据modules()获取。使用此方法的前提是,须按下文1,2中的方式定义模型,或者利用Sequential+OrderedDict定义模型。
def separate_resnet_bn_paras(model):
all_parameters = model.parameters()
paras_only_bn = []
for pname, p in model.named_parameters():
if pname.find('bn') >= 0:
paras_only_bn.append(p)
paras_only_bn_id = list(map(id, paras_only_bn))
paras_no_bn = list(filter(lambda p: id(p) not in paras_only_bn_id, all_parameters))
return paras_only_bn, paras_no_bn
两种方法的区别
参数分组的区别,其实对应了模型构造时的区别。举例:
构造ResNet的basic block,在__init__()函数中定义了
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes)
self.relu = ReLU(inplace = True)
…
在forward()中定义
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
…
对ResNet取model.name_parameters()返回的pname形如:
‘layer1.0.conv1.weight’
‘layer1.0.bn1.weight’
‘layer1.0.bn1.bias’
# layer对应conv2_x, …, conv5_x; '0’对应各layer中的block索引,比如conv2_x有3个block,对应索引为layer1.0, …, layer1.2; 'conv1’就是__init__()中定义的self.conv1
若构造model时采用了Sequential(),则model.name_parameters()返回的pname形如:‘body.3.res_layer.1.weight’,此处的1.weight实际对应了BN的weight,无法通过pname.find(‘bn’)找到该模块。
self.res_layer = Sequential(
Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
BatchNorm2d(depth),
ReLU(depth),
Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
BatchNorm2d(depth)
)
针对4中的情况,两种解决办法:利用OrderedDict修饰Sequential,或利用方法一
downsample = Sequential( OrderedDict([
(‘conv_ds’, conv1x1(self.inplanes, planes * block.expansion, stride)),
(‘bn_ds’, BatchNorm2d(planes * block.expansion)),
]))
# 如此,相应模块的pname将会带有’conv_ds’,‘bn_ds’字样