https: // www.cnblogs.com / adong7639 / p / 9145911.
html
写的很好
'''
本文讲解的是在CNN中的batch normalization
'''
import torch
import torch.nn as nn
import copy
class Net(nn.Module):
def __init__(self, dim, pretrained):
super(Net, self).__init__()
self.bn = nn.BatchNorm2d(dim, 0)
if pretrained:
self.pretrained()
def forward(self, input):
return self.bn(input)
def pretrained(self):
nn.init.constant_(self.bn.weight, 1)
nn.init.constant_(self.bn.bias, 0)
def train():
dim = 3
model = Net(dim)
print(sum(p.numel() for p in model.parameters() if p.requires_grad))
for p in model.parameters():
print(p, p.requires_grad)
'''
对于CNN特征图通道数为3的Batch normalization层而言,BN层的learnable parameter有6个,分别是gamma和beta
在训练过程中gamma和beta才是需要被更新的
6
Parameter containing:
tensor([0.2322, 0.9405, 0.9887], requires_grad=True) True
Parameter containing:
tensor([0., 0., 0.], requires_grad=True) True
'''
# model.eval()
feature_map = torch.randn((2, 3, 2, 2))
output1 = model(feature_map)
state_dict = model.state_dict()
for k, v in state_dict.items():
print(k, v)
'''
bn.weight tensor([0.2860, 0.5986, 0.0594])
bn.bias tensor([0., 0., 0.])
bn.running_mean tensor([-0.2098, 0.1876, -0.3045])
bn.running_var tensor([0.8099, 1.5140, 0.5880])
bn.num_batches_tracked tensor(1)
打印字典时,发现batch normalization层有5个参数
其中bn.weight 对应论文中的gamma bn.bias对应论文中的beta
bn.running_mean则是对于当前batch size的数据所统计出来的平均值
bn.running_var是对于当前batch size的数据所统计出来的方差
'''
print('bn.running_mean', state_dict['bn.running_mean'])
print('bn.running_var', state_dict['bn.running_var'])
#
print(torch.mean(feature_map.permute(1, 0, 2, 3).contiguous().view(dim, -1), 1))
print(torch.var(feature_map.permute(1, 0, 2, 3).contiguous().view(dim, -1), 1))
'''
bn.running_mean tensor([-0.2098, 0.1876, -0.3045])
bn.running_var tensor([0.8099, 1.5140, 0.5880])
tensor([-0.2098, 0.1876, -0.3045])
tensor([0.8099, 1.5140, 0.5880])
当然这是在设定BN层的momentum=1时,即当前时刻的统计量(running_mean和running_var)完全由统计平均值决定
statistic_t_new=(1-momentum)*stastic_(t-1)+momentum)*stastic_(t)
momentum决定当前时刻的bn.running_mean和bn.running_var数值
(1)当momentum=1时,则数值完全由当前时刻计算出来的统计量决定
(2)由于模型上一次的统计量(由于这里不进行模型的参数更新和迭代训练,故而模型的初始值
bn.running_mean tensor([0., 0., 0.])
bn.running_var tensor([1., 1., 1.])) 可能不是0 0 0 1 1 1,而是随机初始化
故而如果将momentum设置为0,则模型会一直保持
bn.running_mean tensor([0., 0., 0.])
bn.running_var tensor([1., 1., 1.])
(3)当设置默认参数momentum=0.1时
bn.running_mean tensor([0.0233, 0.0166, 0.0469])
bn.running_var tensor([0.9961, 1.0899, 0.9974])
tensor([0.2329, 0.1663, 0.4691]) 表示用tensor的方法计算出来的统计量
tensor([0.9615, 1.8986, 0.9738])
刚好bn.running_mean和bn.running_var是统计量的0.1倍
再次回顾计算BN的方式:
对于CNN的输入而言(即BN的输出时4-dimension),则
在batch,H,W 维度上进行normalization,也被称为spatial batch normalization
'''
if __name__ == '__main__':
'''
在BN层中,一般,bn.weight时随机初始化的,而bn.bias初始化为全0
假设现在已知输入特征图的数值,和对应batch normalization的参数,求BN输出的结果
momentum=0.1默认值 0.9*(t-1时刻的统计量)+0.1*(t时刻的统计量)
'''
dim = 3
momentum = 0.1
model = Net(dim, True)
input = torch.randn((2, 3, 2, 2))
output1 = model(input)
def bn_simple_train(input, model):
'''
:param input: 卷积神经网络特征图 shape [batch size,C,H,W]
:return:
'''
mean = torch.mean(input.permute(1, 0, 2, 3).contiguous().view(dim, -1), 1) # shape [dim]
var = torch.var(input.permute(1, 0, 2, 3).contiguous().view(dim, -1), 1) # shape [dim]
init_mean = torch.zeros((dim))
init_var = torch.ones((dim))
run_mean = (1 - momentum) * init_mean + momentum * mean # 滑动平均的方式计算新的均值,训练时计算,为测试数据做准备
run_var = (1 - momentum) * init_var + momentum * var # 滑动平均的方式计算新的方差,训练时计算,为测试数据做准备
run_std = torch.sqrt(run_var + 1e-5)
run_mean_exp = run_mean.view(1, input.shape[1], 1, 1).expand(input.shape)
run_std_exp = run_std.view(1, input.shape[1], 1, 1).expand(input.shape)
'''
这里的tensor复制问题也让我想了很久
tensor1=torch.tensor([1,2,3])
需要得到一个2*3*2*2的tensor2,然后需要满足
tensor2[:,0,:,:]=1
tensor2[:,1,:,:]=2
tensor2[:,2,:,:]=3
这个,除了for循环,内部函数也可以实现
先unsqueeze 到 ,(1, 2, 1, 1) 再 expand(2,3,2,2)
expand, 只能再指定维度进行复制, 不能增加维度, 所以你要先unsqueeze到4个维度
expand的时候会找channel相同的维度,这些维度不变,其他维度复制
'''
# run_mean_exp=torch.zeros((2,3,2,2))
# for i in range(3):
# run_mean_exp[:,i,:,:]=run_mean[i]
# run_std_exp = torch.zeros((2, 3, 2, 2))
# for i in range(3):
# run_std_exp[:, i, :, :] = run_std[i]
output2 = input - run_mean_exp
output2 = output2 / run_std_exp
init_weights = model.state_dict().items()['bn.weights'] # gamma
init_bias = model.state_dict().items()['bn.bias'] # beta
init_weights_exp = init_weights.view(1, input.shape[1], 1, 1).expand(input.shape)
init_bias_exp = init_bias.view(1, input.shape[1], 1, 1).expand(input.shape)
'''
在训练过程中会一直更新(反向传播时)的可学习参数
'''
# init_weights_exp=torch.zeros((2, 3, 2, 2))
# for i in range(3):
# init_weights_exp[:, i, :, :] = init_weights[i]
#
# init_bias_exp = torch.zeros((2, 3, 2, 2))
# for i in range(3):
# init_bias_exp[:, i, :, :] = init_bias[i]
output2 = output2 * init_weights_exp
output2 = output2 + init_bias_exp
return output2
def bn_for_test(input, model):
'''
测试过程中,BN层的running mean和running var都是固定值,不再时新的验证数据的统计量,在model.eval()模式下这两个参数会被固定住
而gamma和beta也不发生改变
:param input:
:param model:
:return:
'''
state_dict = model.state_dict()
init_weights = state_dict.items()['bn.weight']
init_bias = state_dict.items()['bn.bias']
running_mean = state_dict.items()(['bn.running_mean']
running_var = state_dict.tems()['bn.running_var']
mean = running_mean.view(1, input.shape[1], 1, 1).expand(input.shape)
var = running_var.view(1, input.shape[1], 1, 1).expand(input.shape)
weights = init_weights.view(1, input.shape[1], 1, 1).expand(input.shape)
bias = init_bias.view(1, input.shape[1], 1, 1).expand(input.shape)
output = (input - mean) / torch.sqrt(var + 1e-5)
output = output * weights + bias
return output