接上文
代码使用
def ResNet18ADF(noise_variance=1e-3, min_variance=1e-3):
return ResNet(BasicBlock, [2,2,2,2], num_classes=10, noise_variance=1e-3, min_variance=1e-3, initialize_msra=False)
定义模型,其中ResNet定义为:
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10, noise_variance=1e-3, min_variance=1e-3, initialize_msra=False):
super(ResNet, self).__init__()
self.keep_variance_fn = lambda x: keep_variance(x, min_variance=min_variance)
self._noise_variance = noise_variance
self.in_planes = 64
self.conv1 = adf.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
self.bn1 = adf.BatchNorm2d(64, keep_variance_fn=self.keep_variance_fn)
self.ReLU = adf.ReLU(keep_variance_fn=self.keep_variance_fn)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1, keep_variance_fn=self.keep_variance_fn)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2, keep_variance_fn=self.keep_variance_fn)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2, keep_variance_fn=self.keep_variance_fn)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2, keep_variance_fn=self.keep_variance_fn)
self.linear = adf.Linear(512*block.expansion, num_classes, keep_variance_fn=self.keep_variance_fn)
self.AvgPool2d = adf.AvgPool2d(keep_variance_fn=self.keep_variance_fn)
def _make_layer(self, block, planes, num_blocks, stride, keep_variance_fn=None):
strides = [stride] + [1]*(num_blocks-1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride, keep_variance_fn=self.keep_variance_fn))
self.in_planes = planes * block.expansion
return adf.Sequential(*layers)
def forward(self, x):
inputs_mean = x
inputs_variance = torch.zeros_like(inputs_mean) + self._noise_variance
x = inputs_mean, inputs_variance
out = self.ReLU(*self.bn1(*self.conv1(*x)))
out = self.layer1(*out)
out = self.layer2(*out)
out = self.layer3(*out)
out = self.layer4(*out)
out = self.AvgPool2d(*out, 4)
out_mean = out[0].view(out[0].size(0), -1) # Flatten
out_var = out[1].view(out[1].size(0), -1)
out = out_mean, out_var
out = self.linear(*out)
return out
其中,*的作用是:
在Python中,一个星号(*)通常被用来进行解包(unpacking)操作。当一个星号出现在函数调用中的一个参数前面时,它会告诉Python将该参数解包成多个独立的值,然后再将这些值传递给函数。
当一个星号出现在一个变量名前面时,它可以被用来表示一个可变数量的参数。这被称为可变参数列表(variable-length argument list)或者不定长参数(arbitrary argument)。这样的语法允许函数接受不定数量的参数。
下面是一个例子,展示了如何使用星号来定义一个可变参数列表:
def my_func(*args):
for arg in args:
print(arg)
my_func(1, 2, 3, 4, 5)
这个函数可以接受任意数量的参数,并将它们打印出来。在函数中,参数args被定义为一个元组,其中包含了传递给函数的所有参数。
(可以观察到,self.layer等模型层输出的out应该包含两部分)
ResNet中比较关键的是_make_layer,其中用到了adf.Sequential:
class Sequential(nn.Module):
def __init__(self, *args):
super(Sequential, self).__init__()
if len(args) == 1 and isinstance(args[0], OrderedDict):
for key, module in args[0].items():
self.add_module(key, module)
else:
for idx, module in enumerate(args):
self.add_module(str(idx), module)
def _get_item_by_idx(self, iterator, idx):
"""Get the idx-th item of the iterator"""
size = len(self)
idx = operator.index(idx)
if not -size <= idx < size:
raise IndexError('index {} is out of range'.format(idx))
idx %= size
return next(islice(iterator, idx, None))
def __getitem__(self, idx):
if isinstance(idx, slice):
return Sequential(OrderedDict(list(self._modules.items())[idx]))
else:
return self._get_item_by_idx(self._modules.values(), idx)
def __setitem__(self, idx, module):
key = self._get_item_by_idx(self._modules.keys(), idx)
return setattr(self, key, module)
def __delitem__(self, idx):
if isinstance(idx, slice):
for key in list(self._modules.keys())[idx]:
delattr(self, key)
else:
key = self._get_item_by_idx(self._modules.keys(), idx)
delattr(self, key)
def __len__(self):
return len(self._modules)
def __dir__(self):
keys = super(Sequential, self).__dir__()
keys = [key for key in keys if not key.isdigit()]
return keys
def forward(self, inputs, inputs_variance):
for module in self._modules.values():
inputs, inputs_variance = module(inputs, inputs_variance)
return inputs, inputs_variance
其中,add_module()作用是:
add_module() 是 PyTorch 中 nn.Module 类的一个方法,用于将子模块添加到当前模块中。它接受两个参数:
name:子模块的名称。 module:要添加的子模块。 下面是一个例子,展示了如何使用 add_module()
方法向一个模块中添加子模块:import torch.nn as nn class MyModel(nn.Module): def __init__(self): super(MyModel, self).__init__() self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1) self.relu1 = nn.ReLU() self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1) self.relu2 = nn.ReLU() self.fc1 = nn.Linear(32 * 28 * 28, 1024) self.relu3 = nn.ReLU() self.fc2 = nn.Linear(1024, 10) self.softmax = nn.Softmax(dim=1) # 使用 add_module() 方法添加一个 BatchNorm2d 模块 self.bn = nn.BatchNorm2d(32) self.add_module('batch_norm', self.bn) def forward(self, x): x = self.conv1(x) x = self.relu1(x) x = self.conv2(x) x = self.bn(x) x = self.relu2(x) x = x.view(-1, 32 * 28 * 28) x = self.fc1(x) x = self.relu3(x) x = self.fc2(x) x = self.softmax(x) return x
在这个例子中,我们创建了一个自定义的模型 MyModel,并使用 add_module() 方法将一个 BatchNorm2d 模块添加到模型中。这个方法可以方便地管理模型中的子模块,并在需要时进行访问和修改。
在上面的代码中,确实可以直接使用 self.bn 调用 BatchNorm2d
模块,而不需要使用 add_module() 方法。然而,使用 add_module()
方法可以将模块的名称与其实例进行绑定,从而使模块的名称可以在模型的其他地方进行访问和修改。这在模型较复杂的情况下非常有用。举个例子,如果我们想要访问模型中的所有 BatchNorm2d 模块,可以使用 named_modules()
方法来获取所有模块及其名称,并在其中筛选出 BatchNorm2d 模块:for name, module in my_model.named_modules(): if isinstance(module, nn.BatchNorm2d): print(f'{name}: {module}') 这将输出模型中所有的 BatchNorm2d 模块及其名称。
另外,add_module() 方法还可以与 register_parameter()
方法和其他方法一起使用,方便地管理模型的参数和其他属性。因此,在设计复杂的模型时,使用 add_module()
方法可以提高代码的可读性和可维护性。
在forward中,读取add后的module进行计算:
def forward(self, inputs, inputs_variance):
for module in self._modules.values():
inputs, inputs_variance = module(inputs, inputs_variance)
return inputs, inputs_variance
ResNet中比较关键的是_make_layer,其中用到了block(),指的是:
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_planes, planes, stride=1, keep_variance_fn=None):
super(BasicBlock, self).__init__()
self.keep_variance_fn = keep_variance_fn
self.conv1 = adf.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
self.bn1 = adf.BatchNorm2d(planes, keep_variance_fn=self.keep_variance_fn)
self.conv2 = adf.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False, keep_variance_fn=self.keep_variance_fn)
self.bn2 = adf.BatchNorm2d(planes, keep_variance_fn=self.keep_variance_fn)
self.ReLU = adf.ReLU(keep_variance_fn=self.keep_variance_fn)
self.shortcut = adf.Sequential()
if stride != 1 or in_planes != self.expansion*planes:
self.shortcut = adf.Sequential(
adf.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False, keep_variance_fn=self.keep_variance_fn),
adf.BatchNorm2d(self.expansion*planes, keep_variance_fn=self.keep_variance_fn)
)
def forward(self, inputs_mean, inputs_variance):
x = inputs_mean, inputs_variance
out = self.ReLU(*self.bn1(*self.conv1(*x)))
out_mean, out_var = self.bn2(*self.conv2(*out))
shortcut_mean, shortcut_var = self.shortcut(*x)
out_mean, out_var = out_mean + shortcut_mean, out_var + shortcut_var
out = out_mean, out_var
out = self.ReLU(*out)
return out
BasicBlock里把mean和var合成了一个out,而Sequential选择分开传播mean和var,有什么说法吗?
可以看到BasicBlock使用了adf.Conv2d():
class Conv2d(_ConvNd):
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, bias=True,
keep_variance_fn=None, padding_mode='zeros'):
self._keep_variance_fn = keep_variance_fn
kernel_size = _pair(kernel_size)
stride = _pair(stride)
padding = _pair(padding)
dilation = _pair(dilation)
super(Conv2d, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
False, _pair(0), groups, bias, padding_mode)
def forward(self, inputs_mean, inputs_variance):
outputs_mean = F.conv2d(
inputs_mean, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
outputs_variance = F.conv2d(
inputs_variance, self.weight ** 2, None, self.stride, self.padding, self.dilation, self.groups)
if self._keep_variance_fn is not None:
outputs_variance = self._keep_variance_fn(outputs_variance)
return outputs_mean, outputs_variance
其中,对于均值和方差是共享权重的:
outputs_mean, outputs_variance两个卷积操作共享权重吗?
是的,outputs_mean 和 outputs_variance 两个卷积操作在这个类中是共享权重的。在这个类的 init() 方法中,使用父类 _ConvNd 的构造函数初始化了模型的卷积核权重和偏置,并在 forward() 方法中对这些权重和偏置进行了调用。
具体来说,F.conv2d() 函数的第一个参数 inputs_mean 和第二个参数 inputs_variance 都使用了 self.weight 权重作为卷积核,这意味着这两个卷积操作共享相同的权重。同时,由于 inputs_variance 使用了 self.weight 的平方作为卷积核,因此两个卷积操作的权重实际上也是不同的。
需要注意的是,在卷积神经网络中,共享权重是一种常见的技术,它可以减少模型中需要训练的参数数量,从而减轻过拟合的风险。但是,共享权重也可能会限制模型的表达能力,因此需要根据具体问题和数据集的情况进行权衡和选择。
这个代码,使用 w e i g h t 2 weight^2 weight2作为方差的权重,有什么道理嘛?勉强可以理解吧!
可以看到BasicBlock中使用了adf.BatchNorm2d():
class BatchNorm2d(nn.Module):
_version = 2
__constants__ = ['track_running_stats', 'momentum', 'eps', 'weight', 'bias',
'running_mean', 'running_var', 'num_batches_tracked']
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
track_running_stats=True, keep_variance_fn=None):
super(BatchNorm2d, self).__init__()
self._keep_variance_fn = keep_variance_fn
self.num_features = num_features
self.eps = eps
self.momentum = momentum
self.affine = affine
self.track_running_stats = track_running_stats
if self.affine:
self.weight = Parameter(torch.Tensor(num_features))
self.bias = Parameter(torch.Tensor(num_features))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
if self.track_running_stats:
self.register_buffer('running_mean', torch.zeros(num_features))
self.register_buffer('running_var', torch.ones(num_features))
self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
else:
self.register_parameter('running_mean', None)
self.register_parameter('running_var', None)
self.register_parameter('num_batches_tracked', None)
self.reset_parameters()
def reset_running_stats(self):
if self.track_running_stats:
self.running_mean.zero_()
self.running_var.fill_(1)
self.num_batches_tracked.zero_()
def reset_parameters(self):
self.reset_running_stats()
if self.affine:
nn.init.uniform_(self.weight)
nn.init.zeros_(self.bias)
def _check_input_dim(self, input):
raise NotImplementedError
def forward(self, inputs_mean, inputs_variance):
# exponential_average_factor is self.momentum set to
# (when it is available) only so that if gets updated
# in ONNX graph when this node is exported to ONNX.
if self.momentum is None:
exponential_average_factor = 0.0
else:
exponential_average_factor = self.momentum
if self.training and self.track_running_stats:
if self.num_batches_tracked is not None:
self.num_batches_tracked += 1
if self.momentum is None: # use cumulative moving average
exponential_average_factor = 1.0 / float(self.num_batches_tracked)
else: # use exponential moving average
exponential_average_factor = self.momentum
outputs_mean = F.batch_norm(
inputs_mean, self.running_mean, self.running_var, self.weight, self.bias,
self.training or not self.track_running_stats,
exponential_average_factor, self.eps)
outputs_variance = inputs_variance
weight = ((self.weight.unsqueeze(0)).unsqueeze(2)).unsqueeze(3)
outputs_variance = outputs_variance*weight**2
"""
for i in range(outputs_variance.size(1)):
outputs_variance[:,i,:,:]=outputs_variance[:,i,:,:].clone()*self.weight[i]**2
"""
if self._keep_variance_fn is not None:
outputs_variance = self._keep_variance_fn(outputs_variance)
return outputs_mean, outputs_variance
对于均值与方差来说,只有均值需要进行归一化,方差不需要;
而本函数添加了两个可学习的参数:
在 BatchNorm2d 类的构造函数中,当 affine 参数为 True 时,会初始化 self.weight 和 self.bias 两个可学习参数。具体来说,这里的 self.weight 是一个形状为 (num_features,) 的一维张量,用于缩放归一化后的数据。而 self.bias 也是一个形状为 (num_features,) 的一维张量,用于平移归一化后的数据。
这两个参数在 forward() 方法中会被应用于输出张量上,从而进一步提高模型的表达能力和灵活性。在 forward() 方法中,self.weight 用于对归一化后的数据进行缩放,即将均值为 0、方差为 1 的数据缩放为均值为 0、方差为 self.weight 的数据。而 self.bias 则用于对归一化后的数据进行平移,即将缩放后的数据加上一个偏置项 self.bias,从而使得模型能够适应更加复杂和多样的数据分布。
需要注意的是,如果在构造函数中将 affine 参数设为 False,则不会初始化 self.weight 和 self.bias,也就不会应用缩放和平移操作。在这种情况下,BatchNorm2d 类实际上只是对输入的数据进行了均值和方差的归一化处理,而没有引入额外的可学习参数。
可以看到BasicBlock中使用了adf.ReLU():
class ReLU(nn.Module):
def __init__(self, keep_variance_fn=None):
super(ReLU, self).__init__()
self._keep_variance_fn = keep_variance_fn
def forward(self, features_mean, features_variance):
features_stddev = torch.sqrt(features_variance)
div = features_mean / features_stddev
pdf = normpdf(div)
cdf = normcdf(div)
outputs_mean = features_mean * cdf + features_stddev * pdf
outputs_variance = (features_mean ** 2 + features_variance) * cdf \
+ features_mean * features_stddev * pdf - outputs_mean ** 2
if self._keep_variance_fn is not None:
outputs_variance = self._keep_variance_fn(outputs_variance)
return outputs_mean, outputs_variance
在 forward() 方法中,输入的参数包含两部分,即 features_mean 表示输入特征的均值,features_variance 表示输入特征的方差。在计算输出时,首先计算输入特征的标准差 features_stddev,然后计算出 cdf 和 pdf,这两个量分别表示标准正态分布的累积分布函数和概率密度函数。最后,根据 ReLU 函数的定义,对均值和方差分别进行处理,得到输出特征的均值 outputs_mean 和方差 outputs_variance。