import torch.nn as nn
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5)
self.conv2 = nn.Conv2d(20, 20, 5)
def forward(self, x):
x = F.relu(self.conv1(x))
return F.relu(self.conv2(x))
add_module
python源码 def add_module(self, name: str, module: Optional['Module']) -> None:
r"""Adds a child module to the current module.
The module can be accessed as an attribute using the given name.
Args:
name (string): name of the child module. The child module can be
accessed from this module using the given name
module (Module): child module to be added to the module.
"""
if not isinstance(module, Module) and module is not None:
raise TypeError("{} is not a Module subclass".format(
torch.typename(module)))
elif not isinstance(name, torch._six.string_classes):
raise TypeError("module name should be a string. Got {}".format(
torch.typename(name)))
elif hasattr(self, name) and name not in self._modules:
raise KeyError("attribute '{}' already exists".format(name))
elif '.' in name:
raise KeyError("module name can't contain \".\", got: {}".format(name))
elif name == '':
raise KeyError("module name can't be empty string \"\"")
self._modules[name] = module
import torch
from torch import nn
from torchsummary import summary
# 以前存在的网络My_Model
class My_Model(nn.Module):
def __init__(self):
super(My_Model, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10))
def forward(self, X):
X = self.flatten(X)
logist = self.linear_relu_stack(X)
return logist
# 需要增加的全连接层new_one_net
new_one_net = nn.Linear(10, 5)
# 定义一个函数,将新的网络加到序列中
def new_net():
net = nn.Sequential(My_Model())
net.add_module("sunwukong", new_one_net)
return net
# 定义输入x
x = torch.randn(3, 28, 28)
# 定义一个新的网络(增加add_module后)
after_add_module_net = new_net()
print(f"after_add_module_net={after_add_module_net}")
# 定义前向传播
y = after_add_module_net(x)
# 查看是否输出有变化,如果y.shape=torch.Size([3, 5])表示新增模块成功
print(f"y.shape={y.shape}")
summary(after_add_module_net, input_data=x)
after_add_module_net=Sequential(
(0): My_Model(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
(sunwukong): Linear(in_features=10, out_features=5, bias=True)
)
y.shape=torch.Size([3, 5])
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
├─My_Model: 1-1 [-1, 10] --
| └─Flatten: 2-1 [-1, 784] --
| └─Sequential: 2-2 [-1, 10] --
| | └─Linear: 3-1 [-1, 512] 401,920
| | └─ReLU: 3-2 [-1, 512] --
| | └─Linear: 3-3 [-1, 512] 262,656
| | └─ReLU: 3-4 [-1, 512] --
| | └─Linear: 3-5 [-1, 10] 5,130
├─Linear: 1-2 [-1, 5] 55
==========================================================================================
Total params: 669,761
Trainable params: 669,761
Non-trainable params: 0
Total mult-adds (M): 2.01
==========================================================================================
Input size (MB): 0.01
Forward/backward pass size (MB): 0.01
Params size (MB): 2.55
Estimated Total Size (MB): 2.57
==========================================================================================
将fn递归地应用于每一个子模块(如.children()返回的)以及self。典型的使用包括初始化模型的参数(请参见torch.nn.init)
主要是做网络初始化用
在import torch
from torch import nn
# 定义全连接层网络
net = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 5))
# 不计算梯度修饰符
@torch.no_grad()
def init_weight(m):
print(f"m={m}")
if type(m) == nn.Linear:
m.weight.fill_(1)
print(f"m.weight={m.weight}")
net.apply(init_weight)
m=Linear(in_features=4, out_features=2, bias=True)
m.weight=Parameter containing:
tensor([[1., 1., 1., 1.],
[1., 1., 1., 1.]], requires_grad=True)
m=Linear(in_features=2, out_features=5, bias=True)
m.weight=Parameter containing:
tensor([[1., 1.],
[1., 1.],
[1., 1.],
[1., 1.],
[1., 1.]], requires_grad=True)
m=Sequential(
(0): Linear(in_features=4, out_features=2, bias=True)
(1): Linear(in_features=2, out_features=5, bias=True)
)
将module中的parameters和buffers的原来的浮点类型转换成bfloat16
我们知道在一个神经网络中有两种比较重要的参数,parametes
和 buffers
parameters
: 一种是反向传播需要
被optimizer
更新的参数buffers
: 一种是反向传播不需要
被optimizer
更新的参数我们对于模型的buffer和parameters来说,我们可以通过如下方式进行查看
# parameters查看方式
model.parameters()
# buffers查看方式
model.buffers()
(1)通过self.param = nn.Parameter(torch.randn(3, 6))
这样创建的paramters(有self.情况下)会自动注册到模型的parameter中,可以通过model.parameters()进行查询,并且这样创建的parameter自动保存到OrderDict中
import torch
from torch import nn
class BufferModule(nn.Module):
def __init__(self):
super(BufferModule, self).__init__()
buffer = torch.randn(2, 3)
self.register_buffer('my_buffer', buffer)
# 第一种方式nn.Parameter方式创建,会自动注册到parameters中
# 可以通过model.parameters返回,并且这样创建的参数会自动保存到OrderDict中
# 模型的成员变量
self.param = nn.Parameter(torch.randn(3, 6))
def forward(self, X):
pass
model = BufferModule()
for param in model.parameters():
print(f"param={param}")
print("*" * 50)
for buffer in model.buffers():
print(f"buffer={buffer}")
print("*" * 50)
print(f"model.state_dict()={model.state_dict()}")
param=Parameter containing:
tensor([[-0.9245, 0.0886, -0.4979, 0.9047, -0.8856, -0.2012],
[-0.7658, -0.6367, -0.4953, 0.4815, 0.6067, -0.1094],
[-1.1385, -0.6048, 0.1718, 0.0956, 1.2832, 0.2685]],
requires_grad=True)
**************************************************
buffer=tensor([[0.6095, 1.8804, 0.7442],
[0.3063, 1.3889, 0.7409]])
**************************************************
model.state_dict()=OrderedDict([('param', tensor([[-0.9245, 0.0886, -0.4979, 0.9047, -0.8856, -0.2012],
[-0.7658, -0.6367, -0.4953, 0.4815, 0.6067, -0.1094],
[-1.1385, -0.6048, 0.1718, 0.0956, 1.2832, 0.2685]])), ('my_buffer', tensor([[0.6095, 1.8804, 0.7442],
[0.3063, 1.3889, 0.7409]]))])
(2)通过param = nn.Parameter(torch.randn(3, 3))
创建后再通过self.register_parameter('my_param', param)
注入到模型中,通过这两个步骤后就可以实现模型参数的保存到OrderDict中
import torch
from torch import nn
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
# 定义一个普通的buffer,后续注入到模型中
buffer = torch.randn(2, 4)
# 定义一个普通的参数parameter,后续注入到模型中
param = nn.Parameter(torch.randn(3, 3))
# 定义一个普通的参数buffer,作为对比不注入
buffer_none = torch.randn(2, 4)
# 定义一个普通的参数parameter,作为对比不注入
param_none = nn.Parameter(torch.randn(3, 3))
self.register_buffer('my_buffer', buffer)
self.register_parameter('my_param', param)
def forward(self,X):
pass
model = MyModel()
for param in model.parameters():
print(param)
print("*"*10)
for buffer in model.buffers():
print(buffer)
print("*"*10)
print(model.state_dict())
Parameter containing:
tensor([[ 2.4219, 0.6770, -0.8113],
[-1.1523, 1.1072, -0.0615],
[ 0.5957, 0.0584, 1.3951]], requires_grad=True)
**********
tensor([[ 1.5272, -0.0775, 1.8428, -0.3205],
[ 0.4810, 0.8681, 0.4728, 1.2447]])
**********
OrderedDict([('my_param', tensor([[ 2.4219, 0.6770, -0.8113],
[-1.1523, 1.1072, -0.0615],
[ 0.5957, 0.0584, 1.3951]])), ('my_buffer', tensor([[ 1.5272, -0.0775, 1.8428, -0.3205],
[ 0.4810, 0.8681, 0.4728, 1.2447]]))])
def register_buffer(self, name: str, tensor: Optional[Tensor], persistent: bool = True) -> None:
r"""Adds a buffer to the module.
This is typically used to register a buffer that should not to be
considered a model parameter. For example, BatchNorm's ``running_mean``
is not a parameter, but is part of the module's state. Buffers, by
default, are persistent and will be saved alongside parameters. This
behavior can be changed by setting :attr:`persistent` to ``False``. The
only difference between a persistent buffer and a non-persistent buffer
is that the latter will not be a part of this module's
:attr:`state_dict`.
Buffers can be accessed as attributes using given names.
Args:
name (string): name of the buffer. The buffer can be accessed
from this module using the given name
tensor (Tensor): buffer to be registered.
persistent (bool): whether the buffer is part of this module's
:attr:`state_dict`.
Example::
>>> self.register_buffer('running_mean', torch.zeros(num_features))
"""
if persistent is False and isinstance(self, torch.jit.ScriptModule):
raise RuntimeError("ScriptModule does not support non-persistent buffers")
if '_buffers' not in self.__dict__:
raise AttributeError(
"cannot assign buffer before Module.__init__() call")
elif not isinstance(name, torch._six.string_classes):
raise TypeError("buffer name should be a string. "
"Got {}".format(torch.typename(name)))
elif '.' in name:
raise KeyError("buffer name can't contain \".\"")
elif name == '':
raise KeyError("buffer name can't be empty string \"\"")
elif hasattr(self, name) and name not in self._buffers:
raise KeyError("attribute '{}' already exists".format(name))
elif tensor is not None and not isinstance(tensor, torch.Tensor):
raise TypeError("cannot assign '{}' object to buffer '{}' "
"(torch Tensor or None required)"
.format(torch.typename(tensor), name))
else:
self._buffers[name] = tensor
if persistent:
self._non_persistent_buffers_set.discard(name)
else:
self._non_persistent_buffers_set.add(name)
parameter
,不需要更新的参数为buffer
作用:将数据放到GPU中,本质上就是用一个lambda函数,将模型中每个模块执行_apply函数
def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
r"""Moves all model parameters and buffers to the GPU.
This also makes associated parameters and buffers different objects. So
it should be called before constructing optimizer if the module will
live on GPU while being optimized.
.. note::
This method modifies the module in-place.
Args:
device (int, optional): if specified, all parameters will be
copied to that device
Returns:
Module: self
"""
return self._apply(lambda t: t.cuda(device))
根据指定的字符串target
进行索引,得到parameters;
def get_parameter(self, target: str) -> "Parameter":
"""
Returns the parameter given by ``target`` if it exists,
otherwise throws an error.
See the docstring for ``get_submodule`` for a more detailed
explanation of this method's functionality as well as how to
correctly specify ``target``.
Args:
target: The fully-qualified string name of the Parameter
to look for. (See ``get_submodule`` for how to specify a
fully-qualified string.)
Returns:
torch.nn.Parameter: The Parameter referenced by ``target``
Raises:
AttributeError: If the target string references an invalid
path or resolves to something that is not an
``nn.Parameter``
"""
# 将字符串按照.进行分割,得到三份,左边为module_path,右边为param_name
module_path, _, param_name = target.rpartition(".")
# 根据module_path 得到模块,并赋值给mod
mod: torch.nn.Module = self.get_submodule(module_path)
# 判断mod中是否有指定的param_name,如果没有就报错
if not hasattr(mod, param_name):
raise AttributeError(mod._get_name() + " has no attribute `"
+ param_name + "`")
# 根据param_name 获取得到param
param: torch.nn.Parameter = getattr(mod, param_name)
# 再次判断param的类型是否是torch.nn.Parameter,不是就报错
if not isinstance(param, torch.nn.Parameter):
raise AttributeError("`" + param_name + "` is not an "
"nn.Parameter")
# 返回参数
return param
apply 函数作用三个地方:
def _apply(self, fn):
for module in self.children():
module._apply(fn)
def compute_should_use_set_data(tensor, tensor_applied):
if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
# If the new tensor has compatible tensor type as the existing tensor,
# the current behavior is to change the tensor in-place using `.data =`,
# and the future behavior is to overwrite the existing tensor. However,
# changing the current behavior is a BC-breaking change, and we want it
# to happen in future releases. So for now we introduce the
# `torch.__future__.get_overwrite_module_params_on_conversion()`
# global flag to let the user control whether they want the future
# behavior of overwriting the existing tensor or not.
return not torch.__future__.get_overwrite_module_params_on_conversion()
else:
return False
for key, param in self._parameters.items():
if param is not None:
# Tensors stored in modules are graph leaves, and we don't want to
# track autograd history of `param_applied`, so we have to use
# `with torch.no_grad():`
with torch.no_grad():
param_applied = fn(param)
should_use_set_data = compute_should_use_set_data(param, param_applied)
if should_use_set_data:
param.data = param_applied
else:
assert isinstance(param, Parameter)
assert param.is_leaf
self._parameters[key] = Parameter(param_applied, param.requires_grad)
if param.grad is not None:
with torch.no_grad():
grad_applied = fn(param.grad)
should_use_set_data = compute_should_use_set_data(param.grad, grad_applied)
if should_use_set_data:
param.grad.data = grad_applied
else:
assert param.grad.is_leaf
self._parameters[key].grad = grad_applied.requires_grad_(param.grad.requires_grad)
for key, buf in self._buffers.items():
if buf is not None:
self._buffers[key] = fn(buf)
return self
将参数和缓冲区移动到指定的设备而不复制存储。简而言之,就是将parameters和buffers放到指定的GPU上而不存其值
因为源代码中用到了torch.empty_like
函数
def to_empty(self: T, *, device: Union[str, device]) -> T:
r"""Moves the parameters and buffers to the specified device without copying storage.
Args:
device (:class:`torch.device`): The desired device of the parameters
and buffers in this module.
Returns:
Module: self
"""
return self._apply(lambda t: torch.empty_like(t, device=device))