1. nn.Module

  • 作用:所有神经网络模块的基类
  • 官网链接:pytorch 中 nn.Module类说明
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

2. add_module

  • 作用:将子模块增加到模型中,可以使用给定的名称作为属性访问模块
  • add_modulepython源码
    def add_module(self, name: str, module: Optional['Module']) -> None:
        r"""Adds a child module to the current module.

        The module can be accessed as an attribute using the given name.

            name (string): name of the child module. The child module can be
                accessed from this module using the given name
            module (Module): child module to be added to the module.
        if not isinstance(module, Module) and module is not None:
            raise TypeError("{} is not a Module subclass".format(
        elif not isinstance(name, torch._six.string_classes):
            raise TypeError("module name should be a string. Got {}".format(
        elif hasattr(self, name) and name not in self._modules:
            raise KeyError("attribute '{}' already exists".format(name))
        elif '.' in name:
            raise KeyError("module name can't contain \".\", got: {}".format(name))
        elif name == '':
            raise KeyError("module name can't be empty string \"\"")
        self._modules[name] = module
  • 实例代码:我们在以前的神经网络中通过add_module方法新增一个新的全连接层
import torch
from torch import nn
from torchsummary import summary

# 以前存在的网络My_Model
class My_Model(nn.Module):
	def __init__(self):
		super(My_Model, self).__init__()
		self.flatten = nn.Flatten()
		self.linear_relu_stack = nn.Sequential(
			nn.Linear(28 * 28, 512),
			nn.Linear(512, 512),
			nn.Linear(512, 10))

	def forward(self, X):
		X = self.flatten(X)
		logist = self.linear_relu_stack(X)
		return logist

# 需要增加的全连接层new_one_net
new_one_net = nn.Linear(10, 5)

# 定义一个函数,将新的网络加到序列中
def new_net():
	net = nn.Sequential(My_Model())
	net.add_module("sunwukong", new_one_net)
	return net

# 定义输入x
x = torch.randn(3, 28, 28)

# 定义一个新的网络(增加add_module后)
after_add_module_net = new_net()

# 定义前向传播
y = after_add_module_net(x)
# 查看是否输出有变化,如果y.shape=torch.Size([3, 5])表示新增模块成功
summary(after_add_module_net, input_data=x)
  • 输出结果
  (0): My_Model(
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (linear_relu_stack): Sequential(
      (0): Linear(in_features=784, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=10, bias=True)
  (sunwukong): Linear(in_features=10, out_features=5, bias=True)
y.shape=torch.Size([3, 5])
Layer (type:depth-idx)                   Output Shape              Param #
├─My_Model: 1-1                          [-1, 10]                  --
|    └─Flatten: 2-1                      [-1, 784]                 --
|    └─Sequential: 2-2                   [-1, 10]                  --
|    |    └─Linear: 3-1                  [-1, 512]                 401,920
|    |    └─ReLU: 3-2                    [-1, 512]                 --
|    |    └─Linear: 3-3                  [-1, 512]                 262,656
|    |    └─ReLU: 3-4                    [-1, 512]                 --
|    |    └─Linear: 3-5                  [-1, 10]                  5,130
├─Linear: 1-2                            [-1, 5]                   55
Total params: 669,761
Trainable params: 669,761
Non-trainable params: 0
Total mult-adds (M): 2.01
Input size (MB): 0.01
Forward/backward pass size (MB): 0.01
Params size (MB): 2.55
Estimated Total Size (MB): 2.57

3. apply


import torch
from torch import nn

# 定义全连接层网络
net = nn.Sequential(nn.Linear(4, 2), nn.Linear(2, 5))

# 不计算梯度修饰符
def init_weight(m):
	if type(m) == nn.Linear:

  • 结果
m=Linear(in_features=4, out_features=2, bias=True)
m.weight=Parameter containing:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)
m=Linear(in_features=2, out_features=5, bias=True)
m.weight=Parameter containing:
tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)
  (0): Linear(in_features=4, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=5, bias=True)

4. bfloat16


5. buffers


  • parameters: 一种是反向传播需要optimizer更新的参数
  • buffers: 一种是反向传播不需要optimizer更新的参数

5.1 buffers¶meters创建和注入


# parameters查看方式
# buffers查看方式
  • 注:因为我们的模型保存的是state_dict 返回的OrderDict,所以这两种参数不仅要满足是否需要被更新的要求,还需要被保存到OrderDict;所以我们需要满足以下两步

5.2 代码

(1)通过self.param = nn.Parameter(torch.randn(3, 6))这样创建的paramters(有self.情况下)会自动注册到模型的parameter中,可以通过model.parameters()进行查询,并且这样创建的parameter自动保存到OrderDict中

import torch
from torch import nn

class BufferModule(nn.Module):
	def __init__(self):
		super(BufferModule, self).__init__()
		buffer = torch.randn(2, 3)
		self.register_buffer('my_buffer', buffer)
		# 第一种方式nn.Parameter方式创建,会自动注册到parameters中
		# 可以通过model.parameters返回,并且这样创建的参数会自动保存到OrderDict中
		# 模型的成员变量
		self.param = nn.Parameter(torch.randn(3, 6))

	def forward(self, X):

model = BufferModule()
for param in model.parameters():
print("*" * 50)
for buffer in model.buffers():

print("*" * 50)
param=Parameter containing:
tensor([[-0.9245,  0.0886, -0.4979,  0.9047, -0.8856, -0.2012],
        [-0.7658, -0.6367, -0.4953,  0.4815,  0.6067, -0.1094],
        [-1.1385, -0.6048,  0.1718,  0.0956,  1.2832,  0.2685]],
buffer=tensor([[0.6095, 1.8804, 0.7442],
        [0.3063, 1.3889, 0.7409]])
model.state_dict()=OrderedDict([('param', tensor([[-0.9245,  0.0886, -0.4979,  0.9047, -0.8856, -0.2012],
        [-0.7658, -0.6367, -0.4953,  0.4815,  0.6067, -0.1094],
        [-1.1385, -0.6048,  0.1718,  0.0956,  1.2832,  0.2685]])), ('my_buffer', tensor([[0.6095, 1.8804, 0.7442],
        [0.3063, 1.3889, 0.7409]]))])

(2)通过param = nn.Parameter(torch.randn(3, 3))创建后再通过self.register_parameter('my_param', param)注入到模型中,通过这两个步骤后就可以实现模型参数的保存到OrderDict中

import torch 
from torch import nn

class MyModel(nn.Module):
	def __init__(self):
		super(MyModel, self).__init__()
		# 定义一个普通的buffer,后续注入到模型中
		buffer = torch.randn(2, 4)
		# 定义一个普通的参数parameter,后续注入到模型中
		param = nn.Parameter(torch.randn(3, 3))
		# 定义一个普通的参数buffer,作为对比不注入
		buffer_none = torch.randn(2, 4)
		# 定义一个普通的参数parameter,作为对比不注入
		param_none = nn.Parameter(torch.randn(3, 3))
		self.register_buffer('my_buffer', buffer)
		self.register_parameter('my_param', param)
	def forward(self,X):

model = MyModel()
for param in model.parameters():

for buffer in model.buffers():
  • 将my_param和my_buffer注入到模型中,buffer_none和param_none 因为没有用self.register_buffer和self.register_parameter, 所以没有注入到模型中
Parameter containing:
tensor([[ 2.4219,  0.6770, -0.8113],
        [-1.1523,  1.1072, -0.0615],
        [ 0.5957,  0.0584,  1.3951]], requires_grad=True)
tensor([[ 1.5272, -0.0775,  1.8428, -0.3205],
        [ 0.4810,  0.8681,  0.4728,  1.2447]])
OrderedDict([('my_param', tensor([[ 2.4219,  0.6770, -0.8113],
        [-1.1523,  1.1072, -0.0615],
        [ 0.5957,  0.0584,  1.3951]])), ('my_buffer', tensor([[ 1.5272, -0.0775,  1.8428, -0.3205],
        [ 0.4810,  0.8681,  0.4728,  1.2447]]))])

5.3 register_buffer®ister_parameter

    def register_buffer(self, name: str, tensor: Optional[Tensor], persistent: bool = True) -> None:
        r"""Adds a buffer to the module.

        This is typically used to register a buffer that should not to be
        considered a model parameter. For example, BatchNorm's ``running_mean``
        is not a parameter, but is part of the module's state. Buffers, by
        default, are persistent and will be saved alongside parameters. This
        behavior can be changed by setting :attr:`persistent` to ``False``. The
        only difference between a persistent buffer and a non-persistent buffer
        is that the latter will not be a part of this module's

        Buffers can be accessed as attributes using given names.

            name (string): name of the buffer. The buffer can be accessed
                from this module using the given name
            tensor (Tensor): buffer to be registered.
            persistent (bool): whether the buffer is part of this module's


            >>> self.register_buffer('running_mean', torch.zeros(num_features))

        if persistent is False and isinstance(self, torch.jit.ScriptModule):
            raise RuntimeError("ScriptModule does not support non-persistent buffers")

        if '_buffers' not in self.__dict__:
            raise AttributeError(
                "cannot assign buffer before Module.__init__() call")
        elif not isinstance(name, torch._six.string_classes):
            raise TypeError("buffer name should be a string. "
                            "Got {}".format(torch.typename(name)))
        elif '.' in name:
            raise KeyError("buffer name can't contain \".\"")
        elif name == '':
            raise KeyError("buffer name can't be empty string \"\"")
        elif hasattr(self, name) and name not in self._buffers:
            raise KeyError("attribute '{}' already exists".format(name))
        elif tensor is not None and not isinstance(tensor, torch.Tensor):
            raise TypeError("cannot assign '{}' object to buffer '{}' "
                            "(torch Tensor or None required)"
                            .format(torch.typename(tensor), name))
            self._buffers[name] = tensor
            if persistent:

5.4 小结

  • 模型中需要更新的参数为parameter,不需要更新的参数为buffer

6. cuda,xpu,cpu


    def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
        r"""Moves all model parameters and buffers to the GPU.

        This also makes associated parameters and buffers different objects. So
        it should be called before constructing optimizer if the module will
        live on GPU while being optimized.

        .. note::
            This method modifies the module in-place.

            device (int, optional): if specified, all parameters will be
                copied to that device

            Module: self
        return self._apply(lambda t: t.cuda(device))

7. get_parameters


    def get_parameter(self, target: str) -> "Parameter":
        Returns the parameter given by ``target`` if it exists,
        otherwise throws an error.

        See the docstring for ``get_submodule`` for a more detailed
        explanation of this method's functionality as well as how to
        correctly specify ``target``.

            target: The fully-qualified string name of the Parameter
                to look for. (See ``get_submodule`` for how to specify a
                fully-qualified string.)

            torch.nn.Parameter: The Parameter referenced by ``target``

            AttributeError: If the target string references an invalid
                path or resolves to something that is not an
        # 将字符串按照.进行分割,得到三份,左边为module_path,右边为param_name
        module_path, _, param_name = target.rpartition(".")
		# 根据module_path 得到模块,并赋值给mod
        mod: torch.nn.Module = self.get_submodule(module_path)
		# 判断mod中是否有指定的param_name,如果没有就报错
        if not hasattr(mod, param_name):
            raise AttributeError(mod._get_name() + " has no attribute `"
                                 + param_name + "`")
		#  根据param_name 获取得到param
        param: torch.nn.Parameter = getattr(mod, param_name)
		#  再次判断param的类型是否是torch.nn.Parameter,不是就报错
        if not isinstance(param, torch.nn.Parameter):
            raise AttributeError("`" + param_name + "` is not an "
		# 返回参数
        return param

8. apply

apply 函数作用三个地方:

  • module
  • parameters
  • buffers
    def _apply(self, fn):
        for module in self.children():

        def compute_should_use_set_data(tensor, tensor_applied):
            if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
                # If the new tensor has compatible tensor type as the existing tensor,
                # the current behavior is to change the tensor in-place using `.data =`,
                # and the future behavior is to overwrite the existing tensor. However,
                # changing the current behavior is a BC-breaking change, and we want it
                # to happen in future releases. So for now we introduce the
                # `torch.__future__.get_overwrite_module_params_on_conversion()`
                # global flag to let the user control whether they want the future
                # behavior of overwriting the existing tensor or not.
                return not torch.__future__.get_overwrite_module_params_on_conversion()
                return False

        for key, param in self._parameters.items():
            if param is not None:
                # Tensors stored in modules are graph leaves, and we don't want to
                # track autograd history of `param_applied`, so we have to use
                # `with torch.no_grad():`
                with torch.no_grad():
                    param_applied = fn(param)
                should_use_set_data = compute_should_use_set_data(param, param_applied)
                if should_use_set_data:
                    param.data = param_applied
                    assert isinstance(param, Parameter)
                    assert param.is_leaf
                    self._parameters[key] = Parameter(param_applied, param.requires_grad)

                if param.grad is not None:
                    with torch.no_grad():
                        grad_applied = fn(param.grad)
                    should_use_set_data = compute_should_use_set_data(param.grad, grad_applied)
                    if should_use_set_data:
                        param.grad.data = grad_applied
                        assert param.grad.is_leaf
                        self._parameters[key].grad = grad_applied.requires_grad_(param.grad.requires_grad)

        for key, buf in self._buffers.items():
            if buf is not None:
                self._buffers[key] = fn(buf)

        return self

9. to_empty


    def to_empty(self: T, *, device: Union[str, device]) -> T:
        r"""Moves the parameters and buffers to the specified device without copying storage.

            device (:class:`torch.device`): The desired device of the parameters
                and buffers in this module.

            Module: self
        return self._apply(lambda t: torch.empty_like(t, device=device))
