torch.nn.init.calculate_gain(nonlinearity, param=None)
为不同的函数设置不同的增益(不太理解,暂时理解为在初始化的时候,不同的函数(层)初始化的方差不一样,这个增益理解推荐的方差?)。
参数
- nonlinearity ---- 增益函数的名称
-可取值“linear”,“conv1d”, “conv2d”,“conv3d”, “conv_transpose1d”, “conv_transpose2d, ”conv_transpose3d“, ”sigmoid“, ”tanh“, ”relu“, ”leaky_relu“- param ---- leaky_relu 函数的增益参数
“linear”,“conv1d”,“conv2d”,“conv3d”,“conv_transpose1d”,“conv_transpose2d,”conv_transpose3d“,”sigmoid“ 输出为1
”tanh“输出为 5 3 \frac{5}{3} 35 (1.6666666666666667)
”relu“输出为 2 \sqrt{2} 2(1.4142135623730951)
”leaky_relu“输出为 2 1 + p a r a m 2 \sqrt{\frac{2}{1+param^2}} 1+param22 ,param=0.2的时候为1.3867504905630728
from torch import nn
gain = nn.init.calculate_gain('linear') # 1 Linear / Identity 恒等函数
print("linear gain", gain)
gain = nn.init.calculate_gain('conv1d') # 1
print("conv1d gain", gain)
gain = nn.init.calculate_gain('conv2d') # 1
print("conv2d gain", gain)
gain = nn.init.calculate_gain('conv3d') # 1
print("conv3d gain", gain)
gain = nn.init.calculate_gain('sigmoid') # 1
print("sigmoid gain", gain)
gain = nn.init.calculate_gain('tanh') # 1.6666666666666667
print("tanh gain", gain)
gain = nn.init.calculate_gain('relu') # 1.4142135623730951
print("relu gain", gain)
gain = nn.init.calculate_gain('leaky_relu', param=0) # 1.4142135623730951
print("leaky_relu 0 gain", gain)
gain = nn.init.calculate_gain('leaky_relu', param=0.2) # 1.3867504905630728
print("leaky_relu 0.2 gain", gain)
gain = nn.init.calculate_gain('leaky_relu', param=1) # 1
print("leaky_relu 1 gain", gain)
gain = nn.init.calculate_gain('conv_transpose1d') # 1
print("conv_transpose1d gain", gain)
# conv_transpose1d conv_transpose2d conv_transpose3d 1
torch.nn.init.constant_(tensor, val)
使用val值填充输入Tensor
参数
- tensor ---- 一个n维的torch.Tensor
- val ----填充Tensor的值
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[4.0153e-12, 4.5565e-41, 4.0086e-12, 4.5565e-41, 4.0086e-12],
# [4.5565e-41, 4.0087e-12, 4.5565e-41, 4.0087e-12, 4.5565e-41],
# [4.0019e-12, 4.5565e-41, 4.0087e-12, 4.5565e-41, 4.0088e-12]])
nn.init.constant_(w, 0.3)
print(w)
# tensor([[0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
# [0.3000, 0.3000, 0.3000, 0.3000, 0.3000],
# [0.3000, 0.3000, 0.3000, 0.3000, 0.3000]])
torch.nn.init.ones_(tensor)
使用1填充输入Tensor
参数
- tensor ---- 一个n维的torch.Tensor
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[9.7808e+36, 4.5660e-41, 9.7710e+36, 4.5660e-41, 9.7710e+36],
# [4.5660e-41, 9.7711e+36, 4.5660e-41, 9.7711e+36, 4.5660e-41],
# [9.7612e+36, 4.5660e-41, 9.7712e+36, 4.5660e-41, 9.7712e+36]])
nn.init.ones_(w)
print(w)
# tensor([[1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1.],
# [1., 1., 1., 1., 1.]])
torch.nn.init.zeros_(tensor)
使用0填充输入Tensor
参数
- tensor ---- 一个n维的torch.Tensor
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[1.3348e-21, 4.5574e-41, 1.3333e-21, 4.5574e-41, 1.3333e-21],
# [4.5574e-41, 1.3333e-21, 4.5574e-41, 1.3333e-21, 4.5574e-41],
# [1.3317e-21, 4.5574e-41, 1.3333e-21, 4.5574e-41, 1.3333e-21]])
nn.init.zeros_(w)
print(w)
# tensor([[0., 0., 0., 0., 0.],
# [0., 0., 0., 0., 0.],
# [0., 0., 0., 0., 0.]])
torch.nn.init.eye_(tensor)
用单位矩阵填充二维
输入Tensor。(在线性层中保留输入的标识,其中尽可能多地保留输入。)
参数
- tensor ---- 一个
2维
的torch.Tensor
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[7.3228e-06, 4.5688e-41, 7.3158e-06, 4.5688e-41, 7.3158e-06],
# [4.5688e-41, 7.3159e-06, 4.5688e-41, 7.3159e-06, 4.5688e-41],
# [7.3088e-06, 4.5688e-41, 7.3159e-06, 4.5688e-41, 7.3160e-06]])
nn.init.eye_(w)
print(w)
# tensor([[1., 0., 0., 0., 0.],
# [0., 1., 0., 0., 0.],
# [0., 0., 1., 0., 0.]])
torch.nn.init.dirac_(tensor, groups=1)
用Dirac delta函数填充{3,4,5}维输入Tensor。在卷积层中保留输入的标识,在卷积层中尽可能多地保留输入通道。在组>1的情况下,每组通道保持同一性。
Dirac delta函数:在除了零以外的点函数值都等于零
参数
- tensor ---- 一个
{3, 4, 5}-维
的torch.Tensor- groups ---- conv层中的组数(默认值:1)
import torch
w = torch.empty(2, 2, 2)
print(w)
# tensor([[[1.5624e-24, 4.5870e-41],
# [1.5609e-24, 4.5870e-41]],
# [[1.5609e-24, 4.5870e-41],
# [1.5609e-24, 4.5870e-41]]])
nn.init.dirac_(w)
print(w)
# tensor([[[0., 1.],
# [0., 0.]],
# [[0., 0.],
# [0., 1.]]])
import torch
w = torch.empty(2, 2, 2)
print(w)
# tensor([[[-5.2552e+08, 4.5673e-41],
# [-5.2503e+08, 4.5673e-41]],
# [[-5.2503e+08, 4.5673e-41],
# [-5.2503e+08, 4.5673e-41]]])
nn.init.dirac_(w, groups=2)
print(w)
# tensor([[[0., 1.],
# [0., 0.]],
# [[0., 1.],
# [0., 0.]]])
torch.nn.init.uniform_(tensor, a=0.0, b=1.0)
使用从均匀分布U(a,b)中提取的值填充输入Tensor
参数
- tensor ---- 一个n维的torch.Tensor
- a ---- 均匀分布的下限
- b ---- 均匀分布的上限
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[-1.3934e+02, 4.5893e-41, -1.3910e+02, 4.5893e-41, -1.3910e+02],
# [ 4.5893e-41, -1.3910e+02, 4.5893e-41, -1.3911e+02, 4.5893e-41],
# [-1.3887e+02, 4.5893e-41, -1.3911e+02, 4.5893e-41, -1.3911e+02]])
nn.init.uniform_(w)
print(w)
# tensor([[0.3920, 0.7401, 0.6583, 0.4253, 0.0297],
# [0.2216, 0.8703, 0.5916, 0.2376, 0.7369],
# [0.2610, 0.8456, 0.7929, 0.8837, 0.8475]])
torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
使用从正态分布N(mean,std^2)中提取的值填充输入Tensor
参数
- tensor ---- 一个n维的torch.Tensor
- mean ----正态分布的均值
- stf ---- 正态分布的标准差
import torch
w = torch.empty(3, 5)
print(w)
import torch
w = torch.empty(3, 5)
print(w)
# tensor([[3.8552e+37, 4.5825e-41, 3.8513e+37, 4.5825e-41, 3.8513e+37],
# [4.5825e-41, 3.8513e+37, 4.5825e-41, 3.8513e+37, 4.5825e-41],
# [3.8474e+37, 4.5825e-41, 3.8513e+37, 4.5825e-41, 3.8514e+37]])
nn.init.normal_(w)
print(w)
# tensor([[ 2.2514, 0.2402, 3.2568, 0.0577, 0.8869],
# [ 0.6135, -0.0227, -0.0789, -0.1025, -0.1091],
# [-0.1615, -0.5869, 0.9064, 1.0609, -0.2149]])
torch.nn.init.xavier_uniform_(tensor, gain=1.0)
使用根据Glorot, X. & Bengio, Y. (2010)在训练深度前馈神经网络的难度(《Understanding the difficulty of training deep feedforward neural networks》)中所诉的均匀分布填充输入Tensor。得到的Tensor其值从 U ( − a , a ) U(-a,a) U(−a,a)中采样,其中
a = g a i n × 6 f a n _ i n + f a n _ o u t a=gain×\sqrt{\frac{6}{fan\_in+fan\_out}} a=gain×fan_in+fan_out6
fan_in 是该层的神经元个数, fan_out 下一层的神经元个数,计算的源代码。
def _calculate_fan_in_and_fan_out(tensor): dimensions = tensor.dim() if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for tensor with fewer >than 2 dimensions") num_input_fmaps = tensor.size(1) num_output_fmaps = tensor.size(0) receptive_field_size = 1 if tensor.dim() > 2: receptive_field_size = tensor[0][0].numel() fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out
参数
- tensor ---- 一个n-维的torch.Tensor
- gain ---- 系数因子
import torch
w = torch.empty(3,5)
print(w)
# tensor([[-9.0681e-24, 4.5773e-41, -9.0559e-24, 4.5773e-41, -9.0559e-24],
# [ 4.5773e-41, -9.0560e-24, 4.5773e-41, -9.0561e-24, 4.5773e-41],
# [-9.0437e-24, 4.5773e-41, -9.0561e-24, 4.5773e-41, -9.0562e-24]])
nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('linear'))
print(w)
# tensor([[-0.6126, 0.4646, -0.6003, -0.5533, 0.4278],
# [-0.0319, 0.8347, 0.7659, -0.1286, 0.3851],
# [-0.4798, 0.3353, 0.8211, -0.8149, -0.3660]])
`torch.nn.init.xavier_normal_(tensor, gain=1.0)
使用根据Glorot, X. & Bengio, Y. (2010)在训练深度前馈神经网络的难度(《Understanding the difficulty of training deep feedforward neural networks》)中所诉的正态分布填充输入Tensor。得到的Tensor其值从 N ( 0 , s t d 2 ) N(0,std^2) N(0,std2)中采样,其中
s t d = g a i n × 2 f a n _ i n + f a n _ o u t std=gain×\sqrt{\frac{2}{fan\_in+fan\_out}} std=gain×fan_in+fan_out2
fan_in 是该层的神经元个数, fan_out 下一层的神经元个数,计算的源代码。
def _calculate_fan_in_and_fan_out(tensor): dimensions = tensor.dim() if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for tensor with fewer >than 2 dimensions") num_input_fmaps = tensor.size(1) num_output_fmaps = tensor.size(0) receptive_field_size = 1 if tensor.dim() > 2: receptive_field_size = tensor[0][0].numel() fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out
参数
- tensor ---- 一个n-维的torch.Tensor
- gain ---- 系数因子
import torch
w = torch.empty(3,5)
print(w)
# tensor([[ 0.0000e+00, 0.0000e+00, -1.3263e+11, 3.0632e-41, 1.4013e-45],
# [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
# [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]])
nn.init.xavier_normal_(w, gain=nn.init.calculate_gain('linear'))
print(w)
# tensor([[ 0.0388, -0.8639, 0.6914, -0.7558, -0.5346],
# [-1.1560, 0.7690, -0.1938, 1.4466, 0.3110],
# [-0.2934, 0.1475, 0.1823, 0.3484, 0.0932]])
torch.nn.init.kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
使用根据He,K.等人(2015)在《Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification》中所诉的均匀分布填充输入Tensor。得到的Tensor其值从 U ( − a , a ) U(-a,a) U(−a,a)中采样,其中
a = g a i n × 3 m o d e a=gain×\sqrt{\frac{3}{mode}} a=gain×mode3
参数
- tensor ---- 一个n-维的torch.Tensor
- a ---- 此层之后使用的整流器的负斜率(仅’leaky_relu’使用该系数)
- mode ---- “fan_in”(默认)或“fan_out”。 选择“fan_in”可以保持向前传播中权重的方差大小。选择“fan_out”可以保持反向传播中权重的方差大小。
- nonlinearity ---- 非线性函数function (nn.functional 名称), 建议仅使用“relu”或“leaky\relu”(默认)
-这个参数与a,一起决定gain的大小: “linear”,“conv1d”,“conv2d”,“conv3d”,“conv_transpose1d”,“conv_transpose2d,”conv_transpose3d“,”sigmoid“ gain为1
”tanh“ gain为 5 3 \frac{5}{3} 35 (1.6666666666666667)
”relu“ gain为 2 \sqrt{2} 2(1.4142135623730951)
”leaky_relu“ gain为 2 1 + a 2 \sqrt{\frac{2}{1+a^2}} 1+a22 ,a=0.2的时候为1.3867504905630728
fan_in 是该层的神经元个数, fan_out 下一层的神经元个数,计算的源代码。
def _calculate_fan_in_and_fan_out(tensor): dimensions = tensor.dim() if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for tensor with fewer >than 2 dimensions") num_input_fmaps = tensor.size(1) num_output_fmaps = tensor.size(0) receptive_field_size = 1 if tensor.dim() > 2: receptive_field_size = tensor[0][0].numel() fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out
import torch
w = torch.empty(3,5)
print(w)
# tensor([[0.0000e+00, 0.0000e+00, 1.7492e+23, 3.0690e-41, 1.4013e-45],
# [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
# [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]])
nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
print(w)
# tensor([[-0.3740, 0.1611, -0.2453, 0.0271, 1.0839],
# [-0.1391, -0.4964, 0.2494, -0.3590, -0.8880],
# [ 0.1929, 0.9824, 0.4459, 0.6854, -0.7735]])
torch.nn.init.kaiming_normal_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu')
使用根据He,K.等人(2015)在《Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification》中所诉的正态分布填充输入Tensor。得到的Tensor其值从 N ( 0 , s t d 2 ) N(0,std^2) N(0,std2)中采样,其中
s t d = g a i n m o d e std=\sqrt{\frac{gain}{mode}} std=modegain
参数
- tensor ---- 一个n-维的torch.Tensor
- a ---- 此层之后使用的整流器的负斜率(仅’leaky_relu’使用该系数)
- mode ---- “fan_in”(默认)或“fan_out”。 选择“fan_in”可以保持向前传播中权重的方差大小。选择“fan_out”可以保持反向传播中权重的方差大小。
- nonlinearity ---- 非线性函数function (nn.functional 名称), 建议仅使用“relu”或“leaky\relu”(默认)
-这个参数与a,一起决定gain的大小: “linear”,“conv1d”,“conv2d”,“conv3d”,“conv_transpose1d”,“conv_transpose2d,”conv_transpose3d“,”sigmoid“ gain为1
”tanh“ gain为 5 3 \frac{5}{3} 35 (1.6666666666666667)
”relu“ gain为 2 \sqrt{2} 2(1.4142135623730951)
”leaky_relu“ gain为 2 1 + a 2 \sqrt{\frac{2}{1+a^2}} 1+a22 ,a=0.2的时候为1.3867504905630728
fan_in 是该层的神经元个数, fan_out 下一层的神经元个数,计算的源代码。
def _calculate_fan_in_and_fan_out(tensor): dimensions = tensor.dim() if dimensions < 2: raise ValueError("Fan in and fan out can not be computed for tensor with fewer >than 2 dimensions") num_input_fmaps = tensor.size(1) num_output_fmaps = tensor.size(0) receptive_field_size = 1 if tensor.dim() > 2: receptive_field_size = tensor[0][0].numel() fan_in = num_input_fmaps * receptive_field_size fan_out = num_output_fmaps * receptive_field_size return fan_in, fan_out
import torch
w = torch.empty(3,5)
print(w)
# tensor([[2.8852e-12, 4.5675e-41, 2.8818e-12, 4.5675e-41, 2.8818e-12],
# [4.5675e-41, 2.8819e-12, 4.5675e-41, 2.8819e-12, 4.5675e-41],
# [2.8785e-12, 4.5675e-41, 2.8819e-12, 4.5675e-41, 2.8819e-12]])
nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
print(w)
# tensor([[-0.4572, -0.4846, -1.1398, -0.6834, 1.0847],
# [ 1.4618, 0.0101, -1.3164, 0.8425, -0.0996],
# [ 0.8427, -0.3352, -0.2115, -0.9104, 0.1057]])
torch.nn.init.orthogonal_(tensor, gain=1)
使用根据Saxe, A. 等人(2013)在深度线性神经网络中非线性学习动力学的精确解《Exact solutions to the nonlinear dynamics of learning in deep linear neural networks》中所诉的(半)正交矩阵填充输入Tensor。输入张量必须至少有2个维度,对于超过2个维度的张量,将延展剩余的维度。
参数
- tensor ---- 一个n-维的torch.Tensor,n>2.
- gain ---- 系数因子
import torch
w = torch.empty(3,5)
print(w)
# tensor([[1.8313e-31, 4.5745e-41, 1.8295e-31, 4.5745e-41, 1.8295e-31],
# [4.5745e-41, 1.8295e-31, 4.5745e-41, 1.8295e-31, 4.5745e-41],
# [1.8277e-31, 4.5745e-41, 1.8295e-31, 4.5745e-41, 1.8295e-31]])
nn.init.orthogonal_(w)
print(w)
# tensor([[ 0.3958, -0.8281, -0.0863, 0.1510, 0.3569],
# [ 0.6480, 0.0215, 0.1499, -0.6569, -0.3546],
# [ 0.5187, 0.2536, 0.3505, 0.7091, -0.2023]])
torch.nn.init.sparse_(tensor, sparsity, std=0.01)
将2D输入Tensor填充为稀疏矩阵,其中非零元素将使用根据Martens, J. (2010)在基于Hessian自由优化的深度学习《Deep learning via Hessian-free optimization》中所诉的正态分布 N ( 0 , 0.01 ) \mathcal{N}(0, 0.01) N(0,0.01)中提取样本。输入张量必须至少有2个维度,对于超过2个维度的张量,将延展剩余的维度。
参数
- tensor ---- 一个n-维的torch.Tensor,n>2.
- sparsity ---- 每列设置为零的概率,概率为1的时候将全设置为0
- std ---- 用于生成非零值的正态分布的标准差
import torch
w = torch.empty(3,5)
print(w)
# tensor([[3.8586e-23, 4.5761e-41, 3.8537e-23, 4.5761e-41, 3.8537e-23],
# [4.5761e-41, 3.8537e-23, 4.5761e-41, 3.8538e-23, 4.5761e-41],
# [3.8488e-23, 4.5761e-41, 3.8538e-23, 4.5761e-41, 3.8538e-23]])
nn.init.sparse_(w, sparsity=0.5)
print(w)
# tensor([[ 0.0046, 0.0000, -0.0100, -0.0115, 0.0000],
# [ 0.0000, -0.0082, 0.0000, 0.0000, 0.0000],
# [ 0.0000, 0.0000, 0.0000, 0.0000, -0.0019]])