import os
import json
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import torchvision
from torchvision import models
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
import visdom
# from tensorboardX import SummaryWriter
from torch.utils.tensorboard import SummaryWriter
nn.Linear(in_features, out_features, bias=True)
>>> linear = nn.Linear(784, 10)
>>> input = torch.randn(4, 784)
>>> output = linear(input)
>>> output.shape
torch.Size([4, 10])
nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0,
dilation=1, groups=1, bias=True, padding_mode='zeros')
通过.weight
和.bias
查看卷积核的权重与偏置
>>> conv = nn.Conv2d(1, 1, 3, 1, 1)
>>> conv.weight.shape
torch.Size([1, 1, 3, 3])
>>> conv.bias.shape
torch.Size([1])
输入特征图必须写为 ( N , C , H , W ) (N, C, H, W) (N,C,H,W)的形式
>>> input = torch.randn(1, 1, 5, 5)
>>> output = conv(input)
>>> output.shape
torch.Size([1, 1, 5, 5])
nn.MaxPool2d(kernel_size, stride=None, padding=0,
dilation=1, return_indices=False, ceil_mode=False)
return_indices
– if True
, will return the max indices along with the outputs.ceil_mode
– when True
, will use ceil instead of floor to compute the output shapestride
– 注意:stride
默认值为 kernel_size
,而非1
>>> max_pooling = nn.MaxPool2d(2, stride=2)
>>> input = torch.randn(1, 1, 4, 4)
>>> max_pooling(input)
tensor([[[[0.9636, 0.7075],
[1.0641, 1.1749]]]])
>>> max_pooling(input).shape
torch.Size([1, 1, 2, 2])
nn.AvgPool2d(kernel_size, stride=None, padding=0,
ceil_mode=False, count_include_pad=True, divisor_override=None)
If padding
is non-zero, then the input is implicitly zero-padded on both sides for padding
number of points.
ceil_mode
– when True
, will use ceil instead of floor to compute the output shapecount_include_pad
– when True
, will include the zero-padding in the averaging calculationdivisor_override
– if specified, it will be used as divisor, otherwise attr:kernel_size
will be usedThe parameters kernel_size
, stride
, padding
can either be:
int
– in which case the same value is used for the height and width dimensiontuple
of two ints – in which case, the first int is used for the height dimension, and the second int for the width dimensionnn.Sequential(
nn.AdaptiveMaxPool2d((1,1)),
nn.Flatten()
}
当然,下面的层也可以用torch.nn.functional
中的函数替代
nn.Sigmoid()
>>> sigmoid = nn.Sigmoid()
>>> sigmoid(torch.Tensor([1, 1, 2, 2]))
tensor([0.7311, 0.7311, 0.8808, 0.8808])
nn.ReLU(inplace=False)
>>> relu = nn.ReLU(inplace=True)
>>> input = torch.randn(2, 2)
>>> input
tensor([[-0.4853, 2.3864],
[ 0.7122, -0.6493]])
>>> relu(input)
tensor([[0.0000, 2.3864],
[0.7122, 0.0000]])
>>> input
tensor([[0.0000, 2.3864],
[0.7122, 0.0000]])
nn.Softmax(dim=None)
>>> softmax = nn.Softmax(dim=1)
>>> score = torch.randn(1, 4)
>>> score
tensor([[ 0.3101, 3.5648, 1.0988, -1.5856]])
>>> softmax(score)
tensor([[0.0342, 0.8855, 0.0752, 0.0051]])
nn.LogSoftmax(dim=None)
后接nn.NLLLoss
层相当于CrossEntropyLoss
层
nn.Dropout(p=0.5, inplace=False)
>>> dropout = nn.Dropout(0.5, inplace=False)
>>> input = torch.randn(1, 20)
>>> output = dropout(input)
>>> output
tensor([[-2.9413, 0.0000, 1.8461, 1.9605, 0.2774, -0.0000, -2.5381, -2.0313,
-0.1914, 0.0000, 0.5346, -0.0000, 0.0000, 4.4960, -3.8345, -1.0938,
4.3297, 2.1258, -4.1431, 0.0000]])
>>> input
tensor([[-1.4707, 0.5105, 0.9231, 0.9802, 0.1387, -0.4195, -1.2690, -1.0156,
-0.0957, 0.8108, 0.2673, -2.0898, 0.6666, 2.2480, -1.9173, -0.5469,
2.1648, 1.0629, -2.0716, 0.9974]])
torch.nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1,
affine=True, track_running_stats=True)
num_features
– C C C from an expected input of size ( N , C , H , W ) (N, C, H, W) (N,C,H,W)eps
– a value added to the denominator for numerical stability. Default: 1e-5
momentum
– the value used for the running_mean and running_var computation. Can be set to None for cumulative moving average (i.e. simple average). Default: 0.1
affine
– a boolean value that when set to True
, this module has learnable affine parameters. Default: True
track_running_stats
– a boolean value that when set to True
, this module tracks the running mean and variance, and when set to False
, this module does not track such statistics and always uses batch statistics in both training and eval modes. Default: True
Because the Batch Normalization is done over the C dimension, computing statistics on ( N , H , W ) (N, H, W) (N,H,W) slices, it’s common terminology to call this Spatial Batch Normalization.
The mean and standard-deviation are calculated per-dimension over the mini-batches and γ \gamma γ and β \beta β are learnable parameter vectors of size C (where C is the input size). By default, the elements of γ \gamma γ are set to 1 and the elements of β \beta β are set to 0.
>>> bn = nn.BatchNorm2d(64)
>>> input = torch.randn(4, 64, 28, 28)
>>> output = bn(input)
>>> output.shape
torch.Size([4, 64, 28, 28])
nn.LSTM
dropout
.input_size
– The number of expected features in the input x x xhidden_size
– The number of features in the hidden state h h hnum_layers
– Number of recurrent layers. E.g., setting num_layers=2
would mean stacking two LSTMs together. Default: 1bias
– If False
, then the layer does not use bias weights b_ih
and b_hh
. Default: True
batch_first
– If True
, then the input and output tensors are provided as ( b a t c h , s e q , f e a t u r e ) (batch, seq, feature) (batch,seq,feature) instead of ( s e q , b a t c h , f e a t u r e ) (seq, batch, feature) (seq,batch,feature). Note that this does not apply to hidden or cell states. Default: False
dropout
– If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout
. Default: 0bidirectional
– If True
, becomes a bidirectional LSTM. Default: False
nn.NLLLoss(weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
C
(C
= number of classes) classes.C
= number of classes;
ignore_index
is specified, this loss also accepts this class index (this index may not necessarily be in the class range).Output: scalar
. If reduction
is ‘none
’, then the same size as the target: ( N ) (N) (N) , or ( N , d 1 , d 2 , . . . , d K ) (N, d_1, d_2, ..., d_K) (N,d1,d2,...,dK) with K ≥ 1 K≥1 K≥1 in the case of K K K-dimensional loss.
reduction
set to 'none'
) loss can be described as:reduction
is ‘mean
’ (default ‘mean
’), thenreduction
is ‘sum
’ (default ‘mean
’), thenParameters
weight
(Tensor
, optional) – a manual rescaling weight given to each class.
C
, assigning weight to each of the classes. This is particularly useful when you have an unbalanced training set.size_average
(bool
, optional) – Deprecatedignore_index
(int
, optional) – Specifies a target value that is ignored and does not contribute to the input gradient.reduce
(bool
, optional) – Deprecatedreduction
(string
, optional) – Specifies the reduction to apply to the output: none
| ’mean'
| 'sum'
. Default: ‘mean
’m = nn.LogSoftmax(dim=1)
loss = nn.NLLLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.tensor([1, 0, 4])
output = loss(m(input), target)
N, C = 5, 4
loss = nn.NLLLoss()
# input is of size N x C x height x width
data = torch.randn(N, C, 8, 8)
m = nn.LogSoftmax(dim=1)
# each element in target has to have 0 <= value < C
target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
output = loss(m(data), target)
nn.CrossEntropyLoss(weight=None, size_average=None,
ignore_index=-100, reduce=None, reduction='mean')
nn.LogSoftmax()
and nn.NLLLoss()
in one single class.
nn.NLLLoss
一样,这里就不多说了loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()
optim.SGD(params, lr=<required parameter>, momentum=0,
dampening=0, weight_decay=0, nesterov=False)
dampening
(float
, optional) – dampening for momentum (default: 0
)dampening
是干啥的 看源码时再解答optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
# 每次优化之前都要先清空梯度
optimizer.zero_grad()
loss.backward()
optimizer.step()
optim.Adagrad(params, lr=0.01, lr_decay=0, weight_decay=0,
initial_accumulator_value=0, eps=1e-10)
lr
(float
, optional) – learning rate (default: 1e-2
)lr_decay
(float
, optional) – learning rate decay (default: 0
)optim.RMSprop(params, lr=0.01, alpha=0.99, eps=1e-08,
weight_decay=0, momentum=0, centered=False)
alpha
(float
, optional) – smoothing constant (default: 0.99)momentum
(float
, optional) – momentum factor (default: 0)centered
(bool
, optional) – if True
, compute the centered RMSProp, the gradient is normalized by an estimation of its variance这个alpha
应该就是RMSProp中遗忘过去梯度的动量参数,那么这个momentum
又是什么?同样也只能等看了源码再解答
optim.Adadelta(params, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0)
lr
(float
, optional) – coefficient that scale delta before it is applied to the parameters (default: 1.0
) 按照Adadelta原公式的话应该是不用lr
的,这里却有lr
参数,还是需要阅读源码后再解答rho
(float
, optional) – coefficient used for computing a running average of squared gradients (default: 0.9
)optim.Adam(params, lr=0.001, betas=(0.9, 0.999),
eps=1e-08, weight_decay=0, amsgrad=False)
amsgrad
(boolean
, optional) – whether to use the AMSGrad variant of this algorithm from the paper On the Convergence of Adam and Beyond (default: False
)