经过网络后的输出图片形状的计算公式:
import torch
import torchvision
from torchvision import models
from torch import nn
class CenteredLayer(nn.Module):
def __init__(self,**kwargs):
super(CenteredLayer,self).__init__(**kwargs)
def forward(self,x):
return x - x.mean()
layer = CenteredLayer()
layer(torch.tensor([1,2,3,4,5],dtype=torch.float))
net = nn.Sequential(nn.Linear(8,128),CenteredLayer())
y = net(torch.rand(4,8))
y.mean().item()
class MyDense(nn.Module):
def __init__(self):
super(MyDense,self).__init__()
self.params = nn.ParameterList([nn.Parameter(torch.randn(4,4)) for i in range(3)])
self.params.append(nn.Parameter(torch.randn(4,1)))
def forward(self,x):
for i in range(len(self.params)):
x = torch.mm(x,self.params[i])
return x
net = MyDense()
print(net)
class MyDictDense(nn.Module):
def __init__(self):
super(MyDictDense,self).__init__()
self.params = nn.ParameterDict({
'linear1':nn.Parameter(torvh.randn(4,4)),
'linear2':nn.Parameter(torch.randn(4,1))
})
self.params.update({'linear3':nn.Parameter(torch.randn(4,2))})
def forward(self,x,choice='linear1'):
return torch.mm(x,self.params[choice])
net = MyDictDense()
print(net)
x = torch.ones(1,4)
print(net(x,'linear1'))
print(net(x,'linear2'))
print(net(x,'linear3'))
net = nn.Sequential(
MyDictDense(),
MyDense(),
)
print(net)
print(net(x))
x=torch.ones(3)
torch.save(x,'x.pt')
x2 = torch.load('x.pt')
print(x2)
class MLP(nn.Module):
def __init__(self):
super(MLP,self).__init__()
self.hidden = nn.Linear(3,2)
self.act = nn.ReLU()
self.output = nn.Linear(2,1)
def forward(self,x):
a = self.act(self.hidden(x))
return self.output(a)
net = MLP()
net.state_dict()
optimizer = torch.optim.SGD(net.parameters(),lr=0.001,momentum=0.9)
optimizer.state_dict()
保存:
torch.save(model.state_dict(),PATH)#推荐的文件后缀是pt或者pth
加载
model = TheModelClass(*args,**kwargs)
model.load_state_dict(torch.load(PATH))
保存
torch.save(model,PATH)
加载
model = torch.load(PATH)
x = torch.ones(6,8)
x[:,2:6] = 0#表示所有行,第2列到第五列,不包括第6列
print(x)
k = torch.tensor([[1,-1]])#所以是1行2列
x = torch.ones(6,8)
x[:,2:6] = 0
# print(x)
k = torch.tensor([[1,-1]],dtype=torch.float)
def corr2d(x,k):
h,w = k.shape
y = torch.zeros((x.shape[0]-h+1,x.shape[1]-w+1))
for i in range(y.shape[0]):
for j in range(y.shape[1]):
y[i,j]=(x[i:i+h,j:j+w]*k).sum()
return y
y = corr2d(x,k)
print(y)
#补充操作里的实现代码
class Conv2D(nn.Module):
def __init__(self,kernel_size):
super(Conv2D,self).__init__()
self.weight = nn.Parameter(torch.randn(kernel_size))
self.bias = nn.Parameter(torch.randn(1))
def forward(self,x):
return corr2d(x,self.weight)+self.bias
#下面是第一步操作中的实现代码
#构造一个核数组形状是(1,2)的二维卷积层
conv2D = Conv2D(kernel_size=(1,2))
step = 20
lr = 0.01
for i in range(step):
y_hat = conv2D(x)
l = ((y_hat-y)**2).sum()
l.backward()
#梯度下降
conv2D.weight.data -= lr*conv2D.weight.grad
conv2D.bias.data -= lr*conv2D.bias.grad
#梯度清零
conv2D.weight.grad.fill_(0)
conv2D.bias.grad.fill_(0)
if (i+1)%5 == 0:
print('Step %d,loss %.3f' %(i+1,l.item()))
print('weight:',conv2D.weight.data)
print('bias:',conv2D.bias.data)
import time
import torch
from torch import nn,optim
import sys
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#定义vgg块
def vgg_block(num_convs,in_channels,out_channels):
blk = []
for i in range(num_convs):
if i == 0:
blk.append(nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1))
else:
blk.append(nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1))
blk.append(nn.ReLU())
blk.append(nn.MaxPool2d(kernel_size=2,stride=2))#这里会使得宽高减半
return nn.Sequential(*blk)
#构造VGG-11
conv_arch = ((1,1,64),(1,64,128),(2,128,256),(2,256,512),(2,512,512))
#经过5个vgg_block,宽高会减半5次,变成224/32=7
fc_features = 512*7*7 #c*w*h
fc_hidden_units = 4096#取任意值
#实现VGG-11
def vgg(conv_arch,fc_features,fc_hidden_units=4096):
net = nn.Sequential()
#卷积层部分
for i,(num_convs,in_channels,out_channels) in enumerate(conv_arch):
#每经过一个vgg_block都会使得宽高减半
net.add_module('vgg_block_'+str(i+1),vgg_block(num_convs,in_channels,out_channels))
#全连接层部分
net.add_module('fc',nn.Sequential(d2l.FlattenLayer(),
nn.Linear(fc_features,fc_hidden_units),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(fc_hidden_units,fc_hidden_units),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(fc_hidden_ubits,10)
))
return net
#下面构造一个高宽都是224的单通道数据样本来观察每一层的输出形状
net = vgg(conv_arch,fc_features,fc_hidden_units)
X = torch.rand(1,1,224,224)
#named_children获取一级子模块及其名字,names_modules会返回所有子模块,包括子模块的子模块
for name,blk in net.named_children():
X = blk()
print(name,'output shape:',X.shape)
'''
输出结果如下:
vgg_block_1 output shape:torch.Size([1,64,112,112])
vgg_block_2 output shape:torch.Size([1,128,56,56])
vgg_block_3 output shape:torch.Size([1,256,28,28])
vgg_block_5 output shape:torch.Size([1,512,7,7])
fc output shape:torch.Size([1,10])
每次将输入的高和宽减半,直到最终高和宽变成7后传入全连接层。
与此同时,输出通道数每次翻倍,直到变成512.因为每个卷积层的窗口大小一样。
所以每层的模型参数尺寸和计算复杂度与输入高、输入宽、输入通道数和输出通道数的乘积成正比。
VGG这种高和宽减半以及通道翻倍的设计使得多数卷积层都有相同的模型参数尺寸和计算复杂度。
'''
import time
import torch
from torch import nn,optim
import sys
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def nin_block(in_channels,out_channels,kernel_size,stride,padding):
blk = nn.Sequential(nn.Conv2d(in_channels,out_channels,kernel_size,stride,padding),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_szie=1),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_size=1),
nn.ReLU())
return blk
import time
import torch
from torch import nn,optim
import sys
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def nin_block(in_channels,out_channels,kernel_size,stride,padding):
blk = nn.Sequential(nn.Conv2d(in_channels,out_channels,kernel_size,stride,padding),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_szie=1),
nn.ReLU(),
nn.Conv2d(out_channels,out_channels,kernel_size=1),
nn.ReLU())
return blk
class GloabalAvgPool2d(nn.Module):
#全局平均池化层可以通过将池化窗口形状设置为输入的高或者宽来实现
def __init__(self):
super(GlobalAvgPool2d,self).__init__()
def forward(self,x):
return F.avg_pool2d(x,kernel_size=x.size()[2:])
net = nn.SEquential(
nin_block(1,96,kernel_size=11,stride=4,padding=0),
nn.MaxPool2d(kernel_size=3,stride=2),
nin_block(96,256,kernel_size=5,stride=1,padding=2),
nn.MaxPool2d(kernel_size=3,stride=2),
nin_block(256,384,kernel_size=3,stride=1,padding=1),
nn.MaxPool2d(kernel_size=3,stride=2),
nn.Dropout(0.5),
#标签类别数是10
nin_block(384,10,kernel_size=3,stride=1,padding=1),
GlobalAvgPool2d(),
#将四维的输出转成二维的输出,其形状是(批量大小,10)
d2l.FlattenLayer()
)
#构建一个数据样本来查看每一层的输出形状
X = torch.rand(1,1,224,224)
for name,blk in net.named_children():
X = blk(X)
print(name,'output shape:',X.shape)
'''
输出结果是:
0 output shape:torch.Size([1,96,54,54])
1 output shape:torch.Size([1,96,26,26])
2 output shape:torch.Size([1,256,26,26])
3 output shape:torch.Size([1,256,12,12])
4 output shape:torch.Size([1,384,12,12])
5 output shape:torch.Size([1,384,5,5])
6 output shape:torch.Size([1,384,5,5])
7 output shape:torch.Size([1,10,5,5])
8 output shape:torch.Size([1,10,1,1])
9 output shape:torch.Size([1,10])
'''
import time
import torch
from torch import nn,optim
import torch.nn.functional as F
import sys
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Inception(nn.Module):
#c1 c2 c3 c4是每条线路里的输出通道数
def __init__(self,in_c,c1,c2,c3,c4):
super(Inception,self).__init__()
#线路1,单1x1卷积层
self.p1_1 = nn.Conv2d(in_c,c1,kernel_size=1)
#线路2,1x1卷积层后接3x3卷积层
self.p2_1 = nn.Conv2d(in_c,c2[0],kernrl_size=1)
self.p2_2 = nn.Conv2d(c2[0],c2[1],kernrl_size=3,padding=1)
#线路3,1x1卷积层后接5x5卷积层
self.p3_1 = nn.Conv2d(in_c,c3[0],kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0],c3[1],kernel_size=5,padding=1)
#线路4,3x3最大池化层后接1x1卷积层
self.p4_1 = nn.MaxPool2d(kernel_size=3,stride=1,padding=1)
self.p4_2 = nn.Conv2d(in_c,c4,kernrl_size=1)
def forward(self,x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
return torch.cat((p1,p2,p3,p4),dim=1)#在通道维上连结输出
b1 = nn.Sequential(nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
b2 = nn.Sequential(nn.Conv2d(64,64,kernel_size=1),
nn.Conv2d(64,192,kernel_size=3,padding=1),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
b3 = nn.Sequential(Inception(192,64,(96,128),(16,32),32),
Inception(256,128,(128,192),(32,96),64),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
b4 = nn.Sequential(Inception(480,192,(96,208),(16,48),64),
Inception(512,160,(112,224),(24,64),64),
Inception(512,128,(128,256),(24,64),64),
Inception(512,112,(144,288),(32,64),64),
Inception(528,256,(160,320),(32,128),128),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1))
b5 = nn.Sequential(Inception(832,256,(160,320),(32,128),128),
Inception(832,384,(192,384),(48,128),128),
d2l.GlobalAvgPool2d())
net = nn.Sequential(b1,b2,b3,b4,b5,
d2l.FlattenLayer(),
nn.Linear(1024,10))
net = nn.Sequential(b1,b2,b3,b4,b5,d2l.FlattenLayer(),nn.Linear(1024,10))
X = torch.rand(1,1,96,96)
for blk in net.children():
X = blk(X)
print('output shape:',X.shape)
Sequential(
(0): Sequential(
(0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
(1): ReLU()
(2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)
(1): Sequential(
(0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
(1): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)
(2): Sequential(
(0): Inception(
(p1_1): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(192, 16, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1))
)
(1): Inception(
(p1_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(128, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(32, 96, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
)
(2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)
(3): Sequential(
(0): Inception(
(p1_1): Conv2d(480, 192, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(480, 96, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(96, 208, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(480, 16, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(16, 48, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(480, 64, kernel_size=(1, 1), stride=(1, 1))
)
(1): Inception(
(p1_1): Conv2d(512, 160, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(512, 112, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(112, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(24, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1))
)
(2): Inception(
(p1_1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(24, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1))
)
(3): Inception(
(p1_1): Conv2d(512, 112, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(512, 144, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(144, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(512, 32, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1))
)
(4): Inception(
(p1_1): Conv2d(528, 256, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(528, 160, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(528, 32, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(32, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(528, 128, kernel_size=(1, 1), stride=(1, 1))
)
(5): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)
(4): Sequential(
(0): Inception(
(p1_1): Conv2d(832, 256, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(832, 160, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(832, 32, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(32, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(832, 128, kernel_size=(1, 1), stride=(1, 1))
)
(1): Inception(
(p1_1): Conv2d(832, 384, kernel_size=(1, 1), stride=(1, 1))
(p2_1): Conv2d(832, 192, kernel_size=(1, 1), stride=(1, 1))
(p2_2): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(p3_1): Conv2d(832, 48, kernel_size=(1, 1), stride=(1, 1))
(p3_2): Conv2d(48, 128, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(p4_1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
(p4_2): Conv2d(832, 128, kernel_size=(1, 1), stride=(1, 1))
)
(2): GlobalAvgPool2d()
)
(5): FlattenLayer()
(6): Linear(in_features=1024, out_features=10, bias=True)
)
output shape: torch.Size([1, 64, 24, 24])
output shape: torch.Size([1, 192, 12, 12])
output shape: torch.Size([1, 480, 6, 6])
output shape: torch.Size([1, 832, 3, 3])
output shape: torch.Size([1, 1024, 1, 1])
output shape: torch.Size([1, 1024])
output shape: torch.Size([1, 10])
import time
import torch
from torch import nn, optim
import torch.nn.functional as F
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def batch_norm(
is_training,
X,
gamma,
beta,
moving_mean,
moving_var,
eps,
momentum):
# 判断当前模式是训练模式还是预测模式
if not is_training:
# 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# 使用全连接层的情况,计算特征维上的均值和方差
mean = X.mean(dim=0)
var = ((X - mean) ** 2).mean(dim=0)
else:
# 使用二维卷积层的情况,计算通道维上axis=1的均值和方差,因为这里是计算axis=1上的均值和方差,
# 所以下面运算的时候都只用到了dim=0,dim=2,dim=3,因为总共四个维度。这里需要保持X的形状,
# 所以keepdim赋值为True。以便后面做广播运算
mean = X.mean(
dim=0,
keepdim=True).mean(
dim=2,
keepdim=True).mean(
dim=3,
keepdim=True)
var = ((X - mean)**2).mean(dim=0, keepdim=True).mean(dim=2,
keepdim=True).mean(dim=3, keepdim=True)
# 训练模式下用当前的均值和方差做标准化
X_hat = (X - mean) / torch.sqrt(var + eps)
# 更新移动平均的均值和方差
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # 拉伸参数和偏移参数
return Y, moving_mean, moving_var
class BatchNorm(nn.Module):
def __init__(self, num_features, num_dims):
super(BatchNorm, self).__init__()
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# 参与求梯度和迭代的拉伸参数和偏移参数,分别初始化为0和1
self.gamma = nn.Parameter(torch.ones(shape))
self.beta = nn.Parameter(torch.zeros(shape))
# 不参与求梯度和迭代的变量,全在内存上初始化为0
self.moving_mean = torch.zeros(shape)
self.moving_var = torch.zeros(shape)
def forward(self, X):
# 如果X不在内存上,将moxing_mean和moving_var复制到X所在显存上
if self.moving_mean.device != X.device:
self.moving_mean = self.moving_mean.to(X.device)
self.moving_var = self.moving_var.to(X.device)
# 保存更新过的moving_mean和moving_var,Module实例的training属性默认为true,调用.eval()设置为false
Y, self.moving_mean, self.moving_var = batch_norm(
self.training, X, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
return Y
net = nn.Sequential(
nn.Conv2d(1, 6, 5), # in_channels,out_channels,kernel_size
BatchNorm(6, num_dims=4), # 因为接在卷积层后所以BN层维数为4,第一个参数是卷积层的输出通道数
nn.Sigmoid(), # 注意,里面定义的每一层后面都加上了逗号
nn.MaxPool2d(2, 2),
# kernel_size,stride
nn.Conv2d(6, 16, 5),
BatchNorm(16, num_dims=4),
nn.Sigmoid(),
nn.MaxPool2d(2, 2),
d2l.FlattenLayer(),
nn.Linear(16 * 4 * 4, 120),
BatchNorm(120, num_dims=2),
nn.Sigmoid(),
nn.Linear(120, 84),
BatchNorm(84, num_dims=2),
nn.Sigmoid(),
nn.Linear(84, 10)
)
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(
net,
train_iter,
test_iter,
batch_size,
optimizer,
device,
num_epochs)
print(net)
#可以从输出的net网络结构知道第一个批量归一化层位于net[1]
print(net[1].gamma.view((-1,))) # 将参数写成一行
print(net[1].beta.view((-1,)))
print(net[1].gamma)
Sequential(
(0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
(1): BatchNorm()
(2): Sigmoid()
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
(5): BatchNorm()
(6): Sigmoid()
(7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(8): FlattenLayer()
(9): Linear(in_features=256, out_features=120, bias=True)
(10): BatchNorm()
(11): Sigmoid()
(12): Linear(in_features=120, out_features=84, bias=True)
(13): BatchNorm()
(14): Sigmoid()
(15): Linear(in_features=84, out_features=10, bias=True)
)
tensor([1.0703, 1.2678, 0.9472, 1.0295, 1.1669, 1.0209], device='cuda:0',
grad_fn=)
tensor([ 0.3542, 0.3791, -0.5505, -0.3212, 0.2804, -0.0223], device='cuda:0',
grad_fn=)
tensor([[[[1.0703]],
[[1.2678]],
[[0.9472]],
[[1.0295]],
[[1.1669]],
[[1.0209]]]], device='cuda:0',requires_grad=True)
#使用pytorch定义好的类来实现含有批量归一化的LeNet,而不是像上面自定义的BatchNorm层
net = nn.Sequential(
#输入第一层卷积层的输入图片是(256,1,28,28),因为batch_size=256,通道数目值为1,因为灰度图片,高宽都是28
nn.Conv2d(1,6,5),#in_channels,out_channels,kernel_size,经过第一层卷积后,输出图片(256,6,24,24),通道数是卷积层的输出通道值,28-5+1=24
nn.BatchNorm2d(6),#BatchNorm2d用于卷积层,卷积层的输出通道是6,这里由于是使用的pytorch定义好的批量归一化层,所以要有前缀nn.,而之前是没有nn.的
nn.Sigmoid(),
nn.MaxPool2d(2,2),#kernel_size,stride,经过最大池化层后输出图片成(256,6,12,12),因为(24-2+2)/2=12
nn.Conv2d(6,16,5),#输出图片为(256,16,8,8),因为12-5+1=8
nn.BatchNorm2d(16),
nn.Sigmoid(),
nn.MaxPool2d(2,2),#输出图片为(256,16,4,4),因为(8-2+2)/2=4
d2l.FlattenLayer(),
nn.Linear(16*4*4,120),#展开后所以全连接层的输入是16*4*4
nn.BatchNorm1d(120),
nn.Sigmoid(),
nn.Linear(120,84),
nn.BatchNorm1d(84),
nn.Sigmoid(),
nn.Linear(84,10)#期望输出的类别是10类
)
# coding=utf-8
# /usr/bin/env python
'''
Author: syy
date: 19-9-24 上午11:29
'''
import time
import torch
from torch import nn,optim
import torch.nn.functional as F
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Residual(nn.Module):
def __init__(self,in_channels,out_channels,use_1x1conv=False,stride=1):
super(Residual,self).__init__()
self.conv1 = nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1,stride=stride)
self.conv2 = nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self,X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
return F.relu(Y+X)
blk = Residual(3,3)
X = torch.rand((4,3,6,6))
blk(X).shape#torch.size([4.3.6.6])
blk = Residual(3,6,use_1x1conv=True,stride=2)
blk(X).shape#torch.size([4.6.3.3])
net = nn.Sequential(
nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
def resnet_block(in_channels,num_residuals,first_block=False):
if first_block:
assert in_channels == out_channels#第一个模块的通道数与输入通道数一样
blk = []
for i in range(num_residuals):
if i ==0 and not first_block:
blk.append(Residual(in_channels,out_channels,use_1x1conv=True,stride=2))
else:
blk.append(Residual(out_channels,out_channles))
return nn.Sequential(*blk)
net.add_module('resnet_block1',resnet_block(64,64,2,first_block=True))
net.add_module('resnet_block2',resnet_block(64,128,2))
net.add_module('resnet_block3',resnet_block(128,256,2))
net.add_module('resnet_block4',resnet_block(256,512,2))
net.add_module('global_avg_pool',d2l.GlobalAvgPool2d())#GlobalAvgPool2d的输出:(Batch_size,512,1,1)
net.add_module('fc',nn.Sequential(d2l.FlattenLayer(),nn.Linear(512,10)))
X = torch.rand((1,1,224,224))
for name,layer in net.named_children():
X = layer(X)
print(name,'output shape:\t',X.shape)
# coding=utf-8
# /usr/bin/env python
'''
Author: syy
date: 19-9-24 上午11:29
'''
import time
import torch
from torch import nn,optim
import torch.nn.functional as F
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#残差块的实现。通过设定是否使用额外的1x1卷积层来修改通道数,1x1卷积层的作用就是用来修改输入图片的通道数,使得它与残差块输出通道数一致,这样才能做X+Y计算
class Residual(nn.Module):
def __init__(self,in_channels,out_channels,use_1x1conv=False,stride=1):
super(Residual,self).__init__()
self.conv1 = nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1,stride=stride)
self.conv2 = nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1)
if use_1x1conv:#因为后面要计算X+Y,所以需要满足conv1和conv2两个卷积层的输入与输出通道数一致,才能做这个相加运算,如果conv1的输入与输出通道数不一样的话,那么就需要用到1x1卷积层来改变通道数,使得这个加法运算成立
self.conv3 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=stride)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self,X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)#注意不要把X和Y混为一谈,后面第二个例子的代码计算的blk(X).shape是计算的把输入通过conv3后的输出形状
#因为这里要计算X+Y,所以需要满足conv1和conv2两个卷积层的输入与输出通道数一致,才能做这个相加运算,如果conv1的输入与输出通道数不一样的话,那么就需要用到1x1卷积层来改变通道数,使得这个加法运算成立
return F.relu(Y+X)
#这第一个例子满足卷积层的通道数与输入图片通道数一致,所以可以不用1x1卷积,已经满足了X+Y的条件
blk = Residual(3,3)#输入通道和输出通道都为3
X = torch.rand((4,3,6,6))#(样本数,通道数,高,宽)
# print(blk(X).shape)#通过残差块后的输出的形状,torch.size([4.3.6.6])
#这第二个例子输入图片的通道数与卷积网络的输出通道数不一致(卷积网络的输入与输出通道数不一致),所以需要1x1卷积层才能满足X+Y
blk = Residual(3,6,use_1x1conv=True,stride=2)#使用1x1卷积来修改输出图片的通道数,输入图片通道数本来为3,因为stride=2,所以最后通过残差块的输出图片高宽减半
blk(X).shape#torch.size([4.6.3.3])
#ResNet的前两层与GoogLeNet中的一样,在输出通道数64,步幅为2的7x7卷积后接步幅为2的3x3最大池化层,不同在于ResNet在卷积层后接了批量归一化层。
net = nn.Sequential(
nn.Conv2d(1,64,kernel_size=7,stride=2,padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
#定义模块,resnet_block是模块的总称,前面residual是定义的残差块
def resnet_block(in_channels,out_channels,num_residuals,first_block=False):
#如果是第一个模块
if first_block:
assert in_channels == out_channels#第一个模块的通道数与输入通道数一样,由于之前已经使用了步幅为2的最大池化层,所以这里无需减小高和宽。
blk = []
#每个模块中包含了2个残差块,如果i为0表示是第一个残差块,并且这个序号为1的残差块不位于第一个模块中的话
for i in range(num_residuals):
if i == 0 and not first_block:#除开第一个模块的之后的每个模块都在第一个残差块里(i是残差块的序号,而不是模块的序号,注意区分出模块和残差块是不同的事物,每个模块包含了2个残差块)将上一个模块的通道数翻倍。
blk.append(Residual(in_channels,out_channels,use_1x1conv=True,stride=2))#通过stride=2,将输入图片的宽和高减半
else:#表示除了第一个模块的其他模块中的所有的第二个残差块
blk.append(Residual(out_channels,out_channels))
return nn.Sequential(*blk)
#GoogLeNet是在后面接4个由Inception块组成的模块,而ResNet是使用由残差块组成的4个模块。
#每个模块中的两个残差块是具有相同的输出通道数
#resnet_block1是第一个模块的名字,其中包含两个残差块residual0和residual1,下面总共有4个模块,第二个参数是调用定义的resnet_block()函数
net.add_module('resnet_block1',resnet_block(64,64,2,first_block=True))#第一个模块的输出通道数等于输入通道数,第三个参数表示每个模块中都含有2个残差块
net.add_module('resnet_block2',resnet_block(64,128,2))#每个模块都在第一个残差块里将上一个模块的通道数翻倍,并且将高和宽减半,是通过stride=2实现的。resnet_block是残差块,module是模块。每个模块中包含了2个残差块。
net.add_module('resnet_block3',resnet_block(128,256,2))
net.add_module('resnet_block4',resnet_block(256,512,2))
#最后还要接上全局平均池化层和全连接层,和GoogLeNet一样
net.add_module('global_avg_pool',d2l.GlobalAvgPool2d())#GlobalAvgPool2d的输出:(Batch_size,512,1,1)
net.add_module('fc',nn.Sequential(d2l.FlattenLayer(),nn.Linear(512,10)))
X = torch.rand((1,1,224,224))
for name,layer in net.named_children():
X = layer(X)
print(name,'output shape:\t',X.shape)
#每个模块里有4个卷积层,不算上1x1卷积层,4个模块,故这里有16层了,再加上后面的全局平均池化层和全连接层,故共有18层,所以这个网络称为resnet-18
print(net)
输出结果:
这个name=0就是对应后面输出的net结构中的(0)即在四个模块之前的第一个卷积核为7x7的卷积层。
0 output shape: torch.Size([1, 64, 112, 112])
这个name=1对应的是7x7卷积层后的批量归一化层
1 output shape: torch.Size([1, 64, 112, 112])
这个name=2对应的是归一化层后的激活函数层ReLU()
2 output shape: torch.Size([1, 64, 112, 112])
这个name=3对应的是激活函数层后的最大池化层
3 output shape: torch.Size([1, 64, 56, 56])
第一个模块,包含2个残差块
计算这个模块网络中不含有1x1卷积层时的X的shape时,就是按照这个X通过模块中的每一层这样计算下去得到最终的输出形状
resnet_block1 output shape: torch.Size([1, 64, 56, 56])
第二个模块
对于这种模块网络中含有1x1卷积层的,虽然对于X也是这样通过模块中的每一层网络这样传递下去,由于(从代码可以知道,这时X和Y相当于是两个量,所以可以看成是计算这种情况下的X的shape时,就只用分析其经过1x1卷积后的shape即可,与X经过所有层后的输出结果是等同的),而上面不含有1x1卷积层的情况加,X就是代码中的Y。
resnet_block2 output shape: torch.Size([1, 128, 28, 28])
第三个模块
resnet_block3 output shape: torch.Size([1, 256, 14, 14])
第四个模块
resnet_block4 output shape: torch.Size([1, 512, 7, 7])
四个模块后面的全局平均池化层
global_avg_pool output shape: torch.Size([1, 512, 1, 1])
最后接的全连接层
fc output shape: torch.Size([1, 10])
下面输出的是net结构
Sequential(
(0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(resnet_block1): Sequential(
(0): Residual(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Residual(
(conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(resnet_block2): Sequential(
(0): Residual(
(conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv3): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2))
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Residual(
(conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(resnet_block3): Sequential(
(0): Residual(
(conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2))
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Residual(
(conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(resnet_block4): Sequential(
(0): Residual(
(conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2))
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(1): Residual(
(conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(global_avg_pool): GlobalAvgPool2d()
(fc): Sequential(
(0): FlattenLayer()
(1): Linear(in_features=512, out_features=10, bias=True)
)
)
# coding=utf-8
# /usr/bin/env python
'''
Author: syy
date: 19-9-24 下午4:43
'''
import time
import torch
from torch import nn,optim
import torch.nn.functional as F
from d2lzh_pytorch import *
import d2lzh_pytorch as d2l
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def conv_block(in_channels,out_channels):
blk = nn.Sequential(nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels,out_channels,kernel_size=3,padding=1))
return blk
#blk是block的缩写
class DenseBlock(nn.Module):
def __init__(self,num_convs,in_channels,out_channels):
super(DenseBlock,self).__init__()
net = []
for i in range(num_convs):
in_c = in_channels+i*out_channels
net.append(conv_block(in_c,out_channels))
self.net = nn.ModuleList(net)
self.out_channels = in_channels+num_convs*out_channels#计算输出通道数
def forward(self,X):
for blk in self.net:
Y = blk(X)
X = torch.cat((X,Y),dim=1)#在通道维上将输入和输出连结
return X
blk = DenseBlock(2,3,10)
X = torch.rand(4,3,8,8)
Y = blk(X)
Y.shape#torch.size([4,23,8,8])
def transition_block(in_channels,out_channels):
blk = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels,out_channels,kernel_size=1),
nn.AvgPool2d(kernel_size=2,stride=2)
)
return blk
blk = transition_block(23,10)
blk(Y).shape#torch.size([4,10,4,4])
net = nn.Sequential(
nn.Conv2d(1,64,kernel_size=2,padding=3),#对于Sequential构造网络需要逗号
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
)
num_channels,growth_rate = 64,32#num_channels为当前的通道数
num_convs_in_dense_blocks = [4,4,4,4]
for i,num_convs in enumerate(num_convs_in_dense_blocks):
DB = DenseBlock(num_convs,num_channels,growth_rate)
net.add_module('DenseBlock_%d' %i,DB)
#上一个稠密块的输出通道数
num_channels = DB.out_channels
#在稠密块之间加入通道数减半的过渡层
if i != len(num_convs_in_dense_blocks)-1:
net.add_module('transition_block_%d' % i,transition_block(num_channels,num_channels//2))
num_channels = num_channels//2
net.add_module('BN',nn.BatchNorm2d(num_channels))
net.add_module('relu',nn.ReLU())
net.add_module('global_avg_pool',d2l.GlobalAvgPool2d())
net.add_module('fc',nn.Sequential(d2l.FlattenLayer(),nn.Linear(num_channels,10)))
X = torch.rand((1,1,96,96))
for name,layer in net.named_children():
X = layer(X)
print(name,'output shape:\t',X.shape)
输出结果如下:
因为X:[1,1,96,96],通过卷积层(96-2+1+6)/1=101,所以输出[1, 64, 101, 101]
0 output shape: torch.Size([1, 64, 101, 101])
通过BN层,BN层的feature_num值也即是第一个参数与输入图片的通道数一致
1 output shape: torch.Size([1, 64, 101, 101])
通过激活函数层
2 output shape: torch.Size([1, 64, 101, 101])
通过最大池化层,(101-3+2+2)/2=51
3 output shape: torch.Size([1, 64, 51, 51])
通过第一个稠密块,在单个稠密块结构中,BN的feature_num值与卷积层的输入通道数值一致。这里经过稠密块后的图片的通道数与以往计算不一样,
不再是通过的最后一层卷积的输出通道数,因为在稠密块中进行了输出通道数与输入通道在通道维上相连结,所以通过的卷积层的输出通道数也是输出
图片通道数的增长率,故64+32*4=192,经过稠密块除了使得通道数增长外,其他没有变化
DenseBlock_0 output shape: torch.Size([1, 192, 51, 51])
经过第一个过渡层,使得通道数以及高宽都减半,通道数减半是因为通过的1x1卷积层发生了作用,而高宽减半是后面通过的stride值为2的平均池化层发生了作用
transition_block_0 output shape: torch.Size([1, 96, 25, 25])
通过第二个稠密块,通道数增加32*4
DenseBlock_1 output shape: torch.Size([1, 224, 25, 25])
经过第二个过渡层,通道数以及高宽减半
transition_block_1 output shape: torch.Size([1, 112, 12, 12])
第三个稠密层,通道数增加32*4
DenseBlock_2 output shape: torch.Size([1, 240, 12, 12])
第三个过渡层,通道数以及高宽减半
transition_block_2 output shape: torch.Size([1, 120, 6, 6])
第三个稠密层,通道数增加32*4
DenseBlock_3 output shape: torch.Size([1, 248, 6, 6])
最后一个稠密层后不再接过渡层,输出图片的通道数与后面接的BN层的feature_num值也即是第一个参数值相等
BN output shape: torch.Size([1, 248, 6, 6])
relu output shape: torch.Size([1, 248, 6, 6])
经过全局平均池化层后高宽都变成1
global_avg_pool output shape: torch.Size([1, 248, 1, 1])
经过全连接层,输出变成两个维度,第一个参数是样本数,第二个参数即输出通道数也即使期望分类的类别数
fc output shape: torch.Size([1, 10])
下面是整个net的结构:
Sequential(
(0): Conv2d(1, 64, kernel_size=(2, 2), stride=(1, 1), padding=(3, 3))
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU()
(3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(DenseBlock_0): DenseBlock(
(net): ModuleList(
(0): Sequential(
(0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): Sequential(
(0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(96, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): Sequential(
(0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): Sequential(
(0): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(160, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(transition_block_0): Sequential(
(0): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1))
(3): AvgPool2d(kernel_size=2, stride=2, padding=0)
)
(DenseBlock_1): DenseBlock(
(net): ModuleList(
(0): Sequential(
(0): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(96, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): Sequential(
(0): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): Sequential(
(0): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(160, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): Sequential(
(0): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(192, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(transition_block_1): Sequential(
(0): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(224, 112, kernel_size=(1, 1), stride=(1, 1))
(3): AvgPool2d(kernel_size=2, stride=2, padding=0)
)
(DenseBlock_2): DenseBlock(
(net): ModuleList(
(0): Sequential(
(0): BatchNorm2d(112, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(112, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): Sequential(
(0): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(144, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): Sequential(
(0): BatchNorm2d(176, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(176, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): Sequential(
(0): BatchNorm2d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(208, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(transition_block_2): Sequential(
(0): BatchNorm2d(240, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(240, 120, kernel_size=(1, 1), stride=(1, 1))
(3): AvgPool2d(kernel_size=2, stride=2, padding=0)
)
(DenseBlock_3): DenseBlock(
(net): ModuleList(
(0): Sequential(
(0): BatchNorm2d(120, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(120, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): Sequential(
(0): BatchNorm2d(152, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(152, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): Sequential(
(0): BatchNorm2d(184, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(184, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): Sequential(
(0): BatchNorm2d(216, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(1): ReLU()
(2): Conv2d(216, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
(BN): BatchNorm2d(248, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU()
(global_avg_pool): GlobalAvgPool2d()
(fc): Sequential(
(0): FlattenLayer()
(1): Linear(in_features=248, out_features=10, bias=True)
)
)
import torch
from torch import nn,optim
from torch.utils.data import Dataset,DataLoader
import torchvision
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torchvision import models
#指定RGB三个通道的均值和方差来将图像通道归一化
normalize = transforms.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225])
train_augs = transforms.Compose([
transforms.RandomResizedCrop(size=224),#要加上逗号
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize
])
test_augs = transforms.Compose([
transforms.Resize(size=256),
transforms.CenterCrop(size=224),
transforms.ToTensor(),
normalize
])
pretrained_net = models.resnet18(pretrained=True)
print(pretrained_net.fc)
#Linear(in_features=512,out_features=1000,bias=True)
#可以通过下面的代码来修改源模型中的全连接层(输出层),在ImageNet中训练有1000个类别,
# 而这里只需要两个类别。修改为目标数据集需要的输出类别数
pretrained_net.fc = nn.Linear(512,2)
# map() 会根据提供的函数id()对指定序列做映射。
output_params = list(map(id,pretrained_net.fc.parameters()))
# filter() 函数用于过滤序列,过滤掉不符合条件的元素,返回由符合条件元素组成的新列表。源模型中只要是不在输出层即全连接层中的参数都保留下来
feature_params = filter(lambda p:id(p) not in output_params,pretrained_net.parameters())
lr = 0.01
optimizer = optim.SGD([{'params':feature_params},
{'params':pretrained_net.fc.parameters(),'lr':lr*10}],
lr=lr,weight_decay=0.01)
#定义一个使用微调的训练函数train_fine_tuning
def train_fine_tuning(net,optimizer,batch_size=128,num_epochs=5):
train_iter = DataLoader(ImageFolder(os.path.join(data_dir,'hotdog/train'),transform=train_augs),
batch_size,shuffle=True)
test_iter = DataLoader(ImageFolder(os.path.join(data_dir,'hotdog/test'),transform=test_augs),
batch_size)
loss = torch.nn.CrossEntropyLoss()
d2l.train(train_iter,test_iter,net,loss,optimizer,device,num_epochs)
train_fine_tuning(pretrained_net,optimizer)
pretrained_net = model_zoo.vision.resnet18_v2(pretrained=True)
pretrained_net.features[-4:]#输出最后四层
pretrained_net.output
Out[6]:(HybridSequential(
(0):BatchNorm(axis=1,eps=1e-05,momentum=.9,fix_gamma=False,use_global_stats=False,in_channels=512)
(1):Activation(relu)
(2):GlobalAvgPool2D(size=(1,1),stride=(1,1),padding=(0,0),ceil_mode=True,pool_type=avg,layout=NCHW)
(3):Flatten
),Dense(512 -> 1000,linear))
net = nn.HybridSequential()
for layer in pretrained_net.features[:-2]:#除了最后两层的所有层
net.add(layer)
num_classes = 21
net.add(nn.Conv2D(num_classes,kernel_size=1),
nn.Conv2DTranspose(num_classes,kernel_size=64,padding=16,strides=32))
pretrained_net = model_zoo.vision.vgg19(pretrained=True)
style_layers,content_layers = [0,5,10,19,28],[25]
net = nn.Sequential()
for i in range(max(content_layers+style_layers)+1):
net.add(pretrained_net.features[i])
def extract_features(X,content_layers,style_layers):
contents = []
styles = []
for i in range(len(net)):
X = net[i](X)
if i in style_layers:
styles.append(X)
if i in content_layers:
contents.append(X)
return contents,styles
def get_contents(image_shape,ctx):
content_X = preprocess(content_img,image_shape).copyto(ctx)
contents_Y,_ = extract_features(content_X,content_layers,style_layers)
return content_X,contents_Y
def get_styles(image_shape,ctx):
style_X = preprocess(style_img,image_shape).copyto(ctx)
_,styles_Y = extract_features(style_X,content_layers,style_layers)
return style_X,style_Y
def content_loss(Y_hat,Y):
return (Y_hat - Y).square().mean()
def gram(X):
num_channels,n = X.shape[1], X.size // X.shape[1]#注意这里用的//符号
X = X.reshape((num_channels,n))
#nd在mxnet中定义的
return nd.dot(X, X.T) / (num_channels*n)#这里用的/符号
def style_loss(Y_hat,gram_Y):
return (gram(Y_hat)-gram_Y).square().mean()
class GeneratedImage(nn.Block):
def __init__(self,img_shape,**kwargs):
super(GeneratedImage,self).__init__(**kwargs)
self.weight = self.params.get('weight',shape=img_shape)
def forward(self):
return self.weight.data()
def get_inits(X,ctx,lr,styles_Y):
gen_img = GeneratedImage(X.shape)
gen_img.initialize(init.Constant(X),ctx=ctx,force_reinit=True)
trainer = gluon.Trainer(gen_img.collect_params(),'adam',
{'learning_rate':lr})
styles_Y_gram = [gram(Y) for Y in styles_Y]
return gen_img(),styles_Y_gram,trainer
x = tensor.ones(2,4,requires_grad=True)
例如:
x=torch.tensor([[1.,2.,3.],[4.,5.,6.]],requires_grad=True)
y=x+1
z=2*y*y
J=torch.mean(z)
J.backward()
x.grad
会输出:tensor([[1.3333,2.0000,2.6667],
[3.3333,4.0000,4.6667]])
import torch.nn as nn
imort torch.nn.functional as F
def __init__(self):
#调用nn.Module的初始化方法
super(Model,self).__init__()
self.conv1=nn.Conv2d(1,20,5)
self.conv2=nn.Conv2d(20,20,5)
#主要写各层之间通过什么激活函数、池化等来连接
def forward(self,x):
x=F.relu(self.conv1(x))
return F.relu(self.conv2(x))
torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
#16通道进来,64通道出去,kernel为3x3,stride为2
conv1=nn.Conv2d(16,64,32,32)
#100个16通道的32x32数据
output_data=conv1(input_data)
print(output_data.size())
#输出torch.Size([100,64,15,15])
#全连接层,32个特征输入,128个特征输出
#相当于一个有128个神经元的神经网络层
fc1=nn.Linear(32,128)
#100个样本,每个样本有32个特征值
input=torch.randn(100,32)
output=fc1(input)
print(output.size())
#输出torch.Size([100,128])
maxpool=nn.MaxPool2d(2,2)
#下面代码的第一个参数是样本个数,第二个参数:通道数,第三个参数是H,第四个参数是W
input=torch.randn(100,3,64,64)
output=maxpool(input)
print(output.size())
#输出torch.Size([100,3,32,32])
F.avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) → Tensor
input=torch.randn(100,3,64,64)
output=F.max_pool2d(input,2,2)
print(output.size())
#输出torch.Size(100,3,32,32])
torch.nn.BatchNorm2d(num_features, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
input=torch.randn(100,64,128,128)
bn=nn.BatchNorm2d(64)
output=bn(input)
print(output.size())
#输出torch.Size([100,64,128,128])
data=torch.randn(10)
output=F.relu(data)
torch.nn.ReLU
torch.nn.Sigmoid
torch.nn.Tanh
torch.nn.Softmax
data=torch.randn(10)
softmax=nn.Softmax()
output=softmax(data)
MSE
torch.nn.MSELoss(size_average=None, reduce=None, reduction=’mean’)
Cross-Entropy
torch.nn.CrossEntropyLoss(weight=None, size_average=None, ignore_index=-100, reduce=None, reduction=’mean’)
例子:
loss_f=nn.CrossEntropyLoss()
#batch_size=8,num_classes=3
x=torch.randn(8,3,required_grad=True)
#一组1维的long类型标签
labels=torch.ones(8,dtype=torch.long)
loss=loss_f(x,labels)
loss.backward()
x.grad
import torch
import torch.nn.functional as F#激励函数都在这
y_relu = F.relu(x).data.numpy()
y_sigmoid = F.sigmoid(x).numpy()
y_tanh = F.tanh(x).numpy()
import torch
import torch.nn.functional as F#激励函数都在这里
class Net(torch.nn.Module):#继承torch的Module
def __init__(self,n_feature,n_hidden,n_output):
super(Net,self).__init__()#继承__init__功能
#定义每层用什么形式
self.hidden = torch.nn.Linear(n_feature,n_hidden)#隐藏层线性输出
self.predict = torch.nn.Linear(n_hidden,n_output)#输出层线性输出
def forward(self,x):#这个同时也是Module中的forward功能
#正向传播输入值,神经网络分析出输出值
x = F.relu(self.hidden(x))#激励函数(隐藏层的线性值)
x = self.predict(x)#输出值,不是预测值,预测值需要另外计算
return x
net = Net(n_feature=1,n_hidden=10,n_output=1)#几个分类类别就几个output
print(net)#net的结构
#optimizer是训练的工具
optimizer = torch.optim.SGD(net.parameters(),lr=0.2)#传入net的所有参数,学习率
loss_func = torch.nn.CrossEntropyLoss()#预测值和真实值的误差计算公式,真实值是1D Tensor(batch),预测值是2D Tensor(batch,n_classes)
for t in range(100):
prediction = net(x)#喂给net训练数据x,输出预测值
loss = loss_func(prediction,y)#计算两者的误差
optimizer.zero_grad()#清空上一步的残余更新参数值
loss.backward()#误差反向传播,计算参数更新值
optimizer.step()#将参数更新值施加到net的parameters上
class Net(torch.nn.Module):
def __init__(self,n_feature,n_hidden,n_output):
super(Net,self).__init__()
self.hidden = torch.nn.Linear(n_feature,n_hidden)
self.predict = torch.nn.Linear(n_hidden,n_output)
def forward(self,x):
x = F.relu(self,hidden(x))
x = self.predict(x)
return x
net1 = Net(1,10,1)
net2 = torch.nn.Sequential(
torch.nn.Linear(1,10),
torch.nn.ReLU(),
torch.nn.Linear(10,1)
)
第一种方法:
torch.save(net1,'net.pkl')#保存整个网络
第二种方法:
torch.save(net1.state_dict(),'net_params.pkl')#只保存网络中的参数
这种方式会提取整个神经网络,网络大时速度慢
def restore_net():
# 提取整个的net1到net2
net2 = torch.load('net.pkl')
prediction = net2(x)
这种方式会提取所有的参数,然后放到你的新建网络中
def restore_params():
#新建net3
net3 = torch.nn.Sequential(
torch.nn.Linear(1,10),
torch.nn.ReLU(),
torch.nn.Linear(10,1)
)
#将保存的参数复制到net3
net3.load_state_dict(torch.load('net_params.pkl'))
prediction = net3(x)
import torch
import torch.utils.data as Data
torch.manual_seed(1)#reproducible
BATCH_SIZE = 5#批训练的数据个数
x = torch.linspace(1,10,10)#x data(torch tensor)
y = torch.linspace(10,1,10)#y data(torch tensor)
#先转换为torch能识别的Dataset
torch_dataset = Data.TensorDataset(data_tensor=x,target_tensor=y)
#把dataset放入DataLoader
loader = Data.DataLoader(
dataset = torch_dataset,#torch TensorDataset format
batch_size = BATCH_SIZE,#mini bacth size
shuffle = True,#打乱数据
num_workers = 2,#多线程来读数据
)
for epoch in range(3):#训练所有整套数据3次
for step,(batch_x,bacth_y)in enumerate(loader):#每一步loader释放一小批数据来学习
#这里是训练的地方
#打印出来一些数据
print('Epoch:',epoch,'|Step:',step,'|batch x:',
batch_x.numpy(),'|batch y:',batch_y.numpy())
#这条print语句就会打印出类似这样的
#Epoch:0|Step:0|batch x:[6. 7. 2. 3. 1.]|batch y:[5. 4. 9. 8. 10.]
由上面代码中注释出的输出模板可以看出,每一步都导出了5个数据学习,每个epoch的导出数据都是先打乱再导出
方法一:SGD
思路是:把数据拆分成小批小批的,分批放入网络中训练。每次使用批数据,虽然不能反应整体情况,但是加速了很多,而且也不会丢失太多准确率。
方法二:Momentum
思路:大多数除了SGD外的其他加速训练过程的方法都是在更新神经网络参数那一步上动手脚,传统的W的更新是把原始W累加上一个负的学习率乘以校正值(dx),这种方法会让学习过程曲折无比,摇摇晃晃走了很多弯路。
所以把它从平地上放到一个斜坡上,只要它往下坡的方向走一点,由于向下的惯性,所以就不一直往下,这样走的弯路会变少。这就是momentum参数更新。
方法三:AdaGrad
思路:这种方法是在学习率上动手脚,使得每一个参数更新都有自己独特的学习率,作用和momentum是一样的,是给它一个不好走路的鞋子,这样使得它摇晃走路就会脚疼,这样鞋子成了走弯路的阻力,逼着它走直路。
方法四:RMSProp
思路:是把momentum和adagrad方法结合起来。但是并没有把momentum合并完全。
方法五:Adam
思路:是在RMSProp方法中没有合并完全的部分合并了,进行了补全。
#默认的network形式
class Net(torch.nn.Module):
def __init__(self):
super(Net,self).__init__()
self.hidden = torch.nn.Linear(1,20)#hidden layer
self.predict = torch.nn.Linear(20,1)#output layer
def forward(self,x):
x = F.relu(self.hidden(x))#activation function for hidden layer
x = self.predict(x)#linear output
return x
#为每个优化器创建一个net
net_SGD = Net()
net_Momentum = Net()
net_RMSprop = Net()
net_Adam = Net()
nets = [net_SGD,net_Momentum,net_RMSprop,net_Adam]
opt_SGD = torch.optim.SGD(net_SGD.parameters(),lr=LR)
opt_Momentum = torch.optim.SGD(net_Momentum.parameters(),lr=LR,momentum=0.8)
opt_RMSprop = torch.optim.RMSprop(net_RMSprop.parameters(),lr=LR,alpha=0.9)
opt_Adam = torch.optim.Adam(net_Adam.parameters(),lr=LR,betas=(0.9,0.99))
optimizers = [opt_SGD,opt_Momentum,opt_RMSprop,opt_Adam]
loss_func = torch.nn.MSELoss()
losses_his = [[],[],[],[]]#记录training时不同神经网络的loss
for epoch in range(EPOCH):
print('Epoch:',epoch)
for step,(b_x,b_y)in enumerate(loader):
#对每个优化器,优化属于它的神经网络
for net,opt,l_his in zip(nets,optimizers,losses_his):
output = net(b_x)#get output for every net
loss = loss_func(output,b_y)#compute loss for every net
opt.zero_grad()#clear gradients for next train
loss.backward()#backpropagation,compute gradients
opt.step()#apply gradients
l_his.append(loss.data.numpy())#loss recoder
class CNN(nn.Module):
def __init__(self):
super(CNN.self).__init__()
self.conv1 = nn.Sequential(#input shape(1,28,28)
nn.Conv2d(
in_channels = 1,#input height
out_channel = 16,#n_filters
kernel_size = 5,#filter size
stride = 1,#filter movement/step
padding =2,#如果想要conv2d出来的图片长宽没有变化,padding=(kernel_size-1)/2 当stride=1
),#output shape(16,28,28)
nn.ReLU(),#activation
nn.MaxPool2d(kernel_size=2)#在2x2空间里向下采样,output shape(16,14,14)
)
self.conv2 = nn.Sequential(#input shape(16,14,14)
nn.Conv2d(16,32,5,1,2)#output shape(32,14,14)
nn.ReLU(),#activation
nn.MaxPool2d(2)#output shape(32,7,7)
)
self.out = nn.Linear(32*7*7,10)#fully connected layer,output 10 classes
def forward(self,x):
x = self.conv1(x)
x = self.conv2(x)
x = x.view(x.size(0),-1)#展平多维的卷积图形成(batch_size,32*7*7)
output = self.out(x)
return output
cnn = CNN()
print(cnn)#net architecture
#最后会输出这样的结果
CNN (
(conv1): Sequential (
(0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(1): ReLU ()
(2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
)
(conv2): Sequential (
(0): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(1): ReLU ()
(2): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
)
(out): Linear (1568 -> 10) 其中1568=32*7*7
)
optimizer = torch.optim.Adam(cnn.parameters(),lr=LR)#optimize all cnn parameters
loss_func = nn.CrossEntropyLoss()#the target label is not one-hotted
#training and testing
for epoch in range(EPOCH):
for step,(b_x,b_y)in enumerate(train_loader):#分配batch data,normalize x when iterate train_loader
output = cnn(b_x)#cnn output
loss = loss_func(output,b_y)#cross entropy loss
optimizer.zero_grad()#clear gradients for this training step
loss.backward()#backpropogation,compute gradients
optimizer.step()#apply gradients
class RNN(nn.Module):
def __init__(self):
self.rnn = nn.RNN(#普通的RNN
input_size = 1,
hidden_size = 32.#rnn hidden unit
num_layers = 1,#有几层RNN layers
batch_first = True,#input&output会是以batch size为第一维度的特征集,例如(batch,time_step,input_size)
)
self.out = nn.Linear(32,1)
def forward(self,x,h_state):#因为hidden state是连续的,所以要一直传递这一个state
#x(batch,time_step,input_size)
#h_state(n_layers,batch,hidden_size)
#r_out(batch,time_step,output_size)
r_out,h_state = self.rnn(x,h_state)#h_state也要作为RNN的一个输入
outs = []#保存所以时间点的预测值
for time_step in range(r_out.size(1))#对每一个时间点计算output
outs.append(self.out(r_out[:,time_step,:]))
return torch.stack(outs.dim = 1),h_state
rnn = RNN()
print(rnn)
#输出如下
RNN (
(rnn): RNN(1, 32, batch_first=True)
(out): Linear (32 -> 1)
)
class RNN(nn.Module):
def __init__(self):
super(RNN,self).__init__()
self.nn = nn.LSTM(
input_size = 28,#图片每行的数据像素点
hidden_size = 64,#rnn hidden unit
num_layers = 1,#有几层RNN layers
batch_first = True,#input&output会是以batch size为第一维度的特征集,例如(batch, time_step, input_size)
)
self.out = nn.Linear(64,10)#输出层
def forward(self,x):
#x shape(batch,time_step,input_size)
#r_out shape (batch,time_step,output_size)
#h_n shape(n_layers,batch,hidden_size) LSTM有两个hidden status,h_n是分线,h_c是主线
#h_c shape (n_layers,batch,hidden_size)
r_out,(h_n,h_c)=self.rnn(x,None)#None表示hidden state会用全0的state
#选取最后一个时间点的r_out输出
#这里r_out[:,-1,:]的值也是h_n的值
out = self.out(r_out[:,-1,:])
return out
rnn = RNN()
print(rnn)
#会输出这样
RNN (
(rnn): LSTM(28, 64, batch_first=True)
(out): Linear (64 -> 10)
)
optimizer = torch.optim.Adam(mn.parameters(),lr=LR)#optimize all parameters
loss_func = nn.CrossEntropyLoss()#the target label is not one-hotted
#training and testing
for epoch in range(EPOCH):
for step,(x,b_y)in enumerate(train_loader):#gives batch data
b_x = x.view(-1,28,28)#reshape x to (batch,time_step,input_size)
output = rnn(b_x)#rnn output
loss = loss_func(output,b_y)#cross entropy loss
optimizer.zero_grad()#clear gradients for this training step
loss.backward()#backpropagation,compute gradients
optimizer.step()#apply gradients
G = nn.Sequential(#Generator
nn.Linear(N_IDEAS,128)#random ideas (could from normal distribution)
nn.ReLU(),
nn.Linear(128,ART_COMPONENTS),#making a plainting from these random ideas
)
D = nn.Sequential(#Discriminator
nn.Linear(ART_COMPONENTS,128)#receive art work either from the famous artists or a newbie like G
nn.ReLU(),
nn.Linear(128,1),
nn.Sigmoid(),#tell the probability that the art work is made by artist
)
class Net(nn.Module):
def __init__(self,bacth_normalization=False):
super(Net,self).__init__()
self.do_bn = bacth_normalization
self.fcs = []
self.bns = []
self.bn_input = nn.BatchNorm1d(1,momentum = 0.5)#给input的BN
for i in range(N_HIDDEN):#建层
input_size = 1 if i == 0 else 10
fc = nn.Linear(input_size,10)
setattr(self,'fc%i' % i,fc)#注意pytorch一定要你将层信息转换为class的属性
self._set_init(fc)#参数初始化
self.fcs.append(fc)
if self.do_bn:
bn = nn.BatchNorm1d(10,momentum=0.5)#momentum的作用是平滑化batch mean和stddev
setattr(self,'bn%i' % i,bn)
self.bns.append(bn)
self.predict = nn.Linear(10,1)#output layer
self._set_init(self.predict)#参数初始化
def _set_init(self,layer):#参数初始化
init.normal_(layer.weight,mean=0.,std=.1)
init.constant_(layer.bias,B_INIT)
def forward(self,x):
pre_activation = [x]
if self.do_bn:x=self.bn_input(x)#判断是否要加BN
layer_input = [x]
for i in range(N_HIDDEN):
x = self.fcs[i](x)
pre_activation.append(x)#为之后出图
if self.do_bn:x = self.bns[i](x)#判断之后是否要加BN
x = ACTIVATION(x)
layer_input.append(x)#为之后出图
out = self.predict(x)
return out,layer_input,pre_activation
#建立两个net,一个有BN。一个没有BN
nets = [Net(batch_normalization = False),Net(batch_normalization = True)]