主要贡献:
如relu函数,低维度的特征通过relu后,会有一部分被毁掉,因为维度越低分布到relu激活带的可能性就越小。那么在反向传播的时候就会出现梯度消失,那么神经元的权重就无法更新,导致特征退化。那么理想解决办法就是对冗余数据使用relu,对不含冗余信息的使用线性激活。
对现有网络A增加恒等映射为更深的B,A和B的效果应该是一样的,所以可以证明增加深度不会导致网络的效果更差。
残差块
block存在两种形式,左边适用于较浅的网络,用在ResNet34中,右图用在网络较深的时候,在ResNet50/101/152中,目的是通过1*1卷积该改变维度进而降低参数量。
网络结构图中存在部分虚线,虚线是因为feature map数量发生了变化。在ShortCut connection中加1*1的卷积使维度统一。
import torch
from torch import nn
from torch.nn import functional as F
# 残差模块
class ResidualBlock(nn.Module):
def __init__(self, in_ch, out_ch, stride=1, shortcut=None):
super(ResidualBlock, self).__init__()
self.left = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False),
nn.BatchNorm2d(out_ch),
)
self.right = shortcut
def forward(self, x):
output = self.left(x)
# 对应网络中的实线和虚线,是否需要调整特征图维度
residual = x if self.right is None else self.right(x)
output += residual
return F.relu(output)
def _make_layer(in_ch, out_ch, block_num, stride=1):
# 维度增加时执行网络中的虚线,对shortcut 使用1*1矩阵增大维度
shortcut = nn.Sequential(
nn.Conv2d(in_ch, out_ch, 1, stride, bias=False),
nn.BatchNorm2d(out_ch),
)
layers = []
layers.append(ResidualBlock(in_ch, out_ch, stride, shortcut))
# 只有第一个shortcut需要统一特征图维度。
for i in range(1, block_num):
layers.append(ResidualBlock(out_ch, out_ch))
return nn.Sequential(*layers)
class ResNet34(nn.Module):
def __init__(self, num_classes=10):
super(ResNet34, self).__init__()
# 第一层,输入为224*224*3。
self.pre = nn.Sequential(
# (224+p*2-7)/s+1=(224+6-7)/2+1=112
nn.Conv2d(3, 64, kernel_size=7, padding=3, stride=2, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
# 输入为112*112*64,输出为56*56*64
nn.MaxPool2d(kernel_size=3, padding=1, stride=2),
)
# 四个layer分别有3,4,6,3个residual block
self.layer1 = _make_layer(64, 64, 3) # 56*56*64
self.layer2 = _make_layer(64, 128, 4, stride=2) # 28*28*128 stride=2在第一层实现下采样
self.layer3 = _make_layer(128, 256, 6, stride=2) # 14*14*256
self.layer4 = _make_layer(256, 512, 3, stride=2) # 7*7*512
# 最终的全连接层
self.Conv = nn.Sequential(
self.layer1,
self.layer2,
self.layer3,
self.layer4,
)
self.fc = nn.Linear(512, num_classes) # 7*7*512使用全局平均池化
def forward(self, x):
x = self.pre(x)
x = self.Conv(x)
x = F.avg_pool2d(x, 7)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
if __name__ == "__main__":
x = torch.rand([2, 3, 224, 224])
model = ResNet34()
print(model)
y = model(x)
print(y)
因为服务器中存在Cifar10数据,所以这部分download设为False,如果需要下载则设置为True。
# 下载CIFAR-10数据集
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=data_transform, download=False)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=64,
shuffle=True)
test_dataset = datasets.CIFAR10('./data', train=False, transform=data_transform, download=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
batch_size=64,
shuffle=False)
import os
import torch
from torch import nn
from model import ResNet34
from torch import optim
from torchvision import datasets, transforms
import torch.utils.data
learning_rate = 1e-3
num_epoches = 100
data_transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize(mean = [0.485, 0.456, 0.406],
std = [0.229, 0.224, 0.225])
])
# 下载CIFAR-10数据集
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=data_transform, download=False)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=64,
shuffle=True)
test_dataset = datasets.CIFAR10('./data', train=False, transform=data_transform, download=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset,
batch_size=64,
shuffle=False)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ResNet34().to(device)
# loss和optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
def train(dataloader, model, loss_fn, optimizer):
loss, current, n = 0.0, 0.0, 0
for batch, (x, y) in enumerate(dataloader):
# 前向传播
x, y = x.to(device), y.to(device)
output = model(x)
cur_loss = loss_fn(output, y)
_, pred = torch.max(output, axis=1)
cur_acc = torch.sum(y==pred)/output.shape[0]
# 反向传播
# 清楚过往梯度
optimizer.zero_grad()
cur_loss.backward()
# 根据梯度更新网络参数
optimizer.step()
loss += cur_loss.item()
current += cur_acc.item()
n += 1
train_loss = loss / n
train_acc = current / n
# 计算训练的错误率
print('train_loss:' + str(train_loss))
# 计算训练的准确率
print('train_acc:' + str(train_acc))
def val(dataloader, model, loss_fn):
model.eval()
loss, current, n = 0.0, 0.0, 0
# with torch.no_grad():将with语句包裹起来的部分停止梯度的更新,从而节省了GPU算力和显存,但是并不会影响dropout和BN层的行为
with torch.no_grad():
for batch, (x, y) in enumerate(dataloader):
# 前向传播
x, y = x.to(device), y.to(device)
output = model(x)
cur_loss = loss_fn(output, y)
_, pred = torch.max(output, axis=1)
cur_acc = torch.sum(y == pred) / output.shape[0]
loss += cur_loss.item()
current += cur_acc.item()
n += 1
# 验证的错误率
print("val_loss: " + str(loss / n))
print("val_acc: " + str(current / n))
# 返回模型准确率
return current / n
min_acc = 0
for t in range(num_epoches):
print(f'epoch {t + 1}\n-----------------')
train(train_dataloader, model, loss_fn, optimizer)
a = val(test_dataloader, model, loss_fn)
if a > min_acc:
folder = 'save_model'
if not os.path.exists(folder):
os.mkdir('save_model')
min_acc = a
print('save best model')
torch.save(model.state_dict(), 'save_model/best_model.pth')
print('Done!')