import json
import gzip
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
import random
import torch.utils.data as data
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn.init import constant_, normal_, uniform_
import time
from torchsummary import summary
from thop import profile
import torch.optim as opt
from torchvision.models import resnet18
class ResBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1, use_residual=True):
super(ResBlock, self).__init__()
self.stride = stride
self.use_residual = use_residual
# 第一个卷积层,卷积核大小为3×3,可以设置不同输出通道数以及步长
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, padding=1, stride=self.stride, bias=False)
# 第二个卷积层,卷积核大小为3×3,不改变输入特征图的形状,步长为1
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False)
# 如果conv2的输出和此残差块的输入数据形状不一致,则use_1x1conv = True
# 当use_1x1conv = True,添加1个1x1的卷积作用在输入数据上,使其形状变成跟conv2一致
if in_channels != out_channels or stride != 1:
self.use_1x1conv = True
else:
self.use_1x1conv = False
# 当残差单元包裹的非线性层输入和输出通道数不一致时,需要用1×1卷积调整通道数后再进行相加运算
if self.use_1x1conv:
self.shortcut = nn.Conv2d(in_channels, out_channels, 1, stride=self.stride, bias=False)
# 每个卷积层后会接一个批量规范化层,批量规范化的内容在7.5.1中会进行详细介绍
self.bn1 = nn.BatchNorm2d(out_channels)
self.bn2 = nn.BatchNorm2d(out_channels)
if self.use_1x1conv:
self.bn3 = nn.BatchNorm2d(out_channels)
def forward(self, inputs):
y = F.relu(self.bn1(self.conv1(inputs)))
y = self.bn2(self.conv2(y))
if self.use_residual:
if self.use_1x1conv: # 如果为真,对inputs进行1×1卷积,将形状调整成跟conv2的输出y一致
shortcut = self.shortcut(inputs)
shortcut = self.bn3(shortcut)
else: # 否则直接将inputs和conv2的输出y相加
shortcut = inputs
y = torch.add(shortcut, y)
out = F.relu(y)
return out
将网络结构封装一下:
def make_first_module(in_channels):
# 模块一:7*7卷积、批量规范化、汇聚
m1 = nn.Sequential(nn.Conv2d(in_channels, 64, 7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
return m1
#定义模块二到五
def resnet_module(input_channels, out_channels, num_res_blocks, stride=1, use_residual=True):
blk = []
# 根据num_res_blocks,循环生成残差单元
for i in range(num_res_blocks):
if i == 0: # 创建模块中的第一个残差单元
blk.append(ResBlock(input_channels, out_channels,
stride=stride, use_residual=use_residual))
else: # 创建模块中的其他残差单元
blk.append(ResBlock(out_channels, out_channels, use_residual=use_residual))
return blk
#封装模块二到五
def make_modules(use_residual):
# 模块二:包含两个残差单元,输入通道数为64,输出通道数为64,步长为1,特征图大小保持不变
m2 = nn.Sequential(*resnet_module(64, 64, 2, stride=1, use_residual=use_residual))
# 模块三:包含两个残差单元,输入通道数为64,输出通道数为128,步长为2,特征图大小缩小一半。
m3 = nn.Sequential(*resnet_module(64, 128, 2, stride=2, use_residual=use_residual))
# 模块四:包含两个残差单元,输入通道数为128,输出通道数为256,步长为2,特征图大小缩小一半。
m4 = nn.Sequential(*resnet_module(128, 256, 2, stride=2, use_residual=use_residual))
# 模块五:包含两个残差单元,输入通道数为256,输出通道数为512,步长为2,特征图大小缩小一半。
m5 = nn.Sequential(*resnet_module(256, 512, 2, stride=2, use_residual=use_residual))
return m2, m3, m4, m5
# 定义完整网络
class Model_ResNet18(nn.Module):
def __init__(self, in_channels=3, num_classes=10, use_residual=True):
super(Model_ResNet18,self).__init__()
m1 = make_first_module(in_channels)
m2, m3, m4, m5 = make_modules(use_residual)
# 封装模块一到模块6
self.net = nn.Sequential(m1, m2, m3, m4, m5,
# 模块六:汇聚层、全连接层
nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(512, num_classes) )
def forward(self, x):
return self.net(x)
可以看到这里的就是残差的结构就是第一层只有先卷积,而后来就是残差块累加,其实为什么残差块里有两层卷积,其实多几层也可以,但是再少的话就没有一定的非线性了。
查看网络结构
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # PyTorch v0.4.0
model = Model_ResNet18(in_channels=1, num_classes=10, use_residual=True).to(device)
summary(model, (1, 32, 32))
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 64, 16, 16] 3,200
BatchNorm2d-2 [-1, 64, 16, 16] 128
ReLU-3 [-1, 64, 16, 16] 0
MaxPool2d-4 [-1, 64, 8, 8] 0
Conv2d-5 [-1, 64, 8, 8] 36,864
BatchNorm2d-6 [-1, 64, 8, 8] 128
Conv2d-7 [-1, 64, 8, 8] 36,864
BatchNorm2d-8 [-1, 64, 8, 8] 128
ResBlock-9 [-1, 64, 8, 8] 0
Conv2d-10 [-1, 64, 8, 8] 36,864
BatchNorm2d-11 [-1, 64, 8, 8] 128
Conv2d-12 [-1, 64, 8, 8] 36,864
BatchNorm2d-13 [-1, 64, 8, 8] 128
ResBlock-14 [-1, 64, 8, 8] 0
Conv2d-15 [-1, 128, 4, 4] 73,728
BatchNorm2d-16 [-1, 128, 4, 4] 256
Conv2d-17 [-1, 128, 4, 4] 147,456
BatchNorm2d-18 [-1, 128, 4, 4] 256
Conv2d-19 [-1, 128, 4, 4] 8,192
BatchNorm2d-20 [-1, 128, 4, 4] 256
ResBlock-21 [-1, 128, 4, 4] 0
Conv2d-22 [-1, 128, 4, 4] 147,456
BatchNorm2d-23 [-1, 128, 4, 4] 256
Conv2d-24 [-1, 128, 4, 4] 147,456
BatchNorm2d-25 [-1, 128, 4, 4] 256
ResBlock-26 [-1, 128, 4, 4] 0
Conv2d-27 [-1, 256, 2, 2] 294,912
BatchNorm2d-28 [-1, 256, 2, 2] 512
Conv2d-29 [-1, 256, 2, 2] 589,824
BatchNorm2d-30 [-1, 256, 2, 2] 512
Conv2d-31 [-1, 256, 2, 2] 32,768
BatchNorm2d-32 [-1, 256, 2, 2] 512
ResBlock-33 [-1, 256, 2, 2] 0
Conv2d-34 [-1, 256, 2, 2] 589,824
BatchNorm2d-35 [-1, 256, 2, 2] 512
Conv2d-36 [-1, 256, 2, 2] 589,824
BatchNorm2d-37 [-1, 256, 2, 2] 512
ResBlock-38 [-1, 256, 2, 2] 0
Conv2d-39 [-1, 512, 1, 1] 1,179,648
BatchNorm2d-40 [-1, 512, 1, 1] 1,024
Conv2d-41 [-1, 512, 1, 1] 2,359,296
BatchNorm2d-42 [-1, 512, 1, 1] 1,024
Conv2d-43 [-1, 512, 1, 1] 131,072
BatchNorm2d-44 [-1, 512, 1, 1] 1,024
ResBlock-45 [-1, 512, 1, 1] 0
Conv2d-46 [-1, 512, 1, 1] 2,359,296
BatchNorm2d-47 [-1, 512, 1, 1] 1,024
Conv2d-48 [-1, 512, 1, 1] 2,359,296
BatchNorm2d-49 [-1, 512, 1, 1] 1,024
ResBlock-50 [-1, 512, 1, 1] 0
AdaptiveAvgPool2d-51 [-1, 512, 1, 1] 0
Flatten-52 [-1, 512] 0
Linear-53 [-1, 10] 5,130
================================================================
Total params: 11,175,434
Trainable params: 11,175,434
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.05
Params size (MB): 42.63
Estimated Total Size (MB): 43.69
----------------------------------------------------------------
读取训练集:
# 打印并观察数据集分布情况
train_set, dev_set, test_set = json.load(gzip.open('mnist.json.gz'))
train_images, train_labels = train_set[0][:2000], train_set[1][:2000]
dev_images, dev_labels = dev_set[0][:200], dev_set[1][:200]
test_images, test_labels = test_set[0][:200], test_set[1][:200]
train_set, dev_set, test_set = [train_images, train_labels], [dev_images, dev_labels], [test_images, test_labels]
print('Length of train/dev/test set:{}/{}/{}'.format(len(train_set[0]), len(dev_set[0]), len(test_set[0])))
image, label = train_set[0][0], train_set[1][0]
image, label = np.array(image).astype('float32'), int(label)
# 原始图像数据为长度784的行向量,需要调整为[28,28]大小的图像
image = np.reshape(image, [28,28])
image = Image.fromarray(image.astype('uint8'), mode='L')
print("The number in the picture is {}".format(label))
plt.figure(figsize=(5, 5))
plt.imshow(image)
plt.savefig('conv-number5.pdf')
plt.show()
# 数据预处理
transforms = Compose([Resize(32), ToTensor(),Normalize(mean=[1], std=[1])])
class MNIST_dataset(data.Dataset):
def __init__(self, dataset, transforms, mode='train'):
self.mode = mode
self.transforms =transforms
self.dataset = dataset
def __getitem__(self, idx):
# 获取图像和标签
image, label = self.dataset[0][idx], self.dataset[1][idx]
image, label = np.array(image).astype('float32'), int(label)
image = np.reshape(image, [28,28])
image = Image.fromarray(image.astype('uint8'), mode='L')
image = self.transforms(image)
return image, label
def __len__(self):
return len(self.dataset[0])
# 固定随机种子
random.seed(0)
# 加载 mnist 数据集
train_dataset = MNIST_dataset(dataset=train_set, transforms=transforms, mode='train')
test_dataset = MNIST_dataset(dataset=test_set, transforms=transforms, mode='test')
dev_dataset = MNIST_dataset(dataset=dev_set, transforms=transforms, mode='dev')
# 打印并观察数据集分布情况
train_set, dev_set, test_set = json.load(gzip.open('mnist.json.gz'))
train_images, train_labels = train_set[0][:2000], train_set[1][:2000]
dev_images, dev_labels = dev_set[0][:200], dev_set[1][:200]
test_images, test_labels = test_set[0][:200], test_set[1][:200]
train_set, dev_set, test_set = [train_images, train_labels], [dev_images, dev_labels], [test_images, test_labels]
print('Length of train/dev/test set:{}/{}/{}'.format(len(train_set[0]), len(dev_set[0]), len(test_set[0])))
image, label = train_set[0][0], train_set[1][0]
image, label = np.array(image).astype('float32'), int(label)
# 原始图像数据为长度784的行向量,需要调整为[28,28]大小的图像
image = np.reshape(image, [28, 28])
image = Image.fromarray(image.astype('uint8'), mode='L')
print("The number in the picture is {}".format(label))
plt.figure(figsize=(5, 5))
plt.imshow(image)
plt.savefig('conv-number5.pdf')
plt.show()
# 数据预处理
transforms = Compose([Resize(32), ToTensor(), Normalize(mean=[1], std=[1])])
class MNIST_dataset(data.Dataset):
def __init__(self, dataset, transforms, mode='train'):
self.mode = mode
self.transforms = transforms
self.dataset = dataset
def __getitem__(self, idx):
# 获取图像和标签
image, label = self.dataset[0][idx], self.dataset[1][idx]
image, label = np.array(image).astype('float32'), int(label)
image = np.reshape(image, [28, 28])
image = Image.fromarray(image.astype('uint8'), mode='L')
image = self.transforms(image)
return image, label
def __len__(self):
return len(self.dataset[0])
# 固定随机种子
random.seed(0)
# 加载 mnist 数据集
train_dataset = MNIST_dataset(dataset=train_set, transforms=transforms, mode='train')
test_dataset = MNIST_dataset(dataset=test_set, transforms=transforms, mode='test')
dev_dataset = MNIST_dataset(dataset=dev_set, transforms=transforms, mode='dev')
class RunnerV3(object):
def __init__(self, model, optimizer, loss_fn, metric, **kwargs):
self.model = model
self.optimizer = optimizer
self.loss_fn = loss_fn
self.metric = metric # 只用于计算评价指标
# 记录训练过程中的评价指标变化情况
self.dev_scores = []
# 记录训练过程中的损失函数变化情况
self.train_epoch_losses = [] # 一个epoch记录一次loss
self.train_step_losses = [] # 一个step记录一次loss
self.dev_losses = []
# 记录全局最优指标
self.best_score = 0
def train(self, train_loader, dev_loader=None, **kwargs):
# 将模型切换为训练模式
self.model.train()
# 传入训练轮数,如果没有传入值则默认为0
num_epochs = kwargs.get("num_epochs", 0)
# 传入log打印频率,如果没有传入值则默认为100
log_steps = kwargs.get("log_steps", 100)
# 评价频率
eval_steps = kwargs.get("eval_steps", 0)
# 传入模型保存路径,如果没有传入值则默认为"best_model.pdparams"
save_path = kwargs.get("save_path", "best_model.pdparams")
custom_print_log = kwargs.get("custom_print_log", None)
# 训练总的步数
num_training_steps = num_epochs * len(train_loader)
if eval_steps:
if self.metric is None:
raise RuntimeError('Error: Metric can not be None!')
if dev_loader is None:
raise RuntimeError('Error: dev_loader can not be None!')
# 运行的step数目
global_step = 0
# 进行num_epochs轮训练
for epoch in range(num_epochs):
# 用于统计训练集的损失
total_loss = 0
for step, data in enumerate(train_loader):
X, y = data
# 获取模型预测
logits = self.model(X)
loss = self.loss_fn(logits, y) # 默认求mean
total_loss += loss
# 训练过程中,每个step的loss进行保存
self.train_step_losses.append((global_step, loss.item()))
if log_steps and global_step % log_steps == 0:
print(
f"[Train] epoch: {epoch}/{num_epochs}, step: {global_step}/{num_training_steps}, loss: {loss.item():.5f}")
# 梯度反向传播,计算每个参数的梯度值
loss.backward()
if custom_print_log:
custom_print_log(self)
# 小批量梯度下降进行参数更新
self.optimizer.step()
# 梯度归零
self.optimizer.zero_grad()
# 判断是否需要评价
if eval_steps > 0 and global_step > 0 and \
(global_step % eval_steps == 0 or global_step == (num_training_steps - 1)):
dev_score, dev_loss = self.evaluate(dev_loader, global_step=global_step)
print(f"[Evaluate] dev score: {dev_score:.5f}, dev loss: {dev_loss:.5f}")
# 将模型切换为训练模式
self.model.train()
# 如果当前指标为最优指标,保存该模型
if dev_score > self.best_score:
self.save_model(save_path)
print(
f"[Evaluate] best accuracy performence has been updated: {self.best_score:.5f} --> {dev_score:.5f}")
self.best_score = dev_score
global_step += 1
# 当前epoch 训练loss累计值
trn_loss = (total_loss / len(train_loader)).item()
# epoch粒度的训练loss保存
self.train_epoch_losses.append(trn_loss)
print("[Train] Training done!")
# 模型评估阶段,使用'paddle.no_grad()'控制不计算和存储梯度
@torch.no_grad()
def evaluate(self, dev_loader, **kwargs):
assert self.metric is not None
# 将模型设置为评估模式
self.model.eval()
global_step = kwargs.get("global_step", -1)
# 用于统计训练集的损失
total_loss = 0
# 重置评价
self.metric.reset()
# 遍历验证集每个批次
for batch_id, data in enumerate(dev_loader):
X, y = data
# 计算模型输出
logits = self.model(X)
# 计算损失函数
loss = self.loss_fn(logits, y).item()
# 累积损失
total_loss += loss
# 累积评价
self.metric.update(logits, y)
dev_loss = (total_loss / len(dev_loader))
dev_score = self.metric.accumulate()
# 记录验证集loss
if global_step != -1:
self.dev_losses.append((global_step, dev_loss))
self.dev_scores.append(dev_score)
return dev_score, dev_loss
# 模型评估阶段,使用'paddle.no_grad()'控制不计算和存储梯度
@torch.no_grad()
def predict(self, x, **kwargs):
# 将模型设置为评估模式
self.model.eval()
# 运行模型前向计算,得到预测值
logits = self.model(x)
return logits
def save_model(self, save_path):
torch.save(self.model.state_dict(), save_path)
def load_model(self, model_path):
state_dict = torch.load(model_path)
self.model.load_state_dict(state_dict)
准确率判断和绘图:
class Accuracy:
def __init__(self, is_logist=True):
"""
输入:
- is_logist: outputs是logist还是激活后的值
"""
# 用于统计正确的样本个数
self.num_correct = 0
# 用于统计样本的总数
self.num_count = 0
self.is_logist = is_logist
def update(self, outputs, labels):
"""
输入:
- outputs: 预测值, shape=[N,class_num]
- labels: 标签值, shape=[N,1]
"""
# 判断是二分类任务还是多分类任务,shape[1]=1时为二分类任务,shape[1]>1时为多分类任务
if outputs.shape[1] == 1: # 二分类
outputs = torch.squeeze(outputs, dim=-1)
if self.is_logist:
# logist判断是否大于0
preds = torch.tensor((outputs >= 0), dtype=torch.float32)
else:
# 如果不是logist,判断每个概率值是否大于0.5,当大于0.5时,类别为1,否则类别为0
preds = torch.tensor((outputs >= 0.5), dtype=torch.float32)
else:
# 多分类时,使用'torch.argmax'计算最大元素索引作为类别
preds = torch.argmax(outputs, dim=1)
# 获取本批数据中预测正确的样本个数
labels = torch.squeeze(labels, dim=-1)
batch_correct = torch.sum(preds == labels).clone().detach().numpy()
batch_count = len(labels)
# 更新num_correct 和 num_count
self.num_correct += batch_correct
self.num_count += batch_count
def accumulate(self):
# 使用累计的数据,计算总的指标
if self.num_count == 0:
return 0
return self.num_correct / self.num_count
def reset(self):
# 重置正确的数目和总数
self.num_correct = 0
self.num_count = 0
def name(self):
return "Accuracy"
def plot(runner, fig_name):
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
train_items = runner.train_step_losses[::30]
train_steps = [x[0] for x in train_items]
train_losses = [x[1] for x in train_items]
plt.plot(train_steps, train_losses, color='#8E004D', label="Train loss")
if runner.dev_losses[0][0] != -1:
dev_steps = [x[0] for x in runner.dev_losses]
dev_losses = [x[1] for x in runner.dev_losses]
plt.plot(dev_steps, dev_losses, color='#E20079', linestyle='--', label="Dev loss")
# 绘制坐标轴和图例
plt.ylabel("loss", fontsize='x-large')
plt.xlabel("step", fontsize='x-large')
plt.legend(loc='upper right', fontsize='x-large')
plt.subplot(1, 2, 2)
# 绘制评价准确率变化曲线
if runner.dev_losses[0][0] != -1:
plt.plot(dev_steps, runner.dev_scores,
color='#E20079', linestyle="--", label="Dev accuracy")
else:
plt.plot(list(range(len(runner.dev_scores))), runner.dev_scores,
color='#E20079', linestyle="--", label="Dev accuracy")
# 绘制坐标轴和图例
plt.ylabel("score", fontsize='x-large')
plt.xlabel("step", fontsize='x-large')
plt.legend(loc='lower right', fontsize='x-large')
plt.savefig(fig_name)
plt.show()
训练:
# 学习率大小
lr = 0.1
# 批次大小
batch_size = 64
# 加载数据
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
# 定义LeNet网络
# 自定义算子实现的LeNet-5
model = Model_ResNet18(in_channels=1, num_classes=10)
# 飞桨API实现的LeNet-5
# model = Paddle_LeNet(in_channels=1, num_classes=10)
# 定义优化器
optimizer = torch.optim.SGD(lr=lr, params=model.parameters())
# 定义损失函数
loss_fn = F.cross_entropy
# 定义评价指标
metric = Accuracy(is_logist=True)
# 实例化RunnerV3
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 启动训练
log_steps = 15
eval_steps = 15
runner.train(train_loader, dev_loader, num_epochs=5, log_steps=log_steps,
eval_steps=eval_steps, save_path="best_model.pdparams")
# 可视化观察训练集与验证集的Loss变化情况
plot(runner, 'cnn-loss2.pdf')
[Train] epoch: 0/5, step: 0/160, loss: 2.42276
[Train] epoch: 0/5, step: 15/160, loss: 1.15423
[Evaluate] dev score: 0.41000, dev loss: 1.79679
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.41000
[Train] epoch: 0/5, step: 30/160, loss: 0.86913
[Evaluate] dev score: 0.51500, dev loss: 1.48232
[Evaluate] best accuracy performence has been updated: 0.41000 --> 0.51500
[Train] epoch: 1/5, step: 45/160, loss: 0.18888
[Evaluate] dev score: 0.86500, dev loss: 0.42591
[Evaluate] best accuracy performence has been updated: 0.51500 --> 0.86500
[Train] epoch: 1/5, step: 60/160, loss: 0.31176
[Evaluate] dev score: 0.77500, dev loss: 0.72687
[Train] epoch: 2/5, step: 75/160, loss: 0.60112
[Evaluate] dev score: 0.79000, dev loss: 0.72759
[Train] epoch: 2/5, step: 90/160, loss: 0.24280
[Evaluate] dev score: 0.88000, dev loss: 0.35190
[Evaluate] best accuracy performence has been updated: 0.86500 --> 0.88000
[Train] epoch: 3/5, step: 105/160, loss: 0.39712
[Evaluate] dev score: 0.74500, dev loss: 0.70728
[Train] epoch: 3/5, step: 120/160, loss: 0.25634
[Evaluate] dev score: 0.93000, dev loss: 0.21397
[Evaluate] best accuracy performence has been updated: 0.88000 --> 0.93000
[Train] epoch: 4/5, step: 135/160, loss: 0.18822
[Evaluate] dev score: 0.95000, dev loss: 0.16862
[Evaluate] best accuracy performence has been updated: 0.93000 --> 0.95000
[Train] epoch: 4/5, step: 150/160, loss: 0.05377
[Evaluate] dev score: 0.95000, dev loss: 0.14262
[Evaluate] dev score: 0.76000, dev loss: 0.74673
[Train] Training done!
# 加载最优模型
runner.load_model('best_model.pdparams')
# 模型评价
score, loss = runner.evaluate(test_loader)
print("[Test] accuracy/loss: {:.4f}/{:.4f}".format(score, loss))
[Test] accuracy/loss: 0.9300/0.1816
带上残差比较一下:
model = Model_ResNet18(in_channels=1, num_classes=10, use_residual=True)
[Train] epoch: 0/5, step: 0/160, loss: 2.44479
[Train] epoch: 0/5, step: 15/160, loss: 0.59010
[Evaluate] dev score: 0.10000, dev loss: 2.33080
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.10000
[Train] epoch: 0/5, step: 30/160, loss: 0.31967
[Evaluate] dev score: 0.31500, dev loss: 1.89870
[Evaluate] best accuracy performence has been updated: 0.10000 --> 0.31500
[Train] epoch: 1/5, step: 45/160, loss: 0.14814
[Evaluate] dev score: 0.91000, dev loss: 0.38122
[Evaluate] best accuracy performence has been updated: 0.31500 --> 0.91000
[Train] epoch: 1/5, step: 60/160, loss: 0.04555
[Evaluate] dev score: 0.92500, dev loss: 0.21402
[Evaluate] best accuracy performence has been updated: 0.91000 --> 0.92500
[Train] epoch: 2/5, step: 75/160, loss: 0.09145
[Evaluate] dev score: 0.95000, dev loss: 0.13580
[Evaluate] best accuracy performence has been updated: 0.92500 --> 0.95000
[Train] epoch: 2/5, step: 90/160, loss: 0.13341
[Evaluate] dev score: 0.92500, dev loss: 0.18148
[Train] epoch: 3/5, step: 105/160, loss: 0.04355
[Evaluate] dev score: 0.95500, dev loss: 0.13861
[Evaluate] best accuracy performence has been updated: 0.95000 --> 0.95500
[Train] epoch: 3/5, step: 120/160, loss: 0.03326
[Evaluate] dev score: 0.97500, dev loss: 0.09213
[Evaluate] best accuracy performence has been updated: 0.95500 --> 0.97500
[Train] epoch: 4/5, step: 135/160, loss: 0.02350
[Evaluate] dev score: 0.96500, dev loss: 0.10141
[Train] epoch: 4/5, step: 150/160, loss: 0.00776
[Evaluate] dev score: 0.96000, dev loss: 0.09467
[Evaluate] dev score: 0.95000, dev loss: 0.11534
[Train] Training done!
[Test] accuracy/loss: 0.9850/0.0635
两个前向传播的测试
from torchvision.models import resnet18
hapi_model = resnet18(pretrained=True)
# 自定义的resnet18模型
model = Model_ResNet18(in_channels=3, num_classes=1000, use_residual=True)
# 获取网络的权重
params = hapi_model.state_dict()
# 用来保存参数名映射后的网络权重
new_params = {}
# 将参数名进行映射
for key in params:
if 'layer' in key:
if 'downsample.0' in key:
new_params['net.' + key[5:8] + '.shortcut' + key[-7:]] = params[key]
elif 'downsample.1' in key:
new_params['net.' + key[5:8] + '.shorcutt' + key[23:]] = params[key]
else:
new_params['net.' + key[5:]] = params[key]
elif 'conv1.weight' == key:
new_params['net.0.0.weight'] = params[key]
elif 'bn1' in key:
new_params['net.0.1' + key[3:]] = params[key]
elif 'fc' in key:
new_params['net.7' + key[2:]] = params[key]
del new_params["net.2.0.shorcutteight"]
del new_params["net.2.0.shorcuttias"]
del new_params["net.2.0.shorcuttunning_mean"]
del new_params["net.2.0.shorcuttunning_var"]
del new_params["net.2.0.shorcuttum_batches_tracked"]
del new_params["net.3.0.shorcutteight"]
del new_params["net.3.0.shorcuttias"]
del new_params["net.3.0.shorcuttunning_mean"]
del new_params["net.3.0.shorcuttunning_var"]
del new_params["net.3.0.shorcuttum_batches_tracked"]
del new_params["net.4.0.shorcutteight"]
del new_params["net.4.0.shorcuttias"]
del new_params["net.4.0.shorcuttunning_mean"]
del new_params["net.4.0.shorcuttunning_var"]
del new_params["net.4.0.shorcuttum_batches_tracked"]
inputs = np.random.randn(*[3, 3, 32, 32])
inputs = inputs.astype('float32')
x = torch.tensor(inputs)
output = hapi_model(x)
hapi_out = hapi_model(x)
# 计算两个模型输出的差异
diff = output - hapi_out
# 取差异最大的值
max_diff = torch.max(diff)
print(max_diff)
tensor(0., grad_fn=)
关于为什么restnet好用:
关于restnet为何有用有几种说法,将shortcut看成一种简单的网络那么restnet就是一种两种弱分类器组成的一个集成学习模块不断累加的结果,所以比单一的分类器要强,而恒等的特点又可以将梯度不断传到底层,可以弄很深不用但心梯度问题。
我的理解是而恒等映射相当于一种记忆就是他的输入就是前一次的结果,越看越像RNN就是RNN没有将每次循环的结果进行保留,但是restnet进行了保留而且使用的是简单的加法(揉在一起的集成学习),那其实说明作用关系不重要,重要的是传播的梯度,那可以做一个实验看看看restnet是不是对序列有一定的敏感性(先挖个坑,啥时候想起来啥时候填),那么可以说对序列的拟合就是不断利用重复的参数空间生成树的过程。但是其实RNN用的反向传播进行了截断用的是BPTT,可能是在一般的RNN里面也没有恒等映射,但是在LSTM里好像有个类似的遗忘门,感觉特点就是本身不储存参数,其结果为底层和现有层的简单函数关系,(因为感觉这特点有用)。在反向传播的时候其实是输出层对每层的变量分别求偏导因为不是因为是加法的原因所以偏导的不会像乘法那样有梯度问题,所以加法就相当于给每个残差块都有了直接和误差进行反向传播的机会,感觉数学上没啥问题,(说到这感觉restnet的图非常具有误导性,很多神经网络也是,为什么不按照数学性质去画呢)。
加法的魔力:
有了数学公式了之后我们就能非常明确的看到之所以梯度不消失是因为在对 o u t out out求导的时候后面的嵌套较少的函数会贡献较多的梯度,可能前面的梯度会消失但是后面的主要的梯度不会变,而且某一个 n e t net net的参数只和相对于该层更外层的参数相关,先关多少层取决于多少层之后梯度会消失,所以我猜想实际上如果是更深的restnet并不一定会严格更好。
举个例子如果网络有一百层,平均过31层就会梯度消失,那么理论上第30层会直接受到第90层的参数影响,但是实际上是90层影响了59到89层,然后第59层影响了第30层。也就是在很多层之后会缺少一个直接越过31层的直连边但是实际上标准的restnet没有(都是猜想,先挖个坑到时候试),有一个问题是梯度消失的层数并不能确定,不同的初始化和数据集都不一样,动态的创建结构可能能解决。
论文链接: https://arxiv.org/pdf/1605.06431.pdf
好吧人家有论文里写了坑填了:
这是随机删除的层的,但我觉得删除中间层和删除外层和内层估计是不大一样,但是论文里没写
这里是参数传递的长度,跟我想的略有差异,这里的路径长度看不出是不是因为梯度消失而导致的是不是能看到梯度消失的层数和路径的长度的比较就更好了
这里唯一的感慨是,设计网络的难度要大于试网络的难度,这里如果让你设计一个有连接长度大部分有限的网络你能想到这?网络的结构决定了效果,但是这也是一种难以预测的宏观的表象,在设计的时候如何能看到,更多是好用,试着去解释
还有就是所谓的集成学习但是组成这个集成的各个部件是共享参数的,那这种参数共享是可以代表各种路径,但是如果我只用其中的几个路径训练,那么出来的参数肯定不同,那么这样的网络是否会比之前的网络的泛化性更好呢,还是更加’‘挑剔’'呢,性能差别如何。其实就是问如果参数共享和集成学习的不同点在哪
x 1 n e t 1 − n e t 2 n e t 1 − n e t 2 − n e t 3 n e t 2 − n e t 3 n e t 1 − n s t 3 x_1 \\ net_1-net_2\\ net_1-net_2-net_3\\ net_2-net_3\\ net_1-nst_3 x1net1−net2net1−net2−net3net2−net3net1−nst3
,所以就有了一个奇特的地方, n e t 1 net_1 net1是可以对 n e t 2 , n e t 2 − n e t 3 , n e t 3 net_2,net_2-net_3,net_3 net2,net2−net3,net3都进行配合使用的但是准确度不一样而因为在反向传播时对其参数的影响不同,层数越多影响的权重就越小(不全是就是大体上是导的越多值越小)(我是没时间做这种试验了,再挖个坑。但是看论文里的数据不同的层进行组合错误率也不是直接上升的,说明这种神经网络的结构有一定的稳定性(瞎说的))
这里的结构就很像自动递归生成的网络结构:
在这篇文章里 Residual Networks Behave Like Ensembles of Relatively Shallow Networks(restnet像浅层网络的集合)有这样一张图表现出了这种关系
我画完才发现人家早给你推导的很好的公式比我写的好多了
在这篇文章的结语部分总结了restnet的另一个性质就是这些路径并不互相依赖
7 Conclusion
What is the reason behind residual networks’ increased performance? In the most recent iteration of
residual networks, He et al. [6] provide one hypothesis: “We obtain these results via a simple but
essential concept—going deeper.” While it is true that they are deeper than previous approaches, we
present a complementary explanation. First, our unraveled view reveals that residual networks can be
viewed as a collection of many paths, instead of a single ultra deep network. Second, we perform
lesion studies to show that, although these paths are trained jointly, they do not strongly depend
on each other. Moreover, they exhibit ensemble-like behavior in the sense that their performance
smoothly correlates with the number of valid paths. Finally, we show that the paths through the
network that contribute gradient during training are shorter than expected. In fact, deep paths are
not required during training as they do not contribute any gradient. Thus, residual networks do not
resolve the vanishing gradient problem by preserving gradient flow throughout the entire depth of
the network. This insight reveals that depth is still an open research question. These promising
observations provide a new lens through which to examine neural networks.
看到这个组合过程很奇怪,那为什么组合公式不是
n e t 3 ( n e t 2 ( n e t 1 ( x ) ) ) + n e t 2 ( n e t 1 ( x ) ) + n e t 1 ( x ) + x net_3(net_2(net_1(x)))+net_2(net_1(x))+net_1(x)+x net3(net2(net1(x)))+net2(net1(x))+net1(x)+x
或者;
n e t 3 ( n e t 2 ( n e t 1 ( x ) + b 1 ) + b 2 ) + n e t 2 ( n e t 1 ( x ) + b 1 ) + n e t 1 ( x ) + x net_3(net_2(net_1(x)+b_1)+b_2)+net_2(net_1(x)+b_1)+net_1(x)+x net3(net2(net1(x)+b1)+b2)+net2(net1(x)+b1)+net1(x)+x
不是更简洁,
好吧只是组合路径中的主要的几条,我想少了<_<
论文提到了一种新的网络由这种深度进行启发
Recently, an alternative training procedure for residual networks has been proposed, referred to as stochastic depth [9]
有时间看看啊这种的说一千层都没事挖个坑再看