@Pytorch:VGG16训练CIFAR10数据集出现bug之总结
从github上下载的源码是LeNet训练Mnist数据集,我寻思着我用vgg16网络训练一下cifar10数据集试试呗。然后就是疯狂的出现各种各样的bug,de完一个又一个,人生真的是不停的debug啊。还好,经过不懈的努力与坚持,终于vgg16训练cifar10获得了成功。现在总结一下我实现过程中出现的问题,供我后面复习以及大家学习。
这个问题困扰了我很久,后来查看各种资料,原因可能如下:
1、学习率设置不当,通常情况下设为:0.01-0.1,设置过小,学习慢。
2、损失函数设置不当,可以考虑选用交叉熵损失函数(criterion = torch.nn.CrossEntropyLoss().cuda())
3、网络结构网络结构不对。(我选的vgg16网络吧,害,直接copy了网上的一段,但是根本不适用于我的情况,提醒小伙伴们,要根据实际做适当修改哦)
网络修改:前面卷积层提取特征都一样,无需改动。全连接层需要根据自己的输入数据集大小做出改变。
官方的输入图片是224×224,经过卷积和池化,到全连接层的第一层是大小变为512×7×7,当用32×32的CIFAR10图片时,经过卷积和池化,到全连接层的第一层是大小变为512乘1乘1。因此在下方全连接层的第一层nn.Linear(512×7×7, 4096)处,需改为nn.Linear(512×1×1, 4096)。其中4096是自己设置的神经元的个数。官方的为4096,是一个经验值,我们在设置时,不小于分类大小就行(此处num_classes = 10,我设为512)。
以下是官方VGG16代码:
vgg16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
class VGG(nn.Module):
def __init__(self, in_channels=3, num_classes=10):
super(VGG, self).__init__()
self.in_channels = in_channels
self.conv = self._make_layer(vgg16)
self.fc = nn.Sequential(
nn.Linear(512*7*7, 4096),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.conv(x)
x = x.reshape(x.shape[0], -1)
x = self.fc(x)
return x
def _make_layer(self, architecture):
layers = []
in_channels = self.in_channels
for x in architecture:
if type(x) == int:
out_channels = x
layers += [
nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
),
nn.BatchNorm2d(x),
nn.ReLU(),
]
in_channels = x
elif x == "M":
layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
return nn.Sequential(*layers)
eg:用的其他代码如下:
将全连接层处改为512,报错ValueError: Expected input batch_size (3136) to match target batch_size (64).据查阅资料,发现是维度不匹配。解决办法如下:
import torch
import torch.nn as nn
import torch.nn.functional as F
from prune import PruningModule, MaskedLinear
class VGG16_torch(PruningModule):
def __init__(self,mask =False):
super(VGG16_torch, self).__init__()
linear = MaskedLinear if mask else nn.Linear
# conv2d = MaskedConv2d if mask else nn.Conv2d
self.features = nn.Sequential(
# conv1
nn.Conv2d(3, 64, 3, 1, 1),
nn.BatchNorm2d(64, 0.9),
nn.ReLU(),
nn.Conv2d(64, 64, 3, 1, 1),
nn.BatchNorm2d(64, 0.9),
nn.ReLU(),
nn.MaxPool2d(2, 2),
# conv2
nn.Conv2d(64, 128, 3, 1, 1),
nn.BatchNorm2d(128, 0.9),
nn.ReLU(),
nn.Conv2d(128, 128, 3, 1, 1),
nn.BatchNorm2d(128, 0.9),
nn.ReLU(),
nn.MaxPool2d(2, 2),
# conv3
nn.Conv2d(128, 256, 3, 1, 1),
nn.BatchNorm2d(256, 0.9),
nn.ReLU(),
nn.Conv2d(256, 256, 3, 1, 1),
nn.BatchNorm2d(256, 0.9),
nn.ReLU(),
nn.Conv2d(256, 256, 3, 1, 1),
nn.BatchNorm2d(256, 0.9),
nn.ReLU(),
nn.MaxPool2d(2, 2),
# conv4
nn.Conv2d(256, 512, 3, 1, 1),
nn.BatchNorm2d(512, 0.9),
nn.ReLU(),
nn.Conv2d(512, 512, 3, 1, 1),
nn.BatchNorm2d(512, 0.9),
nn.ReLU(),
nn.Conv2d(512, 512, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2, 2),
# conv5
nn.Conv2d(512, 512, 3, 1, 1),
nn.BatchNorm2d(512, 0.9),
nn.ReLU(),
nn.Conv2d(512, 512, 3, 1, 1),
nn.BatchNorm2d(512, 0.9),
nn.ReLU(),
nn.Conv2d(512, 512, 3, 1, 1),
nn.BatchNorm2d(512, 0.9),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
# fc1
self.fc1 = linear(512, 512)
self.relu6 = nn.ReLU()
self.dropout1 = nn.Dropout()
# fc2
self.fc2 = linear(512, 512)
self.relu7 = nn.ReLU()
self.dropout2 = nn.Dropout()
# fc3
self.fc3 = linear(512, 10)
def forward(self, x):
x = self.features(x)
#x = x.view(x.size(0), -1)
# x = x.view(x.size(x), -1)
x = x.view(-1,512)
x = self.fc1(x)
x = self.relu6(x)
x = nn.functional.dropout(x, p=0.5,training=self.training)
x = self.fc2(x)
x = self.relu7(x)
x = nn.functional.dropout(x, p=0.5,training=self.training)
x = self.fc3(x)
return x
在上述代码的 x = x.view(-1,512)的前后加上print(x.shape),查看前后数据维度大小为多少。
注意:单运行网络结构.py不会出结果,得运行调用到该结构的主程序。
print(x.shape)
x = x.view(-1,512)
print(x.shape)
我的输出为:
torch.Size([64,25088])
torch.Size([3136, 512])
此时将self.fc1 = linear(512, 512)改为self.fc1 = linear(25088, 512),并且将x = x.view(-1,512)改为x = x.view(-1,25088)即可成功运行。
终于成功,不易不易,属实不易,大家也要加油debug啊。