背景:之前的网络结构做多标签分类较为复杂,我们需要用resNet进行多标签分类简单的实现相应的分类问题,测试baseline。
目录
一、网络结构及定义
二、optimizer
三、新定义网络
3.1 加载模型
3.2 模型定义
3.3 网络尺寸
3.4 fc层尺寸
四、训练内存占用
五、SENet更改
5.1 引入模型
5.2 SENet定义代码
在原来代码基础上更改,方便运行。
加入resnet101的结构
elif Config.MODEL == 'resnet101':
model = models.resnet101(pretrained=False)
print('load pretrained model...')
model.load_state_dict(torch.load('./resnet101-5d3b4d8f.pth'))
# model params
MODEL = 'resnet101' # options: hgat_conv, hgat_fc, groupnet
BACKBONE = 'resnet101'
GROUPS = 12
报错:
optimizer = torch.optim.SGD(model.parameters(), 'lr':parser.lr,
^
SyntaxError: invalid syntax
之前的optimizer
optimizer = torch.optim.SGD(model.get_config_optim(args.lr, args.lrp),
# lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
def get_config_optim(self, lr, lrp):
return [
{'params': self.features.parameters(), 'lr': lrp},
{'params': self.heads.parameters(), 'lr': lr},
]
加入resnet之后,相应的optimizer不能用之前的了。因为没有了相应的get_config_optim
if Config.MODEL == 'resnet101':
optimizer = torch.optim.SGD(params=model.parameters(), lr=args.lr,
# lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
else:
optimizer = torch.optim.SGD(model.get_config_optim(args.lr, args.lrp),
# lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
绕进去之后,发现一层一层调试下去,最终的resnet输入与之前的不一样
File "/home/xingxiangrui/chun-ML_GCN/engine.py", line 256, in train
self.on_forward(True, model, criterion, data_loader, optimizer)
File "/home/xingxiangrui/chun-ML_GCN/engine.py", line 446, in on_forward
self.state['output'] = model(feature_var, inp_var)
File "/home/xingxiangrui/chun-ML_GCN/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
File "/home/xingxiangrui/chun-ML_GCN/env/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 73, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
。。。
File "/home/xingxiangrui/chun-ML_GCN/env/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
result = self.forward(*input, **kwargs)
TypeError: forward() takes 2 positional arguments but 3 were given
所以,改为采用新网络,重新定义ml_resnet.py函数,然后import进来。
elif Config.MODEL == 'resnet101':
import mymodels.ml_resnet as ml_resnet
model= ml_resnet.ML_RESNET(Config.BACKBONE, groups=Config.GROUPS, nclasses=Config.NCLASSES,
nclasses_per_group=Config.NCLASSES_PER_GROUP,
group_channels=Config.GROUP_CHANNELS, class_channels=Config.CLASS_CHANNELS)
从文件中加载定义,创建新文件
class ML_RESNET(nn.Module):
def __init__(self, backbone, groups, nclasses, nclasses_per_group, group_channels, class_channels):
super(HGAT_FC, self).__init__()
self.groups = groups
self.nclasses = nclasses
self.nclasses_per_group = nclasses_per_group
self.group_channels = group_channels
self.class_channels = class_channels
if backbone == 'resnet101':
model = models.resnet101(pretrained=False)
print('load pretrained model...')
model.load_state_dict(torch.load('./resnet101-5d3b4d8f.pth'))
elif backbone == 'resnet50':
model = models.resnet50(pretrained=False)
print('load pretrained model...')
model.load_state_dict(torch.load('./resnet50-5d3b4d8f.pth'))
else:
raise Exception()
self.features = nn.Sequential(
model.conv1,
model.bn1,
model.relu,
model.maxpool,
model.layer1,
model.layer2,
model.layer3,
model.layer4, )
self.gmp = nn.AdaptiveMaxPool2d(1)
self.fc=nn.Sequential(utils.BasicLinear(in_channels=2048, out_channels=1024),
utils.BasicLinear(in_channels=1024, out_channels=class_channels), )
self.image_normalization_mean = [0.485, 0.456, 0.406]
self.image_normalization_std = [0.229, 0.224, 0.225]
def forward(self, x, inp):
x = self.features(x) # [B,2048,H,W]
x=self.gmp(x).view(x.size(0),x.size(1))
x=self.fc(x)
return x
def get_config_optim(self, lr, lrp):
return [
{'params': self.features.parameters(), 'lr': lrp},
#{'params': self.heads.parameters(), 'lr': lr},
]
定义需要改成自己的定义,ML_RESNET
class ML_RESNET(nn.Module):
def __init__(self, backbone, groups, nclasses, nclasses_per_group, group_channels, class_channels):
super(ML_RESNET, self).__init__()
之前的网络尺寸定义为:
raise Exception()
self.features = nn.Sequential(
model.conv1,
model.bn1,
model.relu,
model.maxpool,
model.layer1,
model.layer2,
model.layer3,
model.layer4, )
self.gmp = nn.AdaptiveMaxPool2d(1)
self.fc=nn.Sequential(utils.BasicLinear(in_channels=2048, out_channels=1024),
utils.BasicLinear(in_channels=1024, out_channels=class_channels), )
def forward(self, x, inp):
x = self.features(x) # [B,2048,H,W]
x=self.gmp(x).view(x.size(0),x.size(1))
x=self.fc(x)
return x
结果产生报错
File "general_train.py", line 182, in
main_coco()
File "general_train.py", line 178, in main_coco
engine.learning(model, criterion, train_dataset, val_dataset, optimizer)
File "/home/xingxiangrui/chun-ML_GCN/engine.py", line 214, in learning
self.train(train_loader, model, criterion, optimizer, epoch)
File "/home/xingxiangrui/chun-ML_GCN/engine.py", line 256, in train
self.on_forward(True, model, criterion, data_loader, optimizer)
File "/home/xingxiangrui/chun-ML_GCN/engine.py", line 451, in on_forward
weight=torch.autograd.Variable(weights.cuda()))
File "/home/xingxiangrui/chun-ML_GCN/env/lib/python3.6/site-packages/torch/nn/functional.py", line 1227, in binary_cross_entropy_with_logits
raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
ValueError: Target size (torch.Size([32, 80])) must be the same as input size (torch.Size([32, 256]))
出现这个错误的原因,是这样,,一个为80,一个为256
看报错,是交叉熵的过程中出现的错误
NCLASSES = 80
NCLASSES_PER_GROUP = [1, 8, 5, 10, 5, 10, 7, 10, 6, 6, 5, 7] # FIXME: to check
GROUP_CHANNELS = 512
CLASS_CHANNELS = 256
class channels与class numbers出现了错误,其中一个为256一个为80
定义时进行更改
self.features = nn.Sequential(
model.conv1,
model.bn1,
model.relu,
model.maxpool,
model.layer1,
model.layer2,
model.layer3,
model.layer4, )
self.gmp = nn.AdaptiveMaxPool2d(1)
self.fc=nn.Sequential(utils.BasicLinear(in_channels=2048, out_channels=1024),
utils.BasicLinear(in_channels=1024, out_channels=nclasses), )
将最终输出结果改为nclasses
之前fc层选用错了,不该用basicLinear
self.fc=nn.Sequential(utils.BasicLinear(in_channels=2048, out_channels=1024),
utils.BasicLinear(in_channels=1024, out_channels=nclasses), )
basiclinear是我们自己定义的。
class BasicLinear(nn.Module):
def __init__(self, in_channels, out_channels):
super(BasicLinear, self).__init__()
self.fc = nn.Linear(in_features=in_channels, out_features=out_channels, bias=False)
self.bn = nn.BatchNorm1d(num_features=out_channels)
self.relu = nn.ReLU()
加了ReLU可能导致无法收敛,我们应当直接加linear层来保证模型收敛。
更改self.fc尺寸
# self.fc=nn.Sequential(utils.BasicLinear(in_channels=2048, out_channels=1024),
# utils.BasicLinear(in_channels=1024, out_channels=nclasses), )
self.fc = nn.Linear(in_features=2048, out_features=nclasses, bias=True)
训练过程依然占据四张显卡。且占用内存基本不变。
若用三张显卡训练,会报错,list out of memroy
[[email protected] chun-ML_GCN]$ nvidia-smi
Sun May 5 16:38:47 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.81 Driver Version: 384.81 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla P4 On | 00000000:02:00.0 Off | 0 |
| N/A 61C P0 64W / 75W | 5711MiB / 7606MiB | 47% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla P4 On | 00000000:03:00.0 Off | 0 |
| N/A 62C P0 62W / 75W | 5349MiB / 7606MiB | 71% Default |
+-------------------------------+----------------------+----------------------+
| 2 Tesla P4 On | 00000000:82:00.0 Off | 0 |
| N/A 58C P0 65W / 75W | 5375MiB / 7606MiB | 79% Default |
+-------------------------------+----------------------+----------------------+
| 3 Tesla P4 On | 00000000:83:00.0 Off | 0 |
| N/A 59C P0 63W / 75W | 5351MiB / 7606MiB | 38% Default |
+-------------------------------+----------------------+----------------------+
可以用watch -n 1 nvidia-smi指令,查看显卡动态占用。
背景:需要将模型之中加入SENet的内容,因此需要更改骨架。
SENet的PyTorch代码,此代码没有预训练:https://github.com/moskomule/senet.pytorch
加入新的有预训练的代码:
从模型引入相应的class
import models.senet_origin as senet_origin
文件代码地址见:SENet的PyTorch代码:https://github.com/moskomule/senet.pytorch
带有预训练的SENet的PyTorch代码:https://github.com/Xingxiangrui/various_pyTorch_network_structure/blob/master/senet_and_pretrained.py
# fixme new SE-resnet backbone
if backbone == 'resnet101':
model = senet_origin.se_resnet101()
elif backbone == 'resnet50':
model = senet_origin.se_resnet50()
elif backbone == 'resnet101_cbam':
import mymodels.cbam as cbam
model = cbam.resnet101_cbam()
elif backbone=='resnet150':
model=senet_origin.se_resnet152()
else:
raise Exception()
# self.features = nn.Sequential(
# model.conv1,
# model.bn1,
# model.relu,
# model.maxpool,
# model.layer1,
# model.layer2,
# model.layer3,
# model.layer4, )
self.features = nn.Sequential(
# model.layer0,
model.layer0,
model.layer1,
model.layer2,
model.layer3,
model.layer4 )
https://github.com/Xingxiangrui/various_pyTorch_network_structure/blob/master/senet_and_pretrained.py
直接从这里import即可。