接到上文
# Show 10 * 3 images results each epoch
if ii % (num_img_tr // 10) == 0:
grid_image = make_grid(inputs[:3].clone().cpu().data, 3, normalize=True)
writer.add_image('Image', grid_image, global_step)
grid_image = make_grid(utils.decode_seg_map_sequence(torch.max(outputs[:3], 1)[1].detach().cpu().numpy()), 3, normalize=False,
range=(0, 255))
writer.add_image('Predicted label', grid_image, global_step)
grid_image = make_grid(utils.decode_seg_map_sequence(torch.squeeze(labels[:3], 1).detach().cpu().numpy()), 3, normalize=False, range=(0, 255))
writer.add_image('Groundtruth label', grid_image, global_step)
这部分待补充,到现在为止的需要补充的是模型框架,tensorboardx,用writer数据的保存等知识
# Save the model
if (epoch % snapshot) == snapshot - 1: # snapshot = 10
torch.save(net.state_dict(), os.path.join(save_dir, 'models', modelName + '_epoch-' + str(epoch) + '.pth'))
print("Save model at {}\n".format(os.path.join(save_dir, 'models', modelName + '_epoch-' + str(epoch) + '.pth')))
net.load_state_dict(
torch.load(os.path.join(save_dir, 'models', modelName + '_epoch-' + str(resume_epoch - 1) + '.pth'),
map_location=lambda storage, loc: storage)) # Load all tensors onto the CPU
加载模型参数
每十次epoch就保存一次模型参数,这个方式可以待优化!!!! torch.save(net.state_dict(), os.path.join(save_dir, 'models', modelName + '_epoch-' + str(epoch) + '.pth')) 保存文件名字是pth结尾的.net.state_dict()这里有一个小疑问要解答一下,根据前面可知net以及它的参数全部都在GPU上面,这个时候保存的地点明显就是本地路径,为什么不先转移到CPU再保存?原因可能是函数state_dict()已经进行了处理所以不需要了.
# One testing epoch
if useTest and epoch % nTestInterval == (nTestInterval - 1): # nTestInterval = 5
total_miou = 0.0
net.eval()
for ii, sample_batched in enumerate(testloader):
inputs, labels = sample_batched['image'], sample_batched['label']
# Forward pass of the mini-batch
inputs, labels = Variable(inputs, requires_grad=True), Variable(labels)
if gpu_id >= 0:
inputs, labels = inputs.cuda(), labels.cuda()
with torch.no_grad():
outputs = net.forward(inputs)
predictions = torch.max(outputs, 1)[1]
loss = criterion(outputs, labels, size_average=False, batch_average=True)
running_loss_ts += loss.item()
total_miou += utils.get_iou(predictions, labels)
# Print stuff
if ii % num_img_ts == num_img_ts - 1:
miou = total_miou / (ii * testBatch + inputs.data.shape[0])
running_loss_ts = running_loss_ts / num_img_ts
print('Validation:')
print('[Epoch: %d, numImages: %5d]' % (epoch, ii * testBatch + inputs.data.shape[0]))
writer.add_scalar('data/test_loss_epoch', running_loss_ts, epoch)
writer.add_scalar('data/test_miour', miou, epoch)
print('Loss: %f' % running_loss_ts)
print('MIoU: %f\n' % miou)
running_loss_ts = 0
上面的就是验证集部分了,也包含在了训练的epoch里面.
net.eval() #进行测试
#之前我们已经看到了如下的包含net的函数
net.load_state_dict(
torch.load(os.path.join(save_dir, 'models', modelName + '_epoch-' + str(resume_epoch - 1) + '.pth'),
map_location=lambda storage, loc: storage))
net.cuda()
optimizer = optim.SGD(net.parameters(), lr=p['lr'], momentum=p['momentum'], weight_decay=p['wd'])
net.train()
net.forward(inputs)
torch.save(net.state_dict(), os.path.join(save_dir, 'models', modelName + '_epoch-' + str(epoch) + '.pth'))
测试的时候就要用net.eval()和训练的时候要使用net.train()是一样的
# Forward pass of the mini-batch
inputs, labels = Variable(inputs, requires_grad=True), Variable(labels)
if gpu_id >= 0:
inputs, labels = inputs.cuda(), labels.cuda()
with torch.no_grad():
outputs = net.forward(inputs)
在这里因为我们不需要求梯度了所以使用的是torch.no_grad(),当然这里也可以修改为以下代码, Variable默认是False
# Forward pass of the mini-batch
inputs, labels = Variable(inputs, requires_grad=False), Variable(labels)
# or inputs, labels = Variable(inputs), Variable(labels)
if gpu_id >= 0:
inputs, labels = inputs.cuda(), labels.cuda()
outputs = net.forward(inputs)
predictions = torch.max(outputs, 1)[1]
torch.max函数功能参考:https://blog.csdn.net/Z_lbj/article/details/79766690
predictions的结构和数值现在还没有弄清楚,要先弄明白网络的输出之后才能知道。
接下来是计算miou:
total_miou += utils.get_iou(predictions, labels)
def get_iou(pred, gt, n_classes=21):
total_miou = 0.0
for i in range(len(pred)):
pred_tmp = pred[i]
gt_tmp = gt[i]
intersect = [0] * n_classes # 符号*表示倍乘
# union=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
union = [0] * n_classes
for j in range(n_classes):
match = (pred_tmp == j) + (gt_tmp == j)
it = torch.sum(match == 2).item()
un = torch.sum(match > 0).item()
intersect[j] += it
union[j] += un
iou = []
unique_label = np.unique(gt_tmp.data.cpu().numpy())
for k in range(len(intersect)):
if k not in unique_label:
continue
iou.append(intersect[k] / union[k])
miou = (sum(iou) / len(iou))
total_miou += miou
return total_miou
utils.get_iou(predictions, labels)计算一个batch的miou的函数,intersect保存的是每一类物体的预测和标签一致的数量,union保存的是每一类物体的数量也就是总的数量,也有可能还包含着将背景预测成物体的那些像素或者说是物体的数量。 unique_label = np.unique(gt_tmp.data.cpu().numpy())是得到标签图中含有的种类数目。
if k not in unique_label:
continue
上面这句保证了将背景预测成物体的那些像素或者说是物体的数量这种情况不会在计算里面出现,也就是说把伪标签去掉了。所以iou的大小可能不是21个元素,大部分是小于21。并且在这里计算出来的每一类iou=预测出的真实标签数量/真实标签数量。
这个就是将所有的batch产生的miou加起来得到total_miou:
total_miou += utils.get_iou(predictions, labels)
# 总共的miou值
跑完全部验证图图片的时候:
# Print stuff
if ii % num_img_ts == num_img_ts - 1:
miou = total_miou / (ii * testBatch + inputs.data.shape[0])
running_loss_ts = running_loss_ts / num_img_ts
print('Validation:')
print('[Epoch: %d, numImages: %5d]' % (epoch, ii * testBatch + inputs.data.shape[0]))
writer.add_scalar('data/test_loss_epoch', running_loss_ts, epoch)
writer.add_scalar('data/test_miour', miou, epoch)
print('Loss: %f' % running_loss_ts)
print('MIoU: %f\n' % miou)
running_loss_ts = 0
这里ii * testBatch + inputs.data.shape[0]=241*6+6=242*6=1452, 而实际上总张数是1449,就是说最后一个batch只有3张图片。
那么这里得到的miou就是平均每一张的iou值,running_loss_ts也是平均每一张图图片的损失。
最终全部的epoch跑完之后就要关闭
writer.close()