背景:我们训练好的模型,validate只显示相应的validate的结果。我们需要深入代码底层,看到模型数据集上的表现并且打印出相应的数据集结果。
目录
一、加载并验证模型
1.1 加载模型
1.2 核心语句更改
1.3 创建新的文件
二、直接validate
2.1 learn基础上更改
2.2 validate
三、预测结果
3.1 on_forward函数前馈运算
3.2 模型预测
3.3 输出
四、结果转为np
4.1 torch.flaotTensor与np互相转换
错误做法(可不看)
4.2 torch.cuda.FloatTensor转np正确写法
4.3 逐个batch写入dict
4.4 所有batch写入文件
4.5 写入成功
与之前加载一样
# model params
MODEL = 'resnet101' # options: hgat_conv, hgat_fc, groupnet
BACKBONE = 'resnet101'
。。。
RESUME = './checkpoint/coco/resnet101_on_coco/model_best_81.3118.pth.tar'
模型的运行主要在engine之中。
engine = GCNMultiLabelMAPEngine(state)
# engine.learning(model, criterion, train_dataset, val_dataset, optimizer)
engine.validate_model(model, criterion, train_dataset, val_dataset, optimizer)
创建新的文件
from analyse_result_engine import *
在此文件之中加载模型并且运行模型。
原始代码的engine.learn可以运行,所以我们更改validate部分,让直接可以validate出结果。
def validate_model(self, model, criterion, train_dataset, val_dataset, optimizer=None):
print('validate model...')
self.init_learning(model, criterion)
print('init_learning done...')
# define train and val transform
train_dataset.transform = self.state['train_transform']
train_dataset.target_transform = self._state('train_target_transform')
print('train dataset transform done...')
val_dataset.transform = self.state['val_transform']
val_dataset.target_transform = self._state('val_target_transform')
print('val dataset transfrom done...')
# data loading code
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=self.state['batch_size'], shuffle=True,
num_workers=self.state['workers'], drop_last=True) # fixme
print('train loader done...')
val_loader = torch.utils.data.DataLoader(val_dataset,
batch_size=self.state['batch_size'], shuffle=False,
num_workers=self.state['workers'])
print('val loader done')
数据加载与模型加载等,下面是进行validate过程。
print('start validate...')
prec1 = self.validate(val_loader, model, criterion)
print('validate reulst prec is :',prec1)
return prec1
validate相当于验证模型在验证集上的情况。
def validate(self, data_loader, model, criterion):
# switch to evaluate mode
print('start model eval,fix BN and dropOut...')
model.eval()
self.on_start_epoch(False, model, criterion, data_loader)
if self.state['use_pb']:
data_loader = tqdm(data_loader, desc='Test')
end = time.time()
for i, (input, target) in enumerate(data_loader):
# measure data loading time
self.state['iteration'] = i
self.state['data_time_batch'] = time.time() - end
self.state['data_time'].add(self.state['data_time_batch'])
self.state['input'] = input
self.state['target'] = target
self.on_start_batch(False, model, criterion, data_loader) #pass
if self.state['use_gpu']:
self.state['target'] = self.state['target'].cuda(async=True)
self.on_forward(False, model, criterion, data_loader)
# measure elapsed time
self.state['batch_time_current'] = time.time() - end
self.state['batch_time'].add(self.state['batch_time_current'])
end = time.time()
# measure accuracy
self.on_end_batch(False, model, criterion, data_loader)
score = self.on_end_epoch(False, model, criterion, data_loader)
return score
模型对结果的预测在on_forward之中
for i, (input, target) in enumerate(data_loader):
# measure data loading time
self.state['iteration'] = i
self.state['data_time_batch'] = time.time() - end
self.state['data_time'].add(self.state['data_time_batch'])
self.state['input'] = input
self.state['target'] = target
self.on_start_batch(False, model, criterion, data_loader) #pass
if self.state['use_gpu']:
self.state['target'] = self.state['target'].cuda(async=True)
self.on_forward(False, model, criterion, data_loader)
最重要的语句是
self.state['output'] = model(feature_var, inp_var)
class GCNMultiLabelMAPEngine(MultiLabelMAPEngine):
def on_forward(self, training, model, criterion, data_loader, optimizer=None, display=True):
print('on_forward in GCNMultiLabelMAPEngine...')
feature_var = torch.autograd.Variable(self.state['feature']).float()
target_var = torch.autograd.Variable(self.state['target']).float()
inp_var = torch.autograd.Variable(self.state['input']).float().detach() # one hot
if not training:
feature_var.volatile = True
target_var.volatile = True
inp_var.volatile = True
# compute output
self.state['output'] = model(feature_var, inp_var)
# fixme==========================================
if self.state['loss_type'] == 'DeepMarLoss':
weights = self.state['DeepMarLoss'].weighted_label(target_var)
self.state['loss'] = criterion(self.state['output'], target_var,
weight=torch.autograd.Variable(weights.cuda()))
print('DeepMarLoss,model output:', self.state['output'], 'target_var:', target_var)
else:
self.state['loss'] = criterion(self.state['output'], target_var)
print('model output:', self.state['output'], 'label:', target_var)
# fixme=========================================================
# self.state['loss'] = criterion(self.state['output'], target_var)
if training:
optimizer.zero_grad()
self.state['loss'].backward()
nn.utils.clip_grad_norm(model.parameters(), max_norm=10.0)
optimizer.step()
首先,模型的label为 target_var,这个label为一个 [torch.cuda.FloatTensor of size 1x80 (GPU 0)]
target_var: Variable containing:
Columns 52 to 64
0 0 0 0 0 0 0 0 0 0 0 0 0
Columns 65 to 77
0 0 0 0 0 0 0 0 0 0 0 0 0
Columns 78 to 79
0 0
[torch.cuda.FloatTensor of size 1x80 (GPU 0)]
网络预测输出也是一个variable, 负值表示标签0,正值表示标签1
Columns 56 to 63
-3.5575 -7.9067 -4.5054 -7.6983 -4.6421 -8.1611 -7.7999 -5.5161
Columns 64 to 71
-9.4290 -7.2869 -8.4399 -6.7703 -9.1136 -7.4685 -10.0482 -8.6373
Columns 72 to 79
-6.0110 -6.4669 -2.7479 -6.1014 -5.7885 -6.0999 -9.4972 -9.4879
[torch.cuda.FloatTensor of size 1x80 (GPU 0)]
https://blog.csdn.net/hustchenze/article/details/79154139
https://blog.csdn.net/WYXHAHAHA123/article/details/88358395
转为torch.Float
cpu_labels=inp_var.type(torch.FloatTensor)
cpu_output=self.state['output'].type(torch.FloatTensor)
直接转换会报错:
File "/home/xingxiangrui/chun-ML_GCN/analyse_result_engine.py", line 614, in on_forward_analyse
np_output=cpu_output.numpy()
File "/home/xingxiangrui/chun-ML_GCN/env/lib/python3.6/site-packages/torch/autograd/variable.py", line 67, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Variable' object has no attribute 'numpy'
需要 np_output=cpu_output.data().numpy()
torch.Tensor 转 numpy
ndarray = tensor.numpy()
*gpu上的tensor不能直接转为numpy
ndarray = tensor.data().numpy()
打出
print('target_var[0,1]',target_var[0,1])
target_var[0,1] Variable containing:
0
[torch.cuda.FloatTensor of size 1 (GPU 0)]
方法一:
如何转成numpy格式:先转为torch.FloatTensor,再转到cpu上,再转为data,然后进行numpy
# GPU---CPU---.data-----.numpy
cpu_output=self.state['output'].type(torch.FloatTensor).cpu()
cpu_output_tensor = cpu_output.data()
np_output=cpu_output_tensor.numpy()
print('np_output',np_output)
出现报错:
self.on_forward_analyse(False, model, criterion, data_loader)
File "/home/xingxiangrui/chun-ML_GCN/analyse_result_engine.py", line 614, in on_forward_analyse
cpu_output_tensor = cpu_output.data()
TypeError: 'torch.FloatTensor' object is not callable
方法二:
# .data-----.numpy
output_data_tensor=self.state['output'].data()
np_output=output_data_tensor.numpy()
print('np_output',np_output)
报错:
File "/home/xingxiangrui/chun-ML_GCN/analyse_result_engine.py", line 621, in on_forward_analyse
output_data_tensor=self.state['output'].data()
TypeError: 'torch.cuda.FloatTensor' object is not callable
方法三
#.cpu------.data-----.numpy
cpu_output=self.state['output'].cpu()
cpu_output_tensor = cpu_output.data()
np_output=cpu_output_tensor.numpy()
print('np_output',np_output)
报错:
File "/home/xingxiangrui/chun-ML_GCN/analyse_result_engine.py", line 627, in on_forward_analyse
cpu_output_tensor = cpu_output.data()
TypeError: 'torch.FloatTensor' object is not callable
错在 .data后面不用加括号,低级错误,调试半天。
另外,需要先转到cpu上再写入numpy
先转到cpu用,.cpu(), 再用numpy()
# .data-----.cpu()------.numpy
output_data_np=self.state['output'].cpu().data.numpy()
print('output_data_np',output_data_np)
终于可以回归正常的np.array.batch size为1时候,尺寸为下面这样:
output_data_np [[ -4.905565 -7.9314375 -6.8639855 -8.622047 -8.28002
-9.482967 -6.8433294 -6.62575 -8.268795 -7.9359117
-7.8931932 -8.470847 -9.527361 -5.35377 -7.7762527
-6.008719 -6.3952923 -7.7764564 -7.4207873 -7.5054035
-9.502627 -8.539873 -11.399463 -8.71327 -7.6494164
-8.785699 -5.9663854 -6.3047256 -7.9503956 -7.047566
-8.007742 -8.517551 -8.792 -11.9106 -11.044824
-8.0702095 -8.387167 -9.257486 -8.852714 -4.3451905
4.9528904 -0.9098591 -0.13418153 -1.3261114 -2.517248
-2.8953822 -9.144631 -6.282343 -3.4643247 -6.796801
9.929454 -2.3962693 -6.760852 -6.008553 -8.769535
-5.242237 -2.0778108 -6.062523 -7.2794127 -10.359557
3.7036612 -11.142566 -6.4875765 -6.9061413 -6.9643354
-6.8476014 -8.822233 -5.1054015 -8.188503 -6.054454
-7.3732686 -7.790133 -9.048996 -4.3626633 -6.2686796
-6.826546 -6.6860323 -9.049575 -11.196688 -11.37617 ]]
避免繁琐的numpy操作,不如直接写入dict,相对方便。
网络预测之后,数据写入numpy,label写入numpy,写入dict,直接return,作为函数返回值。
# .data-----.cpu()------.numpy
output_data_np=self.state['output'].cpu().data.numpy()
labels_np=inp_var.cpu().data.numpy()
output_and_labels={'output_data_np':output_data_np,'labels_np':labels_np}
# print('output_data_np',output_data_np)
print('calculate output_data_np')
return output_and_labels
外层嵌套之中,将所有batch的结果分别写入dict
all_output_results={}
all_labels={}
for i, (input, target) in enumerate(data_loader):
# measure data loading time
self.state['iteration'] = i
self.state['data_time_batch'] = time.time() - end
self.state['data_time'].add(self.state['data_time_batch'])
self.state['input'] = input
self.state['target'] = target
self.on_start_batch(False, model, criterion, data_loader) #pass
if self.state['use_gpu']:
self.state['target'] = self.state['target'].cuda(async=True)
output_and_labels=self.on_forward_analyse(False, model, criterion, data_loader)
# output_and_labels={'output_data_np':output_data_np,'labels_np':labels_np}
output_data_np=output_and_labels['output_data_np']
labels_np=output_and_labels['labels_np']
all_output_results[i]=output_data_np
all_labels[i]=labels_np
然后所有循环结束之后写入文件。读取的时候一定要注意写成后缀,rb千万别写成 wb,不然文件会被清零然后需要重新生成。
print('all_output_results',all_output_results)
print('all_labels',all_labels)
with open('checkpoint/coco/resnet101_on_coco/model_results.pkl', 'wb') as f:
print("write model result into checkpoint/coco/resnet101_on_coco/model_results_and_labels.pkl")
pickle.dump(all_output_results, f)
with open('checkpoint/coco/resnet101_on_coco/coco_labels_in_np.pkl', 'wb') as f:
print("write model result into checkpoint/coco/resnet101_on_coco/model_results_and_labels.pkl")
pickle.dump(all_labels, f)
写入路径下多了两个pkl文件
total 4.3G
-rw-rw-r-- 1 xingxiangrui xingxiangrui 164M May 7 16:34 checkpoint.pth.tar
-rw-rw-r-- 1 xingxiangrui xingxiangrui 3.6G May 14 21:49 coco_labels_in_np.pkl
-rw-rw-r-- 1 xingxiangrui xingxiangrui 171M May 5 12:38 model_best_5.6403.pth.tar
-rw-rw-r-- 1 xingxiangrui xingxiangrui 164M May 6 16:08 model_best_81.3118.pth.tar
-rw-rw-r-- 1 xingxiangrui xingxiangrui 164M May 6 16:08 model_best.pth.tar
-rw-rw-r-- 1 xingxiangrui xingxiangrui 15M May 14 21:49 model_results.pkl
-rw------- 1 xingxiangrui xingxiangrui 2.2M May 7 17:38 resnet_on_coco_output.out
但是存在一个奇怪的现象需要我们实验验证。就是coco的label标签,只有0,1,但是维度转换为float输出的时候1*80*300维度。
查找错误原因,在于模型加载出来的标签,应该是targe_var,程序之中用成了inp_var ,改过来即可。