#1 尝试跑https://github.com/L1aoXingyu/reid_baseline的试验,发现import的问题如下:
Traceback (most recent call last):
File "tools/train.py", line 15, in
from data import make_data_loader
File "./data/__init__.py", line 7, in
from .build import make_data_loader
File "./data/build.py", line 10, in
from .datasets import init_dataset, ImageDataset
File "./data/datasets/__init__.py", line 6, in
from .cuhk03 import CUHK03
File "./data/datasets/cuhk03.py", line 10, in
from scipy.misc import imsave
ImportError: cannot import name 'imsave' from 'scipy.misc' (/home/hensel/anaconda3/envs/torch/lib/python3.7/site-packages/scipy/misc/__init__.py)
#2 √ 成功运行后,训练又被终止了,error如下:
Traceback (most recent call last):
File "tools/train.py", line 87, in
main()
File "tools/train.py", line 83, in main
train(cfg)
File "tools/train.py", line 46, in train
num_query
File "./engine/trainer.py", line 150, in do_train
trainer.run(train_loader, max_epochs=epochs)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/engine/engine.py", line 359, in run
self._handle_exception(e)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/engine/engine.py", line 324, in _handle_exception
raise e
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/engine/engine.py", line 350, in run
self._fire_event(Events.EPOCH_COMPLETED)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/engine/engine.py", line 259, in _fire_event
func(self, *(event_args + args), **kwargs)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/handlers/checkpoint.py", line 172, in __call__
self._save(obj=obj, path=path)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/handlers/checkpoint.py", line 130, in _save
self._internal_save(obj, tmp.file)
File "/home/wyl/anaconda3/lib/python3.7/site-packages/ignite/handlers/checkpoint.py", line 144, in _internal_save
raise ValueError("Object should have `state_dict` method.")
ValueError: Object should have `state_dict` method.
最后发现它和cfg.SOLVER.CHECKPOINT_PERIOD有关,一步步定位到
checkpointer = ModelCheckpoint(output_dir, cfg.MODEL.NAME, checkpoint_period, n_saved=10, require_empty=False)
Google看到有人碰到相同的issue,解决方案如下:
将
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': model.state_dict(),
'optimizer': optimizer.state_dict()})
改为
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': model, 'optimizer': optimizer})
即可。