主要照这篇博客进行训练配置,因为没有GPU所以好多坑,CPU训练可以参见这篇博客
正所谓,跑通了的都一样,错误千万样。按照教程来也是坑多
训练:
python train_faster_rcnn_alt_opt.py --net_name ZF --weights /home/lys/py-faster-rcnn/data/imagenet_models/ZF.v2.caffemodel --cfg /home/lys/py-faster-rcnn/experiments/cfgs/faster_rcnn_alt_opt.yml --imdb voc_2007_trainval
error1:
Cannot use GPU in CPU-only Caffe: check mode.
把py-faster-rcnn/tools/下的所有py文件中的GPU注释掉,然后mode设为cpu。示例如下:
# caffe.set_mode_gpu()
caffe.set_mode_cpu()
# if args.gpu_id is not None:
# caffe.set_device(args.gpu_id)
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "train_faster_rcnn_alt_opt.py", line 125, in train_rpn
roidb, imdb = get_roidb(imdb_name)
File "train_faster_rcnn_alt_opt.py", line 62, in get_roidb
imdb = get_imdb(imdb_name)
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/factory.py", line 38, in get_imdb
return __sets[name]()
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/factory.py", line 20, in
__sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/pascal_voc.py", line 39, in __init__
self._image_index = self._load_image_set_index()
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/pascal_voc.py", line 83, in _load_image_set_index
'Path does not exist: {}'.format(image_set_file)
AssertionError: Path does not exist: /home/lys/py-faster-rcnn/data/VOCdevkit2007/VOC2007/ImageSets/Main/trainval.txt
solution2:
智障错误。只顾了查看trainval.txtz在不在了,没有创建VOCdevkit2007文件夹,直接就VOC2007了
error3:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "train_faster_rcnn_alt_opt.py", line 125, in train_rpn
roidb, imdb = get_roidb(imdb_name)
File "train_faster_rcnn_alt_opt.py", line 68, in get_roidb
roidb = get_training_roidb(imdb)
File "/home/lys/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 118, in get_training_roidb
imdb.append_flipped_images()
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/imdb.py", line 108, in append_flipped_images
boxes = self.roidb[i]['boxes'].copy()
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/imdb.py", line 67, in roidb
self._roidb = self.roidb_handler()
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/pascal_voc.py", line 112, in gt_roidb
for index in self.image_index]
File "/home/lys/py-faster-rcnn/tools/../lib/datasets/pascal_voc.py", line 217, in _load_pascal_annotation
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
KeyError: 'leftatrial'
cls = self._class_to_ind[obj.find('name').text.lower().strip()]把lower()去掉,上面提到的第一篇博客有讲
I0627 10:57:37.710443 10173 solver.cpp:81] Creating training net from train_net file: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt
F0627 10:57:37.710464 10173 io.cpp:36] Check failed: fd != -1 (-1 vs. -1) File not found: models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt
solution4:
把/home/lys/py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt下的4个stage1_fast_rcnn_solver30k40k类的文件设置绝对路径
train_net: "/home/lys/py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt"上文提到的第二篇博客
error5:
F0627 11:27:37.913828 10633 smooth_L1_loss_layer.cpp:54] Not Implemented Yet
solution5:
实现这个文件中的两个函数,然后进入到caffe-fast-rcnn下重新make一下。参照这篇博客(template处缺少
// ------------------------------------------------------------------
// Fast R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Ross Girshick
// ------------------------------------------------------------------
#include "caffe/fast_rcnn_layers.hpp"
namespace caffe {
template
void SmoothL1LossLayer::LayerSetUp(
const vector*>& bottom, const vector*>& top) {
SmoothL1LossParameter loss_param = this->layer_param_.smooth_l1_loss_param();
sigma2_ = loss_param.sigma() * loss_param.sigma();
has_weights_ = (bottom.size() >= 3);
if (has_weights_) {
CHECK_EQ(bottom.size(), 4) << "If weights are used, must specify both "
"inside and outside weights";
}
}
template
void SmoothL1LossLayer::Reshape(
const vector*>& bottom, const vector*>& top) {
LossLayer::Reshape(bottom, top);
CHECK_EQ(bottom[0]->channels(), bottom[1]->channels());
CHECK_EQ(bottom[0]->height(), bottom[1]->height());
CHECK_EQ(bottom[0]->width(), bottom[1]->width());
if (has_weights_) {
CHECK_EQ(bottom[0]->channels(), bottom[2]->channels());
CHECK_EQ(bottom[0]->height(), bottom[2]->height());
CHECK_EQ(bottom[0]->width(), bottom[2]->width());
CHECK_EQ(bottom[0]->channels(), bottom[3]->channels());
CHECK_EQ(bottom[0]->height(), bottom[3]->height());
CHECK_EQ(bottom[0]->width(), bottom[3]->width());
}
diff_.Reshape(bottom[0]->num(), bottom[0]->channels(),
bottom[0]->height(), bottom[0]->width());
errors_.Reshape(bottom[0]->num(), bottom[0]->channels(),
bottom[0]->height(), bottom[0]->width());
// vector of ones used to sum
ones_.Reshape(bottom[0]->num(), bottom[0]->channels(),
bottom[0]->height(), bottom[0]->width());
for (int i = 0; i < bottom[0]->count(); ++i) {
ones_.mutable_cpu_data()[i] = Dtype(1);
}
}
template
void SmoothL1LossLayer::Forward_cpu(const vector*>& bottom,
const vector*>& top) {
//NOT_IMPLEMENTED;
// cpu implementation
CHECK_EQ(bottom[0]->count(1), bottom[1]->count(1))
<< "Inputs must have the same dimension.";
int count = bottom[0]->count();
caffe_sub(count,
bottom[0]->cpu_data(),
bottom[1]->cpu_data(),
diff_.mutable_cpu_data());
if(has_weights_){
caffe_mul(count,
bottom[2]->cpu_data(),
diff_.cpu_data(),
diff_.mutable_cpu_data());
}
// f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma
// |x| - 0.5 / sigma / sigma otherwise
const Dtype* in = diff_.cpu_data();
Dtype* out = errors_.mutable_cpu_data();
for(int index=0; indexcpu_data(), out, errors_.mutable_cpu_data());
}
// compute loss
Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
// end cpu implementation
}
template
void SmoothL1LossLayer::Backward_cpu(const vector*>& top,
const vector& propagate_down, const vector*>& bottom) {
//NOT_IMPLEMENTED;
// cpu implementation
int count = diff_.count();
const Dtype* in = diff_.cpu_data();
Dtype* out = diff_.mutable_cpu_data();
for(int index=0; index < count; index++){
Dtype val = in[index];
Dtype abs_val = abs(val);
if(abs_val < 1.0 / sigma2_){
out[index] = sigma2_ * val;
}
else{
out[index] = (Dtype(0) < val) - (val < Dtype(0));
}
}
for(int i=0; i<2; ++i){
if(propagate_down[i]){
const Dtype sign = (i == 0) ? 1 : -1;
const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
caffe_cpu_axpby(
count,
alpha,
out,//diff_.cpu_data(),
Dtype(0),
bottom[i]->mutable_cpu_diff());
if(has_weights_){
caffe_mul(
count,
bottom[2]->cpu_data(),
bottom[i]->cpu_diff(),
bottom[i]->mutable_cpu_data());
caffe_mul(
count,
bottom[3]->cpu_data(),
bottom[i]->cpu_diff(),
bottom[i]->mutable_cpu_data());
}
}
}
// end cpu implementation
}
#ifdef CPU_ONLY
STUB_GPU(SmoothL1LossLayer);
#endif
INSTANTIATE_CLASS(SmoothL1LossLayer);
REGISTER_LAYER_CLASS(SmoothL1Loss);
} // namespace caffe
error6:一天了。。。。我已经从第一个错误犯到第六个了。。。。加油。。。。。。
Process Process-3:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "train_faster_rcnn_alt_opt.py", line 198, in train_fast_rcnn
max_iters=max_iters)
File "/home/lys/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 160, in train_net
model_paths = sw.train_model(max_iters)
File "/home/lys/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 101, in train_model
self.solver.step(1)
File "/home/lys/py-faster-rcnn/tools/../lib/roi_data_layer/layer.py", line 144, in forward
blobs = self._get_next_minibatch()
File "/home/lys/py-faster-rcnn/tools/../lib/roi_data_layer/layer.py", line 63, in _get_next_minibatch
return get_minibatch(minibatch_db, self._num_classes)
File "/home/lys/py-faster-rcnn/tools/../lib/roi_data_layer/minibatch.py", line 55, in get_minibatch
num_classes)
File "/home/lys/py-faster-rcnn/tools/../lib/roi_data_layer/minibatch.py", line 100, in _sample_rois
fg_inds, size=fg_rois_per_this_image, replace=False)
File "mtrand.pyx", line 1176, in mtrand.RandomState.choice (numpy/random/mtrand/mtrand.c:18822)
TypeError: 'numpy.float64' object cannot be interpreted as an index
solution6:
调整numpy版本,博客
python -c "import numpy;print numpy.version.version"#查看numpy版本,1.12.1
sudo pip install -U numpy==1.11.0
再训练 又出错,不过可以避免的,我明明记得删掉pkl了。。。。
error7:
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "train_faster_rcnn_alt_opt.py", line 125, in train_rpn
roidb, imdb = get_roidb(imdb_name)
File "train_faster_rcnn_alt_opt.py", line 68, in get_roidb
roidb = get_training_roidb(imdb)
File "/home/lys/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 122, in get_training_roidb
rdl_roidb.prepare_roidb(imdb)
File "/home/lys/py-faster-rcnn/tools/../lib/roi_data_layer/roidb.py", line 27, in prepare_roidb
roidb[i]['image'] = imdb.image_path_at(i)
IndexError: list index out of range
solution7:
删除py-faster-rcnn/data/cache/ 文件夹下的.pkl文件,或者改名备份,重新训练即可。博客最后的error2
error8:
obj for obj in objs if int(obj.find('difficult').text) == 0]
zhushudiao
error9:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "train_faster_rcnn_alt_opt.py", line 195, in train_fast_rcnn
max_iters=max_iters)
File "/home/amax/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 160, in train_net
model_paths = sw.train_model(max_iters)
File "/home/amax/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 111, in train_model
model_paths.append(self.snapshot())
File "/home/amax/py-faster-rcnn/tools/../lib/fast_rcnn/train.py", line 73, in snapshot
self.bbox_stds[:, np.newaxis])
ValueError: operands could not be broadcast together with shapes (84,4096) (8,1)
测试:
将py-faster-rcnn/output/faster_rcnn_alt_opt/voc_2007_trainval/下的ZF_faster_rcnn_final.caffemodel复制到/py-faster-rcnn/data/faster_rcnn_models/下,在tools下执行
python demo.py --net zf --cpu
学习率之类:
py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt中的solve文件设置
迭代次数
py-faster-rcnn/tools/train_faster_rcnn_alt_opt.py中修改
py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt里对应的solver文件(有4个)也修改,stepsize小于上面修改的数值