这里以faster-rcnn和retinanet为例
# model settings
model = dict(
type='FasterRCNN', # 检测模型
backbone=dict( # 特征提取网络
type='ResNet', # 名称
depth=50, # 后面的都是初始化函数里的参数,我们可以通过这里
num_stages=4, # 来修改默认的参数
out_indices=(0, 1, 2, 3), # 输出层的索引
frozen_stages=1, # 冻结的层数,=1表示冻结resnet的第一个layer层
norm_cfg=dict(type='BN', requires_grad=True), # norm
norm_eval=True, # 训练过程冻结BN层,而在推理过程时使用BN层
# 不更新BN层的统计信息,running_var和running_mean,更新其weight和bias的学习参数。
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), # 加载的预训练权重
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048], # 输入FPN的各层通道
out_channels=256, # 输出通道
num_outs=5), #总的输出层数
rpn_head=dict(
type='RPNHead', # 一般单层预测
in_channels=256, # 输入通道数
feat_channels=256, # 特征层的通道数
anchor_generator=dict( # 生成anchor的设置
type='AnchorGenerator',
scales=[8], # Anchor的大小
ratios=[0.5, 1.0, 2.0], # Anchor的高宽比
strides=[4, 8, 16, 32, 64]), # 对应原图的步长
bbox_coder=dict(
type='DeltaXYWHBBoxCoder', # bbox的编解码
target_means=[.0, .0, .0, .0], # 均值
target_stds=[1.0, 1.0, 1.0, 1.0]), # 方差
loss_cls=dict( # 损失函数使用交叉熵,使用sigmoid
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
roi_head=dict( #
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]), # 对四个特征层进行ROI
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256, # ROI后的输入
fc_out_channels=1024, # 预测层
roi_feat_size=7,
num_classes=80, # 类别数量
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]), # 均值变小了
reg_class_agnostic=False, #class_agnostic表示输出bbox时只考
loss_cls=dict( #虑其是否为前景
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), # use_sigmoid=False,使用softmax
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner', # 最大IOU分配
pos_iou_thr=0.7, # 正样本阈值
neg_iou_thr=0.3, # 负样本阈值
min_pos_iou=0.3, # 补充正样本时最小的阈值
match_low_quality=True, # 开启低质量分配,确保高召回
ignore_iof_thr=-1), # 忽略
sampler=dict(
type='RandomSampler',
num=256, # 需采样的正负样本数量
pos_fraction=0.5, # 正样本比例
neg_pos_ub=-1, # 负样本比例,-1表示不忽略
add_gt_as_proposals=False), # 把ground truth加入proposal作为正样本
allowed_border=-1, #允许在bbox边界外扩一定的像素
pos_weight=-1, # 正样本权重,-1表示不改变原始的权重
debug=False),
rpn_proposal=dict(
nms_pre=2000, # nms保留预测bbox的数量
max_per_img=1000, # 最大保留数量
nms=dict(type='nms', iou_threshold=0.7),# nms分数阈值
min_bbox_size=0), # 最小bbox尺寸
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner', #和上面一样
pos_iou_thr=0.5, # 放宽了正样本的阈值
neg_iou_thr=0.5, #
min_pos_iou=0.5,
match_low_quality=False, # 关闭低质量框,保证准确率
ignore_iof_thr=-1), #
sampler=dict(
type='RandomSampler',
num=512, # 最后保留的样本数量
pos_fraction=0.25, #正样本比例
neg_pos_ub=-1, #负样本
add_gt_as_proposals=True), #
pos_weight=-1,
debug=False)),
test_cfg=dict(
rpn=dict(
nms_pre=1000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100) # 预测的最多保留100个bbox
# soft-nms is also supported for rcnn testing
# e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
))
# model settings
model = dict(
type='RetinaNet',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
start_level=1,
add_extra_convs='on_input',
num_outs=5),
bbox_head=dict( # 一阶段的这里不同
type='RetinaHead',
num_classes=80,
in_channels=256, # 输入256
stacked_convs=4, # 经过4个卷积层
feat_channels=256, # 4个特征层的通道数
#norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
anchor_generator=dict(
type='AnchorGenerator', # 生成anchor
octave_base_scale=4, # 基础尺寸,后面还要处理
scales_per_octave=3, # anchor数量
ratios=[0.5, 1.0, 2.0], #比例
strides=[8, 16, 32, 64, 128]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='FocalLoss', # FocalLoss
use_sigmoid=True,
gamma=2.0, # FocalLoss的一些超参数
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.4,
min_pos_iou=0,
ignore_iof_thr=-1),
allowed_border=-1,
pos_weight=-1,
debug=False),
test_cfg=dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100))
!!!下图是retinanet的anchor尺寸生成
下图是相应的anchor生成
# dataset settings
dataset_type = 'VOCDataset'
data_root = '/data/yzycode/datasets/VOCdevkit/'
#data_root = '/data/datasets/VOCdevkit/'
img_norm_cfg = dict( # 输入图像初始化,减去均值mean并处以方差std,to_rgb表示将bgr转为rgb
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1000, 600), keep_ratio=True), #(1000, 600),COCO(1333,800)
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Pad', size_divisor=32),
dict(type='Normalize', **img_norm_cfg),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(((1333, 800))),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='RepeatDataset', #to avoid reloading datasets frequently
times=3, #times,是倍数的一个设置。最终跑的迭代数是65行的total_epochs*times
dataset=dict(
type=dataset_type, #训练总的训练测试集trainval.txt效果和训练集,验证集分开
ann_file=data_root + 'VOC2007/ImageSets/Main/trainval.txt', #VOC207和VOC2007
img_prefix=data_root + 'VOC2007/', #数据集名字不是VOC2007或者VOC2012,要自己去voc.py修改:https://www.pythonheidong.com/blog/article/1134278/5d81d8615a9fd6e9ca40/
pipeline=train_pipeline)), #最好就按相应的名称排好,省得改。
val=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt', #test
img_prefix=data_root + 'VOC2007/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'VOC2007/ImageSets/Main/test.txt',
img_prefix=data_root + 'VOC2007/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='mAP')
# dataset settings
dataset_type = 'CocoDataset' # 数据集名称
data_root = 'data/coco/' # 根路径
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)#输入图像归一化,减去均值mean并处以方差std,将bgr转为rgb
train_pipeline = [ # 数据预处理和增强操作
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2, #每张GPU的bitchsize
workers_per_gpu=2, #工作的线程数
train=dict(
type=dataset_type, #下面的路径,也可以直接给绝对路径
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/' ,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/' ,
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
# ann_file=data_root + 'annotations/image_info_test-dev2017.json',
# img_prefix=data_root + 'test2017/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='bbox') #评估的方法,可以修改
# 后面可能还会详细讲,因为数据增强这里会用到
# optimizer
optimizer = dict(type='SGD', lr=0.0025, momentum=0.9, weight_decay=0.0001)
#注意下面的初始学习率,根据自己的GPU和给的总batchsize修改
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step', #优化策略
warmup='linear', #初始的学习率增加的策略,linear为线性增加
warmup_iters=500, #在初始的500次迭代中学习率逐渐增加
warmup_ratio=0.001, #起始的学习率,500后稳定在lr=0.02
step=[8, 11]) #在第8和11个epoch时降低学习率
runner = dict(type='EpochBasedRunner', max_epochs=12) #
checkpoint_config = dict(interval=1) # 每1个epoch存储一次模型,这个可以修改
# yapf:disable
log_config = dict(
interval=50, # 每迭代50次保存
hooks=[
dict(type='TextLoggerHook'), # txt信息保存
#dict(type='TensorboardLoggerHook') # tensorboard
])
# yapf:enable
custom_hooks = [dict(type='NumClassCheckHook')]
dist_params = dict(backend='nccl') # 分布式参数
log_level = 'INFO'
work_dir = './work_dirs/faster_rcnn_r50_fpn_1x' # log文件和模型文件存储路径
load_from = None # 加载模型的路径,None表示从预训练模型加载
resume_from = None #恢复训练模型的路径
workflow = [('train', 1)]
#workflosw = [('train', 2),('val', 1)] #表示先进行 n 个 epoch 的训练,然后再进行1个 epoch 的验证
还有些细节的地方没讲到,后面讲解其他相应的内容时会有所补充