本文大部分参考mmdet官方文档以及nekokiku
pip install torch torchvision torchaudio
ps. 似乎没有cuda和cudnn驱动也可以用gpu版的torch,anaconda下已经预装了它俩。
pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
ps. torch_version 只有1.7.0 1.8.0 1.9.0等大版本,1.8.1也得输入1.8.0
git clone https://github.com/open-mmlab/mmdetection.git
# 其他版本可以使用wget下载 e.g. 2.13.0
# https://github.com/open-mmlab/mmdetection/archive/refs/tags/v2.13.0.zip
# cd ~/downloads
# unzip -q v2.13.0.zip
# cd mmdetection-2.13.0
cd mmdetection
pip install -r requirements/build.txt
pip install -v -e . # or "python setup.py develop"
pip install wandb
# 在我的环境里有些不兼容,手动将requests升级
# pip install requests==2.26.0
本文数据集来自cowboyoutfits
包含5分类,牛仔套装中的(belt, sunglasses, boot, cowboy_hat, jacket)
训练集共3062,采用coco格式标注,dev phase测试集999, final phase测试集818
最大的挑战是数据不平衡,最少的类别belt只有21个
参考讨论区
不同的是,注释掉所有valid部分,将划分出的test作为验证集,test_n=150,最小比例0.3
这样将取150与类别数*0.3的最小值作为验证
共划分出2609作为train, 453作为val
import json
import random
import copy
from pycocotools.coco import COCO
def create_subset(c, cats, test_n=150):
new_coco = dict()
new_coco['info'] = {"description": "CowboySuit",
"url": "http://github.com/dmlc/gluon-cv",
"version": "1.0", "year": 2021,
"contributor": "GluonCV/AutoGluon",
"date_created": "2021/07/01"}
new_coco["licenses"] = [
{"url": "http://creativecommons.org/licenses/by/2.0/", "id": 4, "name": "Attribution License"}]
cat_ids = c.getCatIds(cats)
train_img_ids = set()
test_img_ids = set()
for cat in cat_ids[::-1]:
img_ids = copy.copy(c.getImgIds(catIds=[cat]))
random.shuffle(img_ids)
tn = min(test_n, int(len(img_ids) * 0.3))
new_test = set(img_ids[:tn])
exist_test_ids = new_test.intersection(train_img_ids)
test_ids = new_test.difference(exist_test_ids)
train_ids = set(img_ids).difference(test_ids)
print(tn, len(img_ids), len(new_test), len(test_ids), len(train_ids))
train_img_ids.update(train_ids)
test_img_ids.update(test_ids)
# prune duplicates
dup = train_img_ids.intersection(test_img_ids)
train_img_ids = train_img_ids - dup
train_anno_ids = set()
test_anno_ids = set()
for cat in cat_ids:
train_anno_ids.update(c.getAnnIds(imgIds=list(train_img_ids), catIds=[cat]))
test_anno_ids.update(c.getAnnIds(imgIds=list(test_img_ids), catIds=[cat]))
assert len(train_img_ids.intersection(test_img_ids)) == 0, 'img id conflicts, {} '.format(
train_img_ids.intersection(test_img_ids))
assert len(train_anno_ids.intersection(test_anno_ids)) == 0, 'anno id conflicts'
print('train img ids #:', len(train_img_ids), 'train anno #:', len(train_anno_ids))
print('test img ids #:', len(test_img_ids), 'test anno #:', len(test_anno_ids))
new_coco_test = copy.deepcopy(new_coco)
new_coco["images"] = c.loadImgs(list(train_img_ids))
new_coco["annotations"] = c.loadAnns(list(train_anno_ids))
for ann in new_coco["annotations"]:
ann.pop('segmentation', None)
new_coco["categories"] = c.loadCats(cat_ids)
new_coco_test["images"] = c.loadImgs(list(test_img_ids))
new_coco_test["annotations"] = c.loadAnns(list(test_anno_ids))
for ann in new_coco_test["annotations"]:
ann.pop('segmentation', None)
new_coco_test["categories"] = c.loadCats(cat_ids)
print('new train split, images:', len(new_coco["images"]), 'annos:', len(new_coco["annotations"]))
print('new test split, images:', len(new_coco_test["images"]), 'annos:', len(new_coco_test["annotations"]))
return new_coco, new_coco_test
if __name__ == '__main__':
anno = 'cowboy/annotations/train.json'
classes = ['jacket', 'sunglasses', 'boot', 'cowboy_hat', 'belt', ]
c = COCO(anno)
new_train, new_val = create_subset(c, classes)
with open('cowboy/annotations/new_train.json', 'w') as f:
json.dump(new_train, f)
with open('cowboy/annotations/new_val.json', 'w') as f:
json.dump(new_val, f)
看了很多博客,都清一色要求去修改mmdet/core/datasets/coco.py中coco的类别,
实际上在config文件中覆盖就行了
mkdir data
cd data
ln -s ~/det/cowboy/images images
ln -s ~/det/cowboy/annotations anno
mkdir checkpoints
_base_ = [
'../_base_/models/cascade_rcnn_r50_fpn.py',
'../_base_/datasets/coco_detection.py',
'../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
]
我这里使用cascade-r50作为baseline,搭配cosine学习率
由于cosine anneal与multi step有参数上的不同,我在 configs/_base_/schedules/下创建了schedule_Cosine.py
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='CosineAnnealing',
by_epoch=False,
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
min_lr=1e-7)
runner = dict(type='EpochBasedRunner', max_epochs=12)
为了后续更方便的网络结构调节以及各种tricks (然而并不work)我只继承了schedule和runtime
config如下
_base_ = [
'_base_/schedules/schedule_Cosine.py',
'_base_/default_runtime.py'
]
# model config
classes = ('belt', 'sunglasses', 'boot', 'cowboy_hat', 'jacket')
num_classes = len(classes)
max_epochs = 20
batch_size = 4
init_lr = 0.02 * batch_size / 2 / 8
wandb_project = 'kaggle_cowboy_outfits'
wandb_usrname = 'snowtyan'
wandb_runner = 'cascade-rs50-v2' # model name
'''
model=dict(
type='CascadeRCNN'
pretrained
backbone resnet50/resnest50
neck FPN/PAFPN/BFP
rpn_head
anchor_generator
bbox_coder
|- loss_cls CrossEntropyLoss/FocalLoss (X)
|- loss_bbox SmoothLoss/IoULoss
roi_head
|- bbox_head [head1 head2 head3]
|- num_classes
|- loss_cls
|- loss_bbox
train_cfg
|- rpn
|- rpn_proposal
|- nms
|- rcnn [1 2 3]
test_cfg
|- rpn
|- nms
|- rcnn
|- nms
)
'''
model = dict(
type='CascadeRCNN',
pretrained='open-mmlab://resnest50',
backbone=dict(
type='ResNeSt',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_eval=False,
style='pytorch',
stem_channels=64,
radix=2,
reduction_factor=4,
avg_down_stride=True),
neck=dict(
type='FPN',
# type='PAFPN' # no load from
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
# loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
reg_decoded_bbox=True,
loss_bbox=dict(type='GIoULoss', loss_weight=1.0)
),
roi_head=dict(
type='CascadeRoIHead',
num_stages=3,
stage_loss_weights=[1, 0.5, 0.25],
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=num_classes,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
# loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
reg_decoded_bbox=True,
loss_bbox=dict(type='GIoULoss', loss_weight=1.0)
),
dict(type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=num_classes,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
# loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
reg_decoded_bbox=True,
loss_bbox=dict(type='GIoULoss', loss_weight=1.0)
),
dict(type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=num_classes,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
# loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)
reg_decoded_bbox=True,
loss_bbox=dict(type='GIoULoss', loss_weight=1.0)
)
]),
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=2000,
max_per_img=2000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
# type='OHEMSampler', # OHEM 在线难例挖掘
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
# type='OHEMSampler', # OHEM 在线难例挖掘
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
# type='OHEMSampler', # OHEM 在线难例挖掘
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
]),
test_cfg=dict(
rpn=dict(
nms_pre=1000,
max_per_img=1000,
nms=dict(type='nms', iou_threshold=0.7),
min_bbox_size=0),
rcnn=dict(
score_thr=0.1,
nms=dict(type='nms', iou_threshold=0.5),
# nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.001),
max_per_img=100))
)
# Mixed Precision Training
fp16 = dict(loss_scale=512.)
# load data
dataset_type = 'CocoDataset'
data_root = 'data/'
# # use ResNeSt img_norm
img_norm_cfg = dict(
mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True)
albu_train_transforms = [
# dict(type='Flip', p=0.5),
dict(type='RandomRotate90', p=0.5),
dict(type='ShiftScaleRotate',
shift_limit=0.0625,
scale_limit=0.0,
rotate_limit=0,
interpolation=1,
p=0.5),
dict(type='RandomBrightnessContrast',
brightness_limit=[0.1, 0.3],
contrast_limit=[0.1, 0.3],
p=0.2),
dict(type='OneOf',
transforms=[
dict(type='RGBShift',
r_shift_limit=10,
g_shift_limit=10,
b_shift_limit=10,
p=1.0),
dict(type='HueSaturationValue',
hue_shift_limit=20,
sat_shift_limit=30,
val_shift_limit=20,
p=1.0)
],
p=0.1),
dict(type='ChannelShuffle', p=0.1),
dict(type='OneOf',
transforms=[
dict(type='Blur', blur_limit=3, p=1.0),
dict(type='MedianBlur', blur_limit=3, p=1.0)
],
p=0.1),
]
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize',
img_scale=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
(1333, 768), (1333, 800)],
multiscale_mode='value', # multi-scale mode='value'/'range'
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Albu',
transforms=albu_train_transforms),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='MultiScaleFlipAug',
img_scale=(1333, 640),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=batch_size, # batch size
workers_per_gpu=2, # num of worker
train=dict(
type=dataset_type,
ann_file=data_root + 'anno/new_train.json',
img_prefix=data_root + 'images/',
pipeline=train_pipeline,
classes=classes),
val=dict(
type=dataset_type,
ann_file=data_root + 'anno/new_train.json',
img_prefix=data_root + 'images/',
pipeline=test_pipeline,
classes=classes),
test=dict(
type=dataset_type,
ann_file=data_root + 'anno/valid.json', # valid 999 test 818
img_prefix=data_root + 'images/',
pipeline=test_pipeline,
classes=classes))
evaluation = dict(interval=1, metric='bbox', save_best='bbox_mAP')
# lr schedule
optimizer = dict(type='SGD', lr=init_lr, momentum=0.9, weight_decay=0.0001)
lr_config = dict(
policy='CosineAnnealing',
by_epoch=False,
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
min_lr=1e-7)
runner = dict(max_epochs=max_epochs)
# loginfo
# 保存ckpt间隔
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='WandbLoggerHook',
init_kwargs=dict(
project=wandb_project,
name=wandb_runner,
entity=wandb_usrname))
])
log_level = 'INFO'
load_from = 'checkpoints/resnest/cascade_rcnn_rs50_fpn_classes_5.pth'
resume_from = None
work_dir = f'output/{wandb_runner}' # mkdir output
resume_from > load_from > backbone
fp16 = dict(loss_scale=512.)
开启混合精度训练import os
import json
import pandas as pd
import PIL.Image as Image
root = './'
test_dir = os.path.join(root, 'images') # test image root
out_file = os.path.join(root, 'annotations/test.json') # test json output path
df = pd.read_csv(os.path.join(root, 'test.csv'))
data = dict()
data['categories'] = [{"id": 87, "name": "belt", "supercategory": "none"},
{"id": 1034, "name": "sunglasses", "supercategory": "none"},
{"id": 131, "name": "boot", "supercategory": "none"},
{"id": 318, "name": "cowboy_hat", "supercategory": "none"},
{"id": 588, "name": "jacket", "supercategory": "none"},
]
images = []
for idx, row in df.iterrows():
img_id, file_name = row
file_path = os.path.join(test_dir, file_name)
file = Image.open(file_path)
tmp = dict()
tmp['id'] = img_id
tmp['width'] = file.size[0]
tmp['height'] = file.size[1]
tmp['file_name'] = file_name
images.append(tmp)
print('len(test) =', len(images))
data['images'] = images
with open(out_file, 'w') as f:
json.dump(data, f)
print('done!')
python tools/train.py configs/cascade-r50-baseline.py
python tools/test.py \
configs/cascade-r50-baseline.py \
--format-only \
--eval-options 'jsonfile_prefix=./output/cascade-r50-baseline/best'
尝试了四次就没再改了,最后的结果是线下resnext101最好,线上毫无tricks的baseline效果最好…XD
训练hook见wandb