原代码采用的训练集是 coco_2014_train
DATASETS: TRAIN: ("coco_2014_train", "coco_2014_valminusminival") TEST: ("coco_2014_minival",)
以及ResNet网络的预训练
WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
def main():
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
"--config-file",
default="",
metavar="FILE",
help="path to config file",
type=str,
)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument(
"--skip-test",
dest="skip_test",
help="Do not test the final model",
action="store_true",
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
args = parser.parse_args()
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 # 获取"WORLD_SIZE "环境变量
args.distributed = num_gpus > 1
if args.distributed:
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
backend="nccl", init_method="env://"
)
synchronize()
cfg.merge_from_file(args.config_file) # 配置cfg,加载yaml文件配置
cfg.merge_from_list(args.opts)
cfg.freeze()
output_dir = cfg.OUTPUT_DIR # 输出文件保存的路径
if output_dir:
mkdir(output_dir)
logger = setup_logger("fcos_core", output_dir, get_rank()) # 创建一个输出日志方法
logger.info("Using {} GPUs".format(num_gpus))
logger.info(args)
logger.info("Collecting env info (might take some time)")
logger.info("\n" + collect_env_info())
logger.info("Loaded configuration file {}".format(args.config_file))
with open(args.config_file, "r") as cf:
config_str = "\n" + cf.read()
logger.info(config_str)
logger.info("Running with config:\n{}".format(cfg))
model = train(cfg, args.local_rank, args.distributed)
if not args.skip_test:
run_test(cfg, model, args.distributed)
argparse 包参考python之parser.add_argument()用法——命令行选项、参数和子命令解析器。
args.distributed = num_gpus > 1
这段有关的是在配置环境时下载的 apex 这个包,这个简单来说就是训练时能够加快速度的(参考一文详解Apex的安装和使用教程(一款基于 PyTorch 的混合精度训练加速神器)),同时训练时的语句 python -m torch.distributed.launch \ 也提到了。
cfg.merge_from_file(args.config_file) 参考使用yacs库配置神经网络的超参数。
logger = setup_logger("fcos_core", output_dir, get_rank()) 见 ||2 logger.py ||
# 配置cfg,加载yaml文件配置
# 输出文件保存的路径
# 创建一个输出日志方法
# 创建模型
# 创建优化器
# 设置学习率
# transform 数据增强
# 数据集名称
# 路径
# 返回数据集
# 拿出单个数据集
# 随机排序
# 分成batch
# 模型设置为可训练
# 开始时间
# 结束时间
# 迭代,拿出一个batch为一轮
# 一个batch送入model,得到损失函数
# 损失函数加和
# 优化,梯度清零
# 反向传播计算梯度
# 优化器,参数更新
# 更新一些时间,损失函数数值等数据
def do_train(
model,
data_loader,
optimizer,
scheduler,
checkpointer,
device,
checkpoint_period,
arguments,
):
logger = logging.getLogger("fcos_core.trainer")
logger.info("Start training")
meters = MetricLogger(delimiter=" ")
max_iter = len(data_loader)
start_iter = arguments["iteration"]
model.train() # 模型设置为可训练
start_training_time = time.time() # 开始时间
end = time.time() # 结束时间
pytorch_1_1_0_or_later = is_pytorch_1_1_0_or_later()
for iteration, (images, targets, _) in enumerate(data_loader, start_iter): # 迭代,拿出一个batch为一轮
data_time = time.time() - end
iteration = iteration + 1
arguments["iteration"] = iteration
# in pytorch >= 1.1.0, scheduler.step() should be run after optimizer.step()
if not pytorch_1_1_0_or_later:
scheduler.step()
images = images.to(device)
targets = [target.to(device) for target in targets]
loss_dict = model(images, targets) # 一个batch送入model,得到损失函数
losses = sum(loss for loss in loss_dict.values()) # 损失函数加和
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = reduce_loss_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
meters.update(loss=losses_reduced, **loss_dict_reduced)
optimizer.zero_grad() # 优化,梯度清零
losses.backward() # 反向传播计算梯度
optimizer.step() # 优化器,参数更新
if pytorch_1_1_0_or_later:
scheduler.step()
batch_time = time.time() - end
end = time.time()
meters.update(time=batch_time, data=data_time) # 更新一些时间,损失函数数值等数据
eta_seconds = meters.time.global_avg * (max_iter - iteration)
eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
if iteration % 20 == 0 or iteration == max_iter:
logger.info(
meters.delimiter.join(
[
"eta: {eta}",
"iter: {iter}",
"{meters}",
"lr: {lr:.6f}",
"max mem: {memory:.0f}",
]
).format(
eta=eta_string,
iter=iteration,
meters=str(meters),
lr=optimizer.param_groups[0]["lr"],
memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
)
)
if iteration % checkpoint_period == 0:
checkpointer.save("model_{:07d}".format(iteration), **arguments)
if iteration == max_iter:
checkpointer.save("model_final", **arguments)
total_training_time = time.time() - start_training_time
total_time_str = str(datetime.timedelta(seconds=total_training_time))
logger.info(
"Total training time: {} ({:.4f} s / it)".format(
total_time_str, total_training_time / (max_iter)
)
)
logging模块 参考 python中的logging模块,主要作用输出日志,包括调试,运行等等。
def setup_logger(name, save_dir, distributed_rank, filename="log.txt"):
logger = logging.getLogger(name) # 创建一个日志器
logger.setLevel(logging.DEBUG) # 日志输出的默认级别为warning及以上级别,设置输出info级别
# don't log results for the non-master process
if distributed_rank > 0:
return logger
ch = logging.StreamHandler(stream=sys.stdout) # 创建一个处理器handler StreamHandler()控制台实现日志输出
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") # 创建一个文件格式器f_formatter
ch.setFormatter(formatter) # 关联控制台日志器—处理器—格式器
logger.addHandler(ch)
if save_dir:
fh = logging.FileHandler(os.path.join(save_dir, filename))
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
return logger
def get_rank():
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_world_size():
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def get_world_size():
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def make_optimizer(cfg, model):
logger = logging.getLogger("fcos_core.trainer")
params = []
for key, value in model.named_parameters():
if not value.requires_grad:
continue
lr = cfg.SOLVER.BASE_LR # 0.001
weight_decay = cfg.SOLVER.WEIGHT_DECAY # 0.0005
if "bias" in key:
lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR # 0.002
weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS # 0
if key.endswith(".offset.weight") or key.endswith(".offset.bias"):
logger.info("set lr factor of {} as {}".format(
key, cfg.SOLVER.DCONV_OFFSETS_LR_FACTOR
))
lr *= cfg.SOLVER.DCONV_OFFSETS_LR_FACTOR
params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM)
return optimizer
(2) make_lr_scheduler 函数
构建学习率衰减策略。参考PyTorch torch.optim.lr_scheduler 学习率 - LambdaLR;StepLR;MultiStepLR;ExponentialLR
以及 PyTorch学习之六个学习率调整策略 。 以及pytorch-torch.optim.lr_scheduler 调整学习率的六种策略
打印出的学习率数值如下所示
[0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000664, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000332, 0.000664, 0.000332, 0.000664]
89
Process finished with exit code 0
目前尚未清除其功能
def build_dataset(dataset_list, transforms, dataset_catalog, is_train=True):
"""
Arguments:
dataset_list (list[str]): Contains the names of the datasets, i.e.,
coco_2014_trian, coco_2014_val, etc
transforms (callable): transforms to apply to each (image, target) sample
dataset_catalog (DatasetCatalog): contains the information on how to
construct a dataset.
is_train (bool): whether to setup the dataset for training or testing
"""
if not isinstance(dataset_list, (list, tuple)):
raise RuntimeError(
"dataset_list should be a list of strings, got {}".format(dataset_list)
)
datasets = []
for dataset_name in dataset_list: # 数据集名称
data = dataset_catalog.get(dataset_name) # 路径
factory = getattr(D, data["factory"])
args = data["args"]
# for COCODataset, we want to remove images without annotations
# during training
if data["factory"] == "COCODataset":
args["remove_images_without_annotations"] = is_train
if data["factory"] == "PascalVOCDataset":
args["use_difficult"] = not is_train
args["transforms"] = transforms
# make dataset from factory
dataset = factory(**args)
datasets.append(dataset)
# for testing, return a list of datasets
if not is_train:
return datasets
# for training, concatenate all datasets into a single one
dataset = datasets[0]
if len(datasets) > 1:
dataset = D.ConcatDataset(datasets)
return [dataset]
def make_data_sampler(dataset, shuffle, distributed):
if distributed:
return samplers.DistributedSampler(dataset, shuffle=shuffle)
if shuffle: # True
sampler = torch.utils.data.sampler.RandomSampler(dataset)
else:
sampler = torch.utils.data.sampler.SequentialSampler(dataset)
return sampler
class DatasetCatalog(object):
DATA_DIR = "datasets"
DATASETS = {
"coco_2017_train": {
"img_dir": "coco/train2017",
"ann_file": "coco/annotations/instances_train2017.json"
},
"coco_2017_val": {
"img_dir": "coco/val2017",
"ann_file": "coco/annotations/instances_val2017.json"
},
"coco_2017_test_dev": {
"img_dir": "coco/test2017",
"ann_file": "coco/annotations/image_info_test-dev2017.json"
},
"coco_2014_train": {
"img_dir": "coco/train2014",
"ann_file": "coco/annotations/instances_train2014.json"
},
"coco_2014_val": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/instances_val2014.json"
},
"coco_2014_minival": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/instances_minival2014.json"
},
"coco_2014_valminusminival": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/instances_valminusminival2014.json"
},
"keypoints_coco_2014_train": {
"img_dir": "coco/train2014",
"ann_file": "coco/annotations/person_keypoints_train2014.json",
},
"keypoints_coco_2014_val": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/person_keypoints_val2014.json"
},
"keypoints_coco_2014_minival": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/person_keypoints_minival2014.json",
},
"keypoints_coco_2014_valminusminival": {
"img_dir": "coco/val2014",
"ann_file": "coco/annotations/person_keypoints_valminusminival2014.json",
},
"voc_2007_train": {
"data_dir": "voc/VOC2007",
"split": "train"
},
"voc_2007_train_cocostyle": {
"img_dir": "voc/VOC2007/JPEGImages",
"ann_file": "voc/VOC2007/Annotations/pascal_train2007.json"
},
"voc_2007_val": {
"data_dir": "voc/VOC2007",
"split": "val"
},
"voc_2007_val_cocostyle": {
"img_dir": "voc/VOC2007/JPEGImages",
"ann_file": "voc/VOC2007/Annotations/pascal_val2007.json"
},
"voc_2007_test": {
"data_dir": "voc/VOC2007",
"split": "test"
},
"voc_2007_test_cocostyle": {
"img_dir": "voc/VOC2007/JPEGImages",
"ann_file": "voc/VOC2007/Annotations/pascal_test2007.json"
},
"voc_2012_train": {
"data_dir": "voc/VOC2012",
"split": "train"
},
"voc_2012_train_cocostyle": {
"img_dir": "voc/VOC2012/JPEGImages",
"ann_file": "voc/VOC2012/Annotations/pascal_train2012.json"
},
"voc_2012_val": {
"data_dir": "voc/VOC2012",
"split": "val"
},
"voc_2012_val_cocostyle": {
"img_dir": "voc/VOC2012/JPEGImages",
"ann_file": "voc/VOC2012/Annotations/pascal_val2012.json"
},
"voc_2012_test": {
"data_dir": "voc/VOC2012",
"split": "test"
# PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation
},
"cityscapes_fine_instanceonly_seg_train_cocostyle": {
"img_dir": "cityscapes/images",
"ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json"
},
"cityscapes_fine_instanceonly_seg_val_cocostyle": {
"img_dir": "cityscapes/images",
"ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json"
},
"cityscapes_fine_instanceonly_seg_test_cocostyle": {
"img_dir": "cityscapes/images",
"ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json"
}
}
@staticmethod
def get(name):
if "coco" in name:
data_dir = DatasetCatalog.DATA_DIR
attrs = DatasetCatalog.DATASETS[name]
args = dict(
root=os.path.join(data_dir, attrs["img_dir"]),
ann_file=os.path.join(data_dir, attrs["ann_file"]),
)
return dict(
factory="COCODataset",
args=args,
)
elif "voc" in name:
data_dir = DatasetCatalog.DATA_DIR
attrs = DatasetCatalog.DATASETS[name]
args = dict(
data_dir=os.path.join(data_dir, attrs["data_dir"]),
split=attrs["split"],
)
return dict(
factory="PascalVOCDataset",
args=args,
)
raise RuntimeError("Dataset not available: {}".format(name))
class ModelCatalog(object):
S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
C2_IMAGENET_MODELS = {
"MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
"MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
"MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
"MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
"FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
"FAIR/20171220/X-101-64x4d": "ImageNetPretrained/20171220/X-101-64x4d.pkl",
}
C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl"
C2_DETECTRON_MODELS = {
"35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
"35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
"35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
"36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
"35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
"35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
"35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
"36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
"37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK",
# keypoints
"37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao"
}
@staticmethod
def get(name):
if name.startswith("Caffe2Detectron/COCO"):
return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
if name.startswith("ImageNetPretrained"):
return ModelCatalog.get_c2_imagenet_pretrained(name)
raise RuntimeError("model not present in the catalog {}".format(name))
@staticmethod
def get_c2_imagenet_pretrained(name):
prefix = ModelCatalog.S3_C2_DETECTRON_URL
name = name[len("ImageNetPretrained/"):]
name = ModelCatalog.C2_IMAGENET_MODELS[name]
url = "/".join([prefix, name])
return url
@staticmethod
def get_c2_detectron_12_2017_baselines(name):
# Detectron C2 models are stored following the structure
# prefix//2012_2017_baselines/.yaml./suffix
# we use as identifiers in the catalog Caffe2Detectron/COCO//
prefix = ModelCatalog.S3_C2_DETECTRON_URL
dataset_tag = "keypoints_" if "keypoint" in name else ""
suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag)
# remove identification prefix
name = name[len("Caffe2Detectron/COCO/"):]
# split in and
model_id, model_name = name.split("/")
# parsing to make it match the url address from the Caffe2 models
model_name = "{}.yaml".format(model_name)
signature = ModelCatalog.C2_DETECTRON_MODELS[name]
unique_name = ".".join([model_name, signature])
url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
return url