yolov5-5的train函数简单流程
1、开始运行
2、创建opt对象
opt = parse_opt()
def parse_opt(known=False):
print('创建opt对象')
parser = argparse.ArgumentParser()
parser.add_argument('--weights', type=str, default='yolov5s.pt', help='选择训练的权重文件')
parser.add_argument('--cfg', type=str, default='', help='模型配置文件,例子:yolov5s.yaml')
parser.add_argument('--data', type=str, default='data/coco128.yaml', help='# 数据集配置文件,fruit.yaml所在位置')
parser.add_argument('--hyp', type=str, default='data/hyps/hyp.scratch.yaml', help='初始超参文件')
parser.add_argument('--epochs', type=int, default=300, help='训练轮次')
parser.add_argument('--batch-size', type=int, default=16, help='训练批次大小')
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='训练、测试图片分辨率大小')
parser.add_argument('--project', default='runs/train', help='训练结果保存的根目录 默认是runs/train')
parser.add_argument('--name', default='exp', help='训练结果保存的目录 默认是runs/train/exp')
parser.add_argument('--resume', nargs='?', const=True, default=False, help='是否接着上次的结果接着训练,默认False')
parser.add_argument('--rect', action='store_true', help='是否采用矩形训练,默认False')
parser.add_argument('--noautoanchor', action='store_true', help='不自动调整anchor 默认False(自动调整anchor)')
parser.add_argument('--multi-scale', action='store_true', help='是否进行多尺度训练 默认False')
parser.add_argument('--label-smoothing', type=float, default=0.0, help='标签平滑增强 默认0.0不增强 要增强一般就设为0.1')
parser.add_argument('--linear-lr', action='store_true', help='是否使用linear lr 线性学习率 默认False 使用cosine lr')
parser.add_argument('--evolve', type=int, nargs='?', const=300, help='是否进行超参进化 默认False')
parser.add_argument('--cache-images', action='store_true', help='是否提前缓存图片到内存,以加快训练速度 默认False')
parser.add_argument('--image-weights', action='store_true', help='使用加权图像选择进行训练')
parser.add_argument('--single-cls', action='store_true', help='数据集是否只有一个类别,默认False')
parser.add_argument('--adam', action='store_true', help='是否使用adam优化器 默认False(使用SGD)')
parser.add_argument('--sync-bn', action='store_true', help='是否使用跨卡同步BN,在DDP模式使用 默认False')
parser.add_argument('--nosave', action='store_true', help='仅保存最后一个模型')
parser.add_argument('--noval', action='store_true', help='是否只测试最后一轮 默认False True: 只测试最后一轮 False: 每轮训练完都测试mAP')
parser.add_argument('--bucket', type=str, default='', help='谷歌云盘bucket,一般不会用到')
parser.add_argument('--device', default='', help='选择训练设备(GPUorCPU)')
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
parser.add_argument('--entity', default=None, help='W&B entity')
parser.add_argument('--exist-ok', action='store_true', help='如果文件不存在就新建或increment name 默认False(默认文件都是不存在的)')
parser.add_argument('--quad', action='store_true', help='dataloader获取数据时, 是否使用collate_fn4代替collate_fn 默认False')
parser.add_argument('--upload_dataset', action='store_true', help='Upload dataset as W&B artifact table')
parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B')
parser.add_argument('--save_period', type=int, default=-1, help='每一个“保存期”后的日志模型')
parser.add_argument('--artifact_alias', type=str, default="latest", help='要使用的数据集工件的版本')
parser.add_argument('--local_rank', type=int, default=-1, help='rank为进程编号, -1且gpu=1时不进行分布式')
opt = parser.parse_known_args()[0] if known else parser.parse_args()
return opt
3、main函数
main(opt)
3.1 日志初始化
set_logging(RANK)
3.2 打印所有训练opt参数
if RANK in [-1, 0]:
print('3.2.1:本次训练设置的参数有: ')
for i, j in vars(opt).items():
print(i, '=', j)
print('3.2.1:打印完毕--------------------')
print('3.2.2:检查代码版本是否是最新的--------------------')
check_git_status()
print('3.2.3:检查requirements.txt所需包是否都满足---------------')
check_requirements(exclude=['thop'])
3.3 wandb logging初始化
wandb_run = check_wandb_resume(opt)
3.4 判断是继续上回打断的训练还是重新训练
if opt.resume and not wandb_run:
print('3.4.1:继续中断的训练, 一般是这个')
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()
assert os.path.isfile(ckpt), '错误:--恢复检查点不存在'
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
opt = argparse.Namespace(**yaml.safe_load(f))
opt.cfg, opt.weights, opt.resume = '', ckpt, True
LOGGER.info(f'Resuming training from {ckpt}')
else:
print('3.4.2:不继续中断的训练,重新开始-----------------')
opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)
assert len(opt.cfg) or len(opt.weights), '必须指定--cfg或--weights'
opt.name = 'evolve' if opt.evolve else opt.name
opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok or opt.evolve))
3.5 ddp模式
device = select_device(opt.device, batch_size=opt.batch_size)
print('3.5.1:选择设备,设备为:', device)
if LOCAL_RANK != -1:
print('3.5.2.1:进行多GPU训练------------------------------------------------')
from datetime import timedelta
assert torch.cuda.device_count() > LOCAL_RANK, '用于DDP命令的CUDA设备不足'
torch.cuda.set_device(LOCAL_RANK)
device = torch.device('cuda', LOCAL_RANK)
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo",
timeout=timedelta(seconds=60))
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size必须是CUDA设备计数的倍数'
assert not opt.image_weights, '--图像权重参数(--image-weights argument)与DDP训练不兼容'
else:
print('3.5.2.2:不进行多GPU训练------------------------------------------------')
3.6 进化算法
if not opt.evolve:
print('main3.6.1:不使用进化算法 正常开始训练,直接调用train()函数,开始训练------------------------------')
train(opt.hyp, opt, device)
if WORLD_SIZE > 1 and RANK == 0:
_ = [print('销毁进程组... ', end=''), dist.destroy_process_group(), print('Done.')]
else:
print('main3.6.2:使用进化算法----------------------------------------')
meta = {'lr0': (1, 1e-5, 1e-1),
'lrf': (1, 0.01, 1.0),
'momentum': (0.3, 0.6, 0.98),
'weight_decay': (1, 0.0, 0.001),
'warmup_epochs': (1, 0.0, 5.0),
'warmup_momentum': (1, 0.0, 0.95),
'warmup_bias_lr': (1, 0.0, 0.2),
'box': (1, 0.02, 0.2),
'cls': (1, 0.2, 4.0),
'cls_pw': (1, 0.5, 2.0),
'obj': (1, 0.2, 4.0),
'obj_pw': (1, 0.5, 2.0),
'iou_t': (0, 0.1, 0.7),
'anchor_t': (1, 2.0, 8.0),
'anchors': (2, 2.0, 10.0),
'fl_gamma': (0, 0.0, 2.0),
'hsv_h': (1, 0.0, 0.1),
'hsv_s': (1, 0.0, 0.9),
'hsv_v': (1, 0.0, 0.9),
'degrees': (1, 0.0, 45.0),
'translate': (1, 0.0, 0.9),
'scale': (1, 0.0, 0.9),
'shear': (1, 0.0, 10.0),
'perspective': (0, 0.0, 0.001),
'flipud': (1, 0.0, 1.0),
'fliplr': (0, 0.0, 1.0),
'mosaic': (1, 0.0, 1.0),
'mixup': (1, 0.0, 1.0),
'copy_paste': (1, 0.0, 1.0)}
with open(opt.hyp) as f:
hyp = yaml.safe_load(f)
if 'anchors' not in hyp:
hyp['anchors'] = 3
assert LOCAL_RANK == -1, '未为--evolve实现DDP模式e'
opt.noval, opt.nosave = True, True
yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml'
if opt.bucket:
os.system(f'gsutil cp gs://{opt.bucket}/evolve.txt .')
"""
这里的进化算法是:根据之前训练时的hyp来确定一个base hyp再进行突变;
如何根据?
通过之前每次进化得到的results来确定之前每个hyp的权重
有了每个hyp和每个hyp的权重之后有两种进化方式;
1.根据每个hyp的权重随机选择一个之前的hyp作为base hyp,random.choices(range(n), weights=w)
2.根据每个hyp的权重对之前所有的hyp进行融合获得一个base hyp,(x * w.reshape(n, 1)).sum(0) / w.sum()
evolve.txt会记录每次进化之后的results+hyp
每次进化时,hyp会根据之前的results进行从大到小的排序;
再根据fitness函数计算之前每次进化得到的hyp的权重
再确定哪一种进化方式,从而进行进化
"""
for _ in range(opt.evolve):
if Path('evolve.txt').exists():
parent = 'single'
x = np.loadtxt('evolve.txt', ndmin=2)
n = min(5, len(x))
x = x[np.argsort(-fitness(x))][:n]
w = fitness(x) - fitness(x).min() + 1E-6
if parent == 'single' or len(x) == 1:
x = x[random.choices(range(n), weights=w)[0]]
elif parent == 'weighted':
x = (x * w.reshape(n, 1)).sum(0) / w.sum()
mp, s = 0.8, 0.2
npr = np.random
npr.seed(int(time.time()))
g = np.array([x[0] for x in meta.values()])
ng = len(meta)
v = np.ones(ng)
while all(v == 1):
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
for i, k in enumerate(hyp.keys()):
hyp[k] = float(x[i + 7] * v[i])
for k, v in meta.items():
hyp[k] = max(hyp[k], v[1])
hyp[k] = min(hyp[k], v[2])
hyp[k] = round(hyp[k], 5)
results = train(hyp.copy(), opt, device)
print_mutation(hyp.copy(), results, yaml_file, opt.bucket)
plot_evolution(yaml_file)
print(f'超参数演化完成。最佳结果另存为: {yaml_file}\n'
f'命令使用这些超参数训练新模型: $ python train.py --hyp {yaml_file}')
4、train()函数
def train(hyp, opt, device):
print('进入train函数--------------------------------------------------')
"""
:params hyp: data/hyps/hyp.scratch.yaml hyp dictionary
:params opt: main中opt参数
:params device: 当前设备
"""
4.1 初始化参数和配置信息
print('train4.1.1:初始化opt参数-----------------------------------')
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, = \
opt.save_dir, opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
opt.resume, opt.noval, opt.nosave, opt.workers
print('train4.1.2:设置保存pt文件结果的路径------------------------------')
save_dir = Path(save_dir)
wdir = save_dir / 'weights'
wdir.mkdir(parents=True, exist_ok=True)
last = wdir / 'last.pt'
best = wdir / 'best.pt'
results_file = save_dir / 'results.txt'
print('train4.1.3:加载超参信息---------------------------------------------')
if isinstance(hyp, str):
with open(hyp) as f:
hyp = yaml.safe_load(f)
print('train4.1.4:日志输出超参信息------------------------------------------')
print('超参设置为:')
for i, j in hyp.items():
print(i, '==', j)
print('train4.1.4:输出完毕------------------------------------------')
print('train4.1.5:保存运行设置---------------------------------------------------------------')
with open(save_dir / 'hyp.yaml', 'w') as f:
yaml.safe_dump(hyp, f, sort_keys=False)
print('train4.1.6:保存opt--------------------------------------------------------------')
with open(save_dir / 'opt.yaml', 'w') as f:
yaml.safe_dump(vars(opt), f, sort_keys=False)
print('train4.1.7:判断是否需要画图')
plots = not evolve
cuda = device.type != 'cpu'
print('train4.1.8:设置一系列的随机数种子')
init_seeds(1 + RANK)
print('train4.1.9:加载VOC.yaml中的数据配置信息')
with open(data, 'rb') as f:
data_dict = yaml.safe_load(f)
loggers = {'wandb': None, 'tb': None}
if RANK in [-1, 0]:
if not evolve:
prefix = colorstr('tensorboard: ')
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
loggers['tb'] = SummaryWriter(str(save_dir))
opt.hyp = hyp
run_id = torch.load(weights).get('wandb_id') if weights.endswith('.pt') and os.path.isfile(weights) else None
run_id = run_id if opt.resume else None
wandb_logger = WandbLogger(opt, save_dir.stem, run_id, data_dict)
loggers['wandb'] = wandb_logger.wandb
if loggers['wandb']:
data_dict = wandb_logger.data_dict
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
nc = 1 if single_cls else int(data_dict['nc'])
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']
assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, data)
print('数据集类别数(自己设置的) == ', nc)
print('数据集所有类别(自己设置的) == ', names)
print('train4.1.10:判断当前数据集是否是coco数据集(80个类别) ')
is_coco = data.endswith('coco.yaml') and nc == 80
4.2 模型模块
print('train4.2.1:载入模型-------------------------------------------------')
pretrained = weights.endswith('.pt')
if pretrained:
print('train4.2.1.1:使用预训练(一般是这个------------------------------------------------')
print('train4.2.1.1.1:同步不同进程对数据读取的上下文管理器-------------------------------')
with torch_distributed_zero_first(RANK):
weights = attempt_download(weights)
ckpt = torch.load(weights, map_location=device)
print('train4.2.1.1.2:加载模型及参数-------------------------------')
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []
state_dict = ckpt['model'].float().state_dict()
state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)
model.load_state_dict(state_dict, strict=False)
LOGGER.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))
else:
print('train4.2.1.2:不使用预训练-------------------------------------------------')
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)
print('train4.2.2:检查数据集 如果本地没有则从torch库中下载并解压数据集---------------------')
with torch_distributed_zero_first(RANK):
check_dataset(data_dict)
print('train4.2.3:数据集参数-----------------------------------------')
train_path = data_dict['train']
val_path = data_dict['val']
""" train_path == D:/yolo5-5/yolov5/paper_data/train.txt
var_path == D:/yolo5-5/yolov5/paper_data/test.txt
data_dict == {'train': 'D:/yolo5-5/yolov5/paper_data/train.txt',
'val': 'D:/yolo5-5/yolov5/paper_data/test.txt',
'nc': 10,
'names': ['AlligatorCrack', 'TransverseCrack', 'LongitudinalCrack',
'Sealling', 'SeallingCrack', 'Patch', 'Loose', 'LaneMarking',
'Joint', 'IndicatingArrow']}"""
print('train4.2.4:冻结权重层------------------------------------')
freeze = []
for k, v in model.named_parameters():
v.requires_grad = True
if any(x in k for x in freeze):
print('freezing %s' % k)
v.requires_grad = False
4.3 优化器设置
nbs = 64
accumulate = max(round(nbs / batch_size), 1)
print('累积损耗优化(accumulate) == ', accumulate)
print('train4.3.1:根据accumulate设置超参: 权重衰减参数------------------------------------')
hyp['weight_decay'] *= batch_size * accumulate / nbs
LOGGER.info(f"权重衰减 = {hyp['weight_decay']}")
print('train4.3.2:将模型参数分为三组(weights、biases、bn)来进行分组优化------------------------------------')
pg0, pg1, pg2 = [], [], []
for k, v in model.named_modules():
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
pg2.append(v.bias)
if isinstance(v, nn.BatchNorm2d):
pg0.append(v.weight)
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
pg1.append(v.weight)
print('train4.3.3:选择优化器 并设置pg0(bn参数)的优化方式------------')
if opt.adam:
print('train4.3.3.1:opt.adam--------------------------')
optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))
else:
print('train4.3.3.2:no opt.adam--------------------------')
optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
print('train4.3.4:设置pg1(weights)的优化方式-------------------------------')
optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})
print('train4.3.5:设置pg2(biases)的优化方式-------------------------------')
optimizer.add_param_group({'params': pg2})
print('train4.3.6:打印log日志 优化信息-------------------------------')
LOGGER.info('优化器组: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
print('train4.3.7:删除三个变量 优化代码-------------------------------')
del pg0, pg1, pg2
4.4 学习率模块
print('train4.4.1:学习率方式选择----------------------')
if opt.linear_lr:
print('train4.4.1.1:使用线性学习率----------------------------------')
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']
else:
print('train4.4.1.2:使用one_cycle学习率(一般是这个)-------------------------')
lf = one_cycle(1, hyp['lrf'], epochs)
print('train4.4.2:实例化-------------------------')
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
4.5 训练前最后准备模块
print('train4.5.1:单卡训练: 使用EMA(指数移动平均)对模型的参数做平均, 一种给予近期数据更高权重的平均方法--------------')
ema = ModelEMA(model) if RANK in [-1, 0] else None
start_epoch, best_fitness = 0, 0.0
if pretrained:
print('train4.5.2:单使用预训练--------------------')
if ckpt['optimizer'] is not None:
optimizer.load_state_dict(ckpt['optimizer'])
best_fitness = ckpt['best_fitness']
if ema and ckpt.get('ema'):
ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
ema.updates = ckpt['updates']
if ckpt.get('training_results') is not None:
results_file.write_text(ckpt['training_results'])
start_epoch = ckpt['epoch'] + 1
if resume:
assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
if epochs < start_epoch:
LOGGER.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
(weights, ckpt['epoch'], epochs))
epochs += ckpt['epoch']
del ckpt, state_dict
print('train4.5.3:获取模型最大stride(步长) [32 16 8]-----------')
gs = max(int(model.stride.max()), 32)
print('gs == ', gs)
print('train4.5.4:有多少个detect(检测层数)--------------------')
nl = model.model[-1].nl
print('nl == ', nl)
print('train4.5.5:获取训练图片和测试图片分辨率--------------------------')
imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)
print('imgsz == ', imgsz)
print('train4.5.6:是否使用DP mode(单机多卡模式 )-------------------------')
if cuda and RANK == -1 and torch.cuda.device_count() > 1:
print('train4.5.6.1:使用DP mode----------------')
logging.warning('DP not recommended, instead use torch.distributed.run for best DDP Multi-GPU results.\n'
'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
model = torch.nn.DataParallel(model)
else:
print('train4.5.6.2:不使用DP mode-------------------')
print('train4.5.7:是否使用跨卡BN----------------------')
if opt.sync_bn and cuda and RANK != -1:
print('train5.7.1:使用跨卡BN-----------------------------')
raise Exception('can not train with --sync-bn, known issue https://github.com/ultralytics/yolov5/issues/3998')
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
LOGGER.info('Using SyncBatchNorm()')
else:
print('train5.7.2:不使用跨卡BN-----------------------------')
4.6 数据加载模块
print('train4.6.1:create_dataloader???------------------------')
train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=RANK,
workers=workers, image_weights=opt.image_weights, quad=opt.quad,
prefix=colorstr('train: '))
print('train4.6.2:获取标签中最大类别值,与类别数作比较,如果小于类别数则表示有问题--')
mlc = np.concatenate(dataset.labels, 0)[:, 0].max()
nb = len(train_loader)
print('标签中最大类别值 == ', mlc)
print('类别数 == ', nb)
assert mlc < nc, '标签中最大类别值= %g ,超过了 设置的类别数=%g in %s. 可能的类标签是 0-%g' % (mlc, nc, data, nc - 1)
if RANK in [-1, 0]:
val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
hyp=hyp, cache=opt.cache_images and not noval, rect=True, rank=-1,
workers=workers, pad=0.5,
prefix=colorstr('val: '))[0]
if not resume:
print('train4.6.3.1:如果不使用断点续训----------------')
print('train4.6.3.1.1:统计dataset的label信息--------------')
labels = np.concatenate(dataset.labels, 0)
if plots:
print('train4.6.3.1.2:plots可视化数据集labels信息------')
plot_labels(labels, names, save_dir, loggers)
if not opt.noautoanchor:
print('train4.6.3.1.3:计算默认锚框anchor与数据集标签框的高宽比------')
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
print('train4.6.3.1.4:预降锚精度---------------------')
model.half().float()
if cuda and RANK != -1:
model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
4.7 训练开始模块
hyp['box'] *= 3. / nl
hyp['cls'] *= nc / 80. * 3. / nl
hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl
hyp['label_smoothing'] = opt.label_smoothing
model.nc = nc
model.hyp = hyp
model.gr = 1.0
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc
model.names = names
t0 = time.time()
nw = max(round(hyp['warmup_epochs'] * nb), 1000)
last_opt_step = -1
maps = np.zeros(nc)
results = (0, 0, 0, 0, 0, 0, 0)
scheduler.last_epoch = start_epoch - 1
scaler = amp.GradScaler(enabled=cuda)
compute_loss = ComputeLoss(model)
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
f'Using {train_loader.num_workers} dataloader workers\n'
f'Logging results to {save_dir}\n'
f'Starting training for {epochs} epochs...')
print('开始啦!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
for epoch in range(start_epoch, epochs):
print('这是第', epoch, '轮')
model.train()
if opt.image_weights:
if RANK in [-1, 0]:
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)
if RANK != -1:
indices = (torch.tensor(dataset.indices) if RANK == 0 else torch.zeros(dataset.n)).int()
dist.broadcast(indices, 0)
if RANK != 0:
dataset.indices = indices.cpu().numpy()
mloss = torch.zeros(4, device=device)
if RANK != -1:
train_loader.sampler.set_epoch(epoch)
pbar = enumerate(train_loader)
LOGGER.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'labels', 'img_size'))
if RANK in [-1, 0]:
pbar = tqdm(pbar, total=nb)
optimizer.zero_grad()
for i, (imgs, targets, paths, _) in pbar:
ni = i + nb * epoch
imgs = imgs.to(device, non_blocking=True).float() / 255.0
if ni <= nw:
xi = [0, nw]
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
for j, x in enumerate(optimizer.param_groups):
x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
if 'momentum' in x:
x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
if opt.multi_scale:
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs
sf = sz / max(imgs.shape[2:])
if sf != 1:
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]
imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
with amp.autocast(enabled=cuda):
pred = model(imgs)
loss, loss_items = compute_loss(pred, targets.to(device))
if RANK != -1:
loss *= WORLD_SIZE
if opt.quad:
loss *= 4.
scaler.scale(loss).backward()
if ni - last_opt_step >= accumulate:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
if ema:
ema.update(model)
last_opt_step = ni
if RANK in [-1, 0]:
mloss = (mloss * i + loss_items) / (i + 1)
mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)
s = ('%10s' * 2 + '%10.4g' * 6) % (
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
pbar.set_description(s)
if plots and ni < 3:
f = save_dir / f'train_batch{ni}.jpg'
Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
if loggers['tb'] and ni == 0:
with warnings.catch_warnings():
warnings.simplefilter('ignore')
loggers['tb'].add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
elif plots and ni == 10 and loggers['wandb']:
wandb_logger.log({'Mosaics': [loggers['wandb'].Image(str(x), caption=x.name) for x in
save_dir.glob('train*.jpg') if x.exists()]})
lr = [x['lr'] for x in optimizer.param_groups]
scheduler.step()
if RANK in [-1, 0]:
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
final_epoch = epoch + 1 == epochs
if not noval or final_epoch:
wandb_logger.current_epoch = epoch + 1
results, maps, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=ema.ema,
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=is_coco and final_epoch,
verbose=nc < 50 and final_epoch,
plots=plots and final_epoch,
wandb_logger=wandb_logger,
compute_loss=compute_loss)
with open(results_file, 'a') as f:
f.write(s + '%10.4g' * 7 % results + '\n')
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
'val/box_loss', 'val/obj_loss', 'val/cls_loss',
'x/lr0', 'x/lr1', 'x/lr2']
for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
if loggers['tb']:
loggers['tb'].add_scalar(tag, x, epoch)
if loggers['wandb']:
wandb_logger.log({tag: x})
fi = fitness(np.array(results).reshape(1, -1))
if fi > best_fitness:
best_fitness = fi
wandb_logger.end_epoch(best_result=best_fitness == fi)
if (not nosave) or (final_epoch and not evolve):
ckpt = {'epoch': epoch,
'best_fitness': best_fitness,
'training_results': results_file.read_text(),
'model': deepcopy(de_parallel(model)).half(),
'ema': deepcopy(ema.ema).half(),
'updates': ema.updates,
'optimizer': optimizer.state_dict(),
'wandb_id': wandb_logger.wandb_run.id if loggers['wandb'] else None}
torch.save(ckpt, last)
if best_fitness == fi:
torch.save(ckpt, best)
if loggers['wandb']:
if ((epoch + 1) % opt.save_period == 0 and not final_epoch) and opt.save_period != -1:
wandb_logger.log_model(last.parent, opt, epoch, fi, best_model=best_fitness == fi)
del ckpt
4.8 结尾,打印一些信息
if RANK in [-1, 0]:
LOGGER.info(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.\n')
if plots:
plot_results(save_dir=save_dir)
if loggers['wandb']:
files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]]
wandb_logger.log({"Results": [loggers['wandb'].Image(str(save_dir / f), caption=f) for f in files
if (save_dir / f).exists()]})
if not evolve:
if is_coco:
for m in [last, best] if best.exists() else [last]:
results, _, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=attempt_load(m, device).half(),
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=True,
plots=False)
for f in last, best:
if f.exists():
strip_optimizer(f)
if loggers['wandb']:
loggers['wandb'].log_artifact(str(best if best.exists() else last), type='model',
name='run_' + wandb_logger.wandb_run.id + '_model',
aliases=['latest', 'best', 'stripped'])
wandb_logger.finish_run()
torch.cuda.empty_cache()
4.9 返回results
return results