def train(hyp,
opt,
device,
callbacks=Callbacks()
):
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze, = \
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
------------------------------------------------------------------------------------------------
传入参数,callbacks(回调函数,我目前用不到)
------------------------------------------------------------------------------------------------
w = save_dir / 'weights'
w.mkdir(parents=True, exist_ok=True)
last, best = w / 'last.pt', w / 'best.pt'
------------------------------------------------------------------------------------------------
save root
------------------------------------------------------------------------------------------------
if isinstance(hyp, str):
with open(hyp) as f:
hyp = yaml.safe_load(f)
LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
------------------------------------------------------------------------------------------------
用某颜色输出超参数
------------------------------------------------------------------------------------------------
with open(save_dir / 'hyp.yaml', 'w') as f:
yaml.safe_dump(hyp, f, sort_keys=False)
with open(save_dir / 'opt.yaml', 'w') as f:
yaml.safe_dump(vars(opt), f, sort_keys=False)
data_dict = None
------------------------------------------------------------------------------------------------
保存hyp\opt
------------------------------------------------------------------------------------------------
if RANK in [-1, 0]:
loggers = Loggers(save_dir, weights, opt, hyp, LOGGER)
if loggers.wandb:
data_dict = loggers.wandb.data_dict
if resume:
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
for k in methods(loggers):
callbacks.register_action(k, callback=getattr(loggers, k))
------------------------------------------------------------------------------------------------
没研究
------------------------------------------------------------------------------------------------
plots = not evolve
cuda = device.type != 'cpu'
init_seeds(1 + RANK)
with torch_distributed_zero_first(RANK):
data_dict = data_dict or check_dataset(data)
train_path, val_path = data_dict['train'], data_dict['val']
nc = 1 if single_cls else int(data_dict['nc'])
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names']
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}'
is_coco = data.endswith('coco.yaml') and nc == 80
------------------------------------------------------------------------------------------------
多线程check数据
------------------------------------------------------------------------------------------------
pretrained = weights.endswith('.pt')
if pretrained:
with torch_distributed_zero_first(RANK):
weights = attempt_download(weights)
ckpt = torch.load(weights, map_location=device)
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else []
csd = ckpt['model'].float().state_dict()
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)
model.load_state_dict(csd, strict=False)
LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}')
else:
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device)
------------------------------------------------------------------------------------------------
加载model
------------------------------------------------------------------------------------------------
freeze = [f'model.{x}.' for x in range(freeze)]
for k, v in model.named_parameters():
v.requires_grad = True
if any(x in k for x in freeze):
print(f'freezing {k}')
v.requires_grad = False
------------------------------------------------------------------------------------------------
freeze
------------------------------------------------------------------------------------------------
nbs = 64
accumulate = max(round(nbs / batch_size), 1)
hyp['weight_decay'] *= batch_size * accumulate / nbs
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
g0, g1, g2 = [], [], []
for v in model.modules():
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
g2.append(v.bias)
if isinstance(v, nn.BatchNorm2d):
g0.append(v.weight)
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
g1.append(v.weight)
if opt.adam:
optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))
else:
optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']})
optimizer.add_param_group({'params': g2})
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
del g0, g1, g2
------------------------------------------------------------------------------------------------
Optimizer
------------------------------------------------------------------------------------------------
if opt.linear_lr:
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf']
else:
lf = one_cycle(1, hyp['lrf'], epochs)
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
------------------------------------------------------------------------------------------------
Scheduler
------------------------------------------------------------------------------------------------
ema = ModelEMA(model) if RANK in [-1, 0] else None
start_epoch, best_fitness = 0, 0.0
if pretrained:
if opt.change_nc:
ckpt['new_ema'] = []
for emaa in ckpt['ema'].state_dict():
ckpt['new_ema'].append(emaa)
new_weights = []
for k,v in ckpt['ema'].float().state_dict().items():
if k.startswith('model.24.m.0.weight'):
new_v = torch.rand([27, 128, 1, 1])
new_weights.append(new_v)
elif k.startswith('model.24.m.1.weight'):
new_v = torch.rand([27, 256, 1, 1])
new_weights.append(new_v)
elif k.startswith('model.24.m.2.weight'):
new_v = torch.rand([27, 512, 1, 1])
new_weights.append(new_v)
elif k.startswith('model.24.m'):
new_v = torch.rand([27])
new_weights.append(new_v)
else:
new_weights.append(v)
ckpt['my_weight'] = dict(zip(ckpt['new_ema'], new_weights))
if ema and ckpt.get('ema'):
ema.ema.load_state_dict(ckpt['my_weight'])
ema.updates = ckpt['updates']
------------------------------------------------------------------------------------------------
读参数,我这里想换一下输出层的维度
------------------------------------------------------------------------------------------------
else:
if ckpt['optimizer'] is not None:
optimizer.load_state_dict(ckpt['optimizer'])
best_fitness = ckpt['best_fitness']
if ema and ckpt.get('ema'):
ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
ema.updates = ckpt['updates']
------------------------------------------------------------------------------------------------
原本的
------------------------------------------------------------------------------------------------
if opt.restart:
start_epoch = 0
else:
start_epoch = ckpt['epoch'] + 1
if resume:
assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
if epochs < start_epoch:
LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
epochs += ckpt['epoch']
del ckpt, csd
------------------------------------------------------------------------------------------------
Epochs
------------------------------------------------------------------------------------------------
gs = max(int(model.stride.max()), 32)
nl = model.model[-1].nl
imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2)
if cuda and RANK == -1 and torch.cuda.device_count() > 1:
logging.warning('DP not recommended, instead use torch.distributed.run for best DDP Multi-GPU results.\n'
'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
model = torch.nn.DataParallel(model)
if opt.sync_bn and cuda and RANK != -1:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
LOGGER.info('Using SyncBatchNorm()')
train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=RANK,
workers=workers, image_weights=opt.image_weights, quad=opt.quad,
prefix=colorstr('train: '))
mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max())
nb = len(train_loader)
assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
if RANK in [-1, 0]:
val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
workers=workers, pad=0.5,
prefix=colorstr('val: '))[0]
if not resume:
labels = np.concatenate(dataset.labels, 0)
if plots:
plot_labels(labels, names, save_dir)
if not opt.noautoanchor:
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
model.half().float()
callbacks.on_pretrain_routine_end()
if cuda and RANK != -1:
model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
hyp['box'] *= 3. / nl
hyp['cls'] *= nc / 80. * 3. / nl
hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl
hyp['label_smoothing'] = opt.label_smoothing
model.nc = nc
model.hyp = hyp
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc
model.names = names
t0 = time.time()
nw = max(round(hyp['warmup_epochs'] * nb), 1000)
last_opt_step = -1
maps = np.zeros(nc)
results = (0, 0, 0, 0, 0, 0, 0)
scheduler.last_epoch = start_epoch - 1
scaler = amp.GradScaler(enabled=cuda)
compute_loss = ComputeLoss(model)
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
f'Using {train_loader.num_workers} dataloader workers\n'
f'Logging results to {save_dir}\n'
f'Starting training for {epochs} epochs...')
------------------------------------------------------------------------------------------------
imgsz dataloader workers save_dir epochs
------------------------------------------------------------------------------------------------
for epoch in range(start_epoch, epochs): ------------ start_epoch->epochs
model.train() ------------ model.train()
if opt.image_weights: ------------ 是否用加权图片
if RANK in [-1, 0]: ------------ rank = -1
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc
------------ 按照class_weight机制得到cw
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)
------------ 按照class_weight机制得到iw
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)
------------ 按照iw来生成indices
*************************************************************************************************
if RANK != -1: *
indices = (torch.tensor(dataset.indices) if RANK == 0 else torch.zeros(dataset.n)).int()
dist.broadcast(indices, 0) *
if RANK != 0: *
dataset.indices = indices.cpu().numpy() *
用不上 *
*************************************************************************************************
mloss = torch.zeros(3, device=device) ------------ 初始化均值
*************************************************************************************************
if RANK != -1: *
train_loader.sampler.set_epoch(epoch) *
*************************************************************************************************
pbar = enumerate(train_loader) ------------ train_loader放进pbar
LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
if RANK in [-1, 0]: ------------ rank = -1
pbar = tqdm(pbar, total=nb) ------------ pbar放进tqdm
optimizer.zero_grad() ------------ 初始化optim
for i, (imgs, targets, paths, _) in pbar: ------------ batch
ni = i + nb * epoch ------------ 当前batchnum
imgs = imgs.to(device, non_blocking=True).float() / 255.0
if ni <= nw:
xi = [0, nw]
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
for j, x in enumerate(optimizer.param_groups):
x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
if 'momentum' in x:
x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
if opt.multi_scale:
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs
sf = sz / max(imgs.shape[2:])
if sf != 1:
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]]
imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
with amp.autocast(enabled=cuda):
pred = model(imgs)
loss, loss_items = compute_loss(pred, targets.to(device))
if RANK != -1:
loss *= WORLD_SIZE
if opt.quad:
loss *= 4.
scaler.scale(loss).backward()
if ni - last_opt_step >= accumulate:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
if ema:
ema.update(model)
last_opt_step = ni
if RANK in [-1, 0]:
mloss = (mloss * i + loss_items) / (i + 1)
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G'
pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
callbacks.on_train_batch_end(ni, model, imgs, targets, paths, plots)
lr = [x['lr'] for x in optimizer.param_groups]
scheduler.step()
if RANK in [-1, 0]:
callbacks.on_train_epoch_end(epoch=epoch)
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
final_epoch = epoch + 1 == epochs
if not noval or final_epoch:
results, maps, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=ema.ema,
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=is_coco and final_epoch,
verbose=nc < 50 and final_epoch,
plots=plots and final_epoch,
callbacks=callbacks,
compute_loss=compute_loss)
fi = fitness(np.array(results).reshape(1, -1))
if fi > best_fitness:
best_fitness = fi
log_vals = list(mloss) + list(results) + lr
callbacks.on_fit_epoch_end(log_vals, epoch, best_fitness, fi)
if (not nosave) or (final_epoch and not evolve):
ckpt = {'epoch': epoch,
'best_fitness': best_fitness,
'model': deepcopy(de_parallel(model)).half(),
'ema': deepcopy(ema.ema).half(),
'updates': ema.updates,
'optimizer': optimizer.state_dict(),
'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None}
torch.save(ckpt, last)
if best_fitness == fi:
torch.save(ckpt, best)
del ckpt
callbacks.on_model_save(last, epoch, final_epoch, best_fitness, fi)
if RANK in [-1, 0]:
LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
if not evolve:
if is_coco:
for m in [last, best] if best.exists() else [last]:
results, _, _ = val.run(data_dict,
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=attempt_load(m, device).half(),
iou_thres=0.7,
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=True,
plots=False)
for f in last, best:
if f.exists():
strip_optimizer(f)
callbacks.on_train_end(last, best, plots, epoch)
LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
torch.cuda.empty_cache()
return results