2.1 training.py
首先从main函数开始。
def main():
'''
读取超参数函数以及配置文件。
'''
logging.basicConfig(level=logging.DEBUG,
format="[%(asctime)s %(filename)s] %(message)s")
if len(sys.argv) != 2:
logging.error("Usage: python training.py params.py")
sys.exit()
params_path = sys.argv[1]
if not os.path.isfile(params_path):
logging.error("no params file found! path: {}".format(params_path))
sys.exit()
config = importlib.import_module(params_path[:-3]).TRAINING_PARAMS
config["batch_size"] *= len(config["parallels"])
# Create sub_working_dir
'''
working_dir/model_params/size(backbone_name)x(img_w)_try(img_h)/try
这个路径即存储训练后的模型。
'''
sub_working_dir = '{}/{}/size{}x{}_try{}/{}'.format(
config['working_dir'], config['model_params']['backbone_name'],
config['img_w'], config['img_h'], config['try'],
time.strftime("%Y%m%d%H%M%S", time.localtime()))
if not os.path.exists(sub_working_dir):
os.makedirs(sub_working_dir)
config["sub_working_dir"] = sub_working_dir
logging.info("sub working dir: %s" % sub_working_dir)
# Creat tf_summary writer
'''
根据上边训练出来的模型,读取文件中内容,用tensorboard导出训练损失曲线图。
'''
config["tensorboard_writer"] = SummaryWriter(sub_working_dir)
logging.info("Please using 'python -m tensorboard.main --logdir={}'".format(sub_working_dir))
# Start training
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, config["parallels"]))
train(config)
if __name__ == "__main__":
main()
在此插入params.py,定义网络超参数。
TRAINING_PARAMS = \
{
"model_params": {
"backbone_name": "darknet_53",
"backbone_pretrained": "../weights/darknet53_weights_pytorch.pth", # set empty to disable
},
"yolo": {
"anchors": [[[116, 90], [156, 198], [373, 326]],
[[30, 61], [62, 45], [59, 119]],
[[10, 13], [16, 30], [33, 23]]],
"classes": 20, #该网络在voc2012上训练
},
"lr": {
"backbone_lr": 0.001,
"other_lr": 0.01,
"freeze_backbone": False, # freeze backbone wegiths to finetune
"decay_gamma": 0.1, #衰减指数,
"decay_step": 20, # 衰减速度,即每迭代多少轮就衰减的度量值。值为20就代表当前迭代轮数达到20时就给学习率乘上0.1(衰减指数)的1次方,达到40时就给学习率乘上0.1(衰减指数)的2次方。
},
"optimizer": {
"type": "sgd",
"weight_decay": 4e-05,
},
"batch_size": 4,
"train_path": "../data/coco/trainvalno5k.txt",
"epochs": 100,
"img_h": 416,
"img_w": 416,
"parallels": [0], # config GPU device
"working_dir": "YOUR_WORKING_DIR", # replace with your working dir
"pretrain_snapshot": "", # load checkpoint
"evaluate_type": "",
"try": 0,
"export_onnx": False,
}
2.1.1 train(config)函数
def train(config):
'''
param:config,即函数params.py中的参数。
return:
'''
config["global_step"] = config.get("start_step", 0)
is_training = False if config.get("export_onnx") else True
# Load and initialize network
net = ModelMain(config, is_training=is_training)
net.train(is_training)
# Optimizer and learning rate
'''
lr_scheduler.StepLR()
Assuming optimizer uses lr = 0.05 for all groups
lr = 0.05 if epoch < 20
lr = 0.005 if 20 <= epoch < 40
lr = 0.0005 if 40 <= epoch < 80
'''
optimizer = _get_optimizer(config, net)
lr_scheduler = optim.lr_scheduler.StepLR(
optimizer,
step_size=config["lr"]["decay_step"],
gamma=config["lr"]["decay_gamma"])
# Set data parallel
net = nn.DataParallel(net)
net = net.cuda()
# Restore pretrain model
if config["pretrain_snapshot"]:
logging.info("Load pretrained weights from {}".format(config["pretrain_snapshot"]))
state_dict = torch.load(config["pretrain_snapshot"])
net.load_state_dict(state_dict)
# YOLO loss with 3 scales
'''
此处yolo_losses 应该返回什么?
'''
yolo_losses = []
for i in range(3):
yolo_losses.append(YOLOLoss(config["yolo"]["anchors"][i],
config["yolo"]["classes"], (config["img_w"], config["img_h"])))
# DataLoader
dataloader = torch.utils.data.DataLoader(COCODataset(config["train_path"],
(config["img_w"], config["img_h"]),
is_training=True),
batch_size=config["batch_size"],
shuffle=True, num_workers=32, pin_memory=True)
# Start the training loop
'''
训练部分从此处开始。
batch_size = 4
step = len(images)/4 即一个epoch分成了多少个batch。
samples是每个batch_size的输入图片张量和target张量。
'''
logging.info("Start training.")
for epoch in range(config["epochs"]):
'''
samples为一个字典,其中有四个key,分别为image,label,image_path,orginal_size.
value分别为batch_size个输入图片,输入标签,路径,原始尺寸大小。
'''
for step, samples in enumerate(dataloader):
images, labels = samples["image"], samples["label"]
start_time = time.time()
config["global_step"] += 1 #这个地方的global_step在params.py中并没有出现。
# Forward and backward
optimizer.zero_grad()
outputs = net(images)
losses_name = ["total_loss", "x", "y", "w", "h", "conf", "cls"]
losses = []
for _ in range(len(losses_name)):
losses.append([])
for i in range(3):
_loss_item = yolo_losses[i](outputs[i], labels)
for j, l in enumerate(_loss_item):
losses[j].append(l)
losses = [sum(l) for l in losses]
loss = losses[0]
loss.backward()
optimizer.step()
if step > 0 and step % 10 == 0:
_loss = loss.item()
duration = float(time.time() - start_time)
example_per_second = config["batch_size"] / duration
lr = optimizer.param_groups[0]['lr']
logging.info(
"epoch [%.3d] iter = %d loss = %.2f example/sec = %.3f lr = %.5f "%
(epoch, step, _loss, example_per_second, lr)
)
config["tensorboard_writer"].add_scalar("lr",
lr,
config["global_step"])
config["tensorboard_writer"].add_scalar("example/sec",
example_per_second,
config["global_step"])
for i, name in enumerate(losses_name):
value = _loss if i == 0 else losses[i]
config["tensorboard_writer"].add_scalar(name,
value,
config["global_step"])
if step > 0 and step % 1000 == 0:
# net.train(False)
_save_checkpoint(net.state_dict(), config)
# net.train(True)
lr_scheduler.step()
# net.train(False)
_save_checkpoint(net.state_dict(), config)
# net.train(True)
logging.info("Bye~")
# best_eval_result = 0.0
def _save_checkpoint(state_dict, config, evaluate_func=None):
# global best_eval_result
checkpoint_path = os.path.join(config["sub_working_dir"], "model.pth")
torch.save(state_dict, checkpoint_path)
logging.info("Model checkpoint saved to %s" % checkpoint_path)
# eval_result = evaluate_func(config)
# if eval_result > best_eval_result:
# best_eval_result = eval_result
# logging.info("New best result: {}".format(best_eval_result))
# best_checkpoint_path = os.path.join(config["sub_working_dir"], 'model_best.pth')
# shutil.copyfile(checkpoint_path, best_checkpoint_path)
# logging.info("Best checkpoint saved to {}".format(best_checkpoint_path))
# else:
# logging.info("Best result: {}".format(best_eval_result))
(1)_get_optimizer()函数:
def _get_optimizer(config, net):
'''
params:params超参数,net网络
return:优化器。
'''
optimizer = None
# Assign different lr for each layer
params = None
base_params = list(
map(id, net.backbone.parameters())
)
logits_params = filter(lambda p: id(p) not in base_params, net.parameters())
'''
freeze_backbone即冻结主干网络微调,即不允许网络在训练的时候微调darknet53网络。
在本代码params.py中,freeze_backbone = false,即允许微调。
实现学习率的调整。
'''
if not config["lr"]["freeze_backbone"]:
params = [
{"params": logits_params, "lr": config["lr"]["other_lr"]},
{"params": net.backbone.parameters(), "lr": config["lr"]["backbone_lr"]},
]
else:
logging.info("freeze backbone's parameters.")
for p in net.backbone.parameters():
p.requires_grad = False
params = [
{"params": logits_params, "lr": config["lr"]["other_lr"]},
]
# Initialize optimizer class
if config["optimizer"]["type"] == "adam":
optimizer = optim.Adam(params, weight_decay=config["optimizer"]["weight_decay"])
elif config["optimizer"]["type"] == "amsgrad":
optimizer = optim.Adam(params, weight_decay=config["optimizer"]["weight_decay"],
amsgrad=True)
elif config["optimizer"]["type"] == "rmsprop":
optimizer = optim.RMSprop(params, weight_decay=config["optimizer"]["weight_decay"])
else:
# Default to sgd
logging.info("Using SGD optimizer.")
optimizer = optim.SGD(params, momentum=0.9,
weight_decay=config["optimizer"]["weight_decay"],
nesterov=(config["optimizer"]["type"] == "nesterov"))
return optimizer
(2) 返回值dataloader处,用了torch.utils.data.DataLoader函数,即将数据放进神经网络进行学习,可以理解为将我们的数据集分成一小批一小批的,再放到神经网络里。其中参数部分用到了COCODataset函数,因此接下来我们讲解common/coco_dataset.py函数。
class COCODataset(Dataset):
'''
params:train_path,(img_w,img_h),is_training.
return:
为什么会有__init__,__getitem__,__len__三个函数,作用是什么?
'''
def __init__(self, list_path, img_size, is_training, is_debug=False):
self.img_files = []
self.label_files = []
for path in open(list_path, 'r'):
label_path = path.replace('images', 'labels').replace('.png', '.txt').replace(
'.jpg', '.txt').strip()
if os.path.isfile(label_path):
self.img_files.append(path)
self.label_files.append(label_path)
else:
logging.info("no label found. skip it: {}".format(path))
logging.info("Total images: {}".format(len(self.img_files)))
self.img_size = img_size # (w, h)
self.max_objects = 50
self.is_debug = is_debug
# transforms and augmentation
self.transforms = data_transforms.Compose()
if is_training:
self.transforms.add(data_transforms.ImageBaseAug())
# self.transforms.add(data_transforms.KeepAspect())
self.transforms.add(data_transforms.ResizeImage(self.img_size))
self.transforms.add(data_transforms.ToTensor(self.max_objects, self.is_debug))
def __getitem__(self, index):
'''
此处作用在不断读取一个epoch内多个batch的图片和label并转换为tensor形式。
'''
img_path = self.img_files[index % len(self.img_files)].rstrip()
img = cv2.imread(img_path, cv2.IMREAD_COLOR)
if img is None:
raise Exception("Read image error: {}".format(img_path))
ori_h, ori_w = img.shape[:2]
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
label_path = self.label_files[index % len(self.img_files)].rstrip()
if os.path.exists(label_path):
labels = np.loadtxt(label_path).reshape(-1, 5)
else:
logging.info("label does not exist: {}".format(label_path))
labels = np.zeros((1, 5), np.float32)
sample = {'image': img, 'label': labels}
if self.transforms is not None:
sample = self.transforms(sample)
sample["image_path"] = img_path
sample["origin_size"] = str([ori_w, ori_h])
return sample
def __len__(self):
return len(self.img_files)
(3)COCODataset中先读取了数据集中images和对应的labels,之后进行数据增强,对common/data_transforms.py进行讲解。
class Compose(object):
"""Composes several transforms together.
Args:
transforms (list of ``Transform`` objects): list of transforms to compose.
"""
def __init__(self, transforms=[]):
self.transforms = transforms
def __call__(self, img):
for t in self.transforms:
img = t(img)
return img
def add(self, transform):
self.transforms.append(transform)
class ToTensor(object):
'''
转换数据类型+归一化+transpose(即将原数组做转置。若是三维(0,1,2)则转为(2,1,0)+转换数据类型)
filled_labels执行的操作没有看明白,待查证。
'''
def __init__(self, max_objects=50, is_debug=False):
self.max_objects = max_objects
self.is_debug = is_debug
def __call__(self, sample):
image, labels = sample['image'], sample['label']
if self.is_debug == False:
image = image.astype(np.float32)
image /= 255.0
image = np.transpose(image, (2, 0, 1))
image = image.astype(np.float32)
filled_labels = np.zeros((self.max_objects, 5), np.float32)
filled_labels[range(len(labels))[:self.max_objects]] = labels[:self.max_objects]
return {'image': torch.from_numpy(image), 'label': torch.from_numpy(filled_labels)}
class KeepAspect(object):
'''
此处执行图像的缩放操作。
'''
def __init__(self):
pass
def __call__(self, sample):
image, label = sample['image'], sample['label']
h, w, _ = image.shape
dim_diff = np.abs(h - w)
# Upper (left) and lower (right) padding
pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2
# Determine padding
pad = ((pad1, pad2), (0, 0), (0, 0)) if h <= w else ((0, 0), (pad1, pad2), (0, 0))
# Add padding
image_new = np.pad(image, pad, 'constant', constant_values=128)
padded_h, padded_w, _ = image_new.shape
# Extract coordinates for unpadded + unscaled image
x1 = w * (label[:, 1] - label[:, 3]/2)
y1 = h * (label[:, 2] - label[:, 4]/2)
x2 = w * (label[:, 1] + label[:, 3]/2)
y2 = h * (label[:, 2] + label[:, 4]/2)
# Adjust for added padding
x1 += pad[1][0]
y1 += pad[0][0]
x2 += pad[1][0]
y2 += pad[0][0]
# Calculate ratios from coordinates
label[:, 1] = ((x1 + x2) / 2) / padded_w
label[:, 2] = ((y1 + y2) / 2) / padded_h
label[:, 3] *= w / padded_w
label[:, 4] *= h / padded_h
return {'image': image_new, 'label': label}
class ResizeImage(object):
def __init__(self, new_size, interpolation=cv2.INTER_LINEAR):
self.new_size = tuple(new_size) # (w, h)
self.interpolation = interpolation
def __call__(self, sample):
image, label = sample['image'], sample['label']
image = cv2.resize(image, self.new_size, interpolation=self.interpolation)
return {'image': image, 'label': label}
class ImageBaseAug(object):
def __init__(self):
sometimes = lambda aug: iaa.Sometimes(0.5, aug)
self.seq = iaa.Sequential(
[
# Blur each image with varying strength using
# gaussian blur (sigma between 0 and 3.0),
# average/uniform blur (kernel size between 2x2 and 7x7)
# median blur (kernel size between 3x3 and 11x11).
#下面三个方法选择其中一个。
iaa.OneOf([
iaa.GaussianBlur((0, 3.0)),
iaa.AverageBlur(k=(2, 7)),
iaa.MedianBlur(k=(3, 11)),
]),
# Sharpen each image, overlay the result with the original
# image using an alpha between 0 (no sharpening) and 1
# (full sharpening effect).
#用于锐化图像并且与原始图像叠加。
sometimes(iaa.Sharpen(alpha=(0, 0.5), lightness=(0.75, 1.5))),
# 添加高斯噪声。
sometimes(iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5)),
# 每个像素加上 -5 to 5 的值。
sometimes(iaa.Add((-5, 5), per_channel=0.5)),
# 改变图像的亮度。 (80-120% of original value).
sometimes(iaa.Multiply((0.8, 1.2), per_channel=0.5)),
# 调节图片的对比度。
sometimes(iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5)),
],
# do all of the above augmentations in random order
random_order=True
)
def __call__(self, sample):
seq_det = self.seq.to_deterministic()
image, label = sample['image'], sample['label']
image = seq_det.augment_images([image])[0]
return {'image': image, 'label': label}