准备阶段
#分布式训练准备:进程号、进程数量初始化。
rank, world_size = dist_init()
#world_size:进程数量/任务数量/GPU数量
#rank:进程ID
#加载参数:合并args参数表(包含代码中定义参数以及experiments中对应config.yaml配置文件)和cfg配置文件。
cfg.merge_from_file(args.cfg)
#merge_from_file(*.yaml):加载一个yaml配置文件并将其与cfgNode合并
#使用local process(process ID:0)加载log files。
if rank == 0:
if not os.path.exists(cfg.TRAIN.LOG_DIR):
os.makedirs(cfg.TRAIN.LOG_DIR)
init_log('global', logging.INFO)
if cfg.TRAIN.LOG_DIR:
add_file_handler('global',
os.path.join(cfg.TRAIN.LOG_DIR, 'logs.txt'),
logging.INFO)
logger.info("Version Information: \n{}\n".format(commit()))
logger.info("config \n{}".format(json.dumps(cfg, indent=4)))
#创建模型:通过ModelBuilder类作为模型各个模块组装类,在构造函数中分别调用了get_backbone/get_neck/get_rpn_head/get_mask_head(可选)
#ModelBuilder实现了训练时用到的前向传播forward(data)函数,输入data包含训练时的template patch和search patch,以及分类label:label_cls、预测框位置回归label:label_loc,以及位置参数权重:label_loc_weight。返回的outputs字典包括了总损失值total_loss/分类损失cls_loss/位置损失loc_loss;
#ModelBuilder类同时实现了推断时模板分支的计算方法template(z)(backbone和neck部分)和搜索分支的计算方法track(x)(backbone和neck部分,以及与模板分支得到的结果一起送入rpn_head部分),得到并返回分类和回归结果cls/loc/mask(可选)。
model = ModelBuilder().cuda().train()
dist_model = DistModule(model)
#model.cuda().train():将模型移动至GPU上,并set mode=training mode ;
#DistModule(model):分布式训练模型,需要将模型副本copy至各个GPU中;
下面是ModelBuilder集成的各个模块的实现
1.backbone构建
self.backbone = get_backbone(cfg.BACKBONE.TYPE, **cfg.BACKBONE.KWARGS) #model_builder.py
BACKBONES = {
'alexnetlegacy': alexnetlegacy,
'mobilenetv2': mobilenetv2,
'resnet18': resnet18,
'resnet34': resnet34,
'resnet50': resnet50,
'alexnet': alexnet,
} #__init__.py
def resnet50:model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) #resnet_atrous.py
在这,ResNet50的构建主要由两部分构成:ResNet整体构建和Bottleneck模块构建。
* Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1)
输入channels=inplanes;输出channels=4*planes;stride=1;dilation=1(default)
输入x => conv1(inplanes, planes) => bn1 => relu => conv2(planes,planes) => bn2 => relu => conv3(planes,planes*4) => bn3 => out+residual(x或x经过downsample层处理后的输出) ,其中conv2可能为空洞卷积。
* ResNet(block, layers, used_layers)
输入图像 => conv1(3,64) => bn1 => x_:relu => maxpool(ksz=3,s=2,p=1) => p1:layer1 => p2:layer2 => p3:layer3 => p4:layer4 => out:[x_,p1,p2,p3,p4] => out[p2,p3,p4]
layer1=_make_layer(block, planes=64, blocks=layers[0], stride=1, dilation=1):block为Bottleneck实例,layers[0]=3
downsample = nn.Sequential(nn.Conv2d(self.inplanes=64, planes * block.expansion=64*4,kernel_size=1, stride=1, bias=False),nn.BatchNorm2d(planes * block.expansion=64*4),)
block(64,64,1,downsample,1) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=1) => 输入64channels,输出256channels
self.inplanes=64*4=256
block(256,64,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>输入256channels,输出256channels
block(256,64,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>输入256channels,输出256channels
返回:nn.Sequential(Bottleneck*3)
功能:下采样、通道升维 64=>256
layer2=_make_layer(block, planes=128, blocks=layers[1], stride=2,dilation=1):block为Bottleneck实例,layers[1]=4
dd=1,padding=0
downsample = nn.Sequential(nn.Conv2d(self.inplanes=256, planes * block.expansion=128*4,kernel_size=3, stride=2, bias=False,padding=0, dilation=1),nn.BatchNorm2d(planes * block.expansion=128*4),)
block(256,128,2,downsample,1) => Bottleneck(inplanes, planes, stride=2,downsample, dilation=1) => 输入256channels,输出512channels
self.inplanes = 128*4 = 512
block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>输入512channels,输出512channels
block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>输入512channels,输出512channels
block(512,128,dilation=1) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=1) =>输入512channels,输出512channels
返回:nn.Sequential(Bottleneck*4)
功能:下采样、通道升维 256=>512
self.feature_size = 128 * block.expansion = 128*4 = 512
layer3=_make_layer(block, planes=256, blocks=layers[2], stride=1, dilation=2):block为Bottleneck实例,layers[2]=6
dd = dilation // 2 = 1,padding = dd =1
downsample = nn.Sequential(nn.Conv2d(self.inplanes=512, planes * block.expansion=256*4,kernel_size=3, stride=1, bias=False, padding=1, dilation=1),nn.BatchNorm2d(planes * block.expansion=256*4),)
block(512,256,1,downsample,2) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=2) => 输入512channels,输出1024channels
self.inplanes = 256*4 = 1024
block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>输入1024channels,输出1024channels
block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>输入1024channels,输出1024channels
block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>输入1024channels,输出1024channels
block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>输入1024channels,输出1024channels
block(1024,256,dilation=2) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=2) =>输入1024channels,输出1024channels
返回:nn.Sequential(Bottleneck*6)
功能:下采样、通道升维 512=>1024、扩大感受野dilation=2
self.feature_size = (256 + 128) * 4 = 1536
layer4=_make_layer(block, planes=512, blocks=layers[3], stride=1, dilation=4):block为Bottleneck实例,layers[3]=3
dd = dilation // 2 = 2,padding = dd =2
downsample = nn.Sequential(nn.Conv2d(self.inplanes=1024, planes * block.expansion=512*4,kernel_size=3, stride=1, bias=False, padding=2, dilation=2),nn.BatchNorm2d(planes * block.expansion=512*4),)
block(1024,512,1,downsample,4) => Bottleneck(inplanes, planes, stride=1,downsample, dilation=4) => 输入1024channels,输出2048channels
self.inplanes = 512*4 = 2048
block(2048,512,dilation=4) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=4) =>输入2048channels,输出2048channels
block(2048,512,dilation=4) => Bottleneck(inplanes, planes, stride=1,downsample=None, dilation=4) =>输入2048channels,输出2048channels
返回:nn.Sequential(Bottleneck*3)
功能:下采样、通道升维 1024=>2048、扩大感受野dilation=4
self.feature_size = 512 * 4 = 2048
[注意] dilation=1时即为默认的卷积方式,没有空洞;dilation=2表示卷积时cell与cell间空出1个cell。
2.neck构建
if cfg.ADJUST.ADJUST:
self.neck = get_neck(cfg.ADJUST.TYPE, **cfg.ADJUST.KWARGS) #model_builder.py
NECKS = {
'AdjustLayer': AdjustLayer,
'AdjustAllLayer': AdjustAllLayer
} #__init__.py
ADJUST:
ADJUST: true
TYPE: "AdjustAllLayer"
KWARGS:
in_channels: [512, 1024, 2048]
out_channels: [256, 256, 256] #config.yaml
neck层为AdjustAllLayer类型,但其构建也是通过AdjustLayer这个函数实现的。
从config.yaml文件中可以看到,输入/输出channels都是长度为3的列表。因此neck层也会有三个AdjustLayer模块,通过self.add_module()方法添加模块,这三个模块分别称为downsample2/downsample3/downsample4;
看一下AdjustLayer函数的实现:
AdjustLayer(in_channels, out_channels, center_size=7) ,其中in_channels与out_channels每次分别接收来自config.yaml文件中的相应的三个值(512,256)/(1024,256)/(2048,256)
center_size对应论文SiamRPN++中提到的“为了减少计算量,对特征crop center size大小”,center_size=7是默认值,在代码中定义。
AdjustLayer的实现就是一个downsample层:
self.downsample = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.BatchNorm2d(out_channels),
)
训练调用forward函数时,会对经过downsample层处理过后的特征进行判断,是否执行crop:
def forward(self, x):
x = self.downsample(x)
if x.size(3) < 20:
l = (x.size(3) - self.center_size) // 2
r = l + self.center_size
x = x[:, :, l:r, l:r]
return x
3.rpn_head构建
self.rpn_head = get_rpn_head(cfg.RPN.TYPE, **cfg.RPN.KWARGS) #model_builder.py
RPNS = {
'UPChannelRPN': UPChannelRPN,
'DepthwiseRPN': DepthwiseRPN,
'MultiRPN': MultiRPN
} #__init__.py
RPN:
TYPE: 'MultiRPN'
KWARGS:
anchor_num: 5
in_channels: [256, 256, 256]
weighted: true #config.yaml
MultiRPN(anchor_num, in_channels, weighted=False) #rpn.py
这里,RPN的输入均为256channels,SiamRPN++的rpn_head有3个,分别是rpn2/rpn3/rpn4。在经过DepthWiseCorr后,进行加权融合(权值计算:F.softmax(cls_weight)/F.softmax(loc_weight));
DepthwiseRPN类分别就分类和回归进行DepthWise-Corr计算:
DepthwiseRPN(anchor_num, in_channels[i], in_channels[i]))
self.cls = DepthwiseXCorr(in_channels, out_channels, 2 * anchor_num)
self.loc = DepthwiseXCorr(in_channels, out_channels, 4 * anchor_num)
其中,in_channels=256, out_channels=256;对应DepthwiseXCorr.init(in_channels=256, hidden=256, out_channels=10(或20), kernel_size=3)
对于模板分支(in:256,out:256):
self.conv_kernel = nn.Sequential(
nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
)
对于搜索分支(in:256,out:256):
self.conv_search = nn.Sequential(
nn.Conv2d(in_channels, hidden, kernel_size=kernel_size, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
)
depth-wise correlation:
def xcorr_depthwise(x, kernel):
batch = kernel.size(0)
channel = kernel.size(1)
x = x.view(1, batch*channel, x.size(2), x.size(3))
kernel = kernel.view(batch*channel, 1, kernel.size(2), kernel.size(3))
out = F.conv2d(x, kernel, groups=batch*channel)
out = out.view(batch, channel, out.size(2), out.size(3))
return out
最后通过head层得到输出(in:256,out:10/20):
self.head = nn.Sequential(
nn.Conv2d(hidden, hidden, kernel_size=1, bias=False),
nn.BatchNorm2d(hidden),
nn.ReLU(inplace=True),
nn.Conv2d(hidden, out_channels, kernel_size=1)
)
最后看一下ModelBuilder的前向传播函数forward(data):传入的data包含training image pairs、label_cls、label_loc、label_loc_weight****(这些参数key-value pairs会在构建数据集的时候写入)
首先将template/search pairs传入上述构建的模型得到分类结果cls和位置结果loc,然后将cls送入log_softmax得到softmax value:cls;再将cls、label_cls送入select_cross_entropy_loss得到分类损失cls_loss;将loc, label_loc, label_loc_weight送入weight_l1_loss得到回归损失loc_loss。返回ouputs=[ total_loss, cls_loss, loc_loss ]
至此,网络模型构建部分结束。
继续往下说。
#加载backbone预训练权重至刚刚创建的model
if cfg.BACKBONE.PRETRAINED:
cur_path = os.path.dirname(os.path.realpath(__file__))
backbone_path = os.path.join(cur_path, '../', cfg.BACKBONE.PRETRAINED)
load_pretrain(model.backbone, backbone_path)
#插一句:关于加载模型,一般有以下5步:
1.初始化device
device = torch.device('cuda' if cfg.CUDA else 'cpu’) #使用GPU或者CPU
或者 device = torch.cuda.current_device() #直接使用GPU
2.加载模型
pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) #载入GPU
或者 ckpt = torch.load(args.snapshot, map_location=lambda storage,loc:storage.cpu()) #载入CPU
3.移除前缀(一般用于多GPU分布式训练后的模型)
pretrained_dict = remove_prefix(pretrained_dict['state_dict’], 'module.’) #if "state_dict" in pretrained_dict.keys()
或者 pretrained_dict = remove_prefix(pretrained_dict, 'module.’) #if “state_dict" not in pretrained_dict.keys()
4.检查键完整性:即判断当前构建的模型参数与待加载的模型参数是否匹配
check_keys(model, pretrained_dict)
5.装载参数
model.load_state_dict(pretrained_dict, strict=False)
#构建data loader
#1.构建训练集
train_dataset = TrkDataset()
下面我们看一下数据集是怎么构建的。
首先需要说明的一点是:Dataset类必须要实现索引方法__getitem__方便dataloader获取数据,并要提供数据集的长度length(实现__len__方法),DataLoader 才能自动地从DataSet中取出小批量的数据集进行训练。
dataset.py => class: TrkDataset => 初始化函数__init__
#初始化函数
def __init__(self,):
super(TrkDataset, self).__init__()
desired_size = (cfg.TRAIN.SEARCH_SIZE - cfg.TRAIN.EXEMPLAR_SIZE) / cfg.ANCHOR.STRIDE + 1 + cfg.TRAIN.BASE_SIZE
if desired_size != cfg.TRAIN.OUTPUT_SIZE:
raise Exception('size not match!')
# create anchor targer(目的:生成anchors)
#1.根据stride/ratios/scales生成anchors(5,4),5:anchor数量; 4:x1,y1,x2,y2.
#2.在25x25的数组中的每一个位置都生成5个anchor_box,每个anchor_box都用(x1,y1,x2,y2)表示,且这里的x1,y1,x2,y2均表示5个anchor_box在25个点映射回原图的坐标
#3.将上一步得到的anchors作为Anchors实例属性all_anchors保存anchor的信息
self.anchor_target = AnchorTarget() # TrkDataset:anchor_target => AnchorTarget:anchors => Anchors:all_anchors
# create sub dataset
self.all_dataset = []
start = 0
self.num = 0
for name in cfg.DATASET.NAMES:
subdata_cfg = getattr(cfg.DATASET, name)
#SubDataset是构建数据集更具体的类,实现的功能有:
#1.加载标签数据train.json;
#2.过滤了标注错误的数据和空目录数据;
#3.将数据重新组装,将同一片段的帧统称为‘frames’:meta_data[video][track]['frames'] = frames
#4.经过筛选后的有效数据作为标签属性:self.labels = meta_data
#5.提供属性:self.num(数据集长度)/self.num_use(实际使用的数据量,当传入值为-1时等于self.num)/self.videos(数据路径列表)/self.pick(随机打乱原有index顺序并截取num_use大小的index列表)/self.start_idx(数据集开始的index)/self.frame_range(隔帧取)
sub_dataset = SubDataset(
name,
subdata_cfg.ROOT,
subdata_cfg.ANNO,
subdata_cfg.FRAME_RANGE,
subdata_cfg.NUM_USE, #VID=10000,others=-1
start #from 0 to +num_VID/+num_COCO/+num_YTB/+num_DET
)
start += sub_dataset.num #每个数据集开始的下标
self.num += sub_dataset.num_use #from 0 to +num_use_VID/+num_use_COCO/+num_use_YTB/+num_use_DET
sub_dataset.log()
self.all_dataset.append(sub_dataset) #self.all_dataset保存加载好的数据集对象
# data augmentation
self.template_aug = Augmentation(
cfg.DATASET.TEMPLATE.SHIFT,
cfg.DATASET.TEMPLATE.SCALE,
cfg.DATASET.TEMPLATE.BLUR,
cfg.DATASET.TEMPLATE.FLIP,
cfg.DATASET.TEMPLATE.COLOR
)
self.search_aug = Augmentation(
cfg.DATASET.SEARCH.SHIFT,
cfg.DATASET.SEARCH.SCALE,
cfg.DATASET.SEARCH.BLUR,
cfg.DATASET.SEARCH.FLIP,
cfg.DATASET.SEARCH.COLOR
)
videos_per_epoch = cfg.DATASET.VIDEOS_PER_EPOCH #600000
self.num = videos_per_epoch if videos_per_epoch > 0 else self.num
self.num *= cfg.TRAIN.EPOCH #60000*20=120,0000
self.pick = self.shuffle() #打乱顺序的index列表
dataset.py => class: TrkDataset => 获取数据方法
def __getitem__(self, index):
index = self.pick[index] #得到即将要读取的数据index
dataset, index = self._find_dataset(index) #返回index所在的数据集以及该index在此数据集中的相对index
#随机对训练数据增广(gray)和增加随机性(neg)
gray = cfg.DATASET.GRAY and cfg.DATASET.GRAY > np.random.random()
neg = cfg.DATASET.NEG and cfg.DATASET.NEG > np.random.random()
# get one dataset
if neg: #通过随机的方式构造patch pair,增加随机性,可认为是negative pairs
# 在dataset中按照index随机取一个,得到 img_path & img_anno
template = dataset.get_random_target(index)
# 在随机一个dataset中选择一个,并随机index得到 img_path & img_anno
search = np.random.choice(self.all_dataset).get_random_target()
else:#选择同一video路径中的image作为patch pair,可认为是positive pairs
template, search = dataset.get_positive_pair(index) #分别得到模板分支和搜索分支的输入img_path & img_anno
# get image
template_image = cv2.imread(template[0])
search_image = cv2.imread(search[0])
# get bounding box (BBox Ground-Truth)
template_box = self._get_bbox(template_image, template[1]) #x1,y1,x2,y2
search_box = self._get_bbox(search_image, search[1])
# augmentation
# template/search均为image
template, _ = self.template_aug(template_image,
template_box,
cfg.TRAIN.EXEMPLAR_SIZE,
gray=gray)
search, bbox = self.search_aug(search_image,
search_box,
cfg.TRAIN.SEARCH_SIZE,
gray=gray)
# get labels
# 调用AnchorTarget类的__call__函数
# __call__函数功能:
# 1.得到分类标签cls
# 2.得到偏移标签delta
# 3.得到偏移权重标签delta_weight
# 4.得到所有anchors与bbox的IOU值overlap
cls, delta, delta_weight, overlap = self.anchor_target(bbox, cfg.TRAIN.OUTPUT_SIZE, neg)
template = template.transpose((2, 0, 1)).astype(np.float32)
search = search.transpose((2, 0, 1)).astype(np.float32)
return {
'template': template,
'search': search,
'label_cls': cls,
'label_loc': delta,
'label_loc_weight': delta_weight,
'bbox': np.array(bbox)
}
#2(可选,分布式).构建数据集切片器
if get_world_size() > 1:
train_sampler = DistributedSampler(train_dataset)
#3.构建data loader
train_loader = DataLoader(train_dataset,
batch_size=cfg.TRAIN.BATCH_SIZE,
num_workers=cfg.TRAIN.NUM_WORKERS,
pin_memory=True,
sampler=train_sampler)
#构建优化器和学习率调整器
构建SGD优化器optimizer所需要的参数有:可训练参数列表、动量、权重衰减
构建学习率调整器lr_scheduler所需要的参数有:优化器、训练epochs总数
构建过程分为以下几步:
1.搜集可训练参数写入python列表
trainable_params = []
trainable_params += [{
'params': filter(lambda x: x.requires_grad,model.backbone.parameters()),
'lr': cfg.BACKBONE.LAYERS_LR * cfg.TRAIN.BASE_LR}]
2.构建优化器
optimizer = torch.optim.SGD(trainable_params, momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.WEIGHT_DECAY)
3.构建学习率调整器
lr_scheduler = build_lr_scheduler(optimizer, epochs=cfg.TRAIN.EPOCH)
4.更新学习率
lr_scheduler.step(cfg.TRAIN.START_EPOCH)
训练阶段
核心代码:def train(train_loader, model, optimizer, lr_scheduler, tb_writer)
for idx, data in enumerate(train_loader):
if epoch != idx // num_per_epoch + start_epoch:
epoch = idx // num_per_epoch + start_epoch
if get_rank() == 0:
torch.save(
{
'epoch': epoch,
'state_dict': model.module.state_dict(),
'optimizer': optimizer.state_dict()},
cfg.TRAIN.SNAPSHOT_DIR+'/checkpoint_e%d.pth' % (epoch))
if epoch == cfg.TRAIN.EPOCH:
return
if cfg.BACKBONE.TRAIN_EPOCH == epoch:
optimizer, lr_scheduler = build_opt_lr(model.module, epoch)
#每一个epoch,更新学习率
lr_scheduler.step(epoch)
cur_lr = lr_scheduler.get_cur_lr()
outputs = model(data)
loss = outputs['total_loss’]
if is_valid_number(loss.data.item()):
optimizer.zero_grad()
loss.backward()
reduce_gradients(model)
if rank == 0 and cfg.TRAIN.LOG_GRADS:
log_grads(model.module, tb_writer, tb_idx)
# clip gradient
clip_grad_norm_(model.parameters(), cfg.TRAIN.GRAD_CLIP)
optimizer.step()
至此,pysot的训练过程梳理完成。训练贯穿了从数据加载到优化、保存模型一整个过程,搞清楚这些任务过程,基本上就搞懂了整个项目。如果有什么地方有疑问,还请大家提出来一起交流,希望对大家有些帮助。