本节是 DETR流程及 构建backbone和position embedding 相关部分的代码解析
STEP 1: Create model and criterion #构建模型和标准
STEP 2: Create train and val dataloader #构建训练和测试数据集
STEP 3: Define lr_scheduler #定义学习策略
STEP 4: Define optimizer #定义优化器
STEP 5: Load pretrained model or load resume model and optimizer states 加载预训练模型
STEP 6: Validation
STEP 7: Start training and validation
STEP 8:Model save
STEP 1: Create model and criterion #构建模型和标准
model, criterion, postprocessors = build_detr(config)
def build_detr(config):
""" build detr model from config"""
# 1. build backbone with position embedding
backbone = build_backbone(config)
# 2. build transformer (encoders and decoders)
transformer = build_transformer(config)
# 3. build DETR model
aux_loss = not config.EVAL # set true during training
detr = DETR(backbone=backbone,
transformer=transformer,
num_classes=config.MODEL.NUM_CLASSES,
num_queries=config.MODEL.NUM_QUERIES,
aux_loss=aux_loss)
# 4. build matcher
matcher = build_matcher()
# 5. setup aux_loss
weight_dict = {'loss_ce': 1., 'loss_bbox': 5., 'loss_giou': 2.}
if aux_loss:
aux_weight_dict = {}
for i in range(config.MODEL.TRANS.NUM_DECODERS-1):
aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
weight_dict.update(aux_weight_dict)
# 6. build criterion
losses = ['labels', 'boxes', 'cardinality']
criterion = SetCriterion(config.MODEL.NUM_CLASSES,
matcher=matcher,
weight_dict=weight_dict,
eos_coef=0.1,
losses=losses)
# 7. build postprocessers
postprocessors = {'bbox': PostProcess()}
return detr, criterion, postprocessors
build_detr 函数涉及build_backbone ,build_transformer,build_DETR model,build_match,
设置 aux_loss,aux_loss代表是否要对Transformer中Decoder的每层输出都计算loss。
构建 postprossers:PostProcess()这个函数是将output转换位coco api的格式。
用的是常见的resnet网络, backbone调用resnet50中把主干网络提取出来。
pretrained=paddle.distributed.get_rank(),表示在分布式当前进程中使用预训练权重。
paddle.distributed分布式训练,get_rank()获取当前进程值。
norm_layer=FrozenBatchNorm2D 归一化层,类似BN,将统计量(均值与方差)和可学习的仿射参数固定住。在实现的时候,需要将以上4个量注册到buffer,以便阻止梯度反向传播而更新它们,同时又能够记录在模型的state_dict中。
replace_stride_with_dilation=[False, False, dilation], 替换步长用膨胀率,如果为None,设置默认值为[False, False, False]
class Backbone(BackboneBase):
"""Get resnet backbone from resnet.py with multiple settings and return BackboneBase instance"""
def __init__(self, name, train_backbone, return_interm_layers, dilation, backbone_lr):
backbone = getattr(resnet, name)(pretrained=paddle.distributed.get_rank() == 0,
norm_layer=FrozenBatchNorm2D,
replace_stride_with_dilation=[False, False, dilation],
backbone_lr=backbone_lr)
num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
在BackboneBase中,若 return_interm_layers = true,则需要记录resnet每一层的输出。
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
获取网络中间几层的结果IntermediateLayerGetter(),它的输出是一个dict,对应了每层的输出,key是用户自定义的赋予输出特征的名字。
思想:先创建一个model ,然后把它传入IntermediateLayerGetter中,并传入一个字典,传入字典的key是model的直接的层,传入字典的value是返回字典中的key,返回字典的value对应的是model运行的中间结果。
在定义它的时候注明作用的模型和要返回的layer,得到new_m。使用时喂输入变量,返回的就是对应的layer。
mask = F.interpolate(m, size=x.shape[-2:])[0] #[batch_size, feat_h, fea_w]
将mask插值到与输出特征图尺寸一致。
NestedTensor()函数,包括tensor和mask两个成员,tensor就是输入的图像。mask跟tensor同高宽但是单通道。
BackboneBase 的前向方法中输入是NestedTensor 这个类的实例,其实质就是将图像张量和对应的mask封装到一起。
class BackboneBase(nn.Layer):
"""Backbone Base class for NestedTensor input and multiple outputs
This class handles the NestedTensor as input, run inference through backbone
and return multiple output tensors(NestedTensors) from selected layers
"""
def __init__(self,
backbone: nn.Layer,
train_backbone: bool,
num_channels: int,
return_interm_layers: bool):
super().__init__()
for name, parameter in backbone.named_parameters():
if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
parameter.stop_gradient = True
if return_interm_layers:
return_layers = {'layer0': '0', 'layer2': '1', 'layer3':'2', 'layer4':'3'}
else:
return_layers = {'layer4': '0'}
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
self.num_channels = num_channels
def forward(self, tensor_list):
#Inference through resnet backbone, which takes the paddle.Tensor as input
#tensor_list contains .tensor(paddle.Tensor) and .mask(paddle.Tensor) for batch inputs
xs = self.body(tensor_list.tensors)
out = {}
for name, x in xs.items():
# x.shape: [batch_size, feat_dim, feat_h, feat_w]
m = tensor_list.mask # [batch_size, orig_h, orig_w]
assert m is not None
m = m.unsqueeze(0).astype('float32') # [1, batch_size, orig_h, orig_w]
mask = F.interpolate(m, size=x.shape[-2:])[0] #[batch_size, feat_h, fea_w]
# 将mask插值到与输出特征图尺寸一致
out[name] = NestedTensor(x, mask)
return out
DETR将位置编码模块与backbone 通过Joiner()合在一起作为modle,在backbone 输出特征图的同时对其进行位置编码,以供后续Transformer 使用。
joiner是一个小型库,用于以类似sql连接的方式合并数据组。此处Joiner 就是将backbone 和position encoding 集成到一个模块nn.Module,使得前向过程中更方便地使用两者的功能。
Joiner 是nn.Sequectial 的子类,通过初始化,使得self[0]是backbone,self[1]是position-encoding。前向过程就是对backbone 的每层输出都进行位置编码,最终返回backbone的输出及对应的位置编码结果。
class Joiner(nn.Sequential):
""" Joiner layers(nn.Sequential) for backbone and pos_embed
Arguments:
backbone: nn.Layer, backbone layer (resnet)
position_embedding: nn.Layer, position_embedding(learned, or sine)
"""
def __init__(self, backbone, position_embedding):
super().__init__(backbone, position_embedding)
def forward(self, x):
# feature from resnet backbone inference
xs = self[0](x)
out = []
pos = []
# for each backbone output, apply position embedding
for name, xx in xs.items():
out.append(xx)
pos.append(self[1](xx).astype(xx.tensors.dtype))
return out, pos
def build_backbone(config):
""" build resnet backbone and position embedding according to config """
#对backbone输出的特征图进行位置编码,用于后续Transformer 部分
assert config.MODEL.BACKBONE in ['resnet50', 'resnet101'], "backbone name is not supported!"
backbone_name = config.MODEL.BACKBONE
dilation = False
train_backbone = not config.EVAL
#是否需要训练backbone (即是否采用预训练backbone)
return_interm_layers = False #TODO: impl case True for segmentation
#是否需要记录backbone的每层输出
backbone_lr = config.MODEL.BACKBONE_LR
position_embedding = build_position_encoding(config.MODEL.TRANS.EMBED_DIM)
backbone = Backbone(backbone_name, train_backbone, return_interm_layers, dilation, backbone_lr)
model = Joiner(backbone, position_embedding)
#将backbone 与 position embedding 集成在一个model中
model.num_channels = backbone.num_channels
return model
常见的Position Embedding一般有两种,一个基于sine函数的位置编码,一个是可以学习位置编码。
DERT中分别对这两种编码方式,都写成了继承Module的layer,PositionEmbeddingSine,PositionEmbeddingLearned。
class PositionEmbeddingSine(nn.Layer):
# 基于sin函数的位置编码-
def __init__(self, num_position_features=64, temp=10000, norm=False, scale=None):
super().__init__()
self.num_position_features = num_position_features
self.temp = temp
self.norm = norm
if scale is not None and norm is False:
raise ValueError('norm should be true if scale is passed')
self.scale = 2 * math.pi if scale is None else scale
def forward(self, tensor_list):
x = tensor_list.tensors
mask = tensor_list.mask
not_mask = (mask < 0.5).astype('float32')
y_embed = not_mask.cumsum(1, dtype='float32')
x_embed = not_mask.cumsum(2, dtype='float32')
if self.norm:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = paddle.arange(self.num_position_features, dtype='int32')
dim_t = self.temp ** (2 * (dim_t // 2) / self.num_position_features)
pos_y = y_embed.unsqueeze(-1) / dim_t
pos_x = x_embed.unsqueeze(-1) / dim_t
pos_y = paddle.stack((pos_y[:, :, :, 0::2].sin(),
pos_y[:, :, :, 1::2].cos()), axis=4).flatten(3)
pos_x = paddle.stack((pos_x[:, :, :, 0::2].sin(),
pos_x[:, :, :, 1::2].cos()), axis=4).flatten(3)
pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2])
return pos
class PositionEmbeddingLearned(nn.Layer):
#基于学习方式的位置编码
def __init__(self, num_position_features=256):
super().__init__()
w_attr_1 = self.init_weights()
self.row_embed = nn.Embedding(50, num_position_features, weight_attr=w_attr_1)
w_attr_2 = self.init_weights()
self.col_embed = nn.Embedding(50, num_position_features, weight_attr=w_attr_2)
def init_weights(self):
return paddle.ParamAttr(initializer=nn.initializer.Uniform(low=0., high=1.))
def forward(self, tensor_list):
x = tensor_list.tensors #[batch, 2048(R50 feature), H, W]
h, w = x.shape[-2:]
i = paddle.arange(w)
j = paddle.arange(h)
x_embed = self.col_embed(i)
y_embed = self.row_embed(j)
pos = paddle.concat([
x_embed.unsqueeze(0).expand((h, x_embed.shape[0], x_embed.shape[1])),
y_embed.unsqueeze(1).expand((y_embed.shape[0], w, y_embed.shape[1])),
], axis=-1)
pos = pos.transpose([2, 0, 1]) #[dim, h, w]
pos = pos.unsqueeze(0) #[1, dim, h, w]
pos = pos.expand([x.shape[0]] + pose.shape[1::]) #[batch, dim, h, w]
return pos
下一节介绍 build transformer, build dert部分。