源代码来源:
https://github.com/tianzhi0549/FCOS
使用默认配置开始debug fcos_demo.py
前面都是参数设置:
thresholds_for_classes
demo_im_names = os.listdir(args.images_dir)
coco_demo = COCODemo(
cfg,
confidence_thresholds_for_classes=thresholds_for_classes,
min_image_size=args.min_image_size
)
模型建立以后再看,先debug流程
composite = coco_demo.run_on_opencv_image(img) # img为传入图像
重要的是网络怎么运行得到的结果
# 得到预测结果
predictions = self.compute_prediction(image)
# 挑选出合适的label
top_predictions = self.select_top_predictions(predictions)
result = image.copy()
if self.show_mask_heatmaps:
return self.create_mask_montage(result, top_predictions)
# 画框
result = self.overlay_boxes(result, top_predictions)
if self.cfg.MODEL.MASK_ON:
result = self.overlay_mask(result, top_predictions)
if self.cfg.MODEL.KEYPOINT_ON:
result = self.overlay_keypoints(result, top_predictions)
# 画className和score
result = self.overlay_class_names(result, top_predictions)
# 返回结果
return result
首先对传入的图像进行transfroms转换,其定义如下:
transform = T.Compose(
[
T.ToPILImage(), #转换图像为PIL格式,因为transfroms支持的格式为PIL
T.Resize(self.min_image_size), #和论文中一致,将短边resize到800
T.ToTensor(), # 转换到tensor格式
to_bgr_transform, # to_bgr_transform = T.Lambda(lambda x: x * 255)每个像素都乘以255
normalize_transform, # 像素正则化,平均值为cfg.INPUT.PIXEL_MEAN, 方差为cfg.INPUT.PIXEL_STD
]
)
# 调用:image = self.transforms(original_image)
# shape变化:
# 输入图像shape(427,640,3) ->(3,800,1199)
将所有的图片都补充到同一个大小。由于输入的图片大小可能不一致,在经过了transforms后短边的长度相同,但长边的长度各不相同,但pytorch要求输入图像的大小相同,所以它以bach中最大的长边为基准,给其他长边较短的补充0到同样的大小。但由于我们在demo阶段输入的是一张图片,所以相当于什么都没有做。注意在这里还需要填充长边到能被8整除,防止下采样时小数的情况。
images = to_image_list(images)
传入模型得到输出
predictions = self.model(image_list)
predictions = [o.to(self.cpu_device) for o in predictions]
# always single image is passed at a time
prediction = predictions[0]
# reshape prediction (a BoxList) into the original image size
# 得到原始图像大小
height, width = original_image.shape[:-1]
prediction = prediction.resize((width, height))
if prediction.has_field("mask"):
# if we have masks, paste the masks in the right position
# in the image, as defined by the bounding boxes
masks = prediction.get_field("mask")
# always single image is passed at a time
masks = self.masker([masks], [prediction])[0]
prediction.add_field("mask", masks)
return prediction
scores = predictions.get_field("scores")
labels = predictions.get_field("labels")
thresholds = self.confidence_thresholds_for_classes[(labels - 1).long()]
keep = torch.nonzero(scores > thresholds).squeeze(1)
predictions = predictions[keep]
scores = predictions.get_field("scores")
_, idx = scores.sort(0, descending=True)
#返回得分大于自己的阈值,并且将得分降序排列
return predictions[idx]
得到backbone(主干特征提取网络的输出)默认是resnet
features = self.backbone(images.tensors)
features包含5个特征层对应的shape分别为:
[(1,256,100,152), (1,256,50,76), (1,256,25,38), (1,256,13,19), (1,256,7,10)]最后将features传入rpn网络
# 这里的targets为None
proposals, proposal_losses = self.rpn(images, features, targets)
if self.roi_heads:
x, result, detector_losses = self.roi_heads(features, proposals, targets)
else:
# 只使用rpn而没有使用roi-head
x = features
result = proposals
detector_losses = {}
if self.training:
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
return result
# 得到预测
box_cls, box_regression, centerness = self.head(features)
# 计算位置
locations = self.compute_locations(features)
if self.training:
return self._forward_train(
locations, box_cls,
box_regression,
centerness, targets
)
else:
# 返回值
return self._forward_test(
locations, box_cls, box_regression,
centerness, images.image_sizes
)
传入head中:图像image, 和backbone的输出特征层features,进入for循环对每个特征图进行遍历:
for l, feature in enumerate(x):
# cls_tower = 3x3卷积(保证维度,通道数不变), 分组批量正则化, relu
#(分为32个组,由于特征图的维度都是256,所以每组正则化数目256//32)
cls_tower = self.cls_tower(feature)
# box_tower 与cls_tower相同处理(分别)
box_tower = self.bbox_tower(feature)
# cls_logits = 3x3卷积(保证维度,通道数压缩为num_classes, 数据集的类别个数)
# 也即在这一步对每个类别做出了预测
logits.append(self.cls_logits(cls_tower))
# 按照论文所说,中心预测分支可选择在回归分支和分类分支, 这里使用回归分支
# centerness = 3x3卷积(保证维度,输出通道为1)
if self.centerness_on_reg:
centerness.append(self.centerness(box_tower))
else:
centerness.append(self.centerness(cls_tower))
# 给出位置预测
# bbox_pred 3x3卷积(保证维度,输出通道为4)
# scales引入可以学习的参数(默认为1.0),对self.bbox_pred(box_tower)的输出
# 进行逐元素乘,也就是在这一步,得到了(l, r, t, b)预测
bbox_pred = self.scales[l](self.bbox_pred(box_tower))
if self.norm_reg_targets:
# 剔除负值
bbox_pred = F.relu(bbox_pred)
if self.training:
# 在训练时不回归
bbox_reg.append(bbox_pred)
else:
# 未训练时回归到原图上面
# 这里的fpn_strides对应相对于原图的下采样率[8, 16, 32, 64, 128]
# 8的得来:bbox_pred的shape(4, 100, 152)原图(3,800, 1216)
# 800/100 == 1216/152 == 8(这里的长边在前面to_image_list中填充到能被8整除的大小)
bbox_reg.append(bbox_pred * self.fpn_strides[l])
else:
bbox_reg.append(torch.exp(bbox_pred))
# 返回
return logits, bbox_reg, centerness
# shape分别为
# (80, h, w), (4, h, w), (1, h, w)
这里和论文有点不相同,论文中说: 剔除负值通过exp(x)函数。同时引入可以学习的参数i(每个特征层都不相同)。这里剔除负值直接使用的relu函数。
locations = []
# 对每个特征层进行遍历
for level, feature in enumerate(features):
# 得到特征层的清晰度h, w
h, w = feature.size()[-2:]
# 特征层定位
locations_per_level = self.compute_locations_per_level(
h, w, self.fpn_strides[level],
feature.device
)
locations.append(locations_per_level)
return locations
# shape (5, 2, h x w)其中5的得来是由于有5个特征层
# 以步长为间隔生成网格的x
shifts_x = torch.arange(
0, w * stride, step=stride,
dtype=torch.float32, device=device
)
# 以步长为间隔生成网格的y
shifts_y = torch.arange(
0, h * stride, step=stride,
dtype=torch.float32, device=device
)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
# 为每一个网格生成左上角坐标
locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
return locations
# shape 为(2,h x w)
# 计算出原图的
boxes = self.box_selector_test(
locations, box_cls, box_regression,
centerness, image_sizes
)
return boxes, {}
sampled_boxes = []
for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)):
sampled_boxes.append(
# 这里的image_size为原图大小(800, 1199)
self.forward_for_single_feature_map(
l, o, b, c, image_sizes
)
)
boxlists = list(zip(*sampled_boxes))
# 合并特征层的输出
boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
if not self.bbox_aug_enabled:
boxlists = self.select_over_all_levels(boxlists)
return boxlists
# batchsize, channel, high, weight
N, C, H, W = box_cls.shape
# 和locations调整为相同的格式
box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)
box_cls = box_cls.reshape(N, -1, C).sigmoid()
# shape变换 (N, C, H, W)-> (N, HxW, C)
box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1)
box_regression = box_regression.reshape(N, -1, 4)
centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1)
centerness = centerness.reshape(N, -1).sigmoid()
# 同上
# 筛选出大于self.pre_nms_thresh(0.05)分值的类别
candidate_inds = box_cls > self.pre_nms_thresh
# 统计大于0.05分类别的个数,其中可能有同一个网格预测的多个类别大于0.05的也被算在内(因为都是bool值)
# shape = (batchsize, 1) 这里的1表明在经过第一轮筛选(self.pre_nms_thresh)之后剩下的网格数
#(包含一个网格中有多个通道留下)
pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
# 将类别个数多余self.pre_nms_top_n的赋值为1000,一个图中最多的数目为1000
# clamp(min, max)
pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)
# multiply the classification scores with centerness scores
# 得到预测得分,其中将none作为索引是为了增加维度
box_cls = box_cls * centerness[:, :, None]
results = []
# 对每个图像分别遍历
for i in range(N):
per_box_cls = box_cls[i] # 第i张图像,以下都是对第i张图像。预测类别得分
per_candidate_inds = candidate_inds[i] # 大于self.pre_nms_thresh分值的bool数组
per_box_cls = per_box_cls[per_candidate_inds] # bool索引得到类别分值
# 得到分值不是0位置的索引
per_candidate_nonzeros = per_candidate_inds.nonzero()
per_box_loc = per_candidate_nonzeros[:, 0] # 得到分值不是0位置的网格编号
per_class = per_candidate_nonzeros[:, 1] + 1 # 得到每个网格所属的类别(由1开始)
per_box_regression = box_regression[i] # 得到第i张图片的bbox参数
per_box_regression = per_box_regression[per_box_loc] # 按照第一维的网格编号得到目标网格
per_locations = locations[per_box_loc] # 得到每个网格满足条件的王国对应到原图的位置,用bool索引实现
# 满足条件即经过初筛选
per_pre_nms_top_n = pre_nms_top_n[i] # 得到第i张图像满足条件的网格个数
# 如果出现一张图中预测了超过self.pre_nms_top_n数量的目标,那么就只取前per_pre_nms_top_n个
if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
per_box_cls, top_k_indices = \
per_box_cls.topk(per_pre_nms_top_n, sorted=False)
per_class = per_class[top_k_indices]
per_box_regression = per_box_regression[top_k_indices]
per_locations = per_locations[top_k_indices]
# 得到原图中预测的真实位置
detections = torch.stack([
per_locations[:, 0] - per_box_regression[:, 0],
per_locations[:, 1] - per_box_regression[:, 1],
per_locations[:, 0] + per_box_regression[:, 2],
per_locations[:, 1] + per_box_regression[:, 3],
], dim=1)
h, w = image_sizes[i]
# 将结果包装为类别,含有属性bbox=detections, size(原图), mode(怎样格式的数据.如‘xyxy’)
boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy")
# 添加类别
boxlist.add_field("labels", per_class)
# 排序分数,不会打乱顺序,因为第一维就标明了属于那个网格
boxlist.add_field("scores", torch.sqrt(per_box_cls))
# 限制框的大小,防止框超出了边界
boxlist = boxlist.clip_to_image(remove_empty=False)
# 移除较小的框
boxlist = remove_small_boxes(boxlist, self.min_size)
results.append(boxlist)
return results
num_images = len(boxlists)
results = []
for i in range(num_images):
# 遍历所有的图片
# 非极大值抑制,得到所有的检测框
result = boxlist_ml_nms(boxlists[i], self.nms_thresh)
# 目标数量
number_of_detections = len(result)
# Limit to max_per_image detections **over all classes**
# 限制每张图片输出的目标数量,最多为self.fpn_post_nms_top_n(100)
if number_of_detections > self.fpn_post_nms_top_n > 0:
cls_scores = result.get_field("scores")
image_thresh, _ = torch.kthvalue(
cls_scores.cpu(),
number_of_detections - self.fpn_post_nms_top_n + 1
)
keep = cls_scores >= image_thresh.item()
keep = torch.nonzero(keep).squeeze(1)
result = result[keep]
results.append(result)
return results