接上回rock带你读CornerNet-lite系列源码(一), 前篇文章介绍了项目代码的总体架构,和训练时的调用关系,数据流传递到了 py_utils.py下的model定义部分,本篇主要介绍(一)py_utils.py下的三个文件,模型定义。(二)sample下的三个文件,构建Groundtruth,encode方式。
阅读源码最好的方式是按照组件解读,这里强烈建议看下:Hourglass网络的理解和代码分析
好的,这里默认看懂了哈,下面看这段代码就比较好理解了。
import torch
import torch.nn as nn
from .py_utils import TopPool, BottomPool, LeftPool, RightPool
#作者定义的C++4个扩展POOL操作
from .py_utils.utils import convolution, residual, corner_pool
from .py_utils.losses import CornerNet_Loss
from .py_utils.modules import hg_module, hg, hg_net
def make_pool_layer(dim):
return nn.Sequential()
#重复的残差模块,不会改变特征图的大小,但会改变channel的数量,
#即(B,N, W,H)这个操作只会改变N
def make_hg_layer(inp_dim, out_dim, modules):
layers = [residual(inp_dim, out_dim, stride=2)]
layers += [residual(out_dim, out_dim) for _ in range(1, modules)]
return nn.Sequential(*layers)
class model(hg_net):
# 继承hg_net模块,就是把hg_net的所有定义拿过来可以直接调用,
#这里的model就是CorNerNet model结构的所有
def _pred_mod(self, dim): # 用1*1的核升维或者降维到dim个channel
return nn.Sequential(
convolution(3, 256, 256, with_bn=False),
nn.Conv2d(256, dim, (1, 1))
)
def _merge_mod(self):
return nn.Sequential(
nn.Conv2d(256, 256, (1, 1), bias=False),
#用1*1的核升维或者降维到256个channel
nn.BatchNorm2d(256)
)
def __init__(self):
stacks = 2 #堆叠的沙漏网络,2个沙漏堆一起
pre = nn.Sequential(
convolution(7, 3, 128, stride=2),
residual(128, 256, stride=2)
)
#传入一个(B,N,W,H),B是batch,N是channel,W,H是feature map的维度,
#进入上面的pre模块 “预热”了一下下,让channel的数量变为256,
#这个就是为了下一步好和hg_net 模块好衔接,hg_net 模块从256开始増维到512,
#然后降维256(都是对feature map的channel操作,维度越大,map的size越小,
#维度小,size大,故称沙漏网络)
hg_mods = nn.ModuleList([
hg_module(
5, [256, 256, 384, 384, 384, 512], [2, 2, 2, 2, 2, 4],
make_pool_layer=make_pool_layer,
make_hg_layer=make_hg_layer
) for _ in range(stacks) #2 2次重堆叠沙漏
])
cnvs = nn.ModuleList([convolution(3, 256, 256) for _ in range(stacks)])
inters = nn.ModuleList([residual(256, 256) for _ in range(stacks - 1)])
cnvs_ = nn.ModuleList([self._merge_mod() for _ in range(stacks - 1)])
inters_ = nn.ModuleList([self._merge_mod() for _ in range(stacks - 1)])
# convs ,inters, cnvs_,inters_,这几个都是把一些操作组合成ModuleList执行
hgs = hg(pre, hg_mods, cnvs, inters, cnvs_, inters_)
#沙漏网络的构建,hg_mods里面有2个沙漏网络,强调一下
tl_modules = nn.ModuleList([corner_pool(256, TopPool, LeftPool) for _ in range(stacks)])
br_modules = nn.ModuleList([corner_pool(256, BottomPool, RightPool) for _ in range(stacks)])
tl_heats = nn.ModuleList([self._pred_mod(80) for _ in range(stacks)])
br_heats = nn.ModuleList([self._pred_mod(80) for _ in range(stacks)])
#t1_modules, br_moudles,tl_heats,br_heats 层是抽取特征图的信息,构建pred (pred是预测,target或groudtruth是encode之后的标签)
for tl_heat, br_heat in zip(tl_heats, br_heats):
torch.nn.init.constant_(tl_heat[-1].bias, -2.19)
torch.nn.init.constant_(br_heat[-1].bias, -2.19)
tl_tags = nn.ModuleList([self._pred_mod(1) for _ in range(stacks)])
br_tags = nn.ModuleList([self._pred_mod(1) for _ in range(stacks)])
tl_offs = nn.ModuleList([self._pred_mod(2) for _ in range(stacks)])
br_offs = nn.ModuleList([self._pred_mod(2) for _ in range(stacks)])
#tl_tags, br_tags, tl_offs, br_offs 同上
super(model, self).__init__(
hgs, tl_modules, br_modules, tl_heats, br_heats,
tl_tags, br_tags, tl_offs, br_offs
)
#super是为了继承父类hg——net的初始化属性
self.loss = CornerNet_Loss(pull_weight=1e-1, push_weight=1e-1)
#loss
这个是CornerNet的改进版,改进点是用了三个简化版的沙漏网络,同时还是用了一个attention机制,源代码不做介绍了,和CornetNet差别不大
attention机制是将检测物体划分为大目标(96 这里是将标签 encode成 target形式,网络需要生成pred的attention形式,这个对应定义在 model/CornerNet_Saccade.py的att_mods 模块输出,文章的划分标准是按照特征图层的位置划分的,那个ratio是输入特征图缩小的比例。 这句在moudles/moudle.py文件saccade_net类中,意思就是将每个att_mod接在沙漏网络的up位置, up是什么呢,hg_modules= saccade_module,就是下面返回的atts,这各类调用了 saccade_module类返回的merg,mergs, ups就是mergs=[第一个沙漏最后的所有up层输出,第二个沙漏的所有up层输出,第三个沙漏的所有up层输出], 注:每个沙漏有3个up层,即最高维512----》384, 384----》384, 384—》256,每个有up操作。请再看下attmod的注释。 这个网络是轻量级的,前面2个网络都非常大,训练耗费资源很多,单个GPU基本上训练是不行的,cornerNet_Squeeze网络的改进点在于使用个fire_module 替换了residual模块,这个技术是轻量级网络squeezenet的结构,参考论文:squeezenetV1, suqeezenet-V2 fire_module结构如下: groups=out_dim // sr 分组卷积,这里sr一定是整除,把channel分成多个组,卷积后相加,这个减少了很多计算量,原来一个卷积和要卷所有的channel,现在只有一半。最后torch.cat把channel叠加成和输入channel一样。 这里是sample下的文件,以cornernet为例: 这是一段经典的代码,这里preds,gt,可以通用, 这段代码用于对齐 最后特征图(64,64)生成的tl_tag,和 标签 gt_tl_ind的维度 无监督学习pull,push,这两个没有标签,比如一张图里面预测了很多个tl,br,到底哪个和哪个匹配呢,遵循原则,越近的尽量近,越远的尽量远,这段代码还是有点抽象,是一篇论文的思想Pixels to graphs by associative embedding. : ========def create_attention_mask(atts, ratios, sizes, detections):
for det in detections:
width = det[2] - det[0]
height = det[3] - det[1]
max_hw = max(width, height)
for att, ratio, size in zip(atts, ratios, sizes):
#atts: size [[16, 16], [32, 32], [64, 64]] att_map的大小,这个记不清了,debug下,应该差不多
#ratio:[16, 8, 4] 这个是中间层map相对输入图的缩小比率
#sizes:[[96, 256], [32, 96], [0, 32]] 这个是区分标准
#如果attention大小为16*16
if max_hw >= size[0] and max_hw <= size[1]:
x = (det[0] + det[2]) / 2
y = (det[1] + det[3]) / 2
x = (x / ratio).astype(np.int32)
y = (y / ratio).astype(np.int32)
att[y, x] = 1
这里stack=3, attention= [att_mods1, att_mods2,att_mod3] 3个att_mod模块分别接在3个堆叠的沙漏网络中up层 (每个模块有3个up输出,总共9个map输出), att_mods = nn.ModuleList([
nn.ModuleList([
nn.Sequential(
convolution(3, 384, 256, with_bn=False), #接在 512----》384 up的后面
nn.Conv2d(256, 1, (1, 1))
),
nn.Sequential(
convolution(3, 384, 256, with_bn=False), # 接在 384----》384 up的后面
nn.Conv2d(256, 1, (1, 1))
),
nn.Sequential(
convolution(3, 256, 256, with_bn=False), # 接在 384---》256, up的后面
nn.Conv2d(256, 1, (1, 1))
)
]) for _ in range(stacks)
atts = [[att_mod_(u) for att_mod_, u in zip(att_mods, up)] for att_mods, up in zip(self.att_modules, ups)]
class saccade(nn.Module):
def __init__(self, pre, hg_modules, cnvs, inters, cnvs_, inters_):
super(saccade, self).__init__()
self.pre = pre
self.hgs = hg_modules
self.cnvs = cnvs
self.inters = inters
self.inters_ = inters_
self.cnvs_ = cnvs_
def forward(self, x):
inter = self.pre(x)
cnvs = []
atts = []
for ind, (hg_, cnv_) in enumerate(zip(self.hgs, self.cnvs)):
hg, ups = hg_(inter)
cnv = cnv_(hg)
cnvs.append(cnv)
atts.append(ups)
if ind < len(self.hgs) - 1:
inter = self.inters_[ind](inter) + self.cnvs_[ind](cnv)
inter = nn.functional.relu_(inter)
inter = self.inters[ind](inter)
return cnvs, atts
CornerNet_Squeeze结构
这是一个很巧妙的替换,首先看下residual模块:class residual(nn.Module):
def __init__(self, inp_dim, out_dim, k=3, stride=1):
super(residual, self).__init__()
p = (k - 1) // 2
self.conv1 = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(p, p), stride=(stride, stride), bias=False)
self.bn1 = nn.BatchNorm2d(out_dim)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_dim, out_dim, (k, k), padding=(p, p), bias=False)
self.bn2 = nn.BatchNorm2d(out_dim)
self.skip = nn.Sequential(
nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
nn.BatchNorm2d(out_dim)
) if stride != 1 or inp_dim != out_dim else nn.Sequential()
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
conv1 = self.conv1(x)
bn1 = self.bn1(conv1)
relu1 = self.relu1(bn1)
conv2 = self.conv2(relu1)
bn2 = self.bn2(conv2)
skip = self.skip(x)
return self.relu(bn2 + skip)
class fire_module(nn.Module):
def __init__(self, inp_dim, out_dim, sr=2, stride=1):
super(fire_module, self).__init__()
self.conv1 = nn.Conv2d(inp_dim, out_dim // sr, kernel_size=1, stride=1, bias=False)
self.bn1 = nn.BatchNorm2d(out_dim // sr)
self.conv_1x1 = nn.Conv2d(out_dim // sr, out_dim // 2, kernel_size=1, stride=stride, bias=False)
self.conv_3x3 = nn.Conv2d(out_dim // sr, out_dim // 2, kernel_size=3, padding=1,
stride=stride, groups=out_dim // sr, bias=False)
self.bn2 = nn.BatchNorm2d(out_dim)
self.skip = (stride == 1 and inp_dim == out_dim)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
conv1 = self.conv1(x)
bn1 = self.bn1(conv1)
conv2 = torch.cat((self.conv_1x1(bn1), self.conv_3x3(bn1)), 1)
bn2 = self.bn2(conv2)
if self.skip:
return self.relu(bn2 + x)
else:
return self.relu(bn2)
构建Groundtruth
#一个batch的图像,这里进行shuffle(打乱),cropping(随机裁剪),flipping(翻转)
images = np.zeros((batch_size, 3, input_size[0], input_size[1]), dtype=np.float32)
# 左上角的热图
tl_heatmaps = np.zeros((batch_size, categories, output_size[0], output_size[1]), dtype=np.float32)
#右下角热图
br_heatmaps = np.zeros((batch_size, categories, output_size[0], output_size[1]), dtype=np.float32)
##左上角坐標偏移,最后特征map输出的 output_size (64,64) 上坐标 和 input_size(511, 511)坐标间的偏移量,都是groundtruth box坐标的损失
tl_regrs = np.zeros((batch_size, max_tag_len, 2), dtype=np.float32)
# 右下角偏移
br_regrs = np.zeros((batch_size, max_tag_len, 2), dtype=np.float32)
# tl_tags[b_ind, tag_ind] = ytl * output_size[1] + xtl, 左上角在output_map(64,64)的位置
tl_tags = np.zeros((batch_size, max_tag_len), dtype=np.int64)
#同上
br_tags = np.zeros((batch_size, max_tag_len), dtype=np.int64)
#tag_masks[b_ind, :tag_len] = 1, b_ind 是batch中图像索引, tag_len 是图像中的 tl,br 组数量
tag_masks = np.zeros((batch_size, max_tag_len), dtype=np.uint8)
#表示batch中每个图有多少组tl,br
tag_lens = np.zeros((batch_size, ), dtype=np.int32)
热图
#radius是根据radius = gaussian_radius((height, width), gaussian_iou)计算
# 生成热图 draw_gaussian(tl_heatmaps[b_ind, category], [xtl, ytl], radius),
#这里直接使用浅拷贝在tl_heatmap上操作 见 最后三句代码
def gaussian2D(shape, sigma=1):
m, n = [(ss - 1.) / 2. for ss in shape]
y, x = np.ogrid[-m:m+1,-n:n+1]
h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_gaussian(heatmap, center, radius, k=1):
diameter = 2 * radius + 1
gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
x, y = center
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] #浅拷贝
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
#输出masked——heatmap,会改变heatmap
focal loss
preds形式如:(N,w)
gt形如(1,W)
focal的函数代码 参考 Retinanetdef _focal_loss(preds, gt):
pos_inds = gt.eq(1)
neg_inds = gt.lt(1)
neg_weights = torch.pow(1 - gt[neg_inds], 4)
loss = 0
for pred in preds:
pos_pred = pred[pos_inds]
neg_pred = pred[neg_inds]
pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2)
neg_loss = torch.log(1 - neg_pred) * torch.pow(neg_pred, 2) * neg_weights
num_pos = pos_inds.float().sum()
pos_loss = pos_loss.sum()
neg_loss = neg_loss.sum()
if pos_pred.nelement() == 0:
loss = loss - neg_loss
else:
loss = loss - (pos_loss + neg_loss) / num_pos
return loss
pull and push
tl_tags = [_tranpose_and_gather_feat(tl_tag, gt_tl_ind) for tl_tag in tl_tags]
br_tags = [_tranpose_and_gather_feat(br_tag, gt_br_ind) for br_tag in br_tags]
#对齐函数
def _gather_feat(feat, ind, mask=None):
dim = feat.size(2)
ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
feat = feat.gather(1, ind)
if mask is not None:
mask = mask.unsqueeze(2).expand_as(feat)
feat = feat[mask]
feat = feat.view(-1, dim)
return feat
def _tranpose_and_gather_feat(feat, ind):
feat = feat.permute(0, 2, 3, 1).contiguous()
feat = feat.view(feat.size(0), -1, feat.size(3))
feat = _gather_feat(feat, ind)
return feat
def _ae_loss(tag0, tag1, mask):
num = mask.sum(dim=1, keepdim=True).float()
tag0 = tag0.squeeze()
tag1 = tag1.squeeze()
tag_mean = (tag0 + tag1) / 2
tag0 = torch.pow(tag0 - tag_mean, 2) / (num + 1e-4)
tag0 = tag0[mask].sum()
tag1 = torch.pow(tag1 - tag_mean, 2) / (num + 1e-4)
tag1 = tag1[mask].sum()
pull = tag0 + tag1
mask = mask.unsqueeze(1) + mask.unsqueeze(2)
mask = mask.eq(2)
num = num.unsqueeze(2)
num2 = (num - 1) * num
dist = tag_mean.unsqueeze(1) - tag_mean.unsqueeze(2)
dist = 1 - torch.abs(dist)
dist = nn.functional.relu(dist, inplace=True)
dist = dist - 1 / (num + 1e-4)
dist = dist / (num2 + 1e-4)
dist = dist[mask]
push = dist.sum()
return pull, push
未完待续。。。。。