class VGGBase(nn.Module):
VGG base convolutions to produce feature maps.
def __init__(self):
super(VGGBase, self).__init__()
# Standard convolutional layers in VGG16
self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1) # stride = 1, by default
self.conv1_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) # 224->112
self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
self.conv2_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) # 112->56
self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)
self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2) # 56->28
self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)
self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2) # 28->14
self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)
self.pool5 = nn.MaxPool2d(kernel_size=2, stride=2) # 14->7
# Load pretrained weights on ImageNet
def forward(self, image):
Forward propagation.
:param image: images, a tensor of dimensions (N, 3, 224, 224)
:return: feature maps pool5
out = F.relu(self.conv1_1(image)) # (N, 64, 224, 224)
out = F.relu(self.conv1_2(out)) # (N, 64, 224, 224)
out = self.pool1(out) # (N, 64, 112, 112)
out = F.relu(self.conv2_1(out)) # (N, 128, 112, 112)
out = F.relu(self.conv2_2(out)) # (N, 128, 112, 112)
out = self.pool2(out) # (N, 128, 56, 56)
out = F.relu(self.conv3_1(out)) # (N, 256, 56, 56)
out = F.relu(self.conv3_2(out)) # (N, 256, 56, 56)
out = F.relu(self.conv3_3(out)) # (N, 256, 56, 56)
out = self.pool3(out) # (N, 256, 28, 28)
out = F.relu(self.conv4_1(out)) # (N, 512, 28, 28)
out = F.relu(self.conv4_2(out)) # (N, 512, 28, 28)
out = F.relu(self.conv4_3(out)) # (N, 512, 28, 28)
out = self.pool4(out) # (N, 512, 14, 14)
out = F.relu(self.conv5_1(out)) # (N, 512, 14, 14)
out = F.relu(self.conv5_2(out)) # (N, 512, 14, 14)
out = F.relu(self.conv5_3(out)) # (N, 512, 14, 14)
out = self.pool5(out) # (N, 512, 7, 7)
# return 7*7 feature map
return out
def load_pretrained_layers(self):
we use a VGG-16 pretrained on the ImageNet task as the base network.
There's one available in PyTorch, see
We copy these parameters into our network. It's straightforward for conv1 to conv5.
# Current state of base
state_dict = self.state_dict()
param_names = list(state_dict.keys())
# Pretrained VGG base
pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict()
pretrained_param_names = list(pretrained_state_dict.keys())
# Transfer conv. parameters from pretrained model to current model
for i, param in enumerate(param_names):
state_dict[param] = pretrained_state_dict[pretrained_param_names[i]]
print("\nLoaded base model.\n")
1. 离散程度 fmap_dims = 7: VGG16最后的特征图尺寸为 7*7
2. 在上面的举例中我们是假设了三种尺寸的先验框,然后遍历坐标。在先验框生成过程中,先验框的尺寸是提前设置好的,
0. cx, cy表示中心点坐标
1. 遍历特征图上每一个cell,i+0.5是为了从坐标点移动至cell中心,/fmap_dims目的是将坐标在特征图上归一化
2. 这个时候我们已经可以在每个cell上各生成一个框了,但是这个不是我们需要的,我们称之为base_prior_bbox基准框。
3. 根据我们在每个cell上得到的长宽比1:1的基准框,结合我们设置的3种尺度obj_scales和3种长宽比aspect_ratios就得到了每个cell的9个先验框。
4. 最终结果保存在prior_boxes中并返回。
img_prior_boxes = prior_boxes * 图像尺寸
def create_prior_boxes():
Create the 441 prior (default) boxes for the network, as described in the tutorial.
VGG16最后的特征图尺寸为 7*7
因此总的候选框个数 = 7 * 7 * 9 = 441
:return: prior boxes in center-size coordinates, a tensor of dimensions (441, 4)
fmap_dims = 7
obj_scales = [0.2, 0.4, 0.6]
aspect_ratios = [1., 2., 0.5]
prior_boxes = []
for i in range(fmap_dims):
for j in range(fmap_dims):
cx = (j + 0.5) / fmap_dims
cy = (i + 0.5) / fmap_dims
for obj_scale in obj_scales:
for ratio in aspect_ratios:
prior_boxes.append([cx, cy, obj_scale * sqrt(ratio), obj_scale / sqrt(ratio)])
prior_boxes = torch.FloatTensor(prior_boxes).to(device) # (441, 4)
prior_boxes.clamp_(0, 1) # (441, 4)
return prior_boxes
def cxcy_to_gcxgcy(cxcy, priors_cxcy):
Encode bounding boxes (that are in center-size form) w.r.t. the corresponding prior boxes (that are in center-size form).
For the center coordinates, find the offset with respect to the prior box, and scale by the size of the prior box.
For the size coordinates, scale by the size of the prior box, and convert to the log-space.
In the model, we are predicting bounding box coordinates in this encoded form.
:param cxcy: bounding boxes in center-size coordinates, a tensor of size (n_priors, 4)
:param priors_cxcy: prior boxes with respect to which the encoding must be performed, a tensor of size (n_priors, 4)
:return: encoded bounding boxes, a tensor of size (n_priors, 4)
# The 10 and 5 below are referred to as 'variances' in the original SSD Caffe repo, completely empirical
# They are for some sort of numerical conditioning, for 'scaling the localization gradient'
# See
return[(cxcy[:, :2] - priors_cxcy[:, :2]) / (priors_cxcy[:, 2:] / 10), # g_c_x, g_c_y
torch.log(cxcy[:, 2:] / priors_cxcy[:, 2:]) * 5], 1) # g_w, g_h
def gcxgcy_to_cxcy(gcxgcy, priors_cxcy):
Decode bounding box coordinates predicted by the model, since they are encoded in the form mentioned above.
They are decoded into center-size coordinates.
This is the inverse of the function above.
:param gcxgcy: encoded bounding boxes, i.e. output of the model, a tensor of size (n_priors, 4)
:param priors_cxcy: prior boxes with respect to which the encoding is defined, a tensor of size (n_priors, 4)
:return: decoded bounding boxes in center-size form, a tensor of size (n_priors, 4)
return[gcxgcy[:, :2] * priors_cxcy[:, 2:] / 10 + priors_cxcy[:, :2], # c_x, c_y
torch.exp(gcxgcy[:, 2:] / 5) * priors_cxcy[:, 2:]], 1) # w, h
class PredictionConvolutions(nn.Module):
Convolutions to predict class scores and bounding boxes using feature maps.
The bounding boxes (locations) are predicted as encoded offsets w.r.t each of the 441 prior (default) boxes.
See 'cxcy_to_gcxgcy' in for the encoding definition.
The class scores represent the scores of each object class in each of the 441 bounding boxes located.
A high score for 'background' = no object.
def __init__(self, n_classes):
:param n_classes: number of different types of objects
super(PredictionConvolutions, self).__init__()
self.n_classes = n_classes
# Number of prior-boxes we are considering per position in the feature map
# 9 prior-boxes implies we use 9 different aspect ratios, etc.
n_boxes = 9
# Localization prediction convolutions (predict offsets w.r.t prior-boxes)
self.loc_conv = nn.Conv2d(512, n_boxes * 4, kernel_size=3, padding=1)
# Class prediction convolutions (predict classes in localization boxes)
self.cl_conv = nn.Conv2d(512, n_boxes * n_classes, kernel_size=3, padding=1)
# Initialize convolutions' parameters
def init_conv2d(self):
Initialize convolution parameters.
for c in self.children():
if isinstance(c, nn.Conv2d):
nn.init.constant_(c.bias, 0.)
def forward(self, pool5_feats):
Forward propagation.
:param pool5_feats: conv4_3 feature map, a tensor of dimensions (N, 512, 7, 7)
:return: 441 locations and class scores (i.e. w.r.t each prior box) for each image
batch_size = pool5_feats.size(0)
# Predict localization boxes' bounds (as offsets w.r.t prior-boxes)
l_conv = self.loc_conv(pool5_feats) # (N, n_boxes * 4, 7, 7)
l_conv = l_conv.permute(0, 2, 3, 1).contiguous()
# (N, 7, 7, n_boxes * 4), to match prior-box order (after .view())
# (.contiguous() ensures it is stored in a contiguous chunk of memory, needed for .view() below)
locs = l_conv.view(batch_size, -1, 4) # (N, 441, 4), there are a total 441 boxes on this feature map
# Predict classes in localization boxes
c_conv = self.cl_conv(pool5_feats) # (N, n_boxes * n_classes, 7, 7)
c_conv = c_conv.permute(0, 2, 3, 1).contiguous() # (N, 7, 7, n_boxes * n_classes), to match prior-box order (after .view())
classes_scores = c_conv.view(batch_size, -1, self.n_classes) # (N, 441, n_classes), there are a total 441 boxes on this feature map
return locs, classes_scores