Mask_rcnn中文详解

一.在学习Mask之前,建议先看看faster_rcnn,(faster_rcnn代码解读)

Mask_rcnn关键技术:

1.多尺度检测(最早在yolo3中使用),里面用到了FPN技术

2.rpn

2.ROI Align

二:系统学习mask_rcnn过程,B站视频讲解

三:代码中文注释

model.py

"""
Mask R-CNN
The main Mask R-CNN model implemenetation.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""

import datetime
import math
import os
import random
import re

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

import utils
import visualize
#from nms.nms_wrapper import nms
from roialign.roi_align.crop_and_resize import CropAndResizeFunction

############################################################
# nms
############################################################
# boxes=np.array([[100,100,210,210,0.72],
        # [250,250,420,420,0.8],
        # [220,220,320,330,0.92],
        # [100,100,210,210,0.72],
        # [230,240,325,330,0.81],
        # [220,230,315,340,0.9]]) 
def nms(dets, thresh):
    # dets:(m,5)  thresh:scaler
    x1 = dets[:,0]
    y1 = dets[:,1]
    x2 = dets[:,2]
    y2 = dets[:,3]
    areas = (y2-y1+1) * (x2-x1+1)
    scores = dets[:,4]
    keep = []
    index = scores.argsort()[::-1]
    while index.size >0:
        i = index[0]       # every time the first is the biggst, and add it directly
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]])    # calculate the points of overlap 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22-x11+1)    # the weights of overlap
        h = np.maximum(0, y22-y11+1)    # the height of overlap
        overlaps = w*h
        ious = overlaps / (areas[i]+areas[index[1:]] - overlaps)
        idx = np.where(ious<=thresh)[0]
        index = index[idx+1]   # because index start from 1
    return keep
# import matplotlib.pyplot as plt
# def plot_bbox(dets, c='k'):
    
    # x1 = dets[:,0]
    # y1 = dets[:,1]
    # x2 = dets[:,2]
    # y2 = dets[:,3]
    
    # plt.plot([x1,x2], [y1,y1], c)
    # plt.plot([x1,x1], [y1,y2], c)
    # plt.plot([x1,x2], [y2,y2], c)
    # plt.plot([x2,x2], [y1,y2], c)
    # plt.title("after nms")  

############################################################
#  Logging Utility Functions
############################################################

def log(text, array=None):
    """Prints a text message. And, optionally, if a Numpy array is provided it
    prints it's shape, min, and max values.
    if array is not None  shape: (m,n)
    """
    if array is not None:
        text = text.ljust(25) #方法返回一个原字符串左对齐,并使用空格填充至指定长度的新字符串。如果指定的长度小于原字符串的长度则返回原字符串
        text += ("shape: {:20}  min: {:10.5f}  max: {:10.5f}".format(
            str(array.shape), #将一个元组变成一个字符串 (m,n)-> '(m,n)'
            array.min() if array.size else "",  #array.size返回矩阵的元素数量m*n, array.min()返回数组最小值
            array.max() if array.size else ""))
    print(text)

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\n')
    # Print New Line on Complete
    if iteration == total:
        print()


############################################################
#  Pytorch Utility Functions
############################################################

def unique1d(tensor):
    if tensor.size()[0] == 0 or tensor.size()[0] == 1:
        return tensor
    tensor = tensor.sort()[0]  #对tensor每行进行排序
    unique_bool = tensor[1:] != tensor [:-1]  #检测除了第一行和最后一行的数据之外的数据
    first_element = Variable(torch.ByteTensor([True]), requires_grad=False).bool()
    if tensor.is_cuda:
        first_element = first_element.cuda()
    unique_bool = torch.cat((first_element, unique_bool),dim=0)
    return tensor[unique_bool.data]

#求两个一维度的行向量的交集
def intersect1d(tensor1, tensor2):  
    assert len(tensor1.shape)==1 and len(tensor2.shape)==1 and len(tensor1)>1 and len(tensor2)>1,"输入的维度为1且数据长度大于1"
    aux = torch.cat((tensor1, tensor2),dim=0)
    aux = aux.sort()[0]
    return aux[:-1][(aux[1:] == aux[:-1]).data]

def log2(x):
    """Implementatin of Log2. Pytorch doesn't have a native implemenation."""
    ln2 = Variable(torch.log(torch.FloatTensor([2.0])), requires_grad=False)
    if x.is_cuda:
        ln2 = ln2.cuda()
    return torch.log(x) / ln2

class SamePad2d(nn.Module):
    """Mimics tensorflow's 'SAME' padding.
    """

    def __init__(self, kernel_size, stride):
        super(SamePad2d, self).__init__()
        self.kernel_size = torch.nn.modules.utils._pair(kernel_size)  #函数将输入变成成对的元组
        self.stride = torch.nn.modules.utils._pair(stride) 

    def forward(self, input):
        #input (batch,c,h,w)
        in_width = input.size()[3]
        in_height = input.size()[2]
        out_width = math.ceil(float(in_width) / float(self.stride[0]))  #向上取整
        out_height = math.ceil(float(in_height) / float(self.stride[1]))
        pad_along_width = ((out_width - 1) * self.stride[0] +
                           self.kernel_size[0] - in_width)
        pad_along_height = ((out_height - 1) * self.stride[1] +
                            self.kernel_size[1] - in_height)       #这里用到out_w=(in_w +pad-k+s)/s 反向求出pad
        pad_left = math.floor(pad_along_width / 2)  #向下取整
        pad_top = math.floor(pad_along_height / 2)
        pad_right = pad_along_width - pad_left
        pad_bottom = pad_along_height - pad_top
        return F.pad(input, (pad_left, pad_right, pad_top, pad_bottom), 'constant', 0)

    def __repr__(self):
        return self.__class__.__name__


############################################################
#  FPN Graph
############################################################


class TopDownLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TopDownLayer, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)  #将通道压缩
        self.padding2 = SamePad2d(kernel_size=3, stride=1)   #初始化类SamePad2d,在特征图周围用0填充
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,stride=1)

    def forward(self, x, y):
        y = F.upsample(y, scale_factor=2) #将y的宽高扩充到2倍
        x = self.conv1(x)                 #改变x的通道数
        return self.conv2(self.padding2(x+y))  #先将x与y融合后的特征,再用0填充,保证卷积前后宽度不变
          
class FPN(nn.Module):
    def __init__(self, C1, C2, C3, C4, C5, out_channels):
        super(FPN, self).__init__()
        self.out_channels = out_channels  #每层输出的通道数相同
        self.C1 = C1    #卷积操作
        self.C2 = C2
        self.C3 = C3
        self.C4 = C4
        self.C5 = C5
        self.P6 = nn.MaxPool2d(kernel_size=1, stride=2)
        
        self.P5_conv1 = nn.Conv2d(2048, self.out_channels, kernel_size=1, stride=1)
        self.P5_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1), #先将特征图padding,用于下面的卷积操作
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1), #消除混叠效应
        )
        self.P4_conv1 =  nn.Conv2d(1024, self.out_channels, kernel_size=1, stride=1)
        self.P4_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P3_conv1 = nn.Conv2d(512, self.out_channels, kernel_size=1, stride=1)
        self.P3_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P2_conv1 = nn.Conv2d(256, self.out_channels, kernel_size=1, stride=1)
        self.P2_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        c2_out = x
        x = self.C3(x)
        c3_out = x
        x = self.C4(x)
        c4_out = x
        x = self.C5(x)
        p5_out = self.P5_conv1(x)
        p4_out = self.P4_conv1(c4_out) + F.upsample(p5_out, scale_factor=2)
        p3_out = self.P3_conv1(c3_out) + F.upsample(p4_out, scale_factor=2)
        p2_out = self.P2_conv1(c2_out) + F.upsample(p3_out, scale_factor=2)

        p5_out = self.P5_conv2(p5_out)
        p4_out = self.P4_conv2(p4_out)
        p3_out = self.P3_conv2(p3_out)
        p2_out = self.P2_conv2(p2_out)

        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        p6_out = self.P6(p5_out)

        return [p2_out, p3_out, p4_out, p5_out, p6_out]


############################################################
#  Resnet Graph
############################################################
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride)  
        self.bn1 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01) 
        self.padding2 = SamePad2d(kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1)   
        self.bn3 = nn.BatchNorm2d(planes * 4, eps=0.001, momentum=0.01) 
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        #如果x是 (n,inplanes,h,w)
        out = self.conv1(x)  #通道压缩  (n,planes,h,w)
        out = self.bn1(out)  #归一化    (n,planes,h,w)
        out = self.relu(out) #非线性化  (n,planes,h,w)

        out = self.padding2(out)  #添加padding ((n,planes,h+pad,w+pad))
        out = self.conv2(out)     #卷积调整宽高 (n,planes,h,w)
        out = self.bn2(out)       #(n,planes,h,w)
        out = self.relu(out)      #(n,planes,h,w)

        out = self.conv3(out)     #(n,4*planes,h,w)
        out = self.bn3(out)       #(n,4*planes,h,w)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
        
class ResNet(nn.Module):

    def __init__(self, architecture, stage5=False):
        super(ResNet, self).__init__()
        assert architecture in ["resnet50", "resnet101"]
        self.inplanes = 64
        self.layers = [3, 4, {"resnet50": 6, "resnet101": 23}[architecture], 3]
        self.block = Bottleneck
        self.stage5 = stage5

        self.C1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64, eps=0.001, momentum=0.01),
            nn.ReLU(inplace=True),
            SamePad2d(kernel_size=3, stride=2),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.C2 = self.make_layer(self.block, 64, self.layers[0])
        self.C3 = self.make_layer(self.block, 128, self.layers[1], stride=2)
        self.C4 = self.make_layer(self.block, 256, self.layers[2], stride=2)
        if self.stage5:
            self.C5 = self.make_layer(self.block, 512, self.layers[3], stride=2)
        else:
            self.C5 = None

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        x = self.C3(x)
        x = self.C4(x)
        x = self.C5(x)
        return x


    def stages(self):
        return [self.C1, self.C2, self.C3, self.C4, self.C5]

    def make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes * block.expansion, eps=0.001, momentum=0.01),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)


############################################################
#  Proposal Layer
############################################################

def apply_box_deltas(boxes, deltas):
    """Applies the given deltas to the given boxes.
    boxes: [N, 4] where each row is y1, x1, y2, x2
    deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
    """
    # Convert to y, x, h, w
    height = boxes[:, 2] - boxes[:, 0]
    width = boxes[:, 3] - boxes[:, 1]
    center_y = boxes[:, 0] + 0.5 * height
    center_x = boxes[:, 1] + 0.5 * width
    # Apply deltas
    center_y += deltas[:, 0] * height
    center_x += deltas[:, 1] * width
    height *= torch.exp(deltas[:, 2])
    width *= torch.exp(deltas[:, 3])
    # Convert back to y1, x1, y2, x2
    y1 = center_y - 0.5 * height
    x1 = center_x - 0.5 * width
    y2 = y1 + height
    x2 = x1 + width
    result = torch.stack([y1, x1, y2, x2], dim=1)
    return result

def clip_boxes(boxes, window):
    """
    boxes: [N, 4] each col is y1, x1, y2, x2
    window: [4] in the form y1, x1, y2, x2
    """
    boxes = torch.stack( \
        [boxes[:, 0].clamp(float(window[0]), float(window[2])),
         boxes[:, 1].clamp(float(window[1]), float(window[3])),
         boxes[:, 2].clamp(float(window[0]), float(window[2])),
         boxes[:, 3].clamp(float(window[1]), float(window[3]))], 1)
    return boxes

def proposal_layer(inputs, proposal_count, nms_threshold, anchors, config=None):
    """Receives anchor scores and selects a subset to pass as proposals
    to the second stage. Filtering is done based on anchor scores and
    non-max suppression to remove overlaps. It also applies bounding
    box refinment detals to anchors.
    anchors: (anchors_num,4) 它是在图片尺寸维度的
    Inputs:
        rpn_probs: [batch, anchors_num, (bg prob, fg prob)]
        rpn_bbox: [batch, anchors_num, (dy, dx, log(dh), log(dw))]
    Returns:
        Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
    """

    # Currently only supports batchsize 1
    # inputs是一个list [rpn_probs,rpn_bbox]
    # inputs[0] 是rpn_probs (batch,anchors_num,2)
    # inputs[1] 是rpn_bbox  (batch,anchors_num,4)
    
    inputs[0] = inputs[0].squeeze(0)  #去掉batch_size那个维度,因为每个batch只支持一张图片
    inputs[1] = inputs[1].squeeze(0)

    # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
    scores = inputs[0][:, 1]

    # Box deltas [batch, num_rois, 4]
    deltas = inputs[1] #获得deltals,这是每张图片在第一阶段的预测输出
    #RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
    std_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)
    if config.GPU_COUNT:   #GPU_COUNT=1,则表示使用GPU,为0,则表示使用CPU
        std_dev = std_dev.cuda()
    deltas = deltas * std_dev

    # Improve performance by trimming to top anchors by score
    # and doing the rest on the smaller subset.
    pre_nms_limit = min(6000, anchors.size()[0])   #在进行nms之前取出多少个anchors
    scores, order = scores.sort(descending=True)   #将scores进行倒序排序,返回排序后的结果,以及他们在原序列的的index
    order = order[:pre_nms_limit]                  #切片前pre_nms_limit个scores
    scores = scores[:pre_nms_limit]                #切片  (pre_nms_limit,)
    deltas = deltas[order.data, :] # TODO: Support batch size > 1 ff.  #取出scores排名前pre_nms_limit的deltas
    anchors = anchors[order.data, :]               #同上   

    # Apply deltas to anchors to get refined anchors.
    # [batch, N, (y1, x1, y2, x2)]
    boxes = apply_box_deltas(anchors, deltas)      #根据输出偏差对anchors进行修正
                                                   #将取出来的deltas进行变换成方框的左上角和右下角坐标(pre_nms_limit,4)
                                                   #(pre_nms_limit,(y1, x1, y2, x2))

    # Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]
    height, width = config.IMAGE_SHAPE[:2]         #图片的边界
    window = np.array([0, 0, height, width]).astype(np.float32)  
    boxes = clip_boxes(boxes, window)       #将boxs限制在图片边界内

    # Filter out small boxes
    # According to Xinlei Chen's paper, this reduces detection accuracy
    # for small objects, so we're skipping it.

    # Non-max suppression
    #torch.cat((boxes, scores.unsqueeze(1)), 1)将scores增加一个维度变成(pre_nms_limit,1),
    #与boxes (pre_nms_limit,4)在第dims=1的维度上拼接变成维度为(pre_nms_limit,5)
    keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold)#keep是一个list,保存的是经过nms后剩下来的box的index
    if len(keep)>proposal_count:
        keep = keep[:proposal_count]
    boxes = boxes[keep, :]

    # Normalize dimensions to range of 0 to 1.
    norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)
    if config.GPU_COUNT:
        norm = norm.cuda()
    normalized_boxes = boxes / norm  #将boxs归一化

    # Add back batch dimension
    normalized_boxes = normalized_boxes.unsqueeze(0)  #增加一个维度 (proposal_count,4)--> (1,proposal_count,4)

    return normalized_boxes


############################################################
#  ROIAlign Layer
############################################################

def pyramid_roi_align(inputs, pool_size, image_shape):
    """Implements ROI Pooling on multiple levels of the feature pyramid.
    Params:
    - pool_size: [height, width] of the output pooled regions. Usually [7, 7]
    - image_shape: [channels,height, width]. Shape of input image in pixels  
    Inputs:
    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
             coordinates.
    - Feature maps: List of feature maps from different levels of the pyramid.
                    Each is [batch, channels, height, width]
    Output:
    Pooled regions in the shape: [num_boxes, channels, height, width,].
    The width and height are those specific in the pool_shape in the layer
    constructor.
    """

    # Currently only supports batchsize 1
    for i in range(len(inputs)):
        inputs[i] = inputs[i].squeeze(0)  #去掉batch_size那个维度

    # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
    boxes = inputs[0]  #(num_boxes,4)

    # Feature Maps. List of feature maps from different level of the
    # feature pyramid. Each is [batch,channels, height, width]
    feature_maps = inputs[1:]  #[p2,p3,p4,p5,p6]

    # Assign each ROI to a level in the pyramid based on the ROI area.
    y1, x1, y2, x2 = boxes.chunk(4, dim=1)  #将boxes沿着类方向分成四个块
    h = y2 - y1        
    w = x2 - x1

    # Equation 1 in the Feature Pyramid Networks paper. Account for
    # the fact that our coordinates are normalized here.
    # a 224x224 ROI (in pixels) maps to P4
    image_area = Variable(torch.FloatTensor([float(image_shape[1]*image_shape[2])]), requires_grad=False)
    if boxes.is_cuda:  
        image_area = image_area.cuda()
    roi_level = 4 + log2(torch.sqrt(h*w)/(224.0/torch.sqrt(image_area)))  #选在哪个特征输出层上进行ROI_pooling 
    roi_level = roi_level.round().int()   #round()向下取整
    roi_level = roi_level.clamp(2,5)   #FPN产生了[P2,P3,P4,P5,P6]五个特征层,但是只有[P2,P3,P4,P5]进行了roi_pooling


    # Loop through levels and apply ROI pooling to each. P2 to P5.
    pooled = []
    box_to_level = []
    for i, level in enumerate(range(2, 6)):  #i[0,1,2,3] level[2,3,4,5]
        ix  = roi_level==level   #bool
        if not ix.any(): #any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 True
            continue
        assert len(ix.shape)>1,"ix的维度<=1"
        ix = torch.nonzero(ix)[:,0] #torch.nonzero() 输出的非零元素在矩阵中的下标矩阵 维度(m,n1,n2,...)
                                    #m是非零元素的个数,n1,n2,...则是ix除了第一维度的其他维度(对于一维的行向量,则为1)
                                    #假设 ix=tensor([0,1,0,1])  >>> 输出tensor([[1],[3]])  维度(2,1)
                                    #假设 ix=tensor([[0,1,0,1],[0,1,0,1]]) >>>输出tensor([[0,1],[0,3],[1,0],[1,3]]) 维度(4,2)
        #因为ix的维度是[n,1],torch.nonzero(ix)生成的维度是(m,2),而通过[:,0],生成的则是,ix中不为0的index
        level_boxes = boxes[ix.data, :]   #分别取出roi_level为2,3,4,5的roi
        # Keep track of which box is mapped to which level
        box_to_level.append(ix.data) #将roi的roi_level为2,3,4,5的index依次append到box_to_level中

        # Stop gradient propogation to ROI proposals
        level_boxes = level_boxes.detach()  #将抽离导数计算图,也就是说在进行反向传播是不进行求导

        # Crop and Resize
        # From Mask R-CNN paper: "We sample four regular locations, so
        # that we can evaluate either max or average pooling. In fact,
        # interpolating only a single value at each bin center (without
        # pooling) is nearly as effective."
        #
        # Here we use the simplified approach of a single value per bin,
        # which is how it's done in tf.crop_and_resize()
        # Result: [batch * num_boxes, pool_height, pool_width, channels]
        per_level_boxes_num=level_boxes.size()[0]  #每一个level_layer有多少个box
        ind = Variable(torch.zeros(per_level_boxes_num),requires_grad=False).int()#每一个level层级的roi的个数,生成的一个行向量
        if level_boxes.is_cuda:
            ind = ind.cuda()
        feature_maps[i] = feature_maps[i].unsqueeze(0) #依次取出各个特征层的特征图,并且增加一个维度[1,1,c,h,w]           
                                                       #CropAndResizeFunction needs batch dimension
                                                       
        pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) 
                                                       #输出的维度为(per_level_boxes_num,c,pool_size,pool_size)
        pooled.append(pooled_features)

    # Pack pooled features into one tensor
    pooled = torch.cat(pooled, dim=0)   #横向拼接  # (num_rois,c,pool_size,pool_size)

    # Pack box_to_level mapping into one array and add another
    # column representing the order of pooled boxes
    box_to_level = torch.cat(box_to_level, dim=0)    

    # Rearrange pooled features to match the order of the original boxes
    _, box_to_level = torch.sort(box_to_level)  #将因为pooling操作时是按照layer_level来处理的,
                                                #所以打乱了原来roi安装scores的高度排列的顺序
                                                #现在又按照rois原来的顺序来排列
                                                 
    pooled = pooled[box_to_level, :, :, :]  #第一维是boxes的数量 (boxes_num,c,7,7)

    return pooled


############################################################
#  Detection Target Layer
############################################################
def bbox_overlaps(boxes1, boxes2):
    """Computes IoU overlaps between two sets of boxes.
    boxes1, boxes2: [N, (y1, x1, y2, x2)].
    """
    # 1. Tile boxes2 and repeate boxes1. This allows us to compare
    # every boxes1 against every boxes2 without loops.
    # TF doesn't have an equivalent to np.repeate() so simulate it
    # using tf.tile() and tf.reshape.
    boxes1_repeat = boxes2.size()[0]   #box1有多少个框
    boxes2_repeat = boxes1.size()[0]   #box2有多少个框
    boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,4)  #一行一行的复制,先将第一行复制boxes1_repeat份,
                                                        #然后将第二行复制boxes1_repeat份,依次类推
                                                      
    boxes2 = boxes2.repeat(boxes2_repeat,1)  #整体赋值boxes2_repeat份

    # 2. Compute intersections
    b1_y1, b1_x1, b1_y2, b1_x2 = boxes1.chunk(4, dim=1) #这样分块能保持原数据的维度数量
    b2_y1, b2_x1, b2_y2, b2_x2 = boxes2.chunk(4, dim=1)  
    y1 = torch.max(b1_y1, b2_y1)[:, 0]  #返回比较大小的结果,经过[:,0]得到一个一维行向量
    x1 = torch.max(b1_x1, b2_x1)[:, 0]
    y2 = torch.min(b1_y2, b2_y2)[:, 0]
    x2 = torch.min(b1_x2, b2_x2)[:, 0]
    zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)
    if y1.is_cuda:
        zeros = zeros.cuda()
    intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros)

    # 3. Compute unions
    b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
    b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
    union = b1_area[:,0] + b2_area[:,0] - intersection

    # 4. Compute IoU and reshape to [boxes1, boxes2]
    iou = intersection / union
    overlaps = iou.view(boxes2_repeat, boxes1_repeat)

    return overlaps
    
  
def ious(boxes1,boxes2):
    '''
    inputs:
        boxes1: [n,4]
        boxes2: [m,4]
    outputs:
        [n,m]
    '''
    
    b1_y1,b1_x1,b1_y2,b1_x2=boxes1[:,0],boxes1[:,1],boxes1[:,2],boxes1[:,3]
    b2_y1,b2_x1,b2_y2,b2_x2=boxes2[:,0],boxes2[:,1],boxes2[:,2],boxes2[:,3]
   
    y1=torch.max(b1_y1.unsqueeze(1),b2_y1)
    x1=torch.max(b1_x1.unsqueeze(1),b2_x1)
    y2=torch.min(b1_y2.unsqueeze(1),b2_y2)
    x2=torch.min(b1_x2.unsqueeze(1),b2_x2)
   
    intersection = torch.max(x2 - x1, torch.zeros(x1.size()))* torch.max(y2 - y1, torch.zeros(y1.size()))
   
    b1_area = ((b1_y2 - b1_y1) * (b1_x2 - b1_x1))
    b2_area = ((b2_y2 - b2_y1) * (b2_x2 - b2_x1))
    areas=b1_area.unsqueeze(1)+b2_area
    
    ious=intersection/(areas-intersection)
    return ious
                         
    
    
    

def detection_target_layer(proposals, gt_class_ids, gt_boxes, gt_masks, config):
    """Subsamples proposals and generates target box refinment, class_ids,
    and masks for each.
    Inputs:
    proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might
               be zero padded if there are not enough proposals.   rpn_rois
    gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
    gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized
              coordinates.
    gt_masks: [batch,MAX_GT_INSTANCES, height, width,] of boolean type
    Returns: Target ROIs and corresponding class IDs, bounding box shifts,
    and masks.
    rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized
          coordinates
    target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
    target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES,
                    (dy, dx, log(dh), log(dw), class_id)]
                   Class-specific bbox refinments.
    target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width)
                 Masks cropped to bbox boundaries and resized to neural
                 network output size.
    """

    # Currently only supports batchsize 1
    proposals = proposals.squeeze(0) #去掉batch
    gt_class_ids = gt_class_ids.squeeze(0)
    gt_boxes = gt_boxes.squeeze(0)
    gt_masks = gt_masks.squeeze(0)

    # Handle COCO crowds
    # A crowd box in COCO is a bounding box around several instances. Exclude
    # them from training. A crowd box is given a negative class ID.
    # gt_class_ids<0代表该区域是背景,
    # gt_class_ids>0代表该区域是前景。
    if torch.nonzero(gt_class_ids < 0).size():   #当gt_class_ids中存在小于0的数
        crowd_ix = torch.nonzero(gt_class_ids < 0)[:, 0]  #获得gt_class_ids<0的index
        non_crowd_ix = torch.nonzero(gt_class_ids > 0)[:, 0] #获得gt_class_ids>0的index,
        
        crowd_boxes = gt_boxes[crowd_ix.data, :]             #获取gt_class_ids<0的box,背景框
        crowd_masks = gt_masks[:,:,crowd_ix.data]            #获取gt_class_ids<0的mask
        gt_class_ids = gt_class_ids[non_crowd_ix.data]
        gt_boxes = gt_boxes[non_crowd_ix.data, :]            #获取正样本的真实框
        gt_masks = gt_masks[:,:,non_crowd_ix.data]           #

        # Compute overlaps with crowd boxes [anchors, crowds]
        crowd_overlaps = bbox_overlaps(proposals, crowd_boxes) #计算感兴趣框与属性为背景的真实框的ious
        crowd_iou_max = torch.max(crowd_overlaps, dim=1)[0]    #计算每行的最大值
        no_crowd_bool = crowd_iou_max < 0.001                  #获得取每行最大值小于<0.001的index(bool类型,选中为true,未选中为false)
    else:
        #如果gt_class_ids全部大于0,则全为true
        no_crowd_bool =  Variable(torch.ByteTensor(proposals.size()[0]*[True]), requires_grad=False)
        if config.GPU_COUNT:
            no_crowd_bool = no_crowd_bool.cuda()

    # Compute overlaps matrix [proposals, gt_boxes]
    overlaps = bbox_overlaps(proposals, gt_boxes)   #计算感兴趣框与全部真实框的ious

    # Determine postive and negative ROIs
    roi_iou_max = torch.max(overlaps, dim=1)[0]

    # 1. Positive ROIs are those with >= 0.5 IoU with a GT box
    positive_roi_bool = roi_iou_max >= 0.5        #获取取proposals与gt_boxes的ious值的最大值大于0.5的作为正样本

    # Subsample ROIs. Aim for 33% positive
    # Positive ROIs
    if torch.nonzero(positive_roi_bool).size():
        positive_indices = torch.nonzero(positive_roi_bool)[:, 0]  #获取正样本的索引

        positive_count = int(config.TRAIN_ROIS_PER_IMAGE *   
                             config.ROI_POSITIVE_RATIO)         #自定义正样本的数量 int(200*0.33)
        rand_idx = torch.randperm(positive_indices.size()[0])   #给定参数n,返回一个从0到n-1的随机整数排列
                                                                #rand_idx是长度为len(positive_indices)的打乱的自然数序列
                                                                #也就是他是positive_indices这个一维向量的下标
        rand_idx = rand_idx[:positive_count]                    #取出一部下标
        if config.GPU_COUNT:
            rand_idx = rand_idx.cuda()
        positive_indices = positive_indices[rand_idx] #获取从positive_indices选出的下标的对应的值,而这些值又是proposals的下标
        positive_count = positive_indices.size()[0]   #获取最终选取的正样本数
        positive_rois = proposals[positive_indices.data,:]  #获取最终选取的正样本

        # Assign positive ROIs to GT boxes.
        positive_overlaps = overlaps[positive_indices.data,:]  #刷选出正样本与gt_boxes的ious
        roi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1] #获取每行的最大值处于哪一列,也就是看哪个真实框与该
                                                                       #正样本最匹配
        roi_gt_boxes = gt_boxes[roi_gt_box_assignment.data,:] #将每个gt_box与每个正样本计算的的ious,然后看哪个gt_box与
                                                              #该正样本的ious最大,就将该样本的label(包括真实box,box框中的物体的类别)  
                                                              #赋值该正样本,作为该正样本的label                                                               
        roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment.data]

        # Compute bbox refinement for positive ROIs
        # 对RPN网络输出的正样本框,进行修正,计算正样本框与分配给它的真实框之间的偏差
        deltas = Variable(utils.box_refinement(positive_rois.data, roi_gt_boxes.data), requires_grad=False)
        # BBOX_STD_DEV=np.array([0.1, 0.1, 0.2, 0.2])
        std_dev = Variable(torch.from_numpy(config.BBOX_STD_DEV).float(), requires_grad=False)
        if config.GPU_COUNT:
            std_dev = std_dev.cuda()
        deltas /= std_dev         #这里把deltas除以tensor([0.1, 0.1, 0.2, 0.2]),投入网络,所以在proposal_layer
                                  #输出的deltas应该乘以tensor([0.1, 0.1, 0.2, 0.2]),才网络真正的输出

        # Assign positive ROIs to GT masks
        roi_masks = gt_masks[roi_gt_box_assignment.data,:,:]  #获取正样本的mask

        # Compute mask targets
        boxes = positive_rois      
        if config.USE_MINI_MASK:
            # Transform ROI corrdinates from normalized image space
            # to normalized mini-mask space.
            y1, x1, y2, x2 = positive_rois.chunk(4, dim=1)   #获取正样本的y_min,x_min,y_max,x_max
            gt_y1, gt_x1, gt_y2, gt_x2 = roi_gt_boxes.chunk(4, dim=1) #计算给这些正样本分配的真实标签中的box的y_min,x_min,y_max,x_max
            gt_h = gt_y2 - gt_y1
            gt_w = gt_x2 - gt_x1
            dy1 = (y1 - gt_y1) / gt_h
            dx1 = (x1 - gt_x1) / gt_w
            dy2 = (y2 - gt_y1) / gt_h
            dx2 = (x2 - gt_x1) / gt_w
            boxes = torch.cat([dy1, dx1, dy2, dx2], dim=1)  #正样本与真实框的偏差
        box_ids = Variable(torch.arange(roi_masks.size()[0]), requires_grad=False).int()
        if config.GPU_COUNT:
            box_ids = box_ids.cuda()
        masks = Variable(CropAndResizeFunction(config.MASK_SHAPE[0], config.MASK_SHAPE[1], 0)(roi_masks.unsqueeze(1), boxes, box_ids).data, requires_grad=False)
                #将mask进行对其操作,其实就是把它从(56,56)变成(28,28)
        masks = masks.squeeze(1)  

        # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
        # binary cross entropy loss.
        masks = torch.round(masks)
    else:
        positive_count = 0

    # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
    negative_roi_bool = roi_iou_max < 0.5     ##获取取proposals与gt_boxes的ious值的最大值小于0.5的作为负样本
    negative_roi_bool = negative_roi_bool & no_crowd_bool    #负样本满足两个条件,一是iou<0.5,二是与背景框的iou<0.001
    # Negative ROIs. Add enough to maintain positive:negative ratio.
    if torch.nonzero(negative_roi_bool).size() and positive_count>0:
        negative_indices = torch.nonzero(negative_roi_bool)[:, 0]    #获取负样本的index
        r = 1.0 / config.ROI_POSITIVE_RATIO
        negative_count = int(r * positive_count - positive_count)
        rand_idx = torch.randperm(negative_indices.size()[0])
        rand_idx = rand_idx[:negative_count]
        if config.GPU_COUNT:
            rand_idx = rand_idx.cuda()
        negative_indices = negative_indices[rand_idx]
        negative_count = negative_indices.size()[0]
        negative_rois = proposals[negative_indices.data, :]
    else:
        negative_count = 0

    # Append negative ROIs and pad bbox deltas and masks that
    # are not used for negative ROIs with zeros.
    if positive_count > 0 and negative_count > 0:
        rois = torch.cat((positive_rois, negative_rois), dim=0)  #将正负样本纵向拼接
        zeros = Variable(torch.zeros(negative_count), requires_grad=False).int()   #将选择出来的负样本的类别label设置为0
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        roi_gt_class_ids = torch.cat([roi_gt_class_ids, zeros], dim=0)        #将正负样本类别label纵向拼接
        zeros = Variable(torch.zeros(negative_count,4), requires_grad=False)  #将负样本的(dy1,dx1,dy2,dx2)设置为0
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        deltas = torch.cat([deltas, zeros], dim=0)     #将正负样本的delats拼接
        zeros = Variable(torch.zeros(negative_count,config.MASK_SHAPE[0],config.MASK_SHAPE[1]), requires_grad=False)
                                                       #将负样本的掩模矩阵设置为0
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        masks = torch.cat([masks, zeros], dim=0)  #将正负样本的掩模矩阵拼接
    elif positive_count > 0:  #只有正样本没有负样本
        rois = positive_rois
    elif negative_count > 0:  #只有负样本没有正样本
        rois = negative_rois
        zeros = Variable(torch.zeros(negative_count), requires_grad=False)
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        roi_gt_class_ids = zeros
        zeros = Variable(torch.zeros(negative_count,4), requires_grad=False).int()
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        deltas = zeros
        zeros = Variable(torch.zeros(negative_count,config.MASK_SHAPE[0],config.MASK_SHAPE[1]), requires_grad=False)
        if config.GPU_COUNT:
            zeros = zeros.cuda()
        masks = zeros
    else:   #既没有正样本又没有负样本
        rois = Variable(torch.FloatTensor(), requires_grad=False)
        roi_gt_class_ids = Variable(torch.IntTensor(), requires_grad=False)
        deltas = Variable(torch.FloatTensor(), requires_grad=False)
        masks = Variable(torch.FloatTensor(), requires_grad=False)
        if config.GPU_COUNT:
            rois = rois.cuda()
            roi_gt_class_ids = roi_gt_class_ids.cuda()
            deltas = deltas.cuda()
            masks = masks.cuda()

    return rois, roi_gt_class_ids, deltas, masks

############################################################
#  Detection Layer
############################################################

def clip_to_window(window, boxes):
    """
        window: (y1, x1, y2, x2). The window in the image we want to clip to.
        boxes: [N, (y1, x1, y2, x2)]
    """
    boxes[:, 0] = boxes[:, 0].clamp(float(window[0]), float(window[2]))
    boxes[:, 1] = boxes[:, 1].clamp(float(window[1]), float(window[3]))
    boxes[:, 2] = boxes[:, 2].clamp(float(window[0]), float(window[2]))
    boxes[:, 3] = boxes[:, 3].clamp(float(window[1]), float(window[3]))

    return boxes

def refine_detections(rois, probs, deltas, window, config):
    """Refine classified proposals and filter overlaps and return final
    detections.
    Inputs:
        rois: [N, (y1, x1, y2, x2)] in normalized coordinates
        probs: [N, num_classes]. Class probabilities.
        deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
                bounding box deltas.
        window: (y1, x1, y2, x2) in image coordinates. The part of the image
            that contains the image excluding the padding.
    Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)]
    """

    # Class IDs per ROI
    class_scores, class_ids = torch.max(probs, dim=1) #计算每行的最大值,并且返回该最大值已经最大值在该行的索引
                                                      #每行代表一个rois的预测对各个类别的预测分数
                                                      #每行的最大值在该行的索引和最大值代表代表该rois最可能预测的类别和它的预测分数

    # Class probability of the top class of each ROI
    # Class-specific bounding box deltas
    idx=torch.arange(class_ids.size()[0]).long()
    if config.GPU_COUNT:
        idx = idx.cuda()
    #class_scores = probs[idx, class_ids.data]        #
    deltas_specific = deltas[idx, class_ids.data,:]  #idx可以保证每一个样本中只选择一个box的deltas
                                                     #这条代码是选择预测类别分数最高的类别的那个box的deltas

    # Apply bounding box deltas
    # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
    std_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)
    if config.GPU_COUNT:
        std_dev = std_dev.cuda()
    refined_rois = apply_box_deltas(rois, deltas_specific * std_dev) #用这个样本最有可预测的类别的box的deltas去修正rois

    # Convert coordiates to image domain
    height, width = config.IMAGE_SHAPE[:2] #图像shape
    scale = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)
    if config.GPU_COUNT:
        scale = scale.cuda()
    refined_rois *= scale     #将修正后的rois尺寸放大到原图上来

    # Clip boxes to image window
    refined_rois = clip_to_window(window, refined_rois)  #将尺寸放大后的refined_rois压缩到图像尺寸里

    # Round and cast to int since we're deadling with pixels now
    refined_rois = torch.round(refined_rois) #对坐标取整数

    # TODO: Filter out boxes with zero area

    # Filter out background boxes
    # class_ids中结果为0的
    keep_bool = class_ids>0   

    # Filter out low confidence boxes
    if config.DETECTION_MIN_CONFIDENCE:
        keep_bool = keep_bool & (class_scores >= config.DETECTION_MIN_CONFIDENCE) 
    keep = torch.nonzero(keep_bool)[:,0]  #取出前景样本中,类别分数大于最小置信度的样本index

    # Apply per-class NMS
    pre_nms_class_ids = class_ids[keep.data]  #取出样本  (m,)
    pre_nms_scores = class_scores[keep.data]  #(m,)
    pre_nms_rois = refined_rois[keep.data]   #(m,4)

    #对每张图上同一个类别的不同roi进行nms
    #因为pre_nms_class_ids是所有样本的类别,它们里面有很多重复的
    #unique1d的目的相对于set(list_),使得pre_nms_class_ids中每个种类只保留一个
    
    for i, class_id in enumerate(unique1d(pre_nms_class_ids)):
        # Pick detections of this class
        ixs = torch.nonzero(pre_nms_class_ids == class_id)[:,0]

        # Sort
        ix_rois = pre_nms_rois[ixs.data]
        ix_scores = pre_nms_scores[ixs]
        ix_scores, order = ix_scores.sort(descending=True)
        ix_rois = ix_rois[order.data,:]  #将同一类别的roi的类别分数排序

        class_keep = nms(torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1).data, config.DETECTION_NMS_THRESHOLD)#nms
                                         #class_keep是ix_rois的下标

        # Map indicies
        # keep 前景中类别分数大于最小置信度的样本index
        # class_keep,是属于某类别的的index经过nms后保留的index
        # order 是所有属于某个类别的按照score从大到小排列的index
        # ixs是某个类别的索引,ixs与order的区别是order是排好序的,ixs没有排序
        class_keep = keep[ixs[order[class_keep].data].data]  #order[class_keep]是位于这些下标lass_keep的值

        if i==0:
            nms_keep = class_keep
        else:
            nms_keep = unique1d(torch.cat((nms_keep, class_keep)))#这里有点多余了,因为class_keep本来就是某一类别经过nms后剩余的index
                                                                  #不同类别之间之间的index本来就不会重复,二类间的索引也不会重复
    
    keep = intersect1d(keep, nms_keep)  #求两个tensor的交集,注意:这里的两个tensor里面没有重复放入元素
                                        #这句程序感觉没什么用,因为每个class_keep本身不会重叠,而且都是从keep中取出来的
                                        #这里有个问题,上面for循环没有执行时,nms_keep为None,该句程序会报错
                                 

    # Keep top detections
    roi_count = config.DETECTION_MAX_INSTANCES  #最终检测样本100个
    rest_sample=len(class_scores[keep.data])
    if roi_count 0)[:,0]
        # Gather the deltas (predicted and true) that contribute to loss
        target_bbox = target_bbox[positive_roi_ix,:]
        pred_bbox = pred_bbox[positive_roi_ix,:,:]

        # Smooth L1 loss
        loss = F.smooth_l1_loss(pred_bbox, target_bbox)
    else:
        loss = Variable(torch.FloatTensor([0]), requires_grad=False)
        if target_class_ids.is_cuda:
            loss = loss.cuda()
    return loss
    
def compute_mrcnn_mask_loss(target_masks, target_class_ids, pred_masks):
    """Mask binary cross-entropy loss for the masks head.

    target_masks: [batch, num_rois, height, width].
        A float32 tensor of values 0 or 1. Uses zero padding to fill array.
    target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
    pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
                with values from 0 to 1.
    """
    if target_class_ids.size():
        # Only positive ROIs contribute to the loss. And only
        # the class specific mask of each ROI.
        positive_ix = torch.nonzero(target_class_ids > 0)[:, 0]
        positive_class_ids = target_class_ids[positive_ix.data].long()
        indices = torch.stack((positive_ix, positive_class_ids), dim=1)

        # Gather the masks (predicted and true) that contribute to loss
        y_true = target_masks[indices[:,0].data,:,:]
        y_pred = pred_masks[indices[:,0].data,indices[:,1].data,:,:]

        # Binary cross entropy
        loss = F.binary_cross_entropy(y_pred, y_true)
    else:
        loss = Variable(torch.FloatTensor([0]), requires_grad=False)
        if target_class_ids.is_cuda:
            loss = loss.cuda()

    return loss
    
#def compute_mrcnn_mask_loss(target_masks, target_class_ids, pred_masks):
    """Mask binary cross-entropy loss for the masks head.

    target_masks: [batch, num_rois, height, width].
        A float32 tensor of values 0 or 1. Uses zero padding to fill array.
    target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
    pred_masks: [batch, num_rois, height, width, num_classes] float32 tensor
                with values from 0 to 1.
    """
    if target_class_ids.size():
        target_masks=target_masks.view(-1,target_masks.size(2),target_masks.size()[3]) #(m,height,width)
        target_class_ids=target_class_ids.view(-1)  #(m,)
        pred_masks=pred_masks.view(-1,pred_masks.size()[2],pred_masks.size()[3],pred_masks.size()[4]) #(N,height,width,num_classes)
        #pred_masks=pred_masks.permute(0, 3, 1, 2) #(N,num_classes,height,width)

        positive_ix=torch.nonzero(target_class_ids>0)[:,0]
        y_true=target_masks[positive_ix,:,:].view(-1)
        y_pred=pred_masks[positive_ix,:,:,:].view(-1,pred_masks.size()[-1])
        print(y_true.shape)
        print(y_pred.shape)
        loss = F.binary_cross_entropy(y_pred, y_true)
    else:
        loss = Variable(torch.FloatTensor([0]), requires_grad=False)
        if target_class_ids.is_cuda:
            loss = loss.cuda()
    return loss    
    
    

def compute_losses(rpn_match,rpn_class_logits, rpn_target_bbox, rpn_pred_bbox,
                  target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask):
    #rpn_match: 每个anchor的类别(-1(负样本),0(不需要参与计算的样本),1(正样本)
    #rpn_target_bbox:
    #rpn_class_logits, rpn输出的scores (b,num_anchor,2)
    #rpn_pred_bbox: rpn网络输出的boxes (b,num_anchor,4)
    #target_class_ids: (m, num_rois)
    #mrcnn_class_logits: (batch, num_rois, num_classes)
    #target_deltas: (batch,num_rois,4)
    #mrcnn_bbox: (batch,num_rois,num_classes,4)
    rpn_class_loss = compute_rpn_class_loss(rpn_match, rpn_class_logits)
    rpn_bbox_loss = compute_rpn_bbox_loss(rpn_target_bbox, rpn_match, rpn_pred_bbox)
    mrcnn_class_loss = compute_mrcnn_class_loss(target_class_ids, mrcnn_class_logits)
    mrcnn_bbox_loss = compute_mrcnn_bbox_loss(target_deltas, target_class_ids, mrcnn_bbox)
    mrcnn_mask_loss = compute_mrcnn_mask_loss(target_mask, target_class_ids, mrcnn_mask)  #这个还不清楚
    return [rpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss]


############################################################
#  Data Generator
############################################################

def load_image_gt(dataset, config, image_id, augment=False,
                  use_mini_mask=False):
    """Load and return ground truth data for an image (image, mask, bounding boxes).
    augment: If true, apply random image augmentation. Currently, only
        horizontal flipping is offered.
    use_mini_mask: If False, returns full-size masks that are the same height
        and width as the original image. These can be big, for example
        1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
        224x224 and are generated by extracting the bounding box of the
        object and resizing it to MINI_MASK_SHAPE.
        如果为False,则返回高度和宽度与原始图像相同的全尺寸蒙版。它们可能很大,
        例如1024x1024x100(用于100个实例)。迷你遮罩较小,
        通常为224x224,是通过提取对象的边界框并将其大小调整为MINI_MASK_SHAPE生成的
    Returns:
    image: [3,height, width]
    shape: the original shape of the image before resizing and cropping.
    class_ids: [instance_count] Integer class IDs
    bbox: [instance_count, (y1, x1, y2, x2)]
    mask: [height, width, instance_count]. The height and width are those
        of the image unless use_mini_mask is True, in which case they are
        defined in MINI_MASK_SHAPE.
    """
    # Load image and mask
    image = dataset.load_image(image_id)  #image_id一个数,这是一张numpy数据类型的BGR图()
                                          #训练时,将图片全部add到dataset类中了,可以把dataset看成一个容器来理解
                                          #这里在用的时候,是从这个容器中取出一张图片
    mask, class_ids = dataset.load_mask(image_id) #tensor(,,), tensor()它是一个空的,但是shape分别是(1,3),(1)
                                                  #不懂这么为什么要生成一个空的mask
                                                  #现在懂了,这一块程序是作者留在这里要自己去改写的
                                                  #反正这个函数就是取出图片上的mask区域(也就是待检测物体的区域),但是mask
                                                  #区域是个方形的,它比待检测区域大
                                                  #(num_instance,height,width) num_instance是该张图片上有多少个待检测的物体
                                                  #这个mask是怎么生成的不重要,重要的是知道mask是什么
    shape = image.shape   #图片的形状
    image, window, scale, padding = utils.resize_image(
        image,
        min_dim=config.IMAGE_MIN_DIM,  #800
        max_dim=config.IMAGE_MAX_DIM,  #1024
        padding=config.IMAGE_PADDING)  #True
    
    #image 经过处理后的图片
    #window [0,0,height),width] 一个tuple,其中height,width是经常处理后的图片的高和宽
    #scale  图片放缩因子
    #padding 图片每个维度填充的宽度
    
    mask = utils.resize_mask(mask, scale, padding) #将mask做相同的处理
    #mask的维度(heigt,width,num_instance)
    #mask 第0维度和第1维度是图像是否有框住一个图片的的物体使用的框的大小,

    # Random horizontal flips.
    if augment:
        if random.randint(0, 1):
            image = np.fliplr(image)
            mask = np.fliplr(mask)  #随机水平切割

    # Bounding boxes. Note that some boxes might be all zeros
    # if the corresponding mask got cropped out.
    # bbox: [num_instances, (y1, x1, y2, x2)]
    bbox = utils.extract_bboxes(mask) 
    #首先搞懂mask到底是什么,通俗来说mask就是图片上包含带检测物体的区域块,
    #这里用到包含,是因为它的mask是个矩形,它的面积要比物体的区域面积大,可以理解为
    #mask的最圈是物体的外接矩形,在这个矩形中有物体的区域像素是1,没有物体的区域像素为0
    #每一个物体都有一个mask,

    # Active classes
    # Different datasets have different classes, so track the
    # classes supported in the dataset of this image.
    active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)  #这里的num_classes投入网络训练的所有数据集上的所有类别
    source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]] 
                       #source_class_ids它是将所有图片的ids作了一个分类,把具有相同的source的图片的id看成一类
                       #source_class_ids={'cooc':[0,1,3,6],'dfa':[2,4,5]}
                       #dataset.image_info[image_id]["source"]是返回该张图片属于哪个source
                       #此代码就将属于这个source的ids以一个list的方式取出来   
                      
    active_class_ids[source_class_ids] = 1 #将这张图片所属的source的ids置1
    #理解:在进行训练的时候,可以投入不同的数据集,但是不同的数据集上的类别不同,且类别数也不同
    #假设模型训练用了三个训练集合: 
    #集合1的类别包括:人,车,房子
    #集合2的类别包括:苹果,香蕉,人,狗
    #集合3的类别包括:飞机,手机,米饭
    #那么dataset.num_classes=3+4+3=10,active_class_ids=np.array([0,0,0,0,0,0,0,0,0,0])
    #当正在参与训练的图片来自于集合2,此时只需要使用到四个类别苹果,香蕉,人,狗
    #也就是active_class_ids中的第3,4,5,6的位置需要置1
    #即active_class_ids=np.array([0,0,0,1,1,1,1,0,0,0])

    # Resize masks to smaller size to reduce memory usage
    if use_mini_mask:
        mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE) #config.MINI_MASK_SHAPE(56,56)

    # Image meta data
    image_meta = compose_image_meta(image_id, shape, window, active_class_ids)
    # class_ids: [instance_count,] 这张图上所有的类别数
    # (active_class_ids==1).sum()  这张图属于的数据集的所有类别数

    return image, image_meta, class_ids, bbox, mask 

def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
    """Given the anchors and GT boxes, compute overlaps and identify positive
    anchors and deltas to refine them to match their corresponding GT boxes.
    anchors: [num_anchors, (y1, x1, y2, x2)]
    gt_class_ids: [num_gt_boxes] Integer class IDs.
    gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
    Returns:
    rpn_match: [N] (int32) matches between anchors and GT boxes.
               1 = positive anchor, -1 = negative anchor, 0 = neutral
    rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
    """
    # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
    rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
    # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
    rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))

    # Handle COCO crowds
    # A crowd box in COCO is a bounding box around several instances. Exclude
    # them from training. A crowd box is given a negative class ID.
    crowd_ix = np.where(gt_class_ids < 0)[0]   #负样本索引
    if crowd_ix.shape[0] > 0: #如果存在负样本
        # Filter out crowds from ground truth class IDs and boxes
        non_crowd_ix = np.where(gt_class_ids > 0)[0]  #获取ids>0的样本下标
        crowd_boxes = gt_boxes[crowd_ix]  #得到负样本box
        gt_class_ids = gt_class_ids[non_crowd_ix] #得到ids>0的样本的index
        gt_boxes = gt_boxes[non_crowd_ix]  #得到ids>0的样本的boxes
        # Compute overlaps with crowd boxes [anchors, crowds]
        crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes) #计算负样本与anchor的ious
        crowd_iou_max = np.argmax(crowd_overlaps, axis=1) 
        no_crowd_bool = (crowd_iou_max < 0.001) #将ids为0的样本中与anchors的ious<0.001的样本作为真正的负样本
    else: #如果不存在ids==0的样本
        # All anchors don't intersect a crowd
        no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)  #ids>0的样本的index为全部样本的索引

    # Compute overlaps [num_anchors, num_gt_boxes]
    overlaps = utils.compute_overlaps(anchors, gt_boxes) #计算ids>0的样本与anchor的ious

    # Match anchors to GT Boxes
    # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
    # If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
    # Neutral anchors are those that don't match the conditions above,
    # and they don't influence the loss function.
    # However, don't keep any GT box unmatched (rare, but happens). Instead,
    # match it to the closest anchor (even if its max IoU is < 0.3).
    #
    # 1. Set negative anchors first. They get overwritten below if a GT box is
    # matched to them. Skip boxes in crowd areas.
    anchor_iou_argmax = np.argmax(overlaps, axis=1)  #返回每行的最大值在该行的索引
    anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
    rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1 #将从ids<0的样本中选出的负样本
                                                             #加上ids>0的样本中与anchor的ious<0.3的样本一起作为负样本
    # 2. Set an anchor for each GT box (regardless of IoU value).
    # TODO: If multiple anchors have the same IoU match all of them
    gt_iou_argmax = np.argmax(overlaps, axis=0)  #返回overlaps(ious)矩阵中每列最大值
    rpn_match[gt_iou_argmax] = 1      #每列最大值作为正样本
    # 3. Set anchors with high overlap as positive.
    rpn_match[anchor_iou_max >= 0.7] = 1  #将ids>0的样本中与anchor的ious>0.7的样本作为正样本

    # Subsample to balance positive and negative anchors
    # Don't let positives be more than half the anchors
    ids = np.where(rpn_match == 1)[0] # 将正样本的index抽离出(正样本为true,其余为false)
    #RPN_TRAIN_ANCHORS_PER_IMAGE=256
    extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2) #计算符合要求得正样本的数量与实际需要的正样本数量的差值
    if extra > 0: #如果符合要求的正样本数量多余需要的正样本数
        # Reset the extra ones to neutral
        ids = np.random.choice(ids, extra, replace=False) #则从从中随机选择需要的正样本数量
        rpn_match[ids] = 0
    # Same for negative proposals
    ids = np.where(rpn_match == -1)[0]
    extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
                        np.sum(rpn_match == 1))
    if extra > 0:
        # Rest the extra ones to neutral
        ids = np.random.choice(ids, extra, replace=False) #同理
        rpn_match[ids] = 0

    # For positive anchors, compute shift and scale needed to transform them
    # to match the corresponding GT boxes.
    ids = np.where(rpn_match == 1)[0]
    ix = 0  # index into rpn_bbox
    # TODO: use box_refinment() rather than duplicating the code here
    gt=gt_boxes[ids]
    a=anchors[ids]
    gt_h=gt[:,2]-gt[:,0]
    gt_w=gt[:,3]-gt[:,1]
    gt_center_y=gt[0]+0.5*gt_h
    gt_center_x=gt[1]+0.5*gt_w
    a_h = a[:,2] - a[:,0]
    a_w = a[:,3] - a[:,1]
    a_center_y = a[:,0] + 0.5 * a_h
    a_center_x = a[:,1] + 0.5 * a_w
    rpn_bbox[:len(ids),0]=(gt_center_y - a_center_y) / a_h   #:len(ids)是因为这里只需要计算正样本与真实box的偏差
                                                             #负样本的偏差为0
    rpn_bbox[:len(ids),1]=(gt_center_x - a_center_x) / a_w
    rpn_bbox[:len(ids),2]=np.log(gt_h / a_h)
    rpn_bbox[:len(ids),3]=np.log(gt_w / a_w)
    # Normalize
    rpn_bbox /= config.RPN_BBOX_STD_DEV
    
    #rpn_match 里面是(-1,0,1)记录哪些样本是正样本,哪些样本是负样本,哪些样本是无关紧要的样本
    #rpn_bbox:前128个是正样本与真实框的偏差,后128个是负样本与真实框的偏差(全为0)
    return rpn_match, rpn_bbox
    
    # for i, a in zip(ids, anchors[ids]):
        # # Closest gt box (it might have IoU < 0.7)
        # gt = gt_boxes[anchor_iou_argmax[i]]  #anchor_iou_argmax[i]返回的是第i行最大在在行方向的索引,
                                             # #也就是第几个真实box
        # # Convert coordinates to center plus width/height.
        # # GT Box
        # gt_h = gt[2] - gt[0]
        # gt_w = gt[3] - gt[1]
        # gt_center_y = gt[0] + 0.5 * gt_h
        # gt_center_x = gt[1] + 0.5 * gt_w
        # # Anchor
        # a_h = a[2] - a[0]
        # a_w = a[3] - a[1]
        # a_center_y = a[0] + 0.5 * a_h
        # a_center_x = a[1] + 0.5 * a_w

        # # Compute the bbox refinement that the RPN should predict.
        # rpn_bbox[ix] = [
            # (gt_center_y - a_center_y) / a_h,
            # (gt_center_x - a_center_x) / a_w,
            # np.log(gt_h / a_h),
            # np.log(gt_w / a_w),
        # ]
        # # Normalize
        # rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
        # ix += 1
        
               

class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset, config, augment=True):
        """A generator that returns images and corresponding target class ids,
            bounding box deltas, and masks.
            dataset: The Dataset object to pick data from
            config: The model config object
            shuffle: If True, shuffles the samples before every epoch
            augment: If True, applies image augmentation to images (currently only
                     horizontal flips are supported)
            Returns a Python generator. Upon calling next() on it, the
            generator returns two lists, inputs and outputs. The containtes
            of the lists differs depending on the received arguments:
            inputs list:
            - images: [batch, H, W, C]
            - image_metas: [batch, size of image meta]
            - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
            - rpn_bbox: [batch, M, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
            - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
            - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
            - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
                        are those of the image unless use_mini_mask is True, in which
                        case they are defined in MINI_MASK_SHAPE.
            outputs list: Usually empty in regular training. But if detection_targets
                is True then the outputs list contains target class_ids, bbox deltas,
                and masks.
            """
        self.b = 0  # batch item index
        self.image_index = -1
        self.image_ids = np.copy(dataset.image_ids)
        self.error_count = 0

        self.dataset = dataset 
        self.config = config
        self.augment = augment  #图像增强方式

        # Anchors
        # [anchor_count, (y1, x1, y2, x2)]
        self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
                                                 config.RPN_ANCHOR_RATIOS,
                                                 config.BACKBONE_SHAPES,
                                                 config.BACKBONE_STRIDES,
                                                 config.RPN_ANCHOR_STRIDE)
            #一共有6个特征图,每个特征图上都要产生一组anchor
            #并且将每个特征层上产生的anchor都映射到原图上去了

    def __getitem__(self, image_index):  #image_index是属于这一批图像中的第几张图片
        # Get GT bounding boxes and masks for image.
        image_id = self.image_ids[image_index]
        
        image, image_metas, gt_class_ids, gt_boxes, gt_masks = \
            load_image_gt(self.dataset, self.config, image_id, augment=self.augment,
                          use_mini_mask=self.config.USE_MINI_MASK)

        # Skip images that have no instances. This can happen in cases
        # where we train on a subset of classes and the image doesn't
        # have any of the classes we care about.
        #跳过图片上没有实例样本
        if not np.any(gt_class_ids > 0):
            return None

        # RPN Targets
        rpn_match, rpn_bbox = build_rpn_targets(image.shape, self.anchors,
                                                gt_class_ids, gt_boxes, self.config)

        # If more instances than fits in the array, sub-sample from them.
        if gt_boxes.shape[0] > self.config.MAX_GT_INSTANCES:  #如果图片上的box超过了100个,则只从中挑选100个
                                                              #图片上要检测的物体不能太多
            ids = np.random.choice(
                np.arange(gt_boxes.shape[0]), self.config.MAX_GT_INSTANCES, replace=False)
            gt_class_ids = gt_class_ids[ids]
            gt_boxes = gt_boxes[ids]
            gt_masks = gt_masks[:, :, ids]

        # Add to batch
        rpn_match = rpn_match[:, np.newaxis] #将其变成二维的 (N,1)
        images = mold_image(image.astype(np.float32), self.config)#减去平均值

        # Convert
        images = torch.from_numpy(images.transpose(2, 0, 1)).float()
        image_metas = torch.from_numpy(image_metas)
        rpn_match = torch.from_numpy(rpn_match)
        rpn_bbox = torch.from_numpy(rpn_bbox).float()
        gt_class_ids = torch.from_numpy(gt_class_ids)
        gt_boxes = torch.from_numpy(gt_boxes).float()
        gt_masks = torch.from_numpy(gt_masks.astype(int).transpose(2, 0, 1)).float() #(hight,width,num_instance)

        return images, image_metas, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_masks

    def __len__(self):
        return self.image_ids.shape[0]


############################################################
#  MaskRCNN Class
############################################################

class MaskRCNN(nn.Module):
    """Encapsulates the Mask RCNN model functionality.
    """

    def __init__(self, config, model_dir):
        """
        config: A Sub-class of the Config class
        model_dir: Directory to save training logs and trained weights
        """
        super(MaskRCNN, self).__init__()
        self.config = config
        self.model_dir = model_dir  
        self.set_log_dir()  #设置log路径,以及每个eopch保存模型参数路径,并且self.epoch=0
                            # 'C://Desktop/coco20171029T2315\\mask_rcnn_coco_{:04d}.pth'
                            # 'C://Desktop' 是model_dir
                            # 'coco' 是 config.NAME 数据集名字
                            # '20171029T2315'  程序运行该条代码的时间
                            # 'mask_rcnn_coco_{:04d}.pth' 固定字符
                            #  如果set_log_dir传入参数(model_path),则生成的地址中的时间部分是model_path中的时间
                            #  并且将self.epoch改为model_path('.../**.pth')的**部分的值
                            #  该代码衍生物self.log_dir='C://Desktop/coco20171029T2315/'
                            #  self.checkpoint_path='C://Desktop/coco20171029T2315\\mask_rcnn_coco_{:04d}.pth'
                            
        self.build(config=config) #初始还网络模型
        self.initialize_weights() #初始化模型参数
        self.loss_history = []
        self.val_loss_history = []

    def build(self, config):
        """Build Mask R-CNN architecture.
        """

        # Image size must be dividable by 2 multiple times
        h, w = config.IMAGE_SHAPE[:2]
        if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): #如果h,w不是64的整数倍
            raise Exception("Image size must be dividable by 2 at least 6 times "
                            "to avoid fractions when downscaling and upscaling."
                            "For example, use 256, 320, 384, 448, 512, ... etc. ")

        # Build the shared convolutional layers.
        # Bottom-up Layers
        # Returns a list of the last layers of each stage, 5 in total.
        # Don't create the thead (stage 5), so we pick the 4th item in the list.
        resnet = ResNet("resnet101", stage5=True)
        C1, C2, C3, C4, C5 = resnet.stages() 

        # Top-down Layers
        # TODO: add assert to varify feature map sizes match what's in config
        self.fpn = FPN(C1, C2, C3, C4, C5, out_channels=256) #输出[p2_out, p3_out, p4_out, p5_out, p6_out]
                                                             #特征融合再输出
                                                             #p2_out(1,256,256,256)  4
                                                             #p3_out(1,256,128,128)  8
                                                             #p4_out(1,256,64,64)    16
                                                             #p5_out(1,256,32,32)    32 
                                                             #p6_out(1,256,16,16)                                                           
                                                                                                                         
        # Generate Anchors
        #
        self.anchors = Variable(torch.from_numpy(utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
                                                                                config.RPN_ANCHOR_RATIOS,
                                                                                config.BACKBONE_SHAPES,
                                                                                config.BACKBONE_STRIDES,
                                                                                config.RPN_ANCHOR_STRIDE)).float(), requires_grad=False)
                                              
        if self.config.GPU_COUNT:
            self.anchors = self.anchors.cuda()

        # RPN          
        #len(config.RPN_ANCHOR_RATIOS)=3
        #config.RPN_ANCHOR_STRIDE=1
        #output: [rpn_class_logits, rpn_probs,   rpn_bbox] 
        #        (1,256*256*3,2) (1,256*256*3,2) (1,256*256*3,4)
        #        (1,128*128*3,2) (1,128*128*3,2) (1,128*128*3,4)
        #        (1,64*64*3,2)   (1,64*64*3,2)   (1,64*64*3,4)
        #        (1,32*32*3,2)   (1,32*32*3,2)   (1,32*32*3,4)
        self.rpn = RPN(len(config.RPN_ANCHOR_RATIOS), config.RPN_ANCHOR_STRIDE, 256)
        

        # FPN Classifier
        self.classifier = Classifier(256, config.POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES)

        # FPN Mask
        #config.MASK_POOL_SIZE=14
        #config.IMAGE_SHAPE=[28,28]
        #config.NUM_CLASSES=1
        self.mask = Mask(256, config.MASK_POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES)

        # Fix batch norm layers
        # 固定BatchNorm层的参数
        def set_bn_fix(m):
            classname = m.__class__.__name__
            if classname.find('BatchNorm') != -1:
                for p in m.parameters(): 
                    p.requires_grad = False

        self.apply(set_bn_fix)

    def initialize_weights(self):
        """Initialize model weights.
        """

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_uniform(m.weight) #将nn.Conv2d层的w初始化付出(0,1)的均匀分布
                if m.bias is not None:
                    m.bias.data.zero_()    #将nn.Conv2d层的bias置0
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)     #对于nn.BatchNorm2d层w初始化参数置1
                m.bias.data.zero_()        #对于nn.BatchNorm2d层b初始化参数置0
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01) #(0,1)正态分布
                m.bias.data.zero_()

    def set_trainable(self, layer_regex, model=None, indent=0, verbose=1):
        """Sets model layers as trainable if their names match
        the given regular expression.
        """

        for param in self.named_parameters():
            layer_name = param[0] #每层网络中的每个参数的名字
            trainable = bool(re.fullmatch(layer_regex, layer_name))
                            #re.fullmatch(pattern, string) 完成匹配string
                            #只有当pattern与string一模一样的时候
                            #bool(re.fullmatch(pattern, string))返回true
            if not trainable:
                param[1].requires_grad = False  #参数反向传播不改变数据

    def set_log_dir(self, model_path=None):
        """Sets the model log directory and epoch counter.
        model_path: If None, or a format different from what this code uses
            then set a new log directory and start epochs from 0. Otherwise,
            extract the log directory and the epoch counter from the file
            name.
        """

        # Set date and epoch counter as if starting a new model
        self.epoch = 0
        now = datetime.datetime.now() #当前日期时间(精确到秒)

        # If we have a model path with date and epochs use them
        if model_path:  #区分是保存模型参数的路径还是保存其他的路径
            # Continue from we left of. Get epoch and date from the file name
            # A sample model path might look like:
            # /path/to/logs/coco20171029T2315/mask_rcnn_coco_0001.h5
            regex = r".*/\w+(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})/mask\_rcnn\_\w+(\d{4})\.pth"
            m = re.match(regex, model_path)  #查看model_path是不是regex这样的模式
            if m:
                now = datetime.datetime(int(m.group(1)), int(m.group(2)), int(m.group(3)),
                                        int(m.group(4)), int(m.group(5)))
                #这里计算now主要保证模型的路径不会被下面的代码改变
                self.epoch = int(m.group(6)) #获取模型参数是哪个epoch的模型

        # Directory for training logs
        # 训练过程需要保存的路径
        self.log_dir = os.path.join(self.model_dir, "{}{:%Y%m%dT%H%M}".format(
            self.config.NAME.lower(), now))  #now是实时保存路径

        # Path to save after each epoch. Include placeholders that get filled by Keras.
        self.checkpoint_path = os.path.join(self.log_dir, "mask_rcnn_{}_*epoch*.pth".format(
            self.config.NAME.lower()))
        self.checkpoint_path = self.checkpoint_path.replace(
            "*epoch*", "{:04d}")

    def find_last(self):
        """Finds the last checkpoint file of the last trained model in the
        model directory.
        Returns:
            log_dir: The directory where events and weights are saved
            checkpoint_path: the path to the last checkpoint file
        """
        # Get directory names. Each directory corresponds to a model
        dir_names = next(os.walk(self.model_dir))[1] #获得self.model_dir路径下文件夹的名字
        key = self.config.NAME.lower()
        dir_names = filter(lambda f: f.startswith(key), dir_names)#过滤掉不是以key字符为开始的文件夹
        dir_names = sorted(dir_names)  #对dir_names进行排序
        if not dir_names:
            return None, None
        # Pick last directory
        dir_name = os.path.join(self.model_dir, dir_names[-1]) #选择最后一个eopch参数的模型参数文件夹
        # Find the last checkpoint
        checkpoints = next(os.walk(dir_name))[2] #选择文件夹dir_name下面的文件
        checkpoints = filter(lambda f: f.startswith("mask_rcnn"), checkpoints) #过滤不是以mask_rcnn开通的文件
        checkpoints = sorted(checkpoints) #对过滤后的文件进行排序
        if not checkpoints:
            return dir_name, None
        checkpoint = os.path.join(dir_name, checkpoints[-1]) #选择排序后的文件的最后一个文件
        return dir_name, checkpoint

    def load_weights(self, filepath):
        """Modified version of the correspoding Keras function with
        the addition of multi-GPU support and the ability to exclude
        some layers from loading.
        exlude: list of layer names to excluce
        """
        if os.path.exists(filepath):
            state_dict = torch.load(filepath)
            self.load_state_dict(state_dict, strict=False)
        else:
            print("Weight file not found ...")

        # Update the log directory
        self.set_log_dir(filepath)  #set_log_dir对这个函数不是很了解
        if not os.path.exists(self.log_dir):  #如果log_dir不存在,则创建一个
            os.makedirs(self.log_dir)

    def detect(self, images):      
        """Runs the detection pipeline.
        images: List of images, potentially of different sizes.
        Returns a list of dicts, one dict per image. The dict contains:
        rois: [N, (y1, x1, y2, x2)] detection bounding boxes
        class_ids: [N] int class IDs
        scores: [N] float probability scores for the class IDs
        masks: [H, W, N] instance binary masks
        """
        '''
        meta = np.array(
        [image_id] +            # size=1
        list(image_shape) +     # size=3
        list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinates
        list(active_class_ids)  # size=num_classes
        )
        '''
        # Mold inputs to format expected by the neural network
        # 将images(list)中大小不同的图片,经过resize到(1024,1024,3),在经过normalize,最后将这些图片矩阵,合成一个四维度矩阵
        # molded_images (N,1024,1024,3) ,N是images的长度
        # 某一张图片的metas信息 (image_id,image_shape,window,active_class_ids)
        # 这里的每张图片的image_ids都是0
        # 每张图片的iamge_shape是(1024,1024)
        # window则是原始图片在加上边之后的图片中的左上角和右下角坐标
        # active_class_ids是每张图的需要激活哪些类别
        molded_images, image_metas, windows = self.mold_inputs(images)
        

        # Convert images to torch tensor
        #将图片变成tensor
        molded_images = torch.from_numpy(molded_images.transpose(0, 3, 1, 2)).float()
        
        # To GPU
        if self.config.GPU_COUNT:
            molded_images = molded_images.cuda()

        # Wrap in variable
        # 现在tensor和Variable合并了,下面这句可以省略
        molded_images = Variable(molded_images, volatile=True)  

        # Run object detection
        #detections (1,num_detections,6)
        #mrcnn_mask (1,num_detections,num_class,28,28)
        detections, mrcnn_mask = self.predict([molded_images, image_metas], mode='inference')

        # Convert to numpy
        detections = detections.data.cpu().numpy()
        mrcnn_mask = mrcnn_mask.permute(0, 1, 3, 4, 2).data.cpu().numpy() #(1,num_detections,28,28,num_class)

        ################################################################################
        # Process detections
        results = []
        #images是一个list
        #这里的images里面只能是一张图片
        #unmold_detectionss是将detections的box和mrcnn_mask反向缩放平移到原图上
        for i, image in enumerate(images):
            final_rois, final_class_ids, final_scores, final_masks =\
                self.unmold_detections(detections[i], mrcnn_mask[i],
                                       image.shape, windows[i])
            results.append({
                "rois": final_rois,
                "class_ids": final_class_ids,
                "scores": final_scores,
                "masks": final_masks,
            })
        return results
        
        

    def predict(self, input, mode):
        molded_images = input[0]  
        image_metas = input[1]

        if mode == 'inference':
            self.eval()
        elif mode == 'training':
            self.train()

            # Set batchnorm always in eval mode during training
            # 在训练模式时冻结BatchNorm
            def set_bn_eval(m):
                classname = m.__class__.__name__
                if classname.find('BatchNorm') != -1:
                    m.eval()

            self.apply(set_bn_eval)

        # Feature extraction
        [p2_out, p3_out, p4_out, p5_out, p6_out] = self.fpn(molded_images)
        #p2_out (N,256,256,256)
        #p3_out (N,256,128,128)
        #p4_out (N,256,64,64)
        #p5_out (N,256,32,32)
        #p6_out (N,256,16,16)
        
        # Note that P6 is used in RPN, but not in the classifier heads.
        rpn_feature_maps = [p2_out, p3_out, p4_out, p5_out, p6_out]
        mrcnn_feature_maps = [p2_out, p3_out, p4_out, p5_out]

        # Loop through pyramid layers
        # rpn 输出
        layer_outputs = []  # list of lists
        for p in rpn_feature_maps:
            layer_outputs.append(self.rpn(p))
        # Concatenate layer outputs
        # Convert from list of lists of level outputs to list of lists
        # of outputs across levels.
        # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
        outputs = list(zip(*layer_outputs))
        outputs = [torch.cat(list(o), dim=1) for o in outputs]  #这里的o是一个三维的,dim=1,按照这里的第1维进行拼接
        rpn_class_logits, rpn_class, rpn_bbox = outputs
        # Generate proposals
        # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
        # and zero padded.
        #config.POST_NMS_ROIS_TRAINING=2000
        #config.POST_NMS_ROIS_INFERENCE=1000
        proposal_count = self.config.POST_NMS_ROIS_TRAINING if mode == "training" \
            else self.config.POST_NMS_ROIS_INFERENCE
        rpn_rois = proposal_layer([rpn_class, rpn_bbox],
                                 proposal_count=proposal_count,
                                 nms_threshold=self.config.RPN_NMS_THRESHOLD,
                                 anchors=self.anchors,
                                 config=self.config)

        if mode == 'inference':
            # Network Heads
            # Proposal classifier and BBox regressor heads
            # self.classifier将rnp_rois algin_pooling成 (N,c,7,7)
            # 然后投入mrcnn回归和分类网络
            # mrcnn_class_logits  (num_rpn_rois,num_class)
            # mrcnn_class         (num_rpn_rois,num_class)
            # mrcnn_bbox          (num_rpn_rois,num_class,4)
            mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.classifier(mrcnn_feature_maps, rpn_rois)

            # Detections
            # detections [num_detections, (y1, x1, y2, x2, class_id, score)] in image coordinates
            # 根据预测结果对rpn_rois进行修正,最终选取前100个与gt_box匹配最好的
            # detections (100,6)
            detections = detection_layer(self.config, rpn_rois, mrcnn_class, mrcnn_bbox, image_metas)
            

            # Convert boxes to normalized coordinates
            # TODO: let DetectionLayer return normalized coordinates to avoid
            #       unnecessary conversions
            h, w = self.config.IMAGE_SHAPE[:2]
            scale = Variable(torch.from_numpy(np.array([h, w, h, w])).float(), requires_grad=False)
            if self.config.GPU_COUNT:
                scale = scale.cuda()
            detection_boxes = detections[:, :4] / scale  #将坐标归一化

            # Add back batch dimension
            detection_boxes = detection_boxes.unsqueeze(0) #[1,num_detections,4]

            # Create masks for detections
            # mrcnn_feature_maps=[p2_out, p3_out, p4_out, p5_out]
            # mrcnn_mask (rois_num,num_classes,28,28)
            mrcnn_mask = self.mask(mrcnn_feature_maps, detection_boxes)

            # Add back batch dimension
            detections = detections.unsqueeze(0)  #(1,num_detections,6)
            mrcnn_mask = mrcnn_mask.unsqueeze(0)  #(1,rois_num ,num_classes,28,28)

            return [detections, mrcnn_mask]

        elif mode == 'training':

            gt_class_ids = input[2]
            gt_boxes = input[3]
            gt_masks = input[4]

            # Normalize coordinates
            h, w = self.config.IMAGE_SHAPE[:2]
            scale = Variable(torch.from_numpy(np.array([h, w, h, w])).float(), requires_grad=False)
            if self.config.GPU_COUNT:
                scale = scale.cuda()
            gt_boxes = gt_boxes / scale   #gt_boxes归一化

            # Generate detection targets
            # Subsamples proposals and generates target outputs for training
            # Note that proposal class IDs, gt_boxes, and gt_masks are zero
            # padded. Equally, returned rois and targets are zero padded.
            # rois选出128个正样本和128负样本,
            # target_class_ids (正样本的class_ids是和rois匹配度最高的框的类别,负样本的class_ids为0)
            # target_deltas   同target_class_ids
            # target_mask     同target_class_ids
            rois, target_class_ids, target_deltas, target_mask = \
                detection_target_layer(rpn_rois, gt_class_ids, gt_boxes, gt_masks, self.config)

            if not rois.size():
                mrcnn_class_logits = Variable(torch.FloatTensor())
                mrcnn_class = Variable(torch.IntTensor())
                mrcnn_bbox = Variable(torch.FloatTensor())
                mrcnn_mask = Variable(torch.FloatTensor())
                if self.config.GPU_COUNT:
                    mrcnn_class_logits = mrcnn_class_logits.cuda()
                    mrcnn_class = mrcnn_class.cuda()
                    mrcnn_bbox = mrcnn_bbox.cuda()
                    mrcnn_mask = mrcnn_mask.cuda()
            else:
                # Network Heads
                # Proposal classifier and BBox regressor heads
                mrcnn_class_logits, mrcnn_class, mrcnn_bbox = self.classifier(mrcnn_feature_maps, rois)

                # Create masks for detections
                mrcnn_mask = self.mask(mrcnn_feature_maps, rois)

            return [rpn_class_logits, rpn_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask]

    def train_model(self, train_dataset, val_dataset, learning_rate, epochs, layers):
        """Train the model.
        train_dataset, val_dataset: Training and validation Dataset objects.
        learning_rate: The learning rate to train with
        epochs: Number of training epochs. Note that previous training epochs
                are considered to be done alreay, so this actually determines
                the epochs to train in total rather than in this particaular
                call.
        layers: Allows selecting witch layers to train. It can be:
            - A regular expression to match layer names to train
            - One of these predefined values:
              heaads: The RPN, classifier and mask heads of the network
              all: All the layers
              3+: Train Resnet stage 3 and up
              4+: Train Resnet stage 4 and up
              5+: Train Resnet stage 5 and up
        """

        # Pre-defined layer regular expressions
        layer_regex = {
            # all layers but the backbone
            "heads": r"(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            # From a specific Resnet stage and up
            "3+": r"(fpn.C3.*)|(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            "4+": r"(fpn.C4.*)|(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            "5+": r"(fpn.C5.*)|(fpn.P5\_.*)|(fpn.P4\_.*)|(fpn.P3\_.*)|(fpn.P2\_.*)|(rpn.*)|(classifier.*)|(mask.*)",
            # All layers
            "all": ".*",
        }
        if layers in layer_regex.keys():
            layers = layer_regex[layers]

        # Data generators
        # results of Dataset
        # images, image_metas, rpn_match, rpn_bbox, gt_class_ids, gt_boxes, gt_masks
        train_set = Dataset(train_dataset, self.config, augment=True)
        train_generator = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True, num_workers=4)
        val_set = Dataset(val_dataset, self.config, augment=True)
        val_generator = torch.utils.data.DataLoader(val_set, batch_size=1, shuffle=True, num_workers=4)

        # Train
        # self.epoch初始为0
        log("\nStarting at epoch {}. LR={}\n".format(self.epoch+1, learning_rate))
        
        #Checkpoint Path: C://desktop/coco20171029T2315\mask_rcnn_coco_{:04d}.pth
        log("Checkpoint Path: {}".format(self.checkpoint_path))
        self.set_trainable(layers)

        # Optimizer object
        # Add L2 Regularization
        # Skip gamma and beta weights of batch normalization layers.
        # trainables_wo_bn 非BatchNorm层的参数
        # trainables_only_bn BatchNorm层的参数
        trainables_wo_bn = [param for name, param in self.named_parameters() if param.requires_grad and not 'bn' in name]
        trainables_only_bn = [param for name, param in self.named_parameters() if param.requires_grad and 'bn' in name]
        optimizer = optim.SGD([
            {'params': trainables_wo_bn, 'weight_decay': self.config.WEIGHT_DECAY}, #在batchnorm层中加入权重衰减
            {'params': trainables_only_bn} #其他层不使用权重衰减
        ], lr=learning_rate, momentum=self.config.LEARNING_MOMENTUM)

        #self.epoch初始轮数,它在初始化的时候会改变
        #没次执行set_log_dir(model_path)是时候都会随着model_path的变化而变化
        #当在调节参数的时候,你想从第几轮开始调训练,你就导入第几层的模型参数,
        #如当你将这个地址的模型参数'.../mask_rcnn_coco_0003.pth'导入作为模型初始化参数
        #那么当你训练的时候相当于是第4轮进行训练了,这个是不是很方便
        for epoch in range(self.epoch+1, epochs+1):
            log("Epoch {}/{}.".format(epoch,epochs))

            # Training
            # 里面进行了正向传播求loss和打印loss,并进行反向传播更新参数,且执行1000次
            loss, loss_rpn_class, loss_rpn_bbox, loss_mrcnn_class, loss_mrcnn_bbox, loss_mrcnn_mask = self.train_epoch(train_generator, optimizer, self.config.STEPS_PER_EPOCH)

            # Validation
            # 里面进行了正向传播求loss和打印loss,且执行50次
            val_loss, val_loss_rpn_class, val_loss_rpn_bbox, val_loss_mrcnn_class, val_loss_mrcnn_bbox, val_loss_mrcnn_mask = self.valid_epoch(val_generator, self.config.VALIDATION_STEPS)

            # Statistics
            self.loss_history.append([loss, loss_rpn_class, loss_rpn_bbox, loss_mrcnn_class, loss_mrcnn_bbox, loss_mrcnn_mask])
            self.val_loss_history.append([val_loss, val_loss_rpn_class, val_loss_rpn_bbox, val_loss_mrcnn_class, val_loss_mrcnn_bbox, val_loss_mrcnn_mask])
            visualize.plot_loss(self.loss_history, self.val_loss_history, save=True, log_dir=self.log_dir)

            # Save model
            torch.save(self.state_dict(), self.checkpoint_path.format(epoch))

        self.epoch = epochs

    def train_epoch(self, datagenerator, optimizer, steps):
        batch_count = 0
        loss_sum = 0
        loss_rpn_class_sum = 0
        loss_rpn_bbox_sum = 0
        loss_mrcnn_class_sum = 0
        loss_mrcnn_bbox_sum = 0
        loss_mrcnn_mask_sum = 0
        step = 0

        optimizer.zero_grad()   #将梯度初始化为零
                                #(因为一个batch的loss关于weight的导数是所有sample的loss关于weight的导数的累加和)

        for inputs in datagenerator:
            batch_count += 1

            images = inputs[0]
            image_metas = inputs[1]
            rpn_match = inputs[2]
            rpn_bbox = inputs[3]
            gt_class_ids = inputs[4]
            gt_boxes = inputs[5]
            gt_masks = inputs[6]

            # image_metas as numpy array
            image_metas = image_metas.numpy()

            # Wrap in variables
            images = Variable(images)
            rpn_match = Variable(rpn_match)
            rpn_bbox = Variable(rpn_bbox)
            gt_class_ids = Variable(gt_class_ids)
            gt_boxes = Variable(gt_boxes)
            gt_masks = Variable(gt_masks)

            # To GPU
            if self.config.GPU_COUNT:
                images = images.cuda()
                rpn_match = rpn_match.cuda()
                rpn_bbox = rpn_bbox.cuda()
                gt_class_ids = gt_class_ids.cuda()
                gt_boxes = gt_boxes.cuda()
                gt_masks = gt_masks.cuda()

            # Run object detection
            rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask = \
                self.predict([images, image_metas, gt_class_ids, gt_boxes, gt_masks], mode='training')

            # Compute losses
            rpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss = compute_losses(rpn_match, rpn_bbox, rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask)
            loss = rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss

            # Backpropagation
            loss.backward()   #loss反向传播
            torch.nn.utils.clip_grad_norm(self.parameters(), 5.0) #梯度裁剪
            if (batch_count % self.config.BATCH_SIZE) == 0:  #每隔config.BATCH_SIZE个batch进行一次参数更新
                optimizer.step()   #参数更新
                optimizer.zero_grad()  #梯度清零
                batch_count = 0  

            # Progress
            printProgressBar(step + 1, steps, prefix="\t{}/{}".format(step + 1, steps),
                             suffix="Complete - loss: {:.5f} - rpn_class_loss: {:.5f} - rpn_bbox_loss: {:.5f} - mrcnn_class_loss: {:.5f} - mrcnn_bbox_loss: {:.5f} - mrcnn_mask_loss: {:.5f}".format(
                                 loss.data.cpu()[0], rpn_class_loss.data.cpu()[0], rpn_bbox_loss.data.cpu()[0],
                                 mrcnn_class_loss.data.cpu()[0], mrcnn_bbox_loss.data.cpu()[0],
                                 mrcnn_mask_loss.data.cpu()[0]), length=10)

            # Statistics
            loss_sum += loss.data.cpu()[0]/steps
            loss_rpn_class_sum += rpn_class_loss.data.cpu()[0]/steps
            loss_rpn_bbox_sum += rpn_bbox_loss.data.cpu()[0]/steps
            loss_mrcnn_class_sum += mrcnn_class_loss.data.cpu()[0]/steps
            loss_mrcnn_bbox_sum += mrcnn_bbox_loss.data.cpu()[0]/steps
            loss_mrcnn_mask_sum += mrcnn_mask_loss.data.cpu()[0]/steps

            # Break after 'steps' steps
            if step==steps-1:
                break
            step += 1 #训练步数加1

        return loss_sum, loss_rpn_class_sum, loss_rpn_bbox_sum, loss_mrcnn_class_sum, loss_mrcnn_bbox_sum, loss_mrcnn_mask_sum

    def valid_epoch(self, datagenerator, steps):

        step = 0
        loss_sum = 0
        loss_rpn_class_sum = 0
        loss_rpn_bbox_sum = 0
        loss_mrcnn_class_sum = 0
        loss_mrcnn_bbox_sum = 0
        loss_mrcnn_mask_sum = 0

        for inputs in datagenerator:
            images = inputs[0]
            image_metas = inputs[1]
            rpn_match = inputs[2]
            rpn_bbox = inputs[3]
            gt_class_ids = inputs[4]
            gt_boxes = inputs[5]
            gt_masks = inputs[6]

            # image_metas as numpy array
            image_metas = image_metas.numpy()

            # Wrap in variables
            images = Variable(images, volatile=True)
            rpn_match = Variable(rpn_match, volatile=True)
            rpn_bbox = Variable(rpn_bbox, volatile=True)
            gt_class_ids = Variable(gt_class_ids, volatile=True)
            gt_boxes = Variable(gt_boxes, volatile=True)
            gt_masks = Variable(gt_masks, volatile=True)

            # To GPU
            if self.config.GPU_COUNT:
                images = images.cuda()
                rpn_match = rpn_match.cuda()
                rpn_bbox = rpn_bbox.cuda()
                gt_class_ids = gt_class_ids.cuda()
                gt_boxes = gt_boxes.cuda()
                gt_masks = gt_masks.cuda()

            # Run object detection
            rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask = \
                self.predict([images, image_metas, gt_class_ids, gt_boxes, gt_masks], mode='training')

            if not target_class_ids.size():
                continue

            # Compute losses
            rpn_class_loss, rpn_bbox_loss, mrcnn_class_loss, mrcnn_bbox_loss, mrcnn_mask_loss = compute_losses(rpn_match, rpn_bbox, rpn_class_logits, rpn_pred_bbox, target_class_ids, mrcnn_class_logits, target_deltas, mrcnn_bbox, target_mask, mrcnn_mask)
            loss = rpn_class_loss + rpn_bbox_loss + mrcnn_class_loss + mrcnn_bbox_loss + mrcnn_mask_loss

            # Progress
            # 花里胡哨
            # 只要知道下面这条代码是打印每个step的loss
            printProgressBar(step + 1, steps, prefix="\t{}/{}".format(step + 1, steps),
                             suffix="Complete - loss: {:.5f} - rpn_class_loss: {:.5f} - rpn_bbox_loss: {:.5f} - mrcnn_class_loss: {:.5f} - mrcnn_bbox_loss: {:.5f} - mrcnn_mask_loss: {:.5f}".format(
                                 loss.data.cpu()[0], rpn_class_loss.data.cpu()[0], rpn_bbox_loss.data.cpu()[0],
                                 mrcnn_class_loss.data.cpu()[0], mrcnn_bbox_loss.data.cpu()[0],
                                 mrcnn_mask_loss.data.cpu()[0]), length=10)

            # Statistics
            loss_sum += loss.data.cpu()[0]/steps
            loss_rpn_class_sum += rpn_class_loss.data.cpu()[0]/steps
            loss_rpn_bbox_sum += rpn_bbox_loss.data.cpu()[0]/steps
            loss_mrcnn_class_sum += mrcnn_class_loss.data.cpu()[0]/steps
            loss_mrcnn_bbox_sum += mrcnn_bbox_loss.data.cpu()[0]/steps
            loss_mrcnn_mask_sum += mrcnn_mask_loss.data.cpu()[0]/steps

            # Break after 'steps' steps
            if step==steps-1:
                break
            step += 1

        return loss_sum, loss_rpn_class_sum, loss_rpn_bbox_sum, loss_mrcnn_class_sum, loss_mrcnn_bbox_sum, loss_mrcnn_mask_sum

    def mold_inputs(self, images):
        """Takes a list of images and modifies them to the format expected
        as an input to the neural network.
        images: List of image matricies [height,width,depth]. Images can have
            different sizes.
        Returns 3 Numpy matricies:
        molded_images: [N, h, w, 3]. Images resized and normalized.
        image_metas: [N, length of meta data]. Details about each image.
        windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
            original image (padding excluded).
        """
        molded_images = []
        image_metas = []
        windows = []
        for image in images:
            '''
                依次将每张图片进行尺寸处理,然后做平均值平移
            '''
            # Resize image to fit the model expected size
            # TODO: move resizing to mold_image()
            molded_image, window, scale, padding = utils.resize_image(
                image,
                min_dim=self.config.IMAGE_MIN_DIM,
                max_dim=self.config.IMAGE_MAX_DIM,
                padding=self.config.IMAGE_PADDING)
             
            molded_image = mold_image(molded_image, self.config)#将图片减去平均值
            # Build image_meta
            image_meta = compose_image_meta(
                0, image.shape, window,
                np.zeros([self.config.NUM_CLASSES], dtype=np.int32))
                #compose_image_meta()将括号内的数据打包成一个元组
            # Append
            molded_images.append(molded_image)
            windows.append(window)
            image_metas.append(image_meta)
        # Pack into arrays
        molded_images = np.stack(molded_images) #将每张图片拼成一个大矩阵
        image_metas = np.stack(image_metas)
        windows = np.stack(windows)
        return molded_images, image_metas, windows

    def unmold_detections(self, detections, mrcnn_mask, image_shape, window):
        """Reformats the detections of one image from the format of the neural
        network output to a format suitable for use in the rest of the
        application.
        detections: [N, (y1, x1, y2, x2, class_id, score)]  #(N,6)
        mrcnn_mask: [N, height, width, num_classes]         
        image_shape: [height, width, depth] Original size of the image before resizing
        window: [y1, x1, y2, x2] Box in the image where the real image is
                excluding the padding.
        Returns:
        boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
        class_ids: [N] Integer class IDs for each bounding box
        scores: [N] Float probability scores of the class_id
        masks: [height, width, num_instances] Instance masks
        """
        # How many detections do we have?
        # Detections array is padded with zeros. Find the first class_id == 0.
        zero_ix = np.where(detections[:, 4] == 0)[0]   #找出类别为0的index
        #如果存检测类别为0的,则将检测为0的分数最高的那个index赋值给N(因为detections的中的class_ids是按照score从高到低的顺序排的)
        #如果不存在检测为0的,则将检测框的个数赋值给N
        N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]

        # Extract boxes, class_ids, scores, and class-specific masks
        boxes = detections[:N, :4]    #取出前N个框
        class_ids = detections[:N, 4].astype(np.int32)  #取出前N个类别
        scores = detections[:N, 5]   #取出钱N个分数
        #masks(N,hight,width,num_classes)
        #每一个mask预测一个物体
        #这里有N个mask就会预测N个物体
        #那么mask预测的物体它是什么类别呢?
        #事实上每个mask会把所有的类别都预测并且给他的预测结果打分,这也是一个优点消除了类间竞争
        #至于最终mask会把它判断成什么类别是由这对应的预测框的class_ids来决定的
        #所以我们只要把masks中队class_ids进行打分的那个mask取出来就行
        #比如在苹果,香蕉,梨子,已知一个box预测的是苹果,对应的mask相对于有3个预测器
        #每个预测器分别对苹果,香蕉,梨子进行打分,而我们只关心对苹果进行打分的那个预测器打出的分数
        
        masks = mrcnn_mask[np.arange(N), :, :, class_ids]  #从每一张图中选择一个出前N个class_ids的mask
                                                           #(N,hight,width)
                                                           
                                                           

        # Compute scale and shift to translate coordinates to image domain.
        # img1:原始图像
        # img2: 经过放缩之后的图像
        # img3: 经过放缩之后的图像,并且还进行了边缘填充,window是将img2贴的到img3上按照左下角对齐后img2在img3的坐标
                #仔细品品window的坐标(top_pad, left_pad, h + top_pad, w + left_pad),
                #img3的宽高是在img2的基础上加上pad后的长宽
        #通过上面的描述img2和window的其实是一个东西,只是他们的坐标的表示做了一个平移
        #window的长宽与img2的长宽相等
        h_scale = image_shape[0] / (window[2] - window[0]) #这里是原图在高度上进行放缩的比例因子
        w_scale = image_shape[1] / (window[3] - window[1]) #这里是原图在宽度上进行放缩的比例因子  
        scale = min(h_scale, w_scale) #比较两个放缩因子谁更小
        shift = window[:2]  # y, x    #获取top_pad,left_pad
        scales = np.array([scale, scale, scale, scale])
        shifts = np.array([shift[0], shift[1], shift[0], shift[1]])

        # Translate bounding boxes to image domain
        # 将box以相反的缩放和平移到元素图像上去
        boxes = np.multiply(boxes - shifts, scales).astype(np.int32)

        # Filter out detections with zero area. Often only happens in early
        # stages of training when the network weights are still a bit random.
        # 进一步筛选取出合格的边框
        exclude_ix = np.where(
            (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
        if exclude_ix.shape[0] > 0:
            boxes = np.delete(boxes, exclude_ix, axis=0)
            class_ids = np.delete(class_ids, exclude_ix, axis=0)
            scores = np.delete(scores, exclude_ix, axis=0)
            masks = np.delete(masks, exclude_ix, axis=0)
            N = class_ids.shape[0]

        # Resize masks to original image size and set boundary threshold.
        full_masks = []
        for i in range(N):
            # Convert neural network mask to full size mask
            full_mask = utils.unmold_mask(masks[i], boxes[i], image_shape)
            full_masks.append(full_mask)
        if full_masks:
            full_masks = np.stack(full_masks, axis=-1)
        else:
            full_masks = np.empty((0,) + masks.shape[1:3])
        
        return boxes, class_ids, scores, full_masks


############################################################
#  Data Formatting
############################################################

def compose_image_meta(image_id, image_shape, window, active_class_ids):
    """Takes attributes of an image and puts them in one 1D array. Use
    parse_image_meta() to parse the values back.
    image_id: An int ID of the image. Useful for debugging.
    image_shape: [height, width, channels]
    window: (y1, x1, y2, x2) in pixels. The area of the image where the real
            image is (excluding the padding)
    active_class_ids: List of class_ids available in the dataset from which
        the image came. Useful if training on images from multiple datasets
        where not all classes are present in all datasets.
    """
    meta = np.array(
        [image_id] +            # size=1
        list(image_shape) +     # size=3
        list(window) +          # size=4 (y1, x1, y2, x2) in image cooredinates
        list(active_class_ids)  # size=num_classes
    )
    return meta


# Two functions (for Numpy and TF) to parse image_meta tensors.
def parse_image_meta(meta):
    """Parses an image info Numpy array to its components.
    See compose_image_meta() for more details.
    """
    image_id = meta[:, 0]
    image_shape = meta[:, 1:4]
    window = meta[:, 4:8]   # (y1, x1, y2, x2) window of image in in pixels
    active_class_ids = meta[:, 8:]
    return image_id, image_shape, window, active_class_ids


def parse_image_meta_graph(meta):
    """Parses a tensor that contains image attributes to its components.
    See compose_image_meta() for more details.
    meta: [batch, meta length] where meta length depends on NUM_CLASSES
    """
    image_id = meta[:, 0]
    image_shape = meta[:, 1:4]
    window = meta[:, 4:8]
    active_class_ids = meta[:, 8:]
    return [image_id, image_shape, window, active_class_ids]


def mold_image(images, config):
    """Takes RGB images with 0-255 values and subtraces
    the mean pixel and converts it to float. Expects image
    colors in RGB order.
    """
    return images.astype(np.float32) - config.MEAN_PIXEL


def unmold_image(normalized_images, config):
    """Takes a image normalized with mold() and returns the original."""
    return (normalized_images + config.MEAN_PIXEL).astype(np.uint8)
    
    
    

 utils.py

"""
Mask R-CNN
Common utility functions and classes.

Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""

import sys
import os
import math
import random
import numpy as np
import scipy.misc
import scipy.ndimage
import skimage.color
import skimage.io
import torch
import cv2

############################################################
#  Bounding Boxes
############################################################

def extract_bboxes(mask):
    """Compute bounding boxes from masks.
    mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
          num_instances# 代表图片上实际的类别数 
          mask中的元素不是0就是1,1代表该点是像素点,0表示该点不是像素点
    Returns: bbox array [num_instances, (y1, x1, y2, x2)].
    """
    #mask应该的数据分布应该是一个方形,且外围是用0填充,里面全是1,
    
    boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)  #num_instances,4)
    for i in range(mask.shape[-1]):
        m = mask[:, :, i]  #[height,width]
        # Bounding box.
        horizontal_indicies = np.where(np.any(m, axis=0))[0] #获得m中不全为0的行的index
        vertical_indicies = np.where(np.any(m, axis=1))[0]   #获得m中不全为0的列的index
        if horizontal_indicies.shape[0]:
            x1, x2 = horizontal_indicies[[0, -1]] #返回horizontal_indicies的第一个数和最后一个数
            y1, y2 = vertical_indicies[[0, -1]]   #返回vertical_indicies的第一个数和最后一个数(这里的数代表第几个像素)
            # x2 and y2 should not be part of the box. Increment by 1.
            x2 += 1
            y2 += 1                    #以上操作时找出mask全为1的方形块的位置
        else:
            # No mask for this instance. Might happen due to
            # resizing or cropping. Set bbox to zeros
            x1, x2, y1, y2 = 0, 0, 0, 0
        boxes[i] = np.array([y1, x1, y2, x2])
    return boxes.astype(np.int32)


def compute_iou(box, boxes, box_area, boxes_area):
    """Calculates IoU of the given box with the array of the given boxes.
    box: 1D vector [y1, x1, y2, x2]
    boxes: [boxes_count, (y1, x1, y2, x2)]
    box_area: float. the area of 'box'
    boxes_area: array of length boxes_count.

    Note: the areas are passed in rather than calculated here for
          efficency. Calculate once in the caller to avoid duplicate work.
    """
    # Calculate intersection areas
    y1 = np.maximum(box[0], boxes[:, 0])
    y2 = np.minimum(box[2], boxes[:, 2])
    x1 = np.maximum(box[1], boxes[:, 1])
    x2 = np.minimum(box[3], boxes[:, 3])
    intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
    union = box_area + boxes_area[:] - intersection[:]
    iou = intersection / union
    return iou


def compute_overlaps(boxes1, boxes2):
    """Computes IoU overlaps between two sets of boxes.
    boxes1, boxes2: [N, (y1, x1, y2, x2)].

    For better performance, pass the largest set first and the smaller second.
    """
    # Areas of anchors and GT boxes
    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])

    # Compute overlaps to generate matrix [boxes1 count, boxes2 count]
    # Each cell contains the IoU value.
    overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
    for i in range(overlaps.shape[1]):
        box2 = boxes2[i]
        overlaps[:, i] = compute_iou(box2, boxes1, area2[i], area1)
    return overlaps

def box_refinement(box, gt_box):
    """Compute refinement needed to transform box to gt_box.
    box and gt_box are [N, (y1, x1, y2, x2)]
    """

    height = box[:, 2] - box[:, 0]
    width = box[:, 3] - box[:, 1]
    center_y = box[:, 0] + 0.5 * height
    center_x = box[:, 1] + 0.5 * width

    gt_height = gt_box[:, 2] - gt_box[:, 0]
    gt_width = gt_box[:, 3] - gt_box[:, 1]
    gt_center_y = gt_box[:, 0] + 0.5 * gt_height
    gt_center_x = gt_box[:, 1] + 0.5 * gt_width

    dy = (gt_center_y - center_y) / height
    dx = (gt_center_x - center_x) / width
    dh = torch.log(gt_height / height)
    dw = torch.log(gt_width / width)

    result = torch.stack([dy, dx, dh, dw], dim=1)
    return result


############################################################
#  Dataset
############################################################

class Dataset(object):
    """The base class for dataset classes.
    To use it, create a new class that adds functions specific to the dataset
    you want to use. For example:

    class CatsAndDogsDataset(Dataset):
        def load_cats_and_dogs(self):
            ...
        def load_mask(self, image_id):
            ...
        def image_reference(self, image_id):
            ...

    See COCODataset and ShapesDataset as examples.
    """

    def __init__(self, class_map=None):
        self._image_ids = []
        self.image_info = []
        # Background is always the first class
        self.class_info = [{"source": "", "id": 0, "name": "BG"}]
        self.source_class_ids = {}

    def add_class(self, source, class_id, class_name):
        assert "." not in source, "Source name cannot contain a dot"
        # Does the class exist already?
        for info in self.class_info:
            if info['source'] == source and info["id"] == class_id: #如果增加的source为空,id==0,则放弃这次添加
                # source.class_id combination already available, skip
                return
        # Add the class
        self.class_info.append({
            "source": source,
            "id": class_id,
            "name": class_name,
        })   
        #add_class在训练中会执行20次(数据集中所有的类别数之和)
       

    def add_image(self, source, image_id, path, **kwargs):
        image_info = {
            "id": image_id,    #图片读进来的序号
            "source": source,  #图片属于什么数据集
            "path": path,
        }
        image_info.update(kwargs)  #这里可以出传入多个dict
        self.image_info.append(image_info)     #add_image数据集有多少图片就执行多少次

    def image_reference(self, image_id):
        """Return a link to the image in its source Website or details about
        the image that help looking it up or debugging it.

        Override for your dataset, but pass to this function
        if you encounter images not in your dataset.
        """
        return ""

    def prepare(self, class_map=None):
        """Prepares the Dataset class for use.

        TODO: class map is not supported yet. When done, it should handle mapping
              classes from different datasets to the same class ID.
        """
        def clean_name(name):
            """Returns a shorter version of object names for cleaner display."""
            return ",".join(name.split(",")[:1])

        # Build (or rebuild) everything else from the info dicts.
        self.num_classes = len(self.class_info)   #添加进dataset中的有类别的种类,假设为20类
        self.class_ids = np.arange(self.num_classes)  #生成一个序列 [0,1,2,3,....19]
        self.class_names = [clean_name(c["name"]) for c in self.class_info]  #将类别名字简化用于显示
        self.num_images = len(self.image_info)  #添加进dataset中的图片数量
        self._image_ids = np.arange(self.num_images)   #将添加的图片按照顺序生成一个id序列

        self.class_from_source_map = {"{}.{}".format(info['source'], info['id']): id
                                      for info, id in zip(self.class_info, self.class_ids)} 
                                    #将数据来与和数据类别作出一张map

        # Map sources to class_ids they support
        self.sources = list(set([i['source'] for i in self.class_info]))  #去掉相同的类别
        self.source_class_ids = {} #这个dict记录一张图
        # Loop over datasets
        for source in self.sources:
            self.source_class_ids[source] = []
            # Find classes that belong to this dataset
            for i, info in enumerate(self.class_info):
                # Include BG class in all datasets
                #if i == 0 or source == info['source']: #源码是这样的,感觉应该改成下面的这个
                if source == info['source']:            #理由:看model.py在调用self.source_class_ids中的注解,
                    self.source_class_ids[source].append(i)

    def map_source_class_id(self, source_class_id):
        """Takes a source class ID and returns the int class ID assigned to it.

        For example:
        dataset.map_source_class_id("coco.12") -> 23
        """
        return self.class_from_source_map[source_class_id]

    def get_source_class_id(self, class_id, source):
        """Map an internal class ID to the corresponding class ID in the source dataset."""
        info = self.class_info[class_id]
        assert info['source'] == source
        return info['id']

    def append_data(self, class_info, image_info):
        self.external_to_class_id = {}
        for i, c in enumerate(self.class_info):
            for ds, id in c["map"]:
                self.external_to_class_id[ds + str(id)] = i

        # Map external image IDs to internal ones.
        self.external_to_image_id = {}
        for i, info in enumerate(self.image_info):
            self.external_to_image_id[info["ds"] + str(info["id"])] = i

    @property
    def image_ids(self):
        return self._image_ids

    def source_image_link(self, image_id):
        """Returns the path or URL to the image.
        Override this to return a URL to the image if it's availble online for easy
        debugging.
        """
        return self.image_info[image_id]["path"]

    def load_image(self, image_id):
        """Load the specified image and return a [H,W,3] Numpy array.
        """
        # Load image
        image = cv2.imread(self.image_info[image_id]['path'])  #numpy
        
        image=image.transpose((2,0,1) 
        return image

    def load_mask(self, image_id):
        """Load instance masks for the given image.

        Different datasets use different ways to store masks. Override this
        method to load instance masks and return them in the form of am
        array of binary masks of shape [height, width, instances].

        Returns:
            masks: A bool array of shape [height, width, instance count] with
                a binary mask per instance.
            class_ids: a 1D array of class IDs of the instance masks.
        """
        # Override this function to load a mask from your dataset.
        # Otherwise, it returns an empty mask.
        # 覆盖此功能以从数据集中加载遮罩,这里需要从写该函数
        mask = np.empty([0, 0, 0])
        class_ids = np.empty([0], np.int32)
        return mask, class_ids


def resize_image(image, min_dim=None, max_dim=None, padding=False):
    """
    Resizes an image keeping the aspect ratio.

    min_dim: if provided, resizes the image such that it's smaller
        dimension == min_dim
    max_dim: if provided, ensures that the image longest side doesn't
        exceed this value.
    padding: If true, pads image with zeros so it's size is max_dim x max_dim

    Returns:
    image: the resized image
    window: (y1, x1, y2, x2). If max_dim is provided, padding might
        be inserted in the returned image. If so, this window is the
        coordinates of the image part of the full image (excluding
        the padding). The x2, y2 pixels are not included.
    scale: The scale factor used to resize the image
    padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
    """
    # Default window (y1, x1, y2, x2) and default scale == 1.
    h, w = image.shape[:2]  #image(h,w) h,w是nrray数据类型
    window = (0, 0, h, w)
    scale = 1

    # Scale?
    if min_dim: #(800) 
        # Scale up but not down
        scale1 = max(1, min_dim / min(h, w)) #当最小边都比800大了,则图像最小边不需要resize,
    # Does it exceed max dim?
    if max_dim: #(1024)
        image_max = max(h, w)
        if round(image_max * scale1) > max_dim: #将最最大边大于1024,则需要将最大便压缩
            scale = max_dim / (image_max)
    # Resize image and mask
    if scale != 1:
        image = scipy.misc.imresize(  #scipy.misc.imresize
            image, (round(h * scale), round(w * scale)))
    # Need padding?
    if padding:
        # Get new height and width
        h, w = image.shape[:2]
        top_pad = (max_dim - h) // 2
        bottom_pad = max_dim - h - top_pad
        left_pad = (max_dim - w) // 2
        right_pad = max_dim - w - left_pad
        padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)] 
        image = np.pad(image, padding, mode='constant', constant_values=0)
        window = (top_pad, left_pad, h + top_pad, w + left_pad) #在padding之后原始图像的左上角和右下角的位置
    return image, window, scale, padding


def resize_mask(mask, scale, padding):
    """Resizes a mask using the given scale and padding.
    Typically, you get the scale and padding from resize_image() to
    ensure both, the image and the mask, are resized consistently.

    scale: mask scaling factor
    padding: Padding to add to the mask in the form
            [(top, bottom), (left, right), (0, 0)]
    """
    h, w = mask.shape[:2]
    mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0) #order=0最临近插值
                                    #zoom 各个维度的缩放系数         #order=1双线性插值
                                                                     #order=3立方体插值
    mask = np.pad(mask, padding, mode='constant', constant_values=0)  
    return mask


def minimize_mask(bbox, mask, mini_shape):
    """Resize masks to a smaller version to cut memory load.
    Mini-masks can then resized back to image scale using expand_masks()

    See inspect_data.ipynb notebook for more details.
    """
    #mini_shape=(56,56)
    #mask.shape[-1] 一张图片有几个掩膜
    mini_mask = np.zeros(mini_shape + (mask.shape[-1],), dtype=bool)
    for i in range(mask.shape[-1]):
        m = mask[:, :, i]
        y1, x1, y2, x2 = bbox[i][:4]
        m = m[y1:y2, x1:x2] #先将mask第一次缩小到box的高宽
        if m.size == 0:
            raise Exception("Invalid bounding box with area of zero")
        m = scipy.misc.imresize(m.astype(float), mini_shape, interp='bilinear')
        mini_mask[:, :, i] = np.where(m >= 128, 1, 0) #图像二值化
    return mini_mask


def expand_mask(bbox, mini_mask, image_shape):
    """Resizes mini masks back to image size. Reverses the change
    of minimize_mask().

    See inspect_data.ipynb notebook for more details.
    """
    mask = np.zeros(image_shape[:2] + (mini_mask.shape[-1],), dtype=bool)
    for i in range(mask.shape[-1]):
        m = mini_mask[:, :, i]
        y1, x1, y2, x2 = bbox[i][:4]
        h = y2 - y1
        w = x2 - x1
        m = scipy.misc.imresize(m.astype(float), (h, w), interp='bilinear')
        mask[y1:y2, x1:x2, i] = np.where(m >= 128, 1, 0)
    return mask


# TODO: Build and use this function to reduce code duplication
def mold_mask(mask, config):
    pass


def unmold_mask(mask, bbox, image_shape):
    """Converts a mask generated by the neural network into a format similar
    to it's original shape.
    mask: [height, width] of type float. A small, typically 28x28 mask.
    bbox: [y1, x1, y2, x2]. The box to fit the mask in.

    Returns a binary mask with the same size as the original image.
    """
    threshold = 0.5
    y1, x1, y2, x2 = bbox
    #输出固定大小的mask
    mask = scipy.misc.imresize(
        mask, (y2 - y1, x2 - x1), interp='bilinear').astype(np.float32) / 255.0
    mask = np.where(mask >= threshold, 1, 0).astype(np.uint8) #

    # Put the mask in the right location.
    # 生成一张全0的原始图片大小的图片,然后将mask进行安装box的位置贴到这张图片上
    full_mask = np.zeros(image_shape[:2], dtype=np.uint8)
    full_mask[y1:y2, x1:x2] = mask
    return full_mask


############################################################
#  Anchors
############################################################

def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):
    """
    scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
    ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
    shape: [height, width] spatial shape of the feature map over which
            to generate anchors.
    feature_stride: Stride of the feature map relative to the image in pixels.
    anchor_stride: Stride of anchors on the feature map. For example, if the
        value is 2 then generate anchors for every other feature map pixel.
    """
    # Get all combinations of scales and ratios
    scales, ratios = np.meshgrid(np.array(scales), np.array(ratios)) #
    # array([[ 32],           array([[0.5], 
           # [ 32],                  [1.],
           # [ 32]])                 [2.]])
    
    scales = scales.flatten() #array([ 32,32,32])
    ratios = ratios.flatten() #array([0.5,1.,2.])

    # Enumerate heights and widths from scales and ratios
    heights = scales / np.sqrt(ratios) #(3,)
    widths = scales * np.sqrt(ratios)  #(3,)

    # Enumerate shifts in feature space
    '''
       shape是特征图的长宽, anchor_stride是将特征图shape均分的步长,
       这样一张特征图上就有(shape[0]//anchor_stride)*(shape[1]//anchor_stride)个网格点
       将其乘以feature_stride,就将这些特征点坐标映射到原图尺寸上
    '''
    shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
    shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
    shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)  

    # Enumerate combinations of shifts, widths, and heights
    # 将widths纵向复制len(shifts_x)份,
    # 将shifts_x每一个元素横向复制len(widths)份,再纵向将其拼接
    box_widths, box_centers_x = np.meshgrid(widths, shifts_x) 
    box_heights, box_centers_y = np.meshgrid(heights, shifts_y)

    # Reshape to get a list of (y, x) and a list of (h, w)
    box_centers = np.stack(
        [box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
    box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])

    # Convert to corner coordinates (y1, x1, y2, x2)
    boxes = np.concatenate([box_centers - 0.5 * box_sizes,
                            box_centers + 0.5 * box_sizes], axis=1)
    return boxes


def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,
                             anchor_stride):
    """Generate anchors at different levels of a feature pyramid. Each scale
    is associated with a level of the pyramid, but each ratio is used in
    all levels of the pyramid.

    Returns:
    anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
        with the same order of the given scales. So, anchors of scale[0] come
        first, then anchors of scale[1], and so on.
    """
    # Anchors
    # [anchor_count, (y1, x1, y2, x2)]
    anchors = []
    for i in range(len(scales)):
        anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i],
                                        feature_strides[i], anchor_stride))
    return np.concatenate(anchors, axis=0)






全部代码(pytoch版,有中文解读),本人水平有限对(ROI Align)的gpu版未能注释,还望谅解

代码思维导图

github源码(keras版)

 

 

你可能感兴趣的:(机器学习,pytorch)