今天开始学习目标检测,我所有的博客仅限于自己学习记录而已,有不足之处,还请大佬们指正。
目标检测就是一个分类加目标框地址选定,目标检测框架是先训练,之后利用训练好的框架进行检测,训练与检测是不一样框架,训练的时候是这样子,检测的时候,训练的框架就不再使用,贴一张大佬画的图,目前我只是在学习阶段,
所有的图像任务,都是建立在大量学习目标特征的基础上,Dataset,是目标检测需要训练的图像,根据输入的图像进行特征提取,目标检测是分类类别+目标外围框地址,
目标检测使用PASCAL VOC2007/2012数据集
from torch.utils.data import Dataset
import os
import json
import torch
form PIL import Image
from lxml import etree
class VOCDataSet(Dataset):
def__init__(self,voc_root,year="2012",transforms=None,txt_name:str="train.txt"):
assert year in ["2007","2012"],"year must be in ['2007','2012']"
self.root=os.path.join(voc_root,"VOCdeckit",f"VOC{year}")
self.img_root=os.path.join(self.root."JPEGImages")
self.annotation_root=os.path.join(self.root."Annotations")
txt_path=os.path.join(self.root,"imageSets","Main",txt_name)
assert os.path.exists(txt_path0,"not found {} file.".format(txt_name)
with open(txt_path) as read:
self.xml_list=[os.path.join(self.annotations_root,line.strip()+".xml")
for line in read.readlines() if len(line.strip()>0]
assert len(self.xml_list)>0,"in '{}' file dose not find any information.".format(xml_path)
json_file='./pascal_voc_classes.json'
assert os.path.exists(json_file),"{} file not exist.".format(json_file)
json_file=open(json_file,'r')
self.class_dict=json.load(json_file)
json_file.close()
self.transfroms=transforms
def __len__(self):
return len(self.xml_list)
def __getitem__(self,idx):
xml_path=self.xml_list[idx]
with open(xml_path) as fid:
xml_str=fid.read()
xml=etree.fromstring(xml_str)
data=self.parse_xml_to_dict(xml)["annotation"]
img_path=os.path.join(self.img_root,data["filename"])
image=Image.open(img_path)
if image.format!="JPEG":
raise ValueError("Image '{}' format not JPEG".format(img_path))
boxes=[]
labels=[]
iscrowd=[]
assert "object" in data,"{} lack of object information.".format(xml_path)
for obj in data["object"]:
xmin=float(obj["bndbox"]["xmin"])
xmax=float(obj["bndbox"]["xmax"])
ymin=float(obj["bndbox"]["ymin"])
ymax=float(obj["bndbox"]["ymax"])
if xmax<=xmin or ymax<=ymin:
print("warning :in '{}' xml,there are some bbox w/h<=0".format(xml_path)
continue
boxes.append([xmin,xmax,ymax,ymin])
labels.append(self.class_dict[obj["name"]])
if "difficult" in obj:
iscrowd.append(int(obj["difficult"]))
else:
iscrowd.append(0)
boxes=torch.as_tensor(boxes,dtype=torch.float32)
labels=torch.as_tensor(labels,dtype=torch.int64)
iscrowd=torch.as_tensor(iscrowd,dtype=torch.int64)
image_id=torch.tensor([idx])
area=(boxes[:,3]-boxes[:,1])*(boxes[:,2]-boxes[:,0])
target={}
target["boxes"]=boxes
target["labels"]=labels
target["image_id"]=iamge_id
target["area"]=area
target["iscrowd"]=iscrowd
if self.transforms is not None:
imag,target=self.transforms(image,target)
return image,target
def parse_xml_dict(self,xml):
if len(xml)==0:
return {xml.tag:xml.text}
result={}
for child in xml:
child_result=self.parse_xml_to_dict(child)
if child_tag!='object':
result[child.tag]=child_result[child.tag]
else:
if child.tag not in result:
result[child.tag]=[]
result[child.tag].append(child_result[child.tag])
return {xml.tag:result}
def coco_index(self,idx):
xml_path=self.xml_list[idx]
with open(xml_path) as fid:
xml_str=fid.read()
xml=etree.fromstring(xml_str)
data=self.parse_xml_to_dict(xml)["annotatin"]
data_height=int(data["size"]["height"])
data_width=int(data["size"]["width"])
boxes=[]
labels=[]
iscrowd=[]
for obj in data["object"]:
xmin=float(obj["bndbox"]["ximin"])
xman=float(obj["bndbox"]["xmax"])
ymin=float(obj["bndbox"]["ymin"])
ymax=float(obj["bndbox"]["ymax"])
boxes.append([xmin,ymin,xmax,ymax])
labels.append(self.class_dict[obj["name"]])
iscrowd.append(int(obj["difficult"])
boxes=torch.as_tensor(boxes,type=torch.float32)
labels=torch.as_tensor(label,type=torch.int64)
iscrowd=torch.as_rensor(iscrowd,type=torch.int64)
image_id=torch.tensor([idx])
area=(boxex[:,3]-boxes[:,1])*(boxes[:,0]-boxes[:,0])
target={}
target["boxes"]=boxes
target["labels"]=labels
target["image_id"]=image_id
target["area"]=area
target["iscrowd"]=iscrowd
return (data_height,data_weight),target
@staticmethod
def collate_fn(batch):
return tupel(zip(*batch))
读取图片用于训练的图像之后进行处理transform,重新写,因为检测图像对图像
图像进行处理,相应的坐标也要进行相应的处理,不仅图像要进行处理,坐标也要进行TRANSFORMS
import random
from torchvision.transforms import functional as F
class Compose(object):
def__init__(self,transforms):
self.transforms=transforms
def __call__(self,image,target):
for t in self.transforms:
image,target=t(image,target)
return image,target
class ToTensor(object):
def__call__(self,image,target):
image=F.to_tensor(image)
return image,target
class RandomHorizontalFlip(object):
def__init__(self,prob=0.5):
self.prob=prob
def __call__(self,image,target):
if random.random()<self.prob:
height,width=image.shape[-2:0]
image=image.flip(-1)
bbox=target["boxes"]
bbox[:,[0,2]]=width-bbox[:,[2,0]]
target["boxes"]=bbox
return image,target
之后提取特征
使用MobileNetV2,作为特征提取网络 Faster Rcnn 包括主要三部分 backbone, rpn, roi_heads ,MobileNetV2使用的是训练好的特征提取器,该网络为分类网络。
x=self.features(x)
x=self.avgpool(x)
x=torch.flatten(x,1)
x=self.classifier(x)
使用的是feature部分,输出是特征矩阵,CMM大小,根据MM大小的特征矩阵生成anchor MM9,9对应着所有比例 RPNhead 利用33特征矩阵在生成该anchor是属于前景还是背景,以及四个坐标值的相对变化量。根据生成的所有属于前景的取前2000作为proposal ,根据对应的proposal生成7*7特征矩阵,判断该类别以及坐标信息,
特征提取网络,就是分类网络,在此不再赘述。
整个网络训练
import os
import datatime
import torch
import torchvisin
import transforms
from network_file import FasterRCNN,AnchorsGenerator
from backbone import MobileNetV2
from my_dataset import VOCDataset
from train_utils import GroupedBatchSampler,create_aspect_ratio_groups
from train_utils import train_eval_utils as utils
if __name__=="__main__":
main()
def main():
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using {} device training.".format(device.type))
result_file="result{}.txt".format(datatime.datatime.now().strftime("%Y%m%d-%H%M%S"))
if not os.path.exists("save_weight"):
os.makedirs("save_weights")
data_fransform={
"train":transforms.Compose([transforms.ToTensor(),
transforms.RandomHorizontalFlip(0.5)]),
"val":transforms.Compose([transforms.ToTensor()])
}
VOC_root="./"
aspect_ratio_group_factor=3
batch_size=4
if os.path.exists(os.path.join(VOC_root,"VOCdevkt")) if False:
raise FileNotFoundError("VOCdevkit dese not in path:{}".format(VOC_root))
train_dataset=VOCDataset(VOC_root,"2012",data_transfrom["train"],
"train.txt")
train_sampler=None
if aspect_ratio_group_factor>=0:
train_samler=torch.utils.data.RandomSampler(train_dataset)
group_ids=create_aspect_ratio_groups(train_dataset,k=aspect_ratio_group_factor)
train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, batch_size)
#使得这样一个batch中的数据是从相同比例获取的
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers
print('Using %g dataloader workers' % nw)
该函数时统计训练图像比例在一定比例区间的图像索引
def create_aspect_ratio_group(dataset,k=0):
aspect_ratios=compute_aspect_ratios(dataset)
#计算比例 一共5717张。计算出比例
bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
## 将[0.5, 2]区间划分成6等份(7个点,6个区间)
groups=_quantize(aspect_ratios,bins)
counts = np.unique(groups, return_counts=True)[1]
return group
#Count of instances per bin: [ 5 25 929 117 260 4198 135 48]
#统计各个比例出现的图片个数
#计算所有图像的比例在bins中的索引
#Using [0, 0.5, 0.6299605249474366, 0.7937005259840997, 1.0, 1.2599210498948732, 1.5874010519681994, 2.0, inf] as bins for aspect ratio quantization
如果按照图片高宽比采样图片,dataloader 使用batch_sampler
if train_sampler:
train_data_loader=torch.utils.data.DataLoader(train_dataset,
batch_sampler=train_batch_sampler,
num_workers_nw,
collate_fn=train_dataset,collate_fn)
else:
train_data_loader=torch.utils.data.DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
pin_memory=True,
num_workers=nw,
collate_fn=train_dataset.collate_fn)
model=create_model(num_classes=21)
数据进行比例构建,之后构建网络
def create_model(num_classes):
backbones=MobileNetV2(weight_path="./backbone/mobilenet_v2.pth").features
Sequential(
(0): ConvBNReLU(
(0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(2): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
(1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(3): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(144, 144, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=144, bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(144, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(4): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(144, 144, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=144, bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(144, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(5): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(6): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(7): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(8): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(9): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(10): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(11): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(12): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(13): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(14): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(15): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(16): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(17): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(18): ConvBNReLU(
(0): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
)
一共18层网络
最后一层输出维度1280 ,卷积核为1*1
backbone.out_channels=1280
网络已经准备好
之后进行产生anchor ,anchor 应该是一系列坐标值
AnchorsGenerator生成器 根据特征矩阵生成一定比例的anchors
anchor_generator=AnchorsGenerator(sizes=(32,64,128,256,512),),
aspect_ratios=((0.5,1.0,2.0),)
class AnchorsGenerator(nn.Module):
__annotation__={
"cell_anchors":Optional[List[torch.Tensor]],
"_cache":Dict[str,List[torch.Tensor]]}
#在特征图上进行生成anchor
def __init__(self,sizes=(128,256,512),aspect_ratios=(0.5,1.0,2.0)):
super(AnchorsGenerator,self).__init__()
if not isinstance(sizes[0],(list,tuple)):
sizes=tuple((s,) for s in sizes)
if not isinstance(aspect_ratios[0],(list,tuple)):
aspect_ratios=(aspect_ratios,)*len(sizes)
assert len(sizes)==len(aspect_ratios)
self.sizes=size
self.aspect_ratios=aspect_ratio
self.cell_anchors=None
self._cache={}
# 只是初始化的过程
roi_pooler=torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],output_size=[7,7],sampling_ratio=2)
#根据RPN生成的propsal生成对应的特征矩阵
model=FasterRCNN(backbone=backbone,num_classes=num_classes,rpn_anchor_generator=anchor_generator,box_roi_pool=roi_pooler)
class FasterRCNN(FasterRCNNBase):
def__init__(self,backbone,num_classes=None,
min_size=800,max_size=1333,
image_mean=None,image_std=None,
rpn_anchor_generator=None,rpn_head=None,
rpn_pre_nus_top_train=2000, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, # rpn中在极大值抑制处理前保留的proposal数(根据score进行排序)
rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, # rpn中在极大值抑制处理后保留的proposal数
rpn_nms_thresh=0.7, # rpn中进行极大值抑制处理时使用的iou阈值
rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, # rpn计算损失时,采集正负样本设置的阈值
rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # rpn计算损失时采样的样本数,以及正样本占总样本的比例
rpn_score_thresh=0.0,
# Box parameters
box_roi_pool=None, box_head=None, box_predictor=None,
# 移除低目标概率 fast rcnn中进行nms处理的阈值 对预测结果根据score排序取前100个目标
box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100,
box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, # fast rcnn计算误差时,采集正负样本设置的阈值
box_batch_size_per_image=512, box_positive_fraction=0.25, # fast rcnn计算误差时采样的样本数,以及正样本占所有样本的比例
bbox_reg_weights=None):
if not hasattr(backbone,"out_channels"):
raise ValueError( "backbone should contain an attribute out_channels"
"specifying the number of output channels (assumed to be the"
"same for all the levels"
)
assert isinstance(rpn_anchor_generator,(AnchorsGenerator,type(None)))
assert isinstance(box_roi_pool,(MultiScaleRoIAlign,type(None)))
if num_classes is not None:
if box_predictor is not None:
raise ValueError("num_classes should be None when box_predictor "
"is specified")
else:
if box_predictor is None:
raise ValueError("num_classes should not be None when box_predictor "
"is not specified")
out_channels=backbone.out_channels
if rpn_anchor_generator is None:
anchor_sizes=((32,),(64,),(128,),(256,),(512,))
aspect_ratios=((0.5,1.0,2.0),)*len(anchor_sizes)
rpn_anchor_generator=AnchorsGenerator(anchor_sizes,aspect_ratios)
#生成RPN通过滑动窗口预测网络部分
if rpn_head is None:
rpn_head=RPNHead(out_channels,rpn_anchor_gemerator.num_anchors_per_locatin()[0])
class RPNHead(nn.Module):
#通过滑动窗口其实就是卷积生成预测概率与回归参数信息
def __init__(self,in_channels,num_anchors):#1280 15
super(RPNHead,self).__init__()
self.conv=nn.Conv2d(in_channels,in_channels,kernel_size=3,stride=1,padding=1)
self.cls_logits=nn.Conv2d(in_chanels,num_anchors,kernel_size=1,padding=1)
self.bbox_pred=nn.Conv2d(in_channels,num_anchors*4,kernel_size=1,padding=1)
for layer in self.children():
if isinstance(layer,nn.Conv2d):
torch.nn.init.normal_(layer.weight,std=0.01)
torch.nn.init.constant_(layer.bias,0)
rpn_pre_nums_top_n=dict(training=rpn_pre_nums_top_n_train,
testing=rpn_pre_nums_top_n_test)
rpn_post_nums_top_n=dict(training=rpn_post_nus_train
testing=rpn_post_nms_top_n_test)
进行极大值之前需要保留的anchor个数,之后需要保留anchor个数
# rpn_pre_nms_top_n={'training': 2000, 'testing': 1000}
rpn = RegionProposalNetwork(
rpn_anchor_generator, rpn_head,
rpn_fg_iou_thresh, rpn_bg_iou_thresh,
rpn_batch_size_per_image, rpn_positive_fraction,
rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh,
score_thresh=rpn_score_thresh)
class RegionProposalNetwork(torch.nn.Module):
__annotations__ = {
'box_coder': det_utils.BoxCoder,
'proposal_matcher': det_utils.Matcher,
'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
'pre_nms_top_n': Dict[str, int],
'post_nms_top_n': Dict[str, int],
}
def __init__(self, anchor_generator, head,
fg_iou_thresh, bg_iou_thresh,
batch_size_per_image, positive_fraction,
pre_nms_top_n, post_nms_top_n, nms_thresh, score_thresh=0.0):
# batch_size_per_image=256
# positive_fraction=0.5
# fg_iou_thresh=0.3, bg_iou_thresh=0.7,被认为是正样本的概率值,实际预测与真实的概率大于0.7 小于0.3的被认为是负样本 计算anchors与真实bbox的iou
#在进行极大值抑制之前的样本数,在进行极大值抑制之后的样本数
super(RegionProposalNetwork, self).__init__()
self.anchor_generator = anchor_generator
self.head = head
self.box_coder = det_utils.BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))
self.box_similarity=box_ops.box_iou
self.proposal_matcher = det_utils.Matcher(
fg_iou_thresh, # 当iou大于fg_iou_thresh(0.7)时视为正样本
bg_iou_thresh, # 当iou小于bg_iou_thresh(0.3)时视为负样本
allow_low_quality_matches=True
)
self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
batch_size_per_image, positive_fraction # 256, 0.5
)
# use during testing
self._pre_nms_top_n = pre_nms_top_n
self._post_nms_top_n = post_nms_top_n
self.nms_thresh = nms_thresh
self.score_thresh = s
if box_head is None:
resolutin=box_rol_pool.output_size[0]
#roi pooling 7*7
#之后进行全连接层
representaion_size=1024
box_head=TwoMLPHead(out_channels*resolution**2,representation_size)
class TwoMLPHead(nn.Module):
def __init__(self,in_channels,representation_size):
# 62720 1024
super(TWoMlPHead,self).__init__()
self.fc6=nn.Linear(in_channels,representation_size)
self.fc7=nn.Liner(representation_size,representation_size)
if box_predictor is None:
representation_size=1024
box_predictor=FasterRCNNPredictor(representation_size,num_classes)
class FaterRCNNPredictor(nn.Module):
def __init__(self,in_channels,num_classes):
#1024 21
super(FastRCNNPredictor,self).__init__()
self.cls_score=nn.Linear(in_channels,num_classes)
self.bbox_pre=nn.Linear(in_channels,num_calsses*4)
roi_heads = RoIHeads(
# box
box_roi_pool, box_head, box_predictor,
box_fg_iou_thresh, box_bg_iou_thresh, # 0.5 0.5
box_batch_size_per_image, box_positive_fraction, # 512 0.25
bbox_reg_weights,
box_score_thresh, box_nms_thresh, box_detections_per_img) # 0.05 0.5 100
class RoIHeads(torch.nn.Module):
__annotations__ = {
'box_coder': det_utils.BoxCoder,
'proposal_matcher': det_utils.Matcher,
'fg_bg_sampler': det_utils.BalancedPositiveNegativeSampler,
}
def __init__(self,
box_roi_pool, # Multi-scale RoIAlign pooling
box_head, # TwoMLPHead
box_predictor, # FastRCNNPredictor
# Faster R-CNN training
fg_iou_thresh, bg_iou_thresh, # default: 0.5, 0.5
batch_size_per_image, positive_fraction, # default: 512, 0.25
bbox_reg_weights, # None
# Faster R-CNN inference
score_thresh, # default: 0.05
nms_thresh, # default: 0.5
detection_per_img): # default: 100
super(RoIHeads, self).__init__()
self.box_similarity=box_ops.box_iou
self.proposal_matcher = det_utils.Matcher(
fg_iou_thresh, # default: 0.5
bg_iou_thresh, # default: 0.5
allow_low_quality_matches=False)
self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(
batch_size_per_image, # default: 512
positive_fraction) # default: 0.25
if bbox_reg_weights is None:
bbox_reg_weights = (10., 10., 5., 5.)
self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
self.box_roi_pool = box_roi_pool # Multi-scale RoIAlign pooling
self.box_head = box_head # TwoMLPHead
self.box_predictor = box_predictor # FastRCNNPredictor
self.score_thresh = score_thresh # default: 0.05
self.nms_thresh = nms_thresh # default: 0.5
self.detection_per_img = detection_per_img # default: 100
if image_mean is None:
image_mean = [0.485, 0.456, 0.406]
if image_std is None:
image_std = [0.229, 0.224, 0.225]
# 对数据进行标准化,缩放,打包成batch等处理部分
transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
整个网络结构图
FasterRCNN(
(transform): GeneralizedRCNNTransform(
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
Resize(min_size=(800,), max_size=1333, mode='bilinear')
)
(backbone): Sequential(
(0): ConvBNReLU(
(0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
(2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(2): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=96, bias=False)
(1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(3): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(144, 144, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=144, bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(144, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(4): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(144, 144, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=144, bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(144, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(5): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(6): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(7): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(32, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(192, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=192, bias=False)
(1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(8): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(9): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(10): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(11): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=384, bias=False)
(1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(12): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(13): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(14): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(96, 576, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(576, 576, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=576, bias=False)
(1): BatchNorm2d(576, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(576, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(15): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(16): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(17): InvertedResidual(
(conv): Sequential(
(0): ConvBNReLU(
(0): Conv2d(160, 960, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(1): ConvBNReLU(
(0): Conv2d(960, 960, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=960, bias=False)
(1): BatchNorm2d(960, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
(2): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(18): ConvBNReLU(
(0): Conv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(1280, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(2): ReLU6(inplace=True)
)
)
(rpn): RegionProposalNetwork(
(anchor_generator): AnchorsGenerator()
(head): RPNHead(
(conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cls_logits): Conv2d(1280, 15, kernel_size=(1, 1), stride=(1, 1))
(bbox_pred): Conv2d(1280, 60, kernel_size=(1, 1), stride=(1, 1))
)
)
(roi_heads): RoIHeads(
(box_roi_pool): MultiScaleRoIAlign(featmap_names=['0'], output_size=(7, 7), sampling_ratio=2)
(box_head): TwoMLPHead(
(fc6): Linear(in_features=62720, out_features=1024, bias=True)
(fc7): Linear(in_features=1024, out_features=1024, bias=True)
)
(box_predictor): FastRCNNPredictor(
(cls_score): Linear(in_features=1024, out_features=21, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=84, bias=True)
)
)
)
#一共21类别
开始训练网络
model.to(device)
train_loss=[]
learning_rate=[]
val_map=[]
首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分
for param in model.backbone.parameters():
param.requires_grad = False
# define optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
init_epochs = 1
for epoch in range(init_epochs):
mean_loss, lr = utils.train_one_epoch(model,optimizer,train__loader,device, epoch, print_freq=50, warmup=True)
进入训练模块
def train_one_epoch(model,optimizer,data_loader,device,epoch,print_freq=50,warmup=False):
model.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
header = 'Epoch: [{}]'.format(epoch)
lr_scheduler = None
if epoch == 0 and warmup is True: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练
warmup_factor = 1.0 / 1000
warmup_iters = min(1000, len(data_loader) - 1)
lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
mloss = torch.zeros(1).to(device) # mean losses
enable_amp = True if "cuda" in device.type else False
for i, [images, targets] in enumerate(metric_logger.log_every(data_loader, print_freq, header)):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
with torch.cuda.amp.autocast(enabled=enable_amp):
loss_dict = model(images, targets)
#开始进入训练模式
calss FaterRCNNBase(nn.Module):
def forward(self,images,target=None):
if self.training and target is None:
raise ValueError (In training mode,targets should be passed")
if self.training:
assert target is not None:
for target in targets:
boxes=target["boxes"] tensor([[ 4., 4., 438., 375.]])
if isintance(boxes,torch.Tensor):
if len(boxes.shape)!=2 or boxes.shape[-1]!=4:
raise ValueError("Expected traget boxes to be a lensor of shape[N,4],got {:}.".format(boxes.shape))
original_image_sizes = torch.jit.annotate(List[Tuple[int, int]], [])
for img in images:
val = img.shape[-2:]
assert len(val) == 2 # 防止输入的是个一维向量
original_image_sizes.append((val[0], val[1]))
#[(375, 500), (333, 500), (385, 500), (375, 500)] batch等于4
features = self.backbone(images.tensors) # 将图像输入backbone得到特征图
# torch.Size([4, 1280, 25, 38])
proposals, proposal_losses = self.rpn(images, features, targets)
#rpn
def forward(self,
images, # type: ImageList
features, # type: Dict[str, Tensor]
targets=None # type: Optional[List[Dict[str, Tensor]]]
):
features = list(features.values())
#torch.Size([4, 15, 25, 38]) torch.Size([4, 60, 25, 38])
objectness, pred_bbox_deltas = self.head(features)
,,,
def __init__(self, in_channels, num_anchors):
super(RPNHead, self).__init__()
# 3x3 滑动窗口
self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
# 计算预测的目标分数(这里的目标只是指前景或者背景)
self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)
# 计算预测的目标bbox regression参数
self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1)
for layer in self.children():
if isinstance(layer, nn.Conv2d):
torch.nn.init.normal_(layer.weight, std=0.01)
torch.nn.init.constant_(layer.bias, 0)
def forward(self, x):
# type: (List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
logits = []
bbox_reg = []
for i, feature in enumerate(x): 就循环一次,
t = F.relu(self.conv(feature))
logits.append(self.cls_logits(t)) 得到分类回归参数初步计算出相应的参数
bbox_reg.append(self.bbox_pred(t))
return logits, bbox_reg
,,,
anchors = self.anchor_generator(images, features)
生成anchors
,,,
def forward(self, image_list, feature_maps):
# type: (ImageList, List[Tensor]) -> List[Tensor]
# 获取每个预测特征层的尺寸(height, width)
grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
# torch.Size([25, 38])
# 获取输入图像的height和width
image_size = image_list.tensors.shape[-2:]
# torch.size(800,1260)
dtype, device = feature_maps[0].dtype, feature_maps[0].device
# one step in feature map equate n pixel stride in origin image
# 计算特征层上的一步等于原始图像上的步长
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
#[[tensor(32), tensor(32)]]
,,,
def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
# type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
"""
compute anchor sizes
Arguments:
scales: sqrt(anchor_area)
aspect_ratios: h/w ratios
dtype: float32
device: cpu/gpu
"""
scales = torch.as_tensor(scales, dtype=dtype, device=device)
tensor([ 32., 64., 128., 256., 512.])
aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
tensor([0.5000, 1.0000, 2.0000])
h_ratios = torch.sqrt(aspect_ratios) tensor([0.7071, 1.0000, 1.4142])
w_ratios = 1.0 / h_ratios tensor([1.4142, 1.0000, 0.7071])
ws = (w_ratios[:, None] * scales[None, :]).view(-1)
hs = (h_ratios[:, None] * scales[None, :]).view(-1)
tensor([ 45.2548, 90.5097, 181.0193, 362.0387, 724.0773, 32.0000, 64.0000,
128.0000, 256.0000, 512.0000, 22.6274, 45.2548, 90.5097, 181.0193,
362.0387])
tensor([ 22.6274, 45.2548, 90.5097, 181.0193, 362.0387, 32.0000, 64.0000,
128.0000, 256.0000, 512.0000, 45.2548, 90.5097, 181.0193, 362.0387,
724.0773])
# left-top, right-bottom coordinate relative to anchor center(0, 0)
# 生成的anchors模板都是以(0, 0)为中心的, shape [len(ratios)*len(scales), 4]
base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
tensor([[ -22.6274, -11.3137, 22.6274, 11.3137],
[ -45.2548, -22.6274, 45.2548, 22.6274],
[ -90.5097, -45.2548, 90.5097, 45.2548],
[-181.0193, -90.5097, 181.0193, 90.5097],
[-362.0387, -181.0193, 362.0387, 181.0193],
[ -16.0000, -16.0000, 16.0000, 16.0000],
[ -32.0000, -32.0000, 32.0000, 32.0000],
[ -64.0000, -64.0000, 64.0000, 64.0000],
[-128.0000, -128.0000, 128.0000, 128.0000],
[-256.0000, -256.0000, 256.0000, 256.0000],
[ -11.3137, -22.6274, 11.3137, 22.6274],
[ -22.6274, -45.2548, 22.6274, 45.2548],
[ -45.2548, -90.5097, 45.2548, 90.5097],
[ -90.5097, -181.0193, 90.5097, 181.0193],
[-181.0193, -362.0387, 181.0193, 362.0387]])
return base_anchors.round() #
shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
根据特征图在原图像上找到相对应的坐标信息
tensor([[ 0., 0., 0., 0.],
[ 32., 0., 32., 0.],
[ 64., 0., 64., 0.],
...,
[1120., 768., 1120., 768.],
[1152., 768., 1152., 768.],
[1184., 768., 1184., 768.]])
shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
torch.Size([950, 15, 4]) 25*38=950 生成特征矩阵回归图像生成15个anchor
anchors.append(shifts_anchor.reshape(-1, 4))
#torch.Size([14250, 4])
return anchors # List[Tensor(all_num_anchors, 4)]
num_images = len(anchors)
# numel() Returns the total number of elements in the input tensor.
# 计算每个预测特征层上的对应的anchors数量
num_anchors_per_level_shape_tensors = [o[0].shape for o in objectness]
#[torch.Size([15, 25, 38])]
num_anchors_per_level = [s[0] * s[1] * s[2] for s in num_anchors_per_level_shape_tensors]
#[14250]
objectness, pred_bbox_deltas =concat_box_prediction_layers(objectness, pred_bbox_deltas)
# # 57000
# 57000 4
proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
#将预测的bbox regression参数应用到anchors上得到最终预测bbox坐标
#torch.Size([57000, 1, 4])
# 筛除小boxes框,nms处理,根据预测概率获取前post_nms_top_n个目标
boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
#每一个图像大概留下2000张图像
---------------------------
def filter_proposals(self, proposals, objectness, image_shapes, num_anchors_per_level):
# type: (Tensor, Tensor, List[Tuple[int, int]], List[int]) -> Tuple[List[Tensor], List[Tensor]]
"""
筛除小boxes框,nms处理,根据预测概率获取前post_nms_top_n个目标
Args:
proposals: 预测的bbox坐标
objectness: 预测的目标概率 57000
image_shapes: batch中每张图片的size信息
num_anchors_per_level: 每个预测特征层上预测anchors的数目 14250
Returns:
"""
num_images = proposals.shape[0]
# 4 14250 4
device = proposals.device
# do not backprop throught objectness
objectness = objectness.detach()
objectness = objectness.reshape(num_images, -1)
# Returns a tensor of size size filled with fill_value
# levels负责记录分隔不同预测特征层上的anchors索引信息
levels = [torch.full((n, ), idx, dtype=torch.int64, device=device)
for idx, n in enumerate(num_anchors_per_level)]
levels = torch.cat(levels, 0)
# Expand this tensor to the same size as objectness
levels = levels.reshape(1, -1).expand_as(objectness)
# select top_n boxes independently per level before applying nms
# 获取每张预测特征图上预测概率排前pre_nms_top_n的anchors索引值
top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
image_range = torch.arange(num_images, device=device)
batch_idx = image_range[:, None] # [batch_size, 1]
# 根据每个预测特征层预测概率排前pre_nms_top_n的anchors索引值获取相应概率信息
objectness = objectness[batch_idx, top_n_idx]
levels = levels[batch_idx, top_n_idx]
# 预测概率排前pre_nms_top_n的anchors索引值获取相应bbox坐标信息
proposals = proposals[batch_idx, top_n_idx]
objectness_prob = torch.sigmoid(objectness)
final_boxes = []
final_scores = []
# 遍历每张图像的相关预测信息
for boxes, scores, lvl, img_shape in zip(proposals, objectness_prob, levels, image_shapes):
# 调整预测的boxes信息,将越界的坐标调整到图片边界上
boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
# 返回boxes满足宽,高都大于min_size的索引
keep = box_ops.remove_small_boxes(boxes, self.min_size)
boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
# 移除小概率boxes,参考下面这个链接
# https://github.com/pytorch/vision/pull/3205
keep = torch.where(torch.ge(scores, self.score_thresh))[0] # ge: >=
boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
# non-maximum suppression, independently done per level
keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)
# keep only topk scoring predictions
keep = keep[: self.post_nms_top_n()]
boxes, scores = boxes[keep], scores[keep]
final_boxes.append(boxes)
final_scores.append(scores)
return final_boxes, final_scores
---------
losses = {}
if self.training:
assert targets is not None
# 计算每个anchors最匹配的gt,并将anchors进行分类,前景,背景以及废弃的anchors
labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
# 结合anchors以及对应的gt,计算regression参数
regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
loss_objectness, loss_rpn_box_reg = self.compute_loss(
objectness, pred_bbox_deltas, labels, regression_targets
)
losses = {
"loss_objectness": loss_objectness,
"loss_rpn_box_reg": loss_rpn_box_reg
}
return boxes, losses
-----
proposals, proposal_losses = self.rpn(images, features, targets)
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
box_features = self.box_roi_pool(features, proposals, image_shapes)
torch.Size([2048, 1280, 7, 7])
def forward(self, x):
x = x.flatten(start_dim=1)
x = F.relu(self.fc6(x))
x = F.relu(self.fc7(x))
return x
optimizer.zero_grad()
losses.backward()
optimizer.step()
目标检测首先训练输入图像,利用特征提取网络提取特征,根据生成的特征矩阵4 * 1280* 2538
在该特征层上生成anchor,以及与bb相确认anchor标签,利用33卷积层在特征层上进行滑动,生成anchor概率,生成回归框,取前2000,利用anchor得到相应的特征层resize flatten 全连接层生成类概率与回归框,利用极大值抑制去除最后,得到阈值为0.5的概率后的边界框,得到最后的输出,边界值