Yolov3
- voc数据集构建文件
- VOC数据集dataset构建文件
- VOC2CSV
- Yolov3配置文件
- 模型backbone构建
- yolo head预测文件
- conv层模块代码文件
- 残差模块
- 激活函数模块
- 损失函数模块
- 特征金字塔yolo
- yolov3网络代码构建
voc数据集构建文件
import sys
import xml.etree.ElementTree as ET
import config.yolov3_config_voc as cfg
import os
from tqdm import tqdm
sys.path.append("..")
def parse_voc_annotation(data_path, file_type, anno_path, use_difficult_bbox=False):
"""
解析 pascal voc数据集的annotation, 表示的形式为[image_global_path xmin,ymin,xmax,ymax,cls_id]
:param data_path: 数据集的路径 , 如 "./data/VOC"
:param file_type: 文件的类型, 'trainval''train''val'
:param anno_path: 标签存储路径
:param use_difficult_bbox: 是否适用difficult==1的bbox
:return: 数据集大小
"""
classes = cfg.DATA["CLASSES"]
img_inds_file = os.path.join(data_path, 'ImageSets', 'Main', file_type+'.txt')
with open(img_inds_file, 'r') as f:
lines = f.readlines()
image_ids = [line.strip() for line in lines]
with open(anno_path, 'a') as f:
for image_id in tqdm(image_ids):
image_path = os.path.join(data_path, 'JPEGImages', image_id + '.jpg')
annotation = image_path
label_path = os.path.join(data_path, 'Annotations', image_id + '.xml')
root = ET.parse(label_path).getroot()
objects = root.findall('object')
for obj in objects:
difficult = obj.find("difficult").text.strip()
if (not use_difficult_bbox) and (int(difficult) == 1):
continue
bbox = obj.find('bndbox')
class_id = classes.index(obj.find("name").text.lower().strip())
xmin = bbox.find('xmin').text.strip()
ymin = bbox.find('ymin').text.strip()
xmax = bbox.find('xmax').text.strip()
ymax = bbox.find('ymax').text.strip()
annotation += ' ' + ','.join([xmin, ymin, xmax, ymax, str(class_id)])
annotation += '\n'
f.write(annotation)
return len(image_ids)
if __name__ == "__main__":
train_data_path_2007 = os.path.join(cfg.DATA_PATH, 'VOCtrainval-2007', 'VOCdevkit', 'VOC2007')
train_data_path_2012 = os.path.join(cfg.DATA_PATH, 'VOCtrainval-2012', 'VOCdevkit', 'VOC2012')
train_annotation_path = os.path.join('../data', 'train_annotation.txt')
if os.path.exists(train_annotation_path):
os.remove(train_annotation_path)
test_data_path_2007 = os.path.join(cfg.DATA_PATH, 'VOCtest-2007', 'VOCdevkit', 'VOC2007')
test_annotation_path = os.path.join('../data', 'test_annotation.txt')
if os.path.exists(test_annotation_path):
os.remove(test_annotation_path)
len_train = parse_voc_annotation(train_data_path_2007,
"trainval",
train_annotation_path,
use_difficult_bbox=False) + \
parse_voc_annotation(train_data_path_2012,
"trainval",
train_annotation_path,
use_difficult_bbox=False)
len_test = parse_voc_annotation(test_data_path_2007, "test", test_annotation_path, use_difficult_bbox=False)
print("The number of images for train and test are :train : {0} | test : {1}".format(len_train, len_test))
VOC数据集dataset构建文件
import os
import sys
import torch
from torch.utils.data import Dataset, DataLoader
import config.yolov3_config_voc as cfg
import cv2
import numpy as np
import random
import utils.data_augment as dataAug
import utils.tools as tools
sys.path.append("..")
sys.path.append("../utils")
class VocDataset(Dataset):
def __init__(self, anno_file_type, img_size=416):
self.img_size = img_size
self.classes = cfg.DATA["CLASSES"]
self.num_classes = len(self.classes)
self.class_to_id = dict(zip(self.classes, range(self.num_classes)))
self.__annotations = self.__load_annotations(anno_file_type)
def __len__(self):
return len(self.__annotations)
def __getitem__(self, item):
img_org, bboxes_org = self.__parse_annotation(self.__annotations[item])
img_org = img_org.transpose(2, 0, 1)
item_mix = random.randint(0, len(self.__annotations)-1)
img_mix, bboxes_mix = self.__parse_annotation(self.__annotations[item_mix])
img_mix = img_mix.transpose(2, 0, 1)
img, bboxes = dataAug.Mixup()(img_org, bboxes_org, img_mix, bboxes_mix)
del img_org, bboxes_org, img_mix, bboxes_mix
label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.__creat_label(bboxes)
img = torch.from_numpy(img).float()
label_sbbox = torch.from_numpy(label_sbbox).float()
label_mbbox = torch.from_numpy(label_mbbox).float()
label_lbbox = torch.from_numpy(label_lbbox).float()
sbboxes = torch.from_numpy(sbboxes).float()
mbboxes = torch.from_numpy(mbboxes).float()
lbboxes = torch.from_numpy(lbboxes).float()
return img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
def __load_annotations(self, anno_type):
assert anno_type in ['train', 'test'], "You must choice one of the 'train' or 'test' for anno_type parameter"
anno_path = os.path.join(cfg.PROJECT_PATH, 'data', anno_type+"_annotation.txt")
with open(anno_path, 'r') as f:
annotations = list(filter(lambda x: len(x) > 0, f.readlines()))
assert len(annotations) > 0, "No images found in {}".format(anno_path)
return annotations
def __parse_annotation(self, annotation):
"""
Data augument.
:param annotation: Image' path and bboxes' coordinates, categories.
ex. [image_path xmin,ymin,xmax,ymax,class_ind xmin,ymin,xmax,ymax,class_ind ...]
:return: Return the enhanced image and bboxes. bbox'shape is [xmin, ymin, xmax, ymax, class_ind]
"""
anno = annotation.strip().split(' ')
img_path = anno[0]
img = cv2.imread(img_path)
assert img is not None, 'File Not Found ' + img_path
bboxes = np.array([list(map(float, box.split(','))) for box in anno[1:]])
img, bboxes = dataAug.RandomHorizontalFilp()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomCrop()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomAffine()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.Resize((self.img_size, self.img_size), True)(np.copy(img), np.copy(bboxes))
return img, bboxes
def __creat_label(self, bboxes):
"""
Label assignment. For a single picture all GT box bboxes are assigned anchor.
1、Select a bbox in order, convert its coordinates("xyxy") to "xywh"; and scale bbox'
xywh by the strides.
2、Calculate the iou between the each detection layer'anchors and the bbox in turn, and select the largest
anchor to predict the bbox.If the ious of all detection layers are smaller than 0.3, select the largest
of all detection layers' anchors to predict the bbox.
Note :
1、The same GT may be assigned to multiple anchors. And the anchors may be on the same or different layer.
2、The total number of bboxes may be more than it is, because the same GT may be assigned to multiple layers
of detection.
"""
anchors = np.array(cfg.MODEL["ANCHORS"])
strides = np.array(cfg.MODEL["STRIDES"])
train_output_size = self.img_size / strides
anchors_per_scale = cfg.MODEL["ANCHORS_PER_SCLAE"]
label = [np.zeros((int(train_output_size[i]),
int(train_output_size[i]),
anchors_per_scale,
6+self.num_classes)) for i in range(3)]
for i in range(3):
label[i][..., 5] = 1.0
bboxes_xywh = [np.zeros((150, 4)) for _ in range(3)]
bbox_count = np.zeros((3,))
for bbox in bboxes:
bbox_coor = bbox[:4]
bbox_class_ind = int(bbox[4])
bbox_mix = bbox[5]
one_hot = np.zeros(self.num_classes, dtype=np.float32)
one_hot[bbox_class_ind] = 1.0
one_hot_smooth = dataAug.LabelSmooth()(one_hot, self.num_classes)
bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5,
bbox_coor[2:] - bbox_coor[:2]], axis=-1)
bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / strides[:, np.newaxis]
iou = []
exist_positive = False
for i in range(3):
anchors_xywh = np.zeros((anchors_per_scale, 4))
anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
anchors_xywh[:, 2:4] = anchors[i]
iou_scale = tools.iou_xywh_numpy(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
iou.append(iou_scale)
iou_mask = iou_scale > 0.3
if np.any(iou_mask):
xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)
label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
label[i][yind, xind, iou_mask, 4:5] = 1.0
label[i][yind, xind, iou_mask, 5:6] = bbox_mix
label[i][yind, xind, iou_mask, 6:] = one_hot_smooth
bbox_ind = int(bbox_count[i] % 150)
bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
bbox_count[i] += 1
exist_positive = True
if not exist_positive:
best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
best_detect = int(best_anchor_ind / anchors_per_scale)
best_anchor = int(best_anchor_ind % anchors_per_scale)
xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)
label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
label[best_detect][yind, xind, best_anchor, 5:6] = bbox_mix
label[best_detect][yind, xind, best_anchor, 6:] = one_hot_smooth
bbox_ind = int(bbox_count[best_detect] % 150)
bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
bbox_count[best_detect] += 1
label_sbbox, label_mbbox, label_lbbox = label
sbboxes, mbboxes, lbboxes = bboxes_xywh
return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
if __name__ == "__main__":
voc_dataset = VocDataset(anno_file_type="train", img_size=448)
dataloader = DataLoader(voc_dataset, shuffle=True, batch_size=1, num_workers=0)
for i, (img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes) in enumerate(dataloader):
if i==0:
print(img.shape)
print(label_sbbox.shape)
print(label_mbbox.shape)
print(label_lbbox.shape)
print(sbboxes.shape)
print(mbboxes.shape)
print(lbboxes.shape)
if img.shape[0] == 1:
labels = np.concatenate([label_sbbox.reshape(-1, 26), label_mbbox.reshape(-1, 26),
label_lbbox.reshape(-1, 26)], axis=0)
labels_mask = labels[..., 4] > 0
labels = np.concatenate([labels[labels_mask][..., :4], np.argmax(labels[labels_mask][..., 6:],
axis=-1).reshape(-1, 1)], axis=-1)
print(labels.shape)
tools.plot_box(labels, img, id=1)
VOC2CSV
import os
import random
import math
import argparse
from tqdm import tqdm
import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--indir", type="str", default="")
parser.add_argument("-p", "--percent", type=float, default=0.2)
parser.add_argument("-t", "--train", type=str, default="")
parser.add_argument("-v", "--val", type=str, default="")
parser.add_argument("-c", "--classes", type=str, default="")
args = parser.parse_args()
return args
def get_file_index(indir, postfix):
print(indir)
file_list = []
for root, dirs, files in os.walk(indir):
for name in files:
if postfix in name:
file_list.append(os.path.join(root, name))
return file_list
def convert_annotation(csv, address_list):
cls_list = []
with open(csv, "w") as f:
for i, address in enumerate(tqdm(address_list)):
in_file = open(address, encoding="utf-8")
strXmml = in_file.read()
in_file.close()
root = ET.XML(in_file)
for obj in root.iter("object"):
cls = obj.find("name").text
cls_list.append(cls)
xmlbox = obj.find("bndbox")
b = (int(float(xmlbox.find("xmin").text)), int(float(xmlbox.find("ymin").text)),
int(float(xmlbox.find("xmax").text)), int(float(xmlbox.find("ymax").text)))
f.write(file_dict[address_list[i]])
f.write(","+",".join([str(a) for a in b]) + ","+cls)
f.write("\n")
return cls_list
if __name__ == "__main__":
args = parse_args()
file_address = args.indir
test_percent = args.percent
train_csv = args.train
test_csv = args.val
class_csv = args.classes
Annotarions = get_file_index(file_address+"/Annotations", ".xml")
Annotarions.sort()
JPEGfiles = get_file_index(file_address+"/JPEGImages", ".jpg")
JPEGfiles.sort()
assert len(Annotarions) == len(JPEGfiles)
file_dict = dict(zip(Annotarions, JPEGfiles))
num = len(Annotarions)
test = random.sample(k=math.cell(num*test_percent), population=Annotarions)
train = list(set(Annotarions) - set(test))
cls_lsit1 = convert_annotation(train_csv, train)
cls_lsit2 = convert_annotation(test_csv, test)
cls_unique = list(set(cls_lsit1+cls_lsit2))
with open(class_csv, "w") as f:
for i, cls in enumerate(cls_unique):
f.write(cls+","+str(i)+"\n")
Yolov3配置文件
DATA_PATH = "./data/VOC"
PROJECT_PATH = r"E:/CV/CV-图像检测/yolov3"
DATA = {"CLASSES": ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
'sofa', 'train', 'tvmonitor'], "NUM": 20}
MODEL = {"ANCHORS": [[(1.25, 1.625), (2.0, 3.75), (4.125, 2.875)],
[(1.875, 3.8125), (3.875, 2.8125), (3.6875, 7.4375)],
[(3.625, 2.8125), (4.875, 6.1875), (11.65625, 10.1875)]],
"STRIDES": [8, 16, 32],
"ANCHORS_PER_SCLAE": 3
}
TRAIN = {
"TRAIN_IMG_SIZE": 448,
"AUGMENT": True,
"BATCH_SIZE":4,
"MULTI_SCALE_TRAIN": True,
"IOU_THRESHOLD_LOSS": 0.5,
"EPOCHS": 50,
"NUMBER_WORKERS": 4,
"MOMENTUM": 0.9,
"WEIGHT_DECAY": 0.0005,
"LR_INIT": 1e-4,
"LR_END": 1e-6,
"WARMUP_EPOCHS": 2
}
TEST = {
"TEST_IMG_SIZE": 448,
"BATCH_SIZE": 4,
"NUMBER_WORKERS": 2,
"CONF_THRESH": 0.01,
"NMS_THRESH": 0.5,
"MULTI_SCALE_TEST": False,
"FLIP_TEST": False
}
模型backbone构建
import torch.nn as nn
from ..layers.conv_module import Convolutional
from ..layers.blocks_module import Residual_block
class Darknet53(nn.Module):
def __init__(self):
super(Darknet53, self).__init__()
self.__conv = Convolutional(filters_in=3, filters_out=32, kernel_size=3, stride=1, pad=1, norm='bn',
activate='leaky')
self.__conv_5_0 = Convolutional(filters_in=32, filters_out=64, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_0 = Residual_block(filters_in=64, filters_out=64, filters_medium=32)
self.__conv_5_1 = Convolutional(filters_in=64, filters_out=128, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_1_0 = Residual_block(filters_in=128, filters_out=128, filters_medium=64)
self.__rb_5_1_1 = Residual_block(filters_in=128, filters_out=128, filters_medium=64)
self.__conv_5_2 = Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_2_0 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_1 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_2 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_3 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_4 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_5 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_6 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_7 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__conv_5_3 = Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_3_0 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_1 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_2 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_3 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_4 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_5 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_6 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_7 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__conv_5_4 = Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_4_0 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_1 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_2 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_3 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
def forward(self, x):
x = self.__conv(x)
x0_0 = self.__conv_5_0(x)
x0_1 = self.__rb_5_0(x0_0)
x1_0 = self.__conv_5_1(x0_1)
x1_1 = self.__rb_5_1_0(x1_0)
x1_2 = self.__rb_5_1_1(x1_1)
x2_0 = self.__conv_5_2(x1_2)
x2_1 = self.__rb_5_2_0(x2_0)
x2_2 = self.__rb_5_2_1(x2_1)
x2_3 = self.__rb_5_2_2(x2_2)
x2_4 = self.__rb_5_2_3(x2_3)
x2_5 = self.__rb_5_2_4(x2_4)
x2_6 = self.__rb_5_2_5(x2_5)
x2_7 = self.__rb_5_2_6(x2_6)
x2_8 = self.__rb_5_2_7(x2_7)
x3_0 = self.__conv_5_3(x2_8)
x3_1 = self.__rb_5_3_0(x3_0)
x3_2 = self.__rb_5_3_1(x3_1)
x3_3 = self.__rb_5_3_2(x3_2)
x3_4 = self.__rb_5_3_3(x3_3)
x3_5 = self.__rb_5_3_4(x3_4)
x3_6 = self.__rb_5_3_5(x3_5)
x3_7 = self.__rb_5_3_6(x3_6)
x3_8 = self.__rb_5_3_7(x3_7)
x4_0 = self.__conv_5_4(x3_8)
x4_1 = self.__rb_5_4_0(x4_0)
x4_2 = self.__rb_5_4_1(x4_1)
x4_3 = self.__rb_5_4_2(x4_2)
x4_4 = self.__rb_5_4_3(x4_3)
return x2_8, x3_8, x4_4
yolo head预测文件
import torch.nn as nn
import torch
class Yolo_head(nn.Module):
def __init__(self, nC, anchors, stride):
super(Yolo_head, self).__init__()
self.__anchors = anchors
self.__nA = len(anchors)
self.__nC = nC
self.__stride = stride
def forward(self, p):
bs, nG = p.shape[0], p.shape[-1]
p = p.view(bs, self.__nA, 5 + self.__nC, nG, nG).permute(0, 3, 4, 1, 2)
p_de = self.__decode(p.clone())
return (p, p_de)
def __decode(self, p):
batch_size, output_size = p.shape[:2]
device = p.device
stride = self.__stride
anchors = (1.0 * self.__anchors).to(device)
conv_raw_dxdy = p[:, :, :, :, 0:2]
conv_raw_dwdh = p[:, :, :, :, 2:4]
conv_raw_conf = p[:, :, :, :, 4:5]
conv_raw_prob = p[:, :, :, :, 5:]
y = torch.arange(0, output_size).unsqueeze(1).repeat(1, output_size)
x = torch.arange(0, output_size).unsqueeze(0).repeat(output_size, 1)
grid_xy = torch.stack([x, y], dim=-1)
grid_xy = grid_xy.unsqueeze(0).unsqueeze(3).repeat(batch_size, 1, 1, 3, 1).float().to(device)
pred_xy = (torch.sigmoid(conv_raw_dxdy) + grid_xy) * stride
pred_wh = (torch.exp(conv_raw_dwdh) * anchors) * stride
pred_xywh = torch.cat([pred_xy, pred_wh], dim=-1)
pred_conf = torch.sigmoid(conv_raw_conf)
pred_prob = torch.sigmoid(conv_raw_prob)
pred_bbox = torch.cat([pred_xywh, pred_conf, pred_prob], dim=-1)
return pred_bbox.view(-1, 5 + self.__nC) if not self.training else pred_bbox
conv层模块代码文件
import torch
import torch.nn as nn
import torch.nn.functional as F
from .activate import *
norm_name = {"bn": nn.BatchNorm2d}
activate_name = {
"relu": nn.ReLU,
"leaky": nn.LeakyReLU,
"mish": Mish}
class Convolutional(nn.Module):
def __init__(self, filters_in, filters_out, kernel_size, stride, pad, norm=None, activate=None):
super(Convolutional, self).__init__()
self.norm = norm
self.activate = activate
self.__conv = nn.Conv2d(in_channels=filters_in, out_channels=filters_out, kernel_size=kernel_size,
stride=stride, padding=pad, bias=not norm)
if norm:
assert norm in norm_name.keys()
if norm == "bn":
self.__norm = norm_name[norm](num_features=filters_out)
if activate:
assert activate in activate_name.keys()
if activate == "leaky":
self.__activate = activate_name[activate](negative_slope=0.1, inplace=True)
if activate == "relu":
self.__activate = activate_name[activate](inplace=True)
def forward(self, x):
x = self.__conv(x)
if self.norm:
x = self.__norm(x)
if self.activate:
x = self.__activate(x)
return x
残差模块
import torch.nn as nn
from ..layers.conv_module import Convolutional
class Residual_block(nn.Module):
def __init__(self, filters_in, filters_out, filters_medium):
super(Residual_block, self).__init__()
self.__conv1 = Convolutional(filters_in=filters_in, filters_out=filters_medium, kernel_size=1, stride=1, pad=0,
norm="bn", activate="leaky")
self.__conv2 = Convolutional(filters_in=filters_medium, filters_out=filters_out, kernel_size=3, stride=1, pad=1,
norm="bn", activate="leaky")
def forward(self, x):
r = self.__conv1(x)
r = self.__conv2(r)
out = x + r
return out
激活函数模块
import torch
import torch.nn as nn
import torch.nn.functional as F
class Mish(nn.Module):
def __init__(self):
super(Mish).__init__()
def forward(self, x):
x = x * (torch.tanh(F.softplus(x)))
return x
class Swish(nn.Module):
def __init__(self):
super(Swish, self).__init__()
def forward(self, x):
x = x * F.sigmoid(x)
return x
损失函数模块
import sys
sys.path.append("../utils")
import torch
import torch.nn as nn
from utils import tools
import config.yolov3_config_voc as cfg
class FocalLoss(nn.Module):
def __init__(self, gamma=2.0, alpha=1.0, reduction="mean"):
super(FocalLoss, self).__init__()
self.__gamma = gamma
self.__alpha = alpha
self.__loss = nn.BCEWithLogitsLoss(reduction=reduction)
def forward(self, input, target):
loss = self.__loss(input=input, target=target)
loss *= self.__alpha * torch.pow(torch.abs(target - torch.sigmoid(input)), self.__gamma)
return loss
class YoloV3Loss(nn.Module):
def __init__(self, anchors, strides, iou_threshold_loss=0.5):
super(YoloV3Loss, self).__init__()
self.__iou_threshold_loss = iou_threshold_loss
self.__strides = strides
def forward(self, p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes):
"""
:param p: Predicted offset values for three detection layers.
The shape is [p0, p1, p2], ex. p0=[bs, grid, grid, anchors, tx+ty+tw+th+conf+cls_20]
:param p_d: Decodeed predicted value. The size of value is for image size.
ex. p_d0=[bs, grid, grid, anchors, x+y+w+h+conf+cls_20]
:param label_sbbox: Small detection layer's label. The size of value is for original image size.
shape is [bs, grid, grid, anchors, x+y+w+h+conf+mix+cls_20]
:param label_mbbox: Same as label_sbbox.
:param label_lbbox: Same as label_sbbox.
:param sbboxes: Small detection layer bboxes.The size of value is for original image size.
shape is [bs, 150, x+y+w+h]
:param mbboxes: Same as sbboxes.
:param lbboxes: Same as sbboxes
"""
strides = self.__strides
loss_s, loss_s_giou, loss_s_conf, loss_s_cls = self.__cal_loss_per_layer(p[0], p_d[0], label_sbbox,
sbboxes, strides[0])
loss_m, loss_m_giou, loss_m_conf, loss_m_cls = self.__cal_loss_per_layer(p[1], p_d[1], label_mbbox,
mbboxes, strides[1])
loss_l, loss_l_giou, loss_l_conf, loss_l_cls = self.__cal_loss_per_layer(p[2], p_d[2], label_lbbox,
lbboxes, strides[2])
loss = loss_l + loss_m + loss_s
loss_giou = loss_s_giou + loss_m_giou + loss_l_giou
loss_conf = loss_s_conf + loss_m_conf + loss_l_conf
loss_cls = loss_s_cls + loss_m_cls + loss_l_cls
return loss, loss_giou, loss_conf, loss_cls
def __cal_loss_per_layer(self, p, p_d, label, bboxes, stride):
"""
(1)The loss of regression of boxes.
GIOU loss is defined in https://arxiv.org/abs/1902.09630.
Note: The loss factor is 2-w*h/(img_size**2), which is used to influence the
balance of the loss value at different scales.
(2)The loss of confidence.
Includes confidence loss values for foreground and background.
Note: The backgroud loss is calculated when the maximum iou of the box predicted
by the feature point and all GTs is less than the threshold.
(3)The loss of classes。
The category loss is BCE, which is the binary value of each class.
:param stride: The scale of the feature map relative to the original image
:return: The average loss(loss_giou, loss_conf, loss_cls) of all batches of this detection layer.
"""
BCE = nn.BCEWithLogitsLoss(reduction="none")
FOCAL = FocalLoss(gamma=2, alpha=1.0, reduction="none")
batch_size, grid = p.shape[:2]
img_size = stride * grid
p_conf = p[..., 4:5]
p_cls = p[..., 5:]
p_d_xywh = p_d[..., :4]
label_xywh = label[..., :4]
label_obj_mask = label[..., 4:5]
label_cls = label[..., 6:]
label_mix = label[..., 5:6]
giou = tools.GIOU_xywh_torch(p_d_xywh, label_xywh).unsqueeze(-1)
bbox_loss_scale = 2.0 - 1.0 * label_xywh[..., 2:3] * label_xywh[..., 3:4] / (img_size ** 2)
loss_giou = label_obj_mask * bbox_loss_scale * (1.0 - giou) * label_mix
iou = tools.iou_xywh_torch(p_d_xywh.unsqueeze(4), bboxes.unsqueeze(1).unsqueeze(1).unsqueeze(1))
iou_max = iou.max(-1, keepdim=True)[0]
label_noobj_mask = (1.0 - label_obj_mask) * (iou_max < self.__iou_threshold_loss).float()
loss_conf = (label_obj_mask * FOCAL(input=p_conf, target=label_obj_mask) +
label_noobj_mask * FOCAL(input=p_conf, target=label_obj_mask)) * label_mix
loss_cls = label_obj_mask * BCE(input=p_cls, target=label_cls) * label_mix
loss_giou = (torch.sum(loss_giou)) / batch_size
loss_conf = (torch.sum(loss_conf)) / batch_size
loss_cls = (torch.sum(loss_cls)) / batch_size
loss = loss_giou + loss_conf + loss_cls
return loss, loss_giou, loss_conf, loss_cls
特征金字塔yolo
import torch
import torch.nn as nn
import torch.nn.functional as F
from ..layers.conv_module import Convolutional
class Upsample(nn.Module):
def __init__(self, scale_factor=1, mode='nearest'):
super(Upsample, self).__init__()
self.scale_factor = scale_factor
self.mode = mode
def forward(self, x):
return F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
class Route(nn.Module):
def __init__(self):
super(Route, self).__init__()
def forward(self, x1, x2):
"""
x1 means previous output; x2 means current output
"""
out = torch.cat((x2, x1), dim=1)
return out
class FPN_YOLOV3(nn.Module):
"""
FPN for yolov3, and is different from original FPN or retinanet' FPN.
"""
def __init__(self, fileters_in, fileters_out):
super(FPN_YOLOV3, self).__init__()
fi_0, fi_1, fi_2 = fileters_in
fo_0, fo_1, fo_2 = fileters_out
self.__conv_set_0 = nn.Sequential(
Convolutional(filters_in=fi_0, filters_out=512, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=1024, filters_out=512, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=1024, filters_out=512, kernel_size=1, stride=1,pad=0, norm="bn",
activate="leaky"),
)
self.__conv0_0 = Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv0_1 = Convolutional(filters_in=1024, filters_out=fo_0, kernel_size=1,
stride=1, pad=0)
self.__conv0 = Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky")
self.__upsample0 = Upsample(scale_factor=2)
self.__route0 = Route()
self.__conv_set_1 = nn.Sequential(
Convolutional(filters_in=fi_1+256, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
)
self.__conv1_0 = Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv1_1 = Convolutional(filters_in=512, filters_out=fo_1, kernel_size=1,
stride=1, pad=0)
self.__conv1 = Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky")
self.__upsample1 = Upsample(scale_factor=2)
self.__route1 = Route()
self.__conv_set_2 = nn.Sequential(
Convolutional(filters_in=fi_2+128, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
)
self.__conv2_0 = Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv2_1 = Convolutional(filters_in=256, filters_out=fo_2, kernel_size=1,
stride=1, pad=0)
def forward(self, x0, x1, x2):
r0 = self.__conv_set_0(x0)
out0 = self.__conv0_0(r0)
out0 = self.__conv0_1(out0)
r1 = self.__conv0(r0)
r1 = self.__upsample0(r1)
x1 = self.__route0(x1, r1)
r1 = self.__conv_set_1(x1)
out1 = self.__conv1_0(r1)
out1 = self.__conv1_1(out1)
r2 = self.__conv1(r1)
r2 = self.__upsample1(r2)
x2 = self.__route1(x2, r2)
r2 = self.__conv_set_2(x2)
out2 = self.__conv2_0(r2)
out2 = self.__conv2_1(out2)
return out2, out1, out0
yolov3网络代码构建
import sys
import torch.nn as nn
import torch
import config.yolov3_config_voc as cfg
import numpy as np
from model.backbones.darknet53 import Darknet53
from model.necks.yolo_fpn import FPN_YOLOV3
from model.head.yolo_head import Yolo_head
from model.layers.conv_module import Convolutional
from utils.tools import *
sys.path.append("..")
class Yolov3(nn.Module):
"""
Note : int the __init__(), to define the modules should be in order, because of the weight file is order
"""
def __init__(self, init_weights=True):
super(Yolov3, self).__init__()
self.__anchors = torch.FloatTensor(cfg.MODEL["ANCHORS"])
self.__strides = torch.FloatTensor(cfg.MODEL["STRIDES"])
self.__nC = cfg.DATA["NUM"]
self.__out_channel = cfg.MODEL["ANCHORS_PER_SCLAE"] * (self.__nC + 5)
self.__backnone = Darknet53()
self.__fpn = FPN_YOLOV3(fileters_in=[1024, 512, 256],
fileters_out=[self.__out_channel, self.__out_channel, self.__out_channel])
self.__head_s = Yolo_head(nC=self.__nC, anchors=self.__anchors[0], stride=self.__strides[0])
self.__head_m = Yolo_head(nC=self.__nC, anchors=self.__anchors[1], stride=self.__strides[1])
self.__head_l = Yolo_head(nC=self.__nC, anchors=self.__anchors[2], stride=self.__strides[2])
if init_weights:
self.__init_weights()
def forward(self, x):
out = []
x_s, x_m, x_l = self.__backnone(x)
x_s, x_m, x_l = self.__fpn(x_l, x_m, x_s)
out.append(self.__head_s(x_s))
out.append(self.__head_m(x_m))
out.append(self.__head_l(x_l))
if self.training:
p, p_d = list(zip(*out))
return p, p_d
else:
p, p_d = list(zip(*out))
return p, torch.cat(p_d, 0)
def __init_weights(self):
" Note :nn.Conv2d nn.BatchNorm2d'initing modes are uniform "
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.normal_(m.weight.data, 0.0, 0.01)
if m.bias is not None:
m.bias.data.zero_()
print("initing {}".format(m))
elif isinstance(m, nn.BatchNorm2d):
torch.nn.init.constant_(m.weight.data, 1.0)
torch.nn.init.constant_(m.bias.data, 0.0)
print("initing {}".format(m))
def load_darknet_weights(self, weight_file, cutoff=52):
"https://github.com/ultralytics/yolov3/blob/master/models.py"
print("load darknet weights : ", weight_file)
with open(weight_file, 'rb') as f:
_ = np.fromfile(f, dtype=np.int32, count=5)
weights = np.fromfile(f, dtype=np.float32)
count = 0
ptr = 0
for m in self.modules():
if isinstance(m, Convolutional):
if count == cutoff:
break
count += 1
conv_layer = m._Convolutional__conv
if m.norm == "bn":
bn_layer = m._Convolutional__norm
num_b = bn_layer.bias.numel()
bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias.data)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight.data)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
print("loading weight {}".format(bn_layer))
else:
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias.data)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight.data)
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
print("loading weight {}".format(conv_layer))