目录
1. 数据处理
1.1 数据集分类
1.2 数据转换为hdf5格式
1.3 编码
2. 网络模型
2.1 DarkNet19
2.2 yolo_body+decoder
3. 损失函数
3.1 正样本损失
3.2 负样本损失
3.3 类别损失
3.4 框损失
4. 训 练
4.1 载入数据
4.2 载入模型
4.3 损失函数
4.4 更新参数
5. 预 测
5.1 数据处理
5.2 预测
5.3 筛选
5.4 画框
data_process/datasets_split_1.py
aim : 把数据集划分为训练集、测试集、验证集。每个数据集存放的是图片的名称。
input :xml_path、base_path、trainval_radio、train_radio
output : base_path+trainval.txt、base_path+train.txt、base_path+val.txt、base_path+test.txt。
process:
1. 根据xml_path里的文件获取总样本名称。
2. 根据trainval_radio、train_radio获取各个数据集的样本数量。根据各个数据集的样本数量从总样本中抽取样本,获取样本的下标。
3. 根据下标所在的数据集,把数据集的名称放在不同的数据集中。
import random,os
xml_path = '../VOCdevkit/VOC2007/Annotations' # 总样本
base_path = '../VOCdevkit/VOC2007/ImageSets/Main'
trainval_radio = 0.9 # 训练测试数据集的样本比例
train_radio = 0.9 # 验证集比例
names_list = []
img_names = os.listdir(xml_path)
for name in img_names:
if name.endswith('.xml'):
names_list.append(name[:-4])
N = len(names_list) # 总样本量
trainval_num = int(N*trainval_radio) # 训练测试数据集量
train_num = int(trainval_num*train_radio) # 训练集样本量
trainval_idx = random.sample(range(N),trainval_num) # 训练测试数据集下标
train_idx = random.sample(trainval_idx,train_num)
# 训练集下标
# 数据集地址
ftrain_val = open(os.path.join(base_path,'trainval.txt'),'w')
ftrain = open(os.path.join(base_path,'train.txt'),'w')
fval = open(os.path.join(base_path,'val.txt'),'w')
ftest = open(os.path.join(base_path,'test.txt'),'w')
# 读入数据
for i in range(N) :
name = names_list[i] + '\n'
if i in trainval_idx:
ftrain_val.write(name)
if i in train_idx:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrain_val.close()
ftrain.close()
fval.close()
ftest.close()
data_process/data2hdf5_2.py
input : 数据集
output : pascal_voc_07_12_LS.hdf5
process:
1. 获取数据集的样本。train_set --> get_ids(voc_path,train_set) --> train_ids
2. 生成voc_h5file,设置存储的图片数据类型和框的数据类型。划分每个数据集所属的group。voc_h5file存储'classes'。在每个group中设置train_images和train_boxes项目用来存储图片和框。
3. train_ids ,train_images,train_boxes --> add_to_dataset();
img_id --> get_img(voc_path,year,img_id);get_boxes(voc_path,year,img_id) --> img_data;img_box
代码
import numpy as np
import os,h5py,argparse
import xml.etree.ElementTree as ElementTree
sets_from_2007 = [('2007','train'),('2007','val')]
train_set = [('2007','train')]
val_set = [('2007','val')]
test_set = [('2007','test')]
classes = [
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"]
parser = argparse.ArgumentParser(description='Conver Pascal VOC 2007 detection dataset to HDF5')
parser.add_argument('-p','--path_to_voc',help='path to VOCdevkit directory',
default='../VOCdevkit')
def get_ids(voc_path,datasets):
''' 数据集中的样本'''
ids = []
for year,set in datasets:
id_path = os.path.join(voc_path,'VOC%s/ImageSets/Main/%s.txt'%(year,set))
print(id_path)
with open(id_path,'r')as f:
ids.extend(f.read().strip().split())
return ids
def get_img(voc_path,year,img_id):
''' 读取图片 '''
img_path = os.path.join(voc_path,'VOC%s/JPEGImages/%s.jpg'%(year,img_id))
with open(img_path,'rb')as f:
data = f.read()
return np.frombuffer(data,dtype='uint8') # [n,]
def get_boxes(voc_path,year,img_id):
''' 读取框 '''
boxes_path = os.path.join(voc_path,'VOC%s/Annotations/%s.xml'%(year,img_id))
with open(boxes_path,'r') as f:
xml_tree = ElementTree.parse(f)
root = xml_tree.getroot()
boxes = []
for obj in root.iter('object'):
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
xml_box = obj.find('bndbox')
bbox = (int(xml_box.find('xmin').text),
int(xml_box.find('ymin').text),
int(xml_box.find('xmax').text),
int(xml_box.find('ymax').text),
classes.index(cls))
boxes.extend(bbox)
return np.array(boxes) # [n,]
def add_to_dataset(voc_path,year,ids,images,boxes,start = 0):
''' 遍历每一个样本,读取数据集的样本和框 '''
for i,img_id in enumerate(ids):
img_data = get_img(voc_path,year,img_id)
img_box = get_boxes(voc_path,year,img_id)
images[start+i] = img_data
boxes[start+i] = img_box
return i
def _main(args):
voc_path = os.path.expanduser(args.path_to_voc)
# 1 获取数据集样本
train_ids = get_ids(voc_path,train_set)
val_ids = get_ids(voc_path,val_set)
test_ids = get_ids(voc_path,test_set)
train_ids_2007 = get_ids(voc_path,sets_from_2007)
total_train_ids = len(train_ids)+len(train_ids_2007)
# 2 设置voc_h5file、数据类型、train_group
print('Creating HDF5 dataset structure.')
fname = os.path.join(voc_path,'pascal_voc_07_12_LS.hdf5')
voc_h5file = h5py.File(fname,'w')
uint8_dt = h5py.special_dtype(vlen = np.dtype('uint8')) # variable length uint8
int_dt = h5py.special_dtype(vlen = np.dtype(int))
train_group = voc_h5file.create_group('train')
val_group = voc_h5file.create_group('val')
test_group = voc_h5file.create_group('test')
# 设置classes,实际应用中没有使用
voc_h5file.attrs['classes'] = np.string_(str.join(',',classes))
# 3 设置train_images 、train_boxes容器
train_images = train_group.create_dataset('images',shape=(total_train_ids,),dtype=uint8_dt)
val_images = val_group.create_dataset('images',shape=(len(val_ids),),dtype=uint8_dt)
test_images = test_group.create_dataset('images',shape=(len(test_ids),),dtype=uint8_dt)
train_boxes = train_group.create_dataset('boxes',shape=(total_train_ids,),dtype=int_dt)
val_boxes = val_group.create_dataset('boxes',shape=(len(val_ids),),dtype=int_dt)
test_boxes = test_group.create_dataset('boxes',shape=(len(test_ids),),dtype=int_dt)
# 4 加载数据
print('Process Pascal VOC 2007 datasets for training set')
last_2007 = add_to_dataset(voc_path,'2007',train_ids_2007,train_images,train_boxes)
print('Processing Pascal VOC 2012 training set.')
add_to_dataset(voc_path,'2007',train_ids,train_images,train_boxes,start=last_2007+1)
print('Processing Pascal VOC 2012 val set.')
add_to_dataset(voc_path, '2007', val_ids, val_images, val_boxes)
print('Processing Pascal VOC 2007 test set.')
add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes)
print('Closing HDF5 file.')
voc_h5file.close()
print('Done.')
if __name__ == '__main__':
_main(parser.parse_args())
# voc_path = parser.parse_args().path_to_voc
# datasets = [('2007','train')]
# ids = get_ids(voc_path,datasets)
# # print(ids)
# img = get_img(voc_path,year='2007',img_id='000025')
# box = get_boxes(voc_path,year='2007',img_id='000025')
# print(box.reshape(-1,5))
data_process/data_encoder_3.py
input : data_path,anchors_path,idx
output : processed_images[n,3,416,416],out[n,13,13,5,4+1+5]
process:
1.读取图片、框、类别数据。processed_images,processed_boxes = self.process_data(idx)
2.对框编码,得到真实偏移和cls。out = self.encoder(processed_boxes)
代码
import numpy as np
import io,os,PIL,h5py,argparse
from PIL import Image
import torch
import torch.utils.data as data
YOLO_ANCHORS = np.array(
((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434),
(7.88282, 3.52778), (9.77052, 9.16828)))
def get_classes(classes_path):
with open(classes_path) as f:
class_name = f.read().strip().split()
return class_name
def get_anchors(anchors_path):
if os.path.isfile(anchors_path):
with open(anchors_path)as f:
anchors = f.read().strip().split()
return np.array(list(map(float,anchors))).reshape(-1, 2)
else:
Warning('Could not open anchors file, using default.')
return YOLO_ANCHORS
class yoloDataset(data.Dataset):
image_size = [416,416]
def __init__(self,data_path,anchors_path):
self.anchors = self.get_anchors(anchors_path)
data = h5py.File(data_path, 'r')
self.images = data['train/images'][:]
self.boxes = data['train/boxes'][:]
# 1 每张图片中,框最多是多少
self.max_num = 0
self.num_samples = len(self.boxes)
self.flag = self.boxes is not None
if self.flag:
for i in range(self.num_samples):
self.boxes[i] = self.boxes[i].reshape(-1,5)
if self.max_num < self.boxes[i].shape[0]:
self.max_num = self.boxes[i].shape[0]
def __len__(self):
return self.num_samples
def __getitem__(self,idx):
processed_images,processed_boxes = self.process_data(idx)
out = self.encoder(processed_boxes)
return torch.tensor(processed_images), torch.tensor(out)
def get_anchors(self,anchors_path):
if os.path.isfile(anchors_path):
with open(anchors_path)as f:
anchors = f.read().strip().split()
return np.array(list(map(float,anchors))).reshape(-1, 2)
else:
Warning('Could not open anchors file, using default.')
return YOLO_ANCHORS
def process_data(self,idx):
'''
aim : 1.把图片归一化到0`1,转换通道。
2.box[x1,y1,x2,y2]-->[cx,cy,w,h];在原图上的相对位置;
每张图片上框的shape为[max_num,5],多余的补零。
inputs: idx
outputs: np.array(img),np.array(new_box)
'''
images = self.images[idx]
boxes = self.boxes[idx]
img = Image.open(io.BytesIO(images))
img_shape = np.array(img.size) #
img = img.resize(self.image_size, PIL.Image.BICUBIC) # (416, 416)
img = np.array(img,np.float)/255.
img = np.transpose(img,(2,0,1))
if self.flag:
box = np.concatenate([(boxes[:,2:4] + boxes[:,:2])*0.5/img_shape,(boxes[:,2:4] - boxes[:,:2])/img_shape,boxes[:,4:5]],1)
new_box = np.zeros((self.max_num,5),dtype=np.float32)
new_box[:len(box),:] = box # box(cx,cy,w,h,cls)
return np.array(img),np.array(new_box)
else:
return np.array(img),None
def encoder(self,boxes):
''' one picture
aim : 把真实框映射到特征图上。
1. 真实框在特征图上对应的数值;
2 真实框在特征图上对应的对应的下标;
3 计算预测偏移
inputs:
box[max_num_box, 5(cx,cy,w,h,cls)],anchors[5,2] max_num_box=10 ; image_size=[416,416]
outputs:
true_boxes:[h, w, num_boxes, 4]
detectors_mask: (h, w, num_boxes, 1) eg:(13, 13, 5, 1)
matching_true_boxes:(h, w, num_boxes, 5) eg:(13, 13, 5, 5)
'''
# 1 创建模版
h,w = self.image_size
num_anchors = len(self.anchors)
num_box_params = boxes.shape[1]
assert h % 32 == 0,'Image sizes in YOLO_v2 must be multiples of 32.'
assert w % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.'
grid_h = h//32 # 13
grid_w = w//32
true_boxes = np.zeros([grid_h,grid_w,num_anchors,4],dtype=np.float32)
detectors_mask = np.zeros([grid_h,grid_w,num_anchors,1],dtype=np.float32) # (13, 13, 5, 1)
matching_true_boxes = np.zeros([grid_h,grid_w,num_anchors,num_box_params],dtype=np.float32) # (13, 13, 5, 5)
# 2 编码
box_class = boxes[:,4] # [n,1]
box = boxes[:,:4]*np.array([grid_w,grid_h,grid_w,grid_h])
i,j = list(map(int,box[:,0])),list(map(int,box[:,1]))
best_idx = self.iou_wh(box[:,:2],self.anchors) # (10, 2), (5, 2)--> ((10,), (10,))
true_boxes[i, j, best_idx] = boxes[:,:4]/np.array([grid_h,grid_w,grid_h,grid_w])
detectors_mask[i,j,best_idx] = 1
adjusted_box = np.array(
[
box[:,0] - i, box[:,1] - j,
np.log(box[:,2] / self.anchors[best_idx][:,0]),
np.log(box[:,3] / self.anchors[best_idx][:,1]), box_class
],
dtype=np.float32).T
matching_true_boxes[i, j, best_idx] = adjusted_box
out = np.concatenate([np.array(true_boxes),np.array(detectors_mask),np.array(matching_true_boxes)],-1)
return out # true_boxes,detectors_mask, matching_true_boxes # ((13, 13, 5, 1), (13, 13, 5, 5))
def iou_wh(self,boxes_wh,anchors_wh):
'''boxes_wh[n,2],anchors_wh [m,2]
iou[n,m]'''
boxes_wh=np.expand_dims(boxes_wh,1) # [10,1,2]
anchors_wh=np.expand_dims(anchors_wh,0) # [1,5,2]
box_max = boxes_wh/2.
box_min = -box_max
anchor_max = anchors_wh/2.
anchor_min = -anchor_max
inter_mins = np.maximum(box_min,anchor_min) # [10,5,2]
inter_maxs = np.minimum(box_max,anchor_max)
inter_wh = np.maximum(inter_maxs-inter_mins,0.)
inter_area = inter_wh[...,0] * inter_wh [...,1] # [10,5]
boxes_area = boxes_wh[...,0] * boxes_wh[...,1]
anchors_area = anchors_wh[...,0]*anchors_wh[...,1] #[1,5]
iou = inter_area/(boxes_area+anchors_area-inter_area) # [10,5]
best_iou = np.max(iou,1)
best_idx = np.argmax(iou,1)
return list(best_idx*(best_iou > 0))
if __name__ == '__main__':
from torch.utils.data import DataLoader
data_path = '../VOCdevkit/pascal_voc_07_12_LS.hdf5'
anchors_path = '../model_data.pascal_classes.txt'
train_dataset = yoloDataset(data_path,anchors_path) # [3, 416, 416],[13, 13, 5, 10]
train_loader = DataLoader(train_dataset,batch_size=1,shuffle=True,num_workers=0)
for i,(img,boxes) in enumerate(train_loader):
print(img.shape) # torch.Size([1, 3, 416, 416])
print(boxes.shape) # torch.Size([1, 13, 13, 5, 10]) 4+1+5
nets/darketnet19.py
input : img[b,3,416,416]
output : feas[b,1024,13,13]
process:
1.features_26 = (cov_bn_leaky3 --> maxpool)*2 -->
(bottleneck_block*2 --> maxpool)*2 -->
bottleneck_x2_block --> maxpool -->
bottleneck_x2_block
2.features_13 = features_26 --> maxpool --> bbx22
代码
import torch
import torch.nn as nn
import math
def cov_bn_leaky3(inplanes,outplanes):
return nn.Sequential(
nn.Conv2d(inplanes,outplanes,kernel_size=3,padding=1),
nn.BatchNorm2d(outplanes),
nn.LeakyReLU(0.1)
)
def cov_bn_leaky1(inplanes,outplanes):
return nn.Sequential(
nn.Conv2d(inplanes,outplanes,kernel_size=1),
nn.BatchNorm2d(outplanes),
nn.LeakyReLU(0.1)
)
def bottleneck_block(inplanes,outplanes,bottleneck_filters):
return nn.Sequential(
cov_bn_leaky3(inplanes,outplanes),
cov_bn_leaky1(outplanes,bottleneck_filters),
cov_bn_leaky3(bottleneck_filters,outplanes)
)
def bottleneck_x2_block(inplanes,outplanes,bottleneck_filters):
return nn.Sequential(
bottleneck_block(inplanes,outplanes,bottleneck_filters),
cov_bn_leaky1(outplanes,bottleneck_filters),
cov_bn_leaky3(bottleneck_filters,outplanes)
)
class darknet_body(nn.Module):
def __init__(self,):
super(darknet_body, self).__init__()
self.cbl1 = cov_bn_leaky3(3,32)
self.cbl2 = cov_bn_leaky3(32,64)
self.bb1 = bottleneck_block(64,128, 64)
self.bb2 = bottleneck_block(128,256, 128)
self.bbx21 = bottleneck_x2_block(256,512, 256)
self.bbx22 = bottleneck_x2_block(512,1024, 512)
self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2)
self.features_26 = nn.Sequential(self.cbl1,self.maxpool,self.cbl2,self.maxpool,self.bb1,
self.maxpool, self.bb2 ,self.maxpool, self.bbx21)
self.features_13 = nn.Sequential(self.features_26 ,self.maxpool, self.bbx22)
for m in self.modules():
if isinstance(m,nn.Conv2d):
n = m.kernel_size[0]*m.kernel_size[1]*m.out_channels
m.weight.data.normal_(0,math.sqrt(2./n))
elif isinstance(m,nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def forward(self,x):
# out = self.cbl1(x) # [1, 32, 416, 416]
# out = self.maxpool(out) # [1, 32, 208, 208]
# out = self.cbl2(out) # [1, 64, 208, 208]
# out = self.maxpool(out) # [1, 64, 104, 104]
# out = self.bb1(out) # [1, 128, 104, 104]
# out = self.maxpool(out) # [1, 128, 52, 52]
# out = self.bb2(out) # [1, 256, 52, 52]
# out = self.maxpool(out) # [1, 256, 26, 26]
# out = self.bbx21(out) # [1, 512, 26, 26]
# out = self.maxpool(x) # [1, 512, 13, 13]
# out = self.bbx22(out) # [1, 1024, 13, 13]
x = self.features_13(x)
return x
def darknet19(inputs):
"""Generate Darknet-19 model for Imagenet classification."""
body = darknet_body()(inputs)
logits = nn.Conv2d(1024,1000, (1, 1))(body)
logits = nn.Softmax(1)(logits)
return logits
if __name__ == '__main__':
x = torch.randn([1,3,416,416])
# y = cov_bn_leaky1(3,10)(x)
# y = bottleneck_block(3,30,20)
# y = bottleneck_x2_block(3,30,20)(x)
# net = darknet_body()
# y = net(x)
y = darknet_body()
print('y.features_26 :',y.features_26)
print('\n')
print('y.bbx22 :',y.bbx22)
# for i in y.children():
# print(i)
nets/yolo_model.py
(1)yolo_body
input :[1,3,416,416]
output :[1, 13, 13, 125]
process:
1.fea_26,fea_13
2.torch.cat([fea_26,fea_13],1) -->
cov_bn_leaky3,cov_bn_leaky1 -->
transpose
(2)yolo_decoder
inputs:
feats: tensor, [None,125,13,13],
anchors: array-like,Anchor box widths and heights. (5,2)
num_classes: int, Number of target classes. 20
outputs:
box_xy[1, 13, 13, 5, 2]
box_wh[1, 13, 13, 5, 2]
box_conf[1, 13, 13, 5, 1]
box_class_pred[1, 13, 13, 5, 20]
process:
根据公式,是编码过程的逆过程。
代码
import sys
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
from nets.darketnet19 import cov_bn_leaky1,cov_bn_leaky3,darknet_body
sys.path.append('..') # 这个是干什么的?
voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
[6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
voc_classes = [
"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
"chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"]
def grid(h,w):
cx = torch.repeat_interleave(torch.arange(h),w).view(-1,1)
cy = torch.Tensor.repeat(torch.arange(w),h).view(-1,1)
return torch.cat([cx,cy],1)
class yolo_body(nn.Module):
def __init__(self,num_anchors=5,num_classes=20):
super(yolo_body, self).__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
self.darknet = darknet_body()
self.fea_13 = nn.Sequential(self.darknet.features_13,cov_bn_leaky3(1024,1024),
cov_bn_leaky3(1024,1024))
self.fea_26 = nn.Sequential(self.darknet.features_26,cov_bn_leaky1(512,64))
def pass_through(self,x):
return torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],1)
def forward(self,x):
fea_13 = self.fea_13(x)
fea_26 = self.fea_26(x)
fea_26 = self.pass_through(fea_26)
out = torch.cat([fea_26,fea_13],1)
out = cov_bn_leaky3(1280,1024)(out)
out = cov_bn_leaky1(1024,self.num_anchors*(self.num_classes+5))(out)
out = torch.transpose(out,1,3)
return out # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]
'''
def yolo_body(inputs,num_anchors=5,num_classes=20):
darknet = darknet_body()
features_26 = darknet.features_26
features_13 = darknet.features_13
fea_13 = nn.Sequential(features_13,cov_bn_leaky3(1024,1024),
cov_bn_leaky3(1024,1024))(inputs)
fea_26 = nn.Sequential(features_26,cov_bn_leaky1(512,64))(inputs)
fea_26 = pass_through(fea_26)
out = torch.cat([fea_26,fea_13],1)
out = cov_bn_leaky3(1280,1024)(out)
out = cov_bn_leaky1(1024,num_anchors*(num_classes+5))(out)
out = torch.transpose(out,1,3)
print('out.shape:',out.shape)
return out # inputs:[1,3,416,416] --> outputs:[1, 13, 13, 125]
'''
def yolo_decoder(feats,anchors,num_classes):
''' Convert final layer features to bounding box parameters.
inputs:
feats: tensor, [None,125,13,13],
anchors: array-like,Anchor box widths and heights.
num_classes: int, Number of target classes.
outputs:
box_xy ,box_wh,box_conf ,box_class_pred
'''
grids = feats.shape[1:3] # torch.Size([13, 13])
num_anchors = len(anchors) # 5
anchors_wh = Variable(torch.from_numpy(anchors)).view(1,1,1,num_anchors,2) # [1, 1, 1, 5, 2]
anchors_cxy = grid(grids[0],grids[1]).view(-1,grids[0],grids[1],1,2) # [1, 13, 13, 1, 2]
feats = feats.view(-1,grids[0],grids[1],num_anchors,num_classes+5) # [1, 13, 13, 125]-->[1, 13, 13, 5, 25]
box_xy = torch.sigmoid(feats[..., :2]) # [1,13,13,5,2]
box_wh = torch.exp(feats[..., 2:4]) # [1,13,13,5,2]
box_confidence = torch.sigmoid(feats[..., 4:5]) # [1,13,13,5,1]
box_class_probs = torch.softmax(feats[..., 5:],-1) # [1,13,13,5,20]
box_xy = (box_xy + anchors_cxy) / torch.tensor(list(grids)) # [1, 13, 13, 5, 2]
box_wh = box_wh * anchors_wh / torch.tensor(list(grids)) # [1, 13, 13, 5, 2]
return box_xy, box_wh, box_confidence, box_class_probs
if __name__ == '__main__':
x = torch.randn([1,3,416,416])
net = yolo_body()
params = []
params_dict = dict(net.named_parameters())
print(net(x).shape) # torch.Size([1, 13, 13, 125])
# x = yolo_body(inputs=x,num_anchors=5,num_classes=20)
# box_xy, box_wh, box_confidence, box_class_probs = yolo_encoder(feats=x,anchors=voc_anchors,num_classes=20)
loss.py
input : pred(b, 13, 13, 125),target(b, 13, 13, 5, 1)
output : total_loss
process:
1.数据准备
target --> true_boxes, detectors_mask, matching_true_boxes
pred --> sigmoid --> pred_d_boxes
pred --> yolo_decoder() --> pred_xy, pred_wh, pred_confidence, pred_class_prob
2.正样本损失 best_iou/1 - pred_confidence,detectors_mask --> objects_loss
3.负样本损失 (pred_xy, pred_wh),true_boxes --> iou --> object_detections;
object_detections,detectors_mask,pred_confidence --> no_objects_loss
4.类别损失 matching_true_boxes[...,-1],pred_class_prob,detectors_mask --> classification_loss
5.框损失 matching_true_boxes[...,:4],pred_d_boxes,detectors_mask --> coordinates_loss
代码
import torch
import numpy as np
import torch.nn as nn
from nets.yolo_model import yolo_decoder
voc_anchors = np.array( [[1.08, 1.19], [3.42, 4.41],
[6.63, 11.38], [9.42, 5.11], [16.62, 10.52]])
'''
model_body.output (b, 13, 13, 125)
detectors_mask_input (b, 13, 13, 5, 1)
matching_boxes_input (b, 13, 13, 5, 5)
'''
class yoloLoss(nn.Module):
def __init__(self,object_scale,no_object_scale,class_scale,
coordinates_scale,anchors,num_classes,
rescore_confidence=False,print_loss=False): # criterion = yoloLoss(7,2,5,0.5)
super(yoloLoss, self).__init__()
self.object_scale = object_scale
self.no_object_scale = no_object_scale
self.class_scale = class_scale
self.coordinates_scale = coordinates_scale
self.rescore_confidence = rescore_confidence
self.print_loss = print_loss
self.anchors = anchors
self.num_classes = num_classes
def compute_iou(self,box_t,box_p):
''' box_pred [b,13, 13, 5, 4],box_true[b,13, 13, 5, 4] (x1,y1,x2,y2)'''
# 1 lt,rd --> wh --> inter + areas --> iou
lt = torch.maximum(box_t[...,:2],box_p[...,:2])
rd = torch.minimum(box_t[...,2:],box_p[...,2:])
wh = rd - lt
wh[wh<0]=0 # [b,h,w,5,n,2]
inter = wh[...,0]*wh[...,1] # [b,h,w,5,n]
area_t = (box_t[...,3]-box_t[...,1])*(box_t[...,2]-box_t[...,0]) # [b,1,1,1,n]
area_p = (box_p[...,3]-box_p[...,1])*(box_p[...,2]-box_p[...,0]) # [b,1,1,1,n]
iou = inter/(area_t+area_p-inter)
return iou # [b,h,w,5,n]
def yolo_loss(self,pred,target):
# 1 数据准备
num_anchors = len(self.anchors)
yolo_output = pred # [1, 13, 13, 125 ]
true_boxes = target[...,:4] # [1, 13, 13, 5, 4]
detectors_mask = target[...,4:5] # [1, 13, 13, 5, 1]
matching_true_boxes = target[...,5:] # [1, 13, 13, 5, 5]
pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_decoder(
yolo_output,anchors=voc_anchors,num_classes=self.num_classes)
# 预测偏移
yolo_output_shape = yolo_output.shape[1:3] # # torch.Size([1, 13, 13, 125])
feats = yolo_output.view(-1,yolo_output_shape[0],yolo_output_shape[1],
num_anchors,self.num_classes+5) # torch.Size([1, 13, 13, 5, 25])
pred_d_boxes = torch.cat((torch.sigmoid(feats[...,0:2]),feats[...,2:4]),axis=-1) # torch.Size([1, 13, 13, 5, 4])
# 2 true_boxes与pred_xy, pred_wh的iou
## true_boxes[(1,13, 13, 5, 4)],pred_xy[1, 13, 13, 5, 2]
true_box = torch.cat([(true_boxes[...,:2]-true_boxes[...,2:4]/2.),(true_boxes[...,:2]+true_boxes[...,2:4]/2.)],-1)
pred_box = torch.cat([(pred_xy-pred_wh/2.),(pred_xy+pred_wh/2.)],-1) # [1, 13, 13, 5, 1, 4]
iou = self.compute_iou(true_box,pred_box) # [1, 13, 13, 5, 10]
best_iou, _ = iou.max(-1) # [1, 13, 13, 5 ]
best_iou = best_iou.unsqueeze(-1) # [1, 13, 13, 5, 1 ]
object_detections = best_iou > 0.6 # [1, 13, 13, 5, 1 ]
# 3 loss
# 3.1 no_obj loss
no_objects_loss = self.no_object_scale * (1-object_detections)*torch.logical_not(detectors_mask)*torch.square(-pred_confidence)
# 3.2 obj loss
if self.rescore_confidence:
objects_loss = self.object_scale * detectors_mask * torch.square(best_iou - pred_confidence)
else:
objects_loss = self.object_scale * detectors_mask * torch.square(1 - pred_confidence)
# 3.3 (obj loss + no_obj loss)
confidence_loss = (objects_loss + no_objects_loss).sum()
# detectors_mask[b, 13, 13, 5, 1]
# 3.4 cls loss true_boxes[b,n] # pred_class_prob [b,13,13,5,20]
matching_classes = matching_true_boxes[...,4] # [b, 13, 13, 5, 1]
s1,s2,s3,s4 = matching_classes.shape
one_hot = torch.eye(self.num_classes)
matching_classes = one_hot[matching_classes.flatten()].view(s1,s2,s3,s4,self.num_classes)
classification_loss = (self.class_scale * detectors_mask * torch.square(matching_classes - pred_class_prob)).sum() # [b,n,20]
# boxes loss
matching_boxes = matching_true_boxes[...,0:4]
coordinates_loss = (self.coordinates_scale * detectors_mask * torch.square(matching_boxes-pred_d_boxes)).sum()
total_loss = 0.5 * (confidence_loss + classification_loss + coordinates_loss)
return total_loss
if __name__ == '__main__':
print('PyCharm')
train.py
process:
1.载入数据
2.载入模型
3.损失函数
4.更新参数
代码
import os
import torch,h5py
import numpy as np
from loss import yoloLoss
from torch.autograd import Variable
from nets.yolo_model import yolo_body
from torch.utils.data import DataLoader
from data_process.data_encoder_3 import get_classes,get_anchors,yoloDataset
# 1 parameters
use_gpu = False
learning_rate = 0.001
num_epochs = 1
batch_size = 1
# 2 model
net = yolo_body()
params = []
params_dict = dict(net.named_parameters())
for k,v in params_dict.items():
if k.startswith('features'):
params += [{'params':[v],'lr':learning_rate*1}]
else:
params += [{'params':[v],'lr':learning_rate*1}]
# 3 loss + optimizer
anchors_path = 'model_data/anchors.txt'
classes_path = 'model_data/pascal_classes.txt'
anchors = get_anchors(anchors_path)
classes = get_classes(classes_path)
num_classes = len(classes)
cost = yoloLoss(5,1,1,1,anchors,num_classes)
optimizer = torch.optim.SGD(params,lr=learning_rate,momentum=0.9,weight_decay=5e-4)
# 4 data
data_path = 'VOCdevkit/pascal_voc_07_12_LS.hdf5'
data = h5py.File(data_path, 'r')
train_dataset = yoloDataset(data_path,anchors_path) # (11, 3, 416, 416) (11, 13, 13, 5, 10)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=0)
# 5 train
num_iter = 0
best_test_loss = np.inf
for epoch in range(num_epochs):
net.train()
if epoch == 30:
learning_rate = 0.0001
if epoch == 40:
learning_rate = 0.00001
for params_group in optimizer.param_groups:
params_group['lr'] = learning_rate
print('\n\nStarting epoch %d / %d' % (epoch + 1, num_epochs))
print('Learning Rate for this epoch: {}'.format(learning_rate))
total_loss = 0.
for i,(img,targets) in enumerate(train_loader):
imgs = Variable(img).to(torch.float32) # torch.Size[b, 3, 416, 416]
targets = Variable(targets) # torch.Size[b, 13, 13, 5, 10]
pred = net(imgs)
loss = cost.yolo_loss(pred,targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if(i+1)%5 == 0:
print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f, average_loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_loader), loss.data.item(), total_loss / (i+1)))
num_iter += 1
predict.py
process:
1.数据处理
2.预测
3.筛选
4.画框
代码
'''
# 1 img process
# 2 predict --> decoder
# 3 filter_boxes
# 4 draw
'''
from torch.autograd import Variable
import torchvision.transforms as transforms
import numpy as np
from PIL import Image,ImageDraw,ImageFont
import colorsys,imghdr,os,torch,cv2
from nets.yolo_model import yolo_body,yolo_decoder
from data_process.data_encoder_3 import get_classes,get_anchors
def yolo_boxes_to_corners(box_xy,box_wh):
box_mins = box_xy - (box_wh/2.)
box_maxes = box_xy + (box_wh/2.)
return torch.cat([box_mins[...,1:2], box_mins[...,0:1],
box_maxes[...,1:2],box_maxes[...,0:1]],-1)
def yolo_filter_boxes(boxes,box_confidence,box_class_probs,threshold=.6):
'''
inputs:
box [1,13,13,5,4 ]_
confidence [1,13,13,5,1 ]
box_class_probs [1,13,13,5,20]
outputs:
boxes[n,4], scores[n], classes[n]
'''
box_scores = box_confidence * box_class_probs # box_scores.shape [1,13,13,5,20]
box_class_scores ,box_classes = torch.max(box_scores,axis=-1) # [1, 13, 13, 5]), torch.Size([1, 13, 13, 5]
prediction_mask = box_class_scores >= threshold # [1, 13, 13, 5])
boxes = boxes[prediction_mask] # [n,4]
scores = box_class_scores[prediction_mask] # [n]
classes = box_classes[prediction_mask] # [n]
return boxes, scores, classes
def nms(bboxes,scores,threshold=0.5):
x1 = bboxes[:,0]
y1 = bboxes[:,1]
x2 = bboxes[:,2]
y2 = bboxes[:,3]
areas = (x2-x1)*(y2-y1)
_,order = scores.sort(0,descending=True)
keep = []
while order.numel() > 0:
if order.numel()>1:
i = order[0]
else:
i = order
keep.append(i)
if order.numel() == 1:
break
xx1 = x1[order[1:]].clamp(min=x1[i])
yy1 = y1[order[1:]].clamp(min=y1[i])
xx2 = x2[order[1:]].clamp(max=x1[i])
yy2 = y2[order[1:]].clamp(max=y1[i])
w = (xx2-xx1).clamp(min=0)
h = (yy2-yy1).clamp(min=0)
inter = w*h
ove = inter/(areas[i]+areas[order[1:]]-inter)
ids = torch.nonzero(ove <= threshold).squeeze()
if ids.numel() == 0:
break
order = order[ids+1]
return torch.LongTensor(keep)
def yolo_eval(yolo_outputs,image_shape=[416,416],
score_threshold=.6,iou_threshold=.5):
''' score_filter + NMS
box_xy[1,13,13,5,2],
box_wh[1,13,13,5,2]_
confidence[1,13,13,5,1],
box_class_probs [1,13,13,5,20]
'''
box_xy,box_wh,box_confidence,box_class_probs = yolo_outputs
boxes = yolo_boxes_to_corners(box_xy, box_wh) #[1, 13, 13, 5, 4]
# 1 score_filter
boxes, scores, classes = yolo_filter_boxes(
boxes, box_confidence, box_class_probs, threshold=score_threshold)
# 预测框映射到原图
boxes = boxes * torch.tensor([image_shape[0],image_shape[1],image_shape[0],image_shape[1]])
# 2 NMS
keep = nms(boxes,scores,iou_threshold)
return boxes[keep],scores[keep],classes[keep]
def detect_img():
# 1 img process
image_name = '000015.jpg'
image = cv2.imread('VOCdevkit/VOC2007/JPEGImages/'+image_name) # (375, 500, 3)
h,w,_ = image.shape # h,w,_ =(375, 500, 3)
img = cv2.resize(image,(416,416)) # (448, 448, 3)
img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB) # dtype('float32')
img = np.array(img,np.float)/255.
transform = transforms.Compose([transforms.ToTensor(),])
img = transform(img) # torch.Size([3, 448, 448])
img = Variable(img[None,:,:,:],volatile=True) #t
# 2 predict --> decoder
net = yolo_body()
net.eval()
print('load model...')
print('predicting...')
feas = net(img)
anchors_path = 'model_data/anchors.txt'
classes_path = 'model_data/pascal_classes.txt'
anchors = get_anchors(anchors_path)
class_names = get_classes(classes_path)
num_classes = len(class_names)
pred = yolo_decoder(feas,anchors,num_classes)
# box_xy[1,13,13,5,2], box_wh[1,13,13,5,2]_confidence[1,13,13,5,1], box_class_probs [1,13,13,5,20]
# 3 filter_boxes
boxes, scores, classes = yolo_eval(pred) # [n,4],[n],[n]
print(boxes.shape, scores.shape, classes.shape)
# 4 draw
hsv_tuples = [(x / len(class_names), 1., 1.)
for x in range(len(class_names))]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
colors))
font = ImageFont.truetype(
font='font/FiraMono-Medium.otf',
size= np.floor(3e-2 * h + 0.5).astype('int32'))
thickness = (h + w) // 300
for i, c in reversed(list(enumerate(classes))):
predicted_class = class_names[c]
box = boxes[i]
score = scores[i]
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
top, left, bottom, right = box
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
print(label, (left, top), (right, bottom))
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
# My kingdom for a good redistributable image drawing library.
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=colors[c])
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=colors[c])
draw.text(text_origin, label, fill=(0, 0, 0), font=font)
del draw
print('\n',1111111)
image.save(os.path.join('image', image_name), quality=90)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
detect_img()
本文代码下载:
链接: link
pwd=123a
参考:
https://github.com/abeardear/pytorch-YOLO-v1
https://github.com/allanzelener/yad2k