YOLOv5对于小目标检测效果不佳的原因之一是小目标样本尺寸较小,YOLOv5的下采样乘数较大。较深的特征图使得学习小目标的特征变得困难,因此本文提出添加小目标检测层来检测较浅的特征图。 具体流程如图5所示。YOLOv5原本只对最后三个C3层进行特征预测,但由于小目标在连续下采样的过程中丢失了特征信息,导致小目标检测效果不理想。因此,我们添加了一层特征预测。新增的预测层下采样次数更少,小目标分辨率更高,有助于模型学习小目标的特征。
在head中,增加了小目标的检测层Detect3
文件路径在yolov5-master\models路径下yolov5x.yaml
# YOLOv5 by Ultralytics, AGPL-3.0 license
# Parameters
nc: 80 # number of classes
depth_multiple: 1.33 # model depth multiple
width_multiple: 1.25 # layer channel multiple
anchors:
- [10,13, 16,30, 33,23] # P3/8
- [30,61, 62,45, 59,119] # P4/16
- [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
需要填入以下:
输入图像大小,input_shape = [640, 640]。
生成的框数量,anchors_num = 12
输入输出的地址。
# -------------------------------------------------------------------------------------------------------#
# kmeans虽然会对数据集中的框进行聚类,但是很多数据集由于框的大小相近,聚类出来的9个框相差不大,
# 这样的框反而不利于模型的训练。因为不同的特征层适合不同大小的先验框,shape越小的特征层适合越大的先验框
# 原始网络的先验框已经按大中小比例分配好了,不进行聚类也会有非常好的效果。
# -------------------------------------------------------------------------------------------------------#
import glob
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
def cas_ratio(box, cluster):
ratios_of_box_cluster = box / cluster
ratios_of_cluster_box = cluster / box
ratios = np.concatenate([ratios_of_box_cluster, ratios_of_cluster_box], axis=-1)
return np.max(ratios, -1)
def avg_ratio(box, cluster):
return np.mean([np.min(cas_ratio(box[i], cluster)) for i in range(box.shape[0])])
def kmeans(box, k):
# -------------------------------------------------------------#
# 取出一共有多少框
# -------------------------------------------------------------#
row = box.shape[0]
# -------------------------------------------------------------#
# 每个框各个点的位置
# -------------------------------------------------------------#
distance = np.empty((row, k))
# -------------------------------------------------------------#
# 最后的聚类位置
# -------------------------------------------------------------#
last_clu = np.zeros((row,))
np.random.seed()
# -------------------------------------------------------------#
# 随机选5个当聚类中心
# -------------------------------------------------------------#
cluster = box[np.random.choice(row, k, replace=False)]
iter = 0
while True:
# -------------------------------------------------------------#
# 计算当前框和先验框的宽高比例
# -------------------------------------------------------------#
for i in range(row):
distance[i] = cas_ratio(box[i], cluster)
# -------------------------------------------------------------#
# 取出最小点
# -------------------------------------------------------------#
near = np.argmin(distance, axis=1)
if (last_clu == near).all():
break
# -------------------------------------------------------------#
# 求每一个类的中位点
# -------------------------------------------------------------#
for j in range(k):
cluster[j] = np.median(
box[near == j], axis=0)
last_clu = near
if iter % 5 == 0:
print('iter: {:d}. avg_ratio:{:.2f}'.format(iter, avg_ratio(box, cluster)))
iter += 1
return cluster, near
def load_data(path):
data = []
# -------------------------------------------------------------#
# 对于每一个xml都寻找box
# -------------------------------------------------------------#
for xml_file in tqdm(glob.glob('{}/*xml'.format(path))):
tree = ET.parse(xml_file)
height = int(tree.findtext('./size/height'))
width = int(tree.findtext('./size/width'))
if height <= 0 or width <= 0:
continue
# -------------------------------------------------------------#
# 对于每一个目标都获得它的宽高
# -------------------------------------------------------------#
for obj in tree.iter('object'):
xmin = int(float(obj.findtext('bndbox/xmin'))) / width
ymin = int(float(obj.findtext('bndbox/ymin'))) / height
xmax = int(float(obj.findtext('bndbox/xmax'))) / width
ymax = int(float(obj.findtext('bndbox/ymax'))) / height
xmin = np.float64(xmin)
ymin = np.float64(ymin)
xmax = np.float64(xmax)
ymax = np.float64(ymax)
# 得到宽高
data.append([xmax - xmin, ymax - ymin])
return np.array(data)
if __name__ == '__main__':
np.random.seed(0)
# -------------------------------------------------------------#
# 运行该程序会计算'./VOCdevkit/VOC2007/Annotations'的xml
# 会生成yolo_anchors.txt
# -------------------------------------------------------------#
input_shape = [640, 640]
anchors_num = 12
# -------------------------------------------------------------#
# 载入数据集,可以使用VOC的xml
# -------------------------------------------------------------#
path = r'D:\learn\sdxx\mbjc\dataset\训练用数据集\714data\jx\jx882'
# -------------------------------------------------------------#
# 载入所有的xml
# 存储格式为转化为比例后的width,height
# -------------------------------------------------------------#
print('Load xmls.')
data = load_data(path)
print('Load xmls done.')
# -------------------------------------------------------------#
# 使用k聚类算法
# -------------------------------------------------------------#
print('K-means boxes.')
cluster, near = kmeans(data, anchors_num)
print('K-means boxes done.')
data = data * np.array([input_shape[1], input_shape[0]])
cluster = cluster * np.array([input_shape[1], input_shape[0]])
# -------------------------------------------------------------#
# 绘图
# -------------------------------------------------------------#
for j in range(anchors_num):
plt.scatter(data[near == j][:, 0], data[near == j][:, 1])
plt.scatter(cluster[j][0], cluster[j][1], marker='x', c='black')
plt.savefig("kmeans_for_anchors.jpg")
plt.show()
print('Save kmeans_for_anchors.jpg in root dir.')
cluster = cluster[np.argsort(cluster[:, 0] * cluster[:, 1])]
print('avg_ratio:{:.2f}'.format(avg_ratio(data, cluster)))
print(cluster)
f = open(r"D:\learn\sdxx\mbjc\dataset\训练用数据集\714data\yolo_anchors.txt", 'w')
row = np.shape(cluster)[0]
for i in range(row):
if i == 0:
x_y = "%d,%d" % (cluster[i][0], cluster[i][1])
else:
x_y = ", %d,%d" % (cluster[i][0], cluster[i][1])
f.write(x_y)
f.close()
复制一个yolov5x.yaml,修改名称为yolov5x_xmb.ymal
将生成的框数字直接替代,anchors中的,并在head中加入小目标检测层
# YOLOv5 by Ultralytics, AGPL-3.0 license
# Parameters
nc: 80 # number of classes
depth_multiple: 1.33 # model depth multiple
width_multiple: 1.25 # layer channel multiple
anchors:
- [10,18, 14,25, 20,33]
- [15,44, 25,45, 17,72]
- [26,78, 35,58, 21,135]
- [47,96, 33,215, 52,378]
# - [10,13, 16,30, 33,23] # P3/8
# - [30,61, 62,45, 59,119] # P4/16
# - [116,90, 156,198, 373,326] # P5/32
# YOLOv5 v6.0 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [64, 6, 2, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 6, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 3, C3, [1024]],
[-1, 1, SPPF, [1024, 5]], # 9
]
# YOLOv5 v6.0 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]], #14
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
##增加
[-1, 1, Conv, [128, 1, 1]], #18
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 2], 1, Concat, [1]], # cat head P4 #20
[-1, 3, C3, [128, False]], # 21 (P4/16-medium)#3
[-1, 1, Conv, [256, 3, 2]],
[[-1, 18], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [256, False]], # 24 (P4/16-medium)#4
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 27 (P4/16-medium)2
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 30 (P5/32-large)1
[[21, 24, 27,30], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]