2013年,Ross Girshick 等人于首次将CNN的方法应用在目标检测任务上,他们使用传统图像算法Selective Search产生候选区域,取得了极大的成功,这就是对目标检测领域影响深远的区域卷积神经网络(R-CNN)模型。
2015年,Ross Girshick 对此方法进行了改进,提出了Fast R-CNN模型。通过将不同区域的物体共用卷积层的计算,大大缩减了计算量,提高了处理速度,而且还引入了调整目标物体位置的回归方法,进一步提高了位置预测的准确性。
2015年,Shaoqing Ren 等人提出了Faster R-CNN模型,提出了RPN的方法来产生物体的候选区域,这一方法不再需要使用传统的图像处理算法来产生候选区域,进一步提升了处理速度。
2017年,Kaiming He 等人提出了Mask R-CNN模型,只需要在Faster R-CNN模型上添加比较少的计算量,就可以同时实现目标检测和物体实例分割两个任务。
以上都是基于R-CNN系列的著名模型,对目标检测方向的发展有着较大的影响力。此外,还有一些其他模型,比如SSD、YOLO(1, 2, 3)、R-FCN等也都是目标检测领域流行的模型结构。
边界框(bounding box)
锚框(Anchor box)
交并比(IoU)
size:图片尺寸
object:图片中包含的物体,一张图片可能中包含多个物体
name:昆虫名称
bndbox:物体真实框
difficult:识别是否困难
import cv2
def get_bbox(gt_bbox, gt_class):
# 对于一般的检测任务来说,一张图片上往往会有多个目标物体
# 设置参数MAX_NUM = 50, 即一张图片最多取50个真实框;如果真实
# 框的数目少于50个,则将不足部分的gt_bbox, gt_class和gt_score的各项数值全设置为0
MAX_NUM = 50
gt_bbox2 = np.zeros((MAX_NUM, 4))
gt_class2 = np.zeros((MAX_NUM,))
for i in range(len(gt_bbox)):
gt_bbox2[i, :] = gt_bbox[i, :]
gt_class2[i] = gt_class[i]
if i >= MAX_NUM:
break
return gt_bbox2, gt_class2
def get_img_data_from_file(record):
“”"
record is a dict as following,
record = {
‘im_file’: img_file,
‘im_id’: im_id,
‘h’: im_h,
‘w’: im_w,
‘is_crowd’: is_crowd,
‘gt_class’: gt_class,
‘gt_bbox’: gt_bbox,
‘gt_poly’: [],
‘difficult’: difficult
}
“”"
im_file = record[‘im_file’]
h = record[‘h’]
w = record[‘w’]
is_crowd = record[‘is_crowd’]
gt_class = record[‘gt_class’]
gt_bbox = record[‘gt_bbox’]
difficult = record[‘difficult’]
img = cv2.imread(im_file)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# check if h and w in record equals that read from img
assert img.shape[0] == int(h), \
"image height of {} inconsistent in record({}) and img file({})".format(
im_file, h, img.shape[0])
assert img.shape[1] == int(w), \
"image width of {} inconsistent in record({}) and img file({})".format(
im_file, w, img.shape[1])
gt_boxes, gt_labels = get_bbox(gt_bbox, gt_class)
# gt_bbox 用相对值
gt_boxes[:, 0] = gt_boxes[:, 0] / float(w)
gt_boxes[:, 1] = gt_boxes[:, 1] / float(h)
gt_boxes[:, 2] = gt_boxes[:, 2] / float(w)
gt_boxes[:, 3] = gt_boxes[:, 3] / float(h)
return img, gt_boxes, gt_labels, (h, w)
图像增广方法汇总:
随机改变亮暗、对比度和颜色等
随机填充
随机裁剪
随机缩放
随机翻转
随机打乱真实框排列顺序
图像增广主要作用:扩大训练数据集,抑制过拟合,提升模型的泛化能力,常用的方法见下面的程序。
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random
def image_augment(img, gtboxes, gtlabels, size, means=None):
# 随机改变亮暗、对比度和颜色等
img = random_distort(img)
# 随机填充
img, gtboxes = random_expand(img, gtboxes, fill=means)
# 随机裁剪
img, gtboxes, gtlabels, = random_crop(img, gtboxes, gtlabels)
# 随机缩放
img = random_interp(img, size)
# 随机翻转
img, gtboxes = random_flip(img, gtboxes)
# 随机打乱真实框排列顺序
gtboxes, gtlabels = shuffle_gtbox(gtboxes, gtlabels)
return img.astype('float32'), gtboxes.astype('float32'), gtlabels.astype('int32')
#随机改变亮暗、对比度和颜色等
def random_distort(img):
# 随机改变亮度
def random_brightness(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Brightness(img).enhance(e)
# 随机改变对比度
def random_contrast(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Contrast(img).enhance(e)
# 随机改变颜色
def random_color(img, lower=0.5, upper=1.5):
e = np.random.uniform(lower, upper)
return ImageEnhance.Color(img).enhance(e)
ops = [random_brightness, random_contrast, random_color]
np.random.shuffle(ops)
img = Image.fromarray(img)
img = ops[0](img)
img = ops[1](img)
img = ops[2](img)
img = np.asarray(img)
return img
#随机填充
def random_expand(img,
gtboxes,
max_ratio=4.,
fill=None,
keep_ratio=True,
thresh=0.5):
if random.random() > thresh:
return img, gtboxes
if max_ratio < 1.0:
return img, gtboxes
h, w, c = img.shape
ratio_x = random.uniform(1, max_ratio)
if keep_ratio:
ratio_y = ratio_x
else:
ratio_y = random.uniform(1, max_ratio)
oh = int(h * ratio_y)
ow = int(w * ratio_x)
off_x = random.randint(0, ow - w)
off_y = random.randint(0, oh - h)
out_img = np.zeros((oh, ow, c))
if fill and len(fill) == c:
for i in range(c):
out_img[:, :, i] = fill[i] * 255.0
out_img[off_y:off_y + h, off_x:off_x + w, :] = img
gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow)
gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh)
gtboxes[:, 2] = gtboxes[:, 2] / ratio_x
gtboxes[:, 3] = gtboxes[:, 3] / ratio_y
return out_img.astype('uint8'), gtboxes
import numpy as np
def multi_box_iou_xywh(box1, box2):
“”"
In this case, box1 or box2 can contain multi boxes.
Only two cases can be processed in this method:
1, box1 and box2 have the same shape, box1.shape == box2.shape
2, either box1 or box2 contains only one box, len(box1) == 1 or len(box2) == 1
If the shape of box1 and box2 does not match, and both of them contain multi boxes, it will be wrong.
“”"
assert box1.shape[-1] == 4, “Box1 shape[-1] should be 4.”
assert box2.shape[-1] == 4, “Box2 shape[-1] should be 4.”
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
inter_x1 = np.maximum(b1_x1, b2_x1)
inter_x2 = np.minimum(b1_x2, b2_x2)
inter_y1 = np.maximum(b1_y1, b2_y1)
inter_y2 = np.minimum(b1_y2, b2_y2)
inter_w = inter_x2 - inter_x1
inter_h = inter_y2 - inter_y1
inter_w = np.clip(inter_w, a_min=0., a_max=None)
inter_h = np.clip(inter_h, a_min=0., a_max=None)
inter_area = inter_w * inter_h
b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
return inter_area / (b1_area + b2_area - inter_area)
def box_crop(boxes, labels, crop, img_shape):
x, y, w, h = map(float, crop)
im_w, im_h = map(float, img_shape)
boxes = boxes.copy()
boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (
boxes[:, 0] + boxes[:, 2] / 2) * im_w
boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (
boxes[:, 1] + boxes[:, 3] / 2) * im_h
crop_box = np.array([x, y, x + w, y + h])
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(
axis=1)
boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2])
boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:])
boxes[:, :2] -= crop_box[:2]
boxes[:, 2:] -= crop_box[:2]
mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1))
boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1)
labels = labels * mask.astype('float32')
boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (
boxes[:, 2] - boxes[:, 0]) / w
boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (
boxes[:, 3] - boxes[:, 1]) / h
return boxes, labels, mask.sum()
#随机裁剪
def random_crop(img,
boxes,
labels,
scales=[0.3, 1.0],
max_ratio=2.0,
constraints=None,
max_trial=50):
if len(boxes) == 0:
return img, boxes
if not constraints:
constraints = [(0.1, 1.0), (0.3, 1.0), (0.5, 1.0), (0.7, 1.0),
(0.9, 1.0), (0.0, 1.0)]
img = Image.fromarray(img)
w, h = img.size
crops = [(0, 0, w, h)]
for min_iou, max_iou in constraints:
for _ in range(max_trial):
scale = random.uniform(scales[0], scales[1])
aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \
min(max_ratio, 1 / scale / scale))
crop_h = int(h * scale / np.sqrt(aspect_ratio))
crop_w = int(w * scale * np.sqrt(aspect_ratio))
crop_x = random.randrange(w - crop_w)
crop_y = random.randrange(h - crop_h)
crop_box = np.array([[(crop_x + crop_w / 2.0) / w,
(crop_y + crop_h / 2.0) / h,
crop_w / float(w), crop_h / float(h)]])
iou = multi_box_iou_xywh(crop_box, boxes)
if min_iou <= iou.min() and max_iou >= iou.max():
crops.append((crop_x, crop_y, crop_w, crop_h))
break
while crops:
crop = crops.pop(np.random.randint(0, len(crops)))
crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h))
if box_num < 1:
continue
img = img.crop((crop[0], crop[1], crop[0] + crop[2],
crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
img = np.asarray(img)
return img, crop_boxes, crop_labels
img = np.asarray(img)
return img, boxes, labels
#随机缩放
def random_interp(img, size, interp=None):
interp_method = [
cv2.INTER_NEAREST,
cv2.INTER_LINEAR,
cv2.INTER_AREA,
cv2.INTER_CUBIC,
cv2.INTER_LANCZOS4,
]
if not interp or interp not in interp_method:
interp = interp_method[random.randint(0, len(interp_method) - 1)]
h, w, _ = img.shape
im_scale_x = size / float(w)
im_scale_y = size / float(h)
img = cv2.resize(
img, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=interp)
return img
#随机翻转
def random_flip(img, gtboxes, thresh=0.5):
if random.random() > thresh:
img = img[:, ::-1, :]
gtboxes[:, 0] = 1.0 - gtboxes[:, 0]
return img, gtboxes
#随机打乱真实框排列顺序
def shuffle_gtbox(gtbox, gtlabel):
gt = np.concatenate(
[gtbox, gtlabel[:, np.newaxis]], axis=1)
idx = np.arange(gt.shape[0])
np.random.shuffle(idx)
gt = gt[idx, :]
return gt[:, :4], gt[:, 4]
通过使用飞桨提供的API paddle.reader.xmap_readers可以开启多线程读取数据.
import functools
import paddle
def multithread_loader(datadir, batch_size= 10, mode=‘train’):
cname2cid = get_insect_names()
records = get_annotations(cname2cid, datadir)
def reader():
if mode == ‘train’:
np.random.shuffle(records)
img_size = get_img_size(mode)
batch_data = []
for record in records:
batch_data.append((record, img_size))
if len(batch_data) == batch_size:
yield batch_data
batch_data = []
img_size = get_img_size(mode)
if len(batch_data) > 0:
yield batch_data
def get_data(samples):
batch_data = []
for sample in samples:
record = sample[0]
img_size = sample[1]
img, gt_bbox, gt_labels, im_shape = get_img_data(record, size=img_size)
batch_data.append((img, gt_bbox, gt_labels, im_shape))
return make_array(batch_data)
mapper = functools.partial(get_data, )
return paddle.reader.xmap_readers(mapper, reader, 8, 10)
按一定规则在图片上产生一系列的候选区域,然后根据这些候选区域与图片上物体真实框之间的位置关系对候选区域进行标注。跟真实框足够接近的那些候选区域会被标注为正样本,同时将真实框的位置作为正样本的位置目标。偏离真实框较大的那些候选区域则会被标注为负样本,负样本不需要预测位置或者类别。
使用卷积神经网络提取图片特征并对候选区域的位置和类别进行预测。这样每个预测框就可以看成是一个样本,根据真实框相对它的位置和类别进行了标注而获得标签值,通过网络模型预测其位置和类别,将网络预测值和标签值进行比较,就可以建立起损失函数。
标注锚框的具体程序
#标注预测框的objectness
def get_objectness_label(img, gt_boxes, gt_labels, iou_threshold = 0.7,
anchors = [116, 90, 156, 198, 373, 326],
num_classes=7, downsample=32):
“”"
img 是输入的图像数据,形状是[N, C, H, W]
gt_boxes,真实框,维度是[N, 50, 4],其中50是真实框数目的上限,当图片中真实框不足50个时,不足部分的坐标全为0
真实框坐标格式是xywh,这里使用相对值
gt_labels,真实框所属类别,维度是[N, 50]
iou_threshold,当预测框与真实框的iou大于iou_threshold时不将其看作是负样本
anchors,锚框可选的尺寸
anchor_masks,通过与anchors一起确定本层级的特征图应该选用多大尺寸的锚框
num_classes,类别数目
downsample,特征图相对于输入网络的图片尺寸变化的比例
“”"
img_shape = img.shape
batchsize = img_shape[0]
num_anchors = len(anchors) // 2
input_h = img_shape[2]
input_w = img_shape[3]
# 将输入图片划分成num_rows x num_cols个小方块区域,每个小方块的边长是 downsample
# 计算一共有多少行小方块
num_rows = input_h // downsample
# 计算一共有多少列小方块
num_cols = input_w // downsample
label_objectness = np.zeros([batchsize, num_anchors, num_rows, num_cols])
label_classification = np.zeros([batchsize, num_anchors, num_classes, num_rows, num_cols])
label_location = np.zeros([batchsize, num_anchors, 4, num_rows, num_cols])
scale_location = np.ones([batchsize, num_anchors, num_rows, num_cols])
# 对batchsize进行循环,依次处理每张图片
for n in range(batchsize):
# 对图片上的真实框进行循环,依次找出跟真实框形状最匹配的锚框
for n_gt in range(len(gt_boxes[n])):
gt = gt_boxes[n][n_gt]
gt_cls = gt_labels[n][n_gt]
gt_center_x = gt[0]
gt_center_y = gt[1]
gt_width = gt[2]
gt_height = gt[3]
if (gt_height < 1e-3) or (gt_height < 1e-3):
continue
i = int(gt_center_y * num_rows)
j = int(gt_center_x * num_cols)
ious = []
for ka in range(num_anchors):
bbox1 = [0., 0., float(gt_width), float(gt_height)]
anchor_w = anchors[ka * 2]
anchor_h = anchors[ka * 2 + 1]
bbox2 = [0., 0., anchor_w/float(input_w), anchor_h/float(input_h)]
# 计算iou
iou = box_iou_xywh(bbox1, bbox2)
ious.append(iou)
ious = np.array(ious)
inds = np.argsort(ious)
k = inds[-1]
label_objectness[n, k, i, j] = 1
c = gt_cls
label_classification[n, k, c, i, j] = 1.
# for those prediction bbox with objectness =1, set label of location
dx_label = gt_center_x * num_cols - j
dy_label = gt_center_y * num_rows - i
dw_label = np.log(gt_width * input_w / anchors[k*2])
dh_label = np.log(gt_height * input_h / anchors[k*2 + 1])
label_location[n, k, 0, i, j] = dx_label
label_location[n, k, 1, i, j] = dy_label
label_location[n, k, 2, i, j] = dw_label
label_location[n, k, 3, i, j] = dh_label
# scale_location用来调节不同尺寸的锚框对损失函数的贡献,作为加权系数和位置损失函数相乘
scale_location[n, k, i, j] = 2.0 - gt_width * gt_height
# 目前根据每张图片上所有出现过的gt box,都标注出了objectness为正的预测框,剩下的预测框则默认objectness为0
# 对于objectness为1的预测框,标出了他们所包含的物体类别,以及位置回归的目标
return label_objectness.astype('float32'), label_location.astype('float32'), label_classification.astype('float32'), \
scale_location.astype('float32')
YOLO-V3算法使用的骨干网络是Darknet53。
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
from paddle.fluid.dygraph.base import to_variable
class ConvBNLayer(fluid.dygraph.Layer):
“”"
卷积 + 批归一化,BN层之后激活函数默认用leaky_relu
“”"
def init(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=“leaky”,
is_test=True):
super(ConvBNLayer, self).init()
self.conv = Conv2D(
num_channels=ch_in,
num_filters=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=False,
act=None)
self.batch_norm = BatchNorm(
num_channels=ch_out,
is_test=is_test,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02),
regularizer=L2Decay(0.)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.)))
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = fluid.layers.leaky_relu(x=out, alpha=0.1)
return out
class DownSample(fluid.dygraph.Layer):
“”"
下采样,图片尺寸减半,具体实现方式是使用stirde=2的卷积
“”"
def init(self,
ch_in,
ch_out,
filter_size=3,
stride=2,
padding=1,
is_test=True):
super(DownSample, self).__init__()
self.conv_bn_layer = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
is_test=is_test)
self.ch_out = ch_out
def forward(self, inputs):
out = self.conv_bn_layer(inputs)
return out
class BasicBlock(fluid.dygraph.Layer):
“”"
基本残差块的定义,输入x经过两层卷积,然后接第二层卷积的输出和输入x相加
“”"
def init(self, ch_in, ch_out, is_test=True):
super(BasicBlock, self).init()
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
padding=0,
is_test=is_test
)
self.conv2 = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out*2,
filter_size=3,
stride=1,
padding=1,
is_test=is_test
)
def forward(self, inputs):
conv1 = self.conv1(inputs)
conv2 = self.conv2(conv1)
out = fluid.layers.elementwise_add(x=inputs, y=conv2, act=None)
return out
class LayerWarp(fluid.dygraph.Layer):
“”"
添加多层残差块,组成Darknet53网络的一个层级
“”"
def init(self, ch_in, ch_out, count, is_test=True):
super(LayerWarp,self).init()
self.basicblock0 = BasicBlock(ch_in,
ch_out,
is_test=is_test)
self.res_out_list = []
for i in range(1, count):
res_out = self.add_sublayer("basic_block_%d" % (i), #使用add_sublayer添加子层
BasicBlock(ch_out*2,
ch_out,
is_test=is_test))
self.res_out_list.append(res_out)
def forward(self,inputs):
y = self.basicblock0(inputs)
for basic_block_i in self.res_out_list:
y = basic_block_i(y)
return y
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
class DarkNet53_conv_body(fluid.dygraph.Layer):
def init(self,
is_test=True):
super(DarkNet53_conv_body, self).__init__()
self.stages = DarkNet_cfg[53]
self.stages = self.stages[0:5]
# 第一层卷积
self.conv0 = ConvBNLayer(
ch_in=3,
ch_out=32,
filter_size=3,
stride=1,
padding=1,
is_test=is_test)
# 下采样,使用stride=2的卷积来实现
self.downsample0 = DownSample(
ch_in=32,
ch_out=32 * 2,
is_test=is_test)
# 添加各个层级的实现
self.darknet53_conv_block_list = []
self.downsample_list = []
for i, stage in enumerate(self.stages):
conv_block = self.add_sublayer(
"stage_%d" % (i),
LayerWarp(32*(2**(i+1)),
32*(2**i),
stage,
is_test=is_test))
self.darknet53_conv_block_list.append(conv_block)
# 两个层级之间使用DownSample将尺寸减半
for i in range(len(self.stages) - 1):
downsample = self.add_sublayer(
"stage_%d_downsample" % i,
DownSample(ch_in=32*(2**(i+1)),
ch_out=32*(2**(i+2)),
is_test=is_test))
self.downsample_list.append(downsample)
def forward(self,inputs):
out = self.conv0(inputs)
#print("conv1:",out.numpy())
out = self.downsample0(out)
#print("dy:",out.numpy())
blocks = []
for i, conv_block_i in enumerate(self.darknet53_conv_block_list): #依次将各个层级作用在输入上面
out = conv_block_i(out)
blocks.append(out)
if i < len(self.stages) - 1:
out = self.downsample_list[i](out)
return blocks[-1:-4:-1] # 将C0, C1, C2作为返回值
#从骨干网络输出特征图C0得到跟预测相关的特征图P0
class YoloDetectionBlock(fluid.dygraph.Layer):
# define YOLO-V3 detection head
# 使用多层卷积和BN提取特征
def init(self,ch_in,ch_out,is_test=True):
super(YoloDetectionBlock, self).init()
assert ch_out % 2 == 0, \
"channel {} cannot be divided by 2".format(ch_out)
self.conv0 = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
padding=0,
is_test=is_test
)
self.conv1 = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out*2,
filter_size=3,
stride=1,
padding=1,
is_test=is_test
)
self.conv2 = ConvBNLayer(
ch_in=ch_out*2,
ch_out=ch_out,
filter_size=1,
stride=1,
padding=0,
is_test=is_test
)
self.conv3 = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out*2,
filter_size=3,
stride=1,
padding=1,
is_test=is_test
)
self.route = ConvBNLayer(
ch_in=ch_out*2,
ch_out=ch_out,
filter_size=1,
stride=1,
padding=0,
is_test=is_test
)
self.tip = ConvBNLayer(
ch_in=ch_out,
ch_out=ch_out*2,
filter_size=3,
stride=1,
padding=1,
is_test=is_test
)
def forward(self, inputs):
out = self.conv0(inputs)
out = self.conv1(out)
out = self.conv2(out)
out = self.conv3(out)
route = self.route(out)
tip = self.tip(route)
return route, tip
#定义Sigmoid函数
def sigmoid(x):
return 1./(1.0 + np.exp(-x))
#将网络特征图输出的[tx, ty, th, tw]转化成预测框的坐标[x1, y1, x2, y2]
def get_yolo_box_xxyy(pred, anchors, num_classes, downsample):
“”"
pred是网络输出特征图转化成的numpy.ndarray
anchors 是一个list。表示锚框的大小,
例如 anchors = [116, 90, 156, 198, 373, 326],表示有三个锚框,
第一个锚框大小[w, h]是[116, 90],第二个锚框大小是[156, 198],第三个锚框大小是[373, 326]
“”"
batchsize = pred.shape[0]
num_rows = pred.shape[-2]
num_cols = pred.shape[-1]
input_h = num_rows * downsample
input_w = num_cols * downsample
num_anchors = len(anchors) // 2
# pred的形状是[N, C, H, W],其中C = NUM_ANCHORS * (5 + NUM_CLASSES)
# 对pred进行reshape
pred = pred.reshape([-1, num_anchors, 5+num_classes, num_rows, num_cols])
pred_location = pred[:, :, 0:4, :, :]
pred_location = np.transpose(pred_location, (0,3,4,1,2))
anchors_this = []
for ind in range(num_anchors):
anchors_this.append([anchors[ind*2], anchors[ind*2+1]])
anchors_this = np.array(anchors_this).astype('float32')
# 最终输出数据保存在pred_box中,其形状是[N, H, W, NUM_ANCHORS, 4],
# 其中最后一个维度4代表位置的4个坐标
pred_box = np.zeros(pred_location.shape)
for n in range(batchsize):
for i in range(num_rows):
for j in range(num_cols):
for k in range(num_anchors):
pred_box[n, i, j, k, 0] = j
pred_box[n, i, j, k, 1] = i
pred_box[n, i, j, k, 2] = anchors_this[k][0]
pred_box[n, i, j, k, 3] = anchors_this[k][1]
# 这里使用相对坐标,pred_box的输出元素数值在0.~1.0之间
pred_box[:, :, :, :, 0] = (sigmoid(pred_location[:, :, :, :, 0]) + pred_box[:, :, :, :, 0]) / num_cols
pred_box[:, :, :, :, 1] = (sigmoid(pred_location[:, :, :, :, 1]) + pred_box[:, :, :, :, 1]) / num_rows
pred_box[:, :, :, :, 2] = np.exp(pred_location[:, :, :, :, 2]) * pred_box[:, :, :, :, 2] / input_w
pred_box[:, :, :, :, 3] = np.exp(pred_location[:, :, :, :, 3]) * pred_box[:, :, :, :, 3] / input_h
# 将坐标从xywh转化成xyxy
pred_box[:, :, :, :, 0] = pred_box[:, :, :, :, 0] - pred_box[:, :, :, :, 2] / 2.
pred_box[:, :, :, :, 1] = pred_box[:, :, :, :, 1] - pred_box[:, :, :, :, 3] / 2.
pred_box[:, :, :, :, 2] = pred_box[:, :, :, :, 0] + pred_box[:, :, :, :, 2]
pred_box[:, :, :, :, 3] = pred_box[:, :, :, :, 1] + pred_box[:, :, :, :, 3]
pred_box = np.clip(pred_box, 0., 1.0)
return pred_box
paddle.fluid.layers.yolov3_loss(x, gt_box, gt_label, anchors, anchor_mask, class_num, ignore_thresh, downsample_ratio, gt_score=None, use_label_smooth=False, name=None)
x: 输出特征图。
gt_box: 真实框。
gt_label: 真实框标签。
ignore_thresh,预测框与真实框IoU阈值超过ignore_thresh时,不作为负样本,YOLO-V3模型里设置为0.7。
downsample_ratio,特征图P0的下采样比例,使用Darknet53骨干网络时为32。
gt_score,真实框的置信度,在使用了mixup技巧时用到。
use_label_smooth,一种训练技巧,如不使用,设置为False。
name,该层的名字,比如’yolov3_loss’,默认值为None,一般无需设置。
class Upsample(fluid.dygraph.Layer):
def init(self, scale=2):
super(Upsample,self).init()
self.scale = scale
def forward(self, inputs):
# get dynamic upsample output shape
shape_nchw = fluid.layers.shape(inputs)
shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
shape_hw.stop_gradient = True
in_shape = fluid.layers.cast(shape_hw, dtype='int32')
out_shape = in_shape * self.scale
out_shape.stop_gradient = True
# reisze by actual_shape
out = fluid.layers.resize_nearest(
input=inputs, scale=self.scale, actual_shape=out_shape)
return out
```python
class YOLOv3(fluid.dygraph.Layer):
def __init__(self, num_classes=7, is_train=True):
super(YOLOv3,self).__init__()
self.is_train = is_train
self.num_classes = num_classes
# 提取图像特征的骨干代码
self.block = DarkNet53_conv_body(
is_test = not self.is_train)
self.block_outputs = []
self.yolo_blocks = []
self.route_blocks_2 = []
# 生成3个层级的特征图P0, P1, P2
for i in range(3):
# 添加从ci生成ri和ti的模块
yolo_block = self.add_sublayer(
"yolo_detecton_block_%d" % (i),
YoloDetectionBlock(
ch_in=512//(2**i)*2 if i==0 else 512//(2**i)*2 + 512//(2**i),
ch_out = 512//(2**i),
is_test = not self.is_train))
self.yolo_blocks.append(yolo_block)
num_filters = 3 * (self.num_classes + 5)
# 添加从ti生成pi的模块,这是一个Conv2D操作,输出通道数为3 * (num_classes + 5)
block_out = self.add_sublayer(
"block_out_%d" % (i),
Conv2D(num_channels=512//(2**i)*2,
num_filters=num_filters,
filter_size=1,
stride=1,
padding=0,
act=None,
param_attr=ParamAttr(
initializer=fluid.initializer.Normal(0., 0.02)),
bias_attr=ParamAttr(
initializer=fluid.initializer.Constant(0.0),
regularizer=L2Decay(0.))))
self.block_outputs.append(block_out)
if i < 2:
# 对ri进行卷积
route = self.add_sublayer("route2_%d"%i,
ConvBNLayer(ch_in=512//(2**i),
ch_out=256//(2**i),
filter_size=1,
stride=1,
padding=0,
is_test=(not self.is_train)))
self.route_blocks_2.append(route)
# 将ri放大以便跟c_{i+1}保持同样的尺寸
self.upsample = Upsample()
def forward(self, inputs):
outputs = []
blocks = self.block(inputs)
for i, block in enumerate(blocks):
if i > 0:
# 将r_{i-1}经过卷积和上采样之后得到特征图,与这一级的ci进行拼接
block = fluid.layers.concat(input=[route, block], axis=1)
# 从ci生成ti和ri
route, tip = self.yolo_blocks[i](block)
# 从ti生成pi
block_out = self.block_outputs[i](tip)
# 将pi放入列表
outputs.append(block_out)
if i < 2:
# 对ri进行卷积调整通道数
route = self.route_blocks_2[i](route)
# 对ri进行放大,使其尺寸和c_{i+1}保持一致
route = self.upsample(route)
return outputs
def get_loss(self, outputs, gtbox, gtlabel, gtscore=None,
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326],
anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
ignore_thresh=0.7,
use_label_smooth=False):
"""
使用fluid.layers.yolov3_loss,直接计算损失函数,过程更简洁,速度也更快
"""
self.losses = []
downsample = 32
for i, out in enumerate(outputs): # 对三个层级分别求损失函数
anchor_mask_i = anchor_masks[i]
loss = fluid.layers.yolov3_loss(
x=out, # out是P0, P1, P2中的一个
gt_box=gtbox, # 真实框坐标
gt_label=gtlabel, # 真实框类别
gt_score=gtscore, # 真实框得分,使用mixup训练技巧时需要,不使用该技巧时直接设置为1,形状与gtlabel相同
anchors=anchors, # 锚框尺寸,包含[w0, h0, w1, h1, ..., w8, h8]共9个锚框的尺寸
anchor_mask=anchor_mask_i, # 筛选锚框的mask,例如anchor_mask_i=[3, 4, 5],将anchors中第3、4、5个锚框挑选出来给该层级使用
class_num=self.num_classes, # 分类类别数
ignore_thresh=ignore_thresh, # 当预测框与真实框IoU > ignore_thresh,标注objectness = -1
downsample_ratio=downsample, # 特征图相对于原图缩小的倍数,例如P0是32, P1是16,P2是8
use_label_smooth=False) # 使用label_smooth训练技巧时会用到,这里没用此技巧,直接设置为False
self.losses.append(fluid.layers.reduce_mean(loss)) #reduce_mean对每张图片求和
downsample = downsample // 2 # 下一级特征图的缩放倍数会减半
return sum(self.losses) # 对每个层级求和
def get_pred(self,
outputs,
im_shape=None,
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326],
anchor_masks = [[6, 7, 8], [3, 4, 5], [0, 1, 2]],
valid_thresh = 0.01):
downsample = 32
total_boxes = []
total_scores = []
for i, out in enumerate(outputs):
anchor_mask = anchor_masks[i]
anchors_this_level = []
for m in anchor_mask:
anchors_this_level.append(anchors[2 * m])
anchors_this_level.append(anchors[2 * m + 1])
boxes, scores = fluid.layers.yolo_box(
x=out,
img_size=im_shape,
anchors=anchors_this_level,
class_num=self.num_classes,
conf_thresh=valid_thresh,
downsample_ratio=downsample,
name="yolo_box" + str(i))
total_boxes.append(boxes)
total_scores.append(
fluid.layers.transpose(
scores, perm=[0, 2, 1]))
downsample = downsample // 2
yolo_boxes = fluid.layers.concat(total_boxes, axis=1)
yolo_scores = fluid.layers.concat(total_scores, axis=2)
return yolo_boxes, yolo_scores
import time
import os
import paddle
import paddle.fluid as fluid
ANCHORS = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
ANCHOR_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
IGNORE_THRESH = .7
NUM_CLASSES = 7
def get_lr(base_lr = 0.0001, lr_decay = 0.1):
bd = [10000, 20000]
lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]
learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
return learning_rate
if __name__ == '__main__':
TRAINDIR = '/home/aistudio/work/insects/train'
TESTDIR = '/home/aistudio/work/insects/test'
VALIDDIR = '/home/aistudio/work/insects/val'
with fluid.dygraph.guard():
model = YOLOv3(num_classes = NUM_CLASSES, is_train=True) #创建模型
learning_rate = get_lr()
opt = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.0005),
parameter_list=model.parameters()) #创建优化器
train_loader = multithread_loader(TRAINDIR, batch_size= 10, mode='train') #创建训练数据读取器
valid_loader = multithread_loader(VALIDDIR, batch_size= 10, mode='valid') #创建验证数据读取器
MAX_EPOCH = 200
for epoch in range(MAX_EPOCH):
for i, data in enumerate(train_loader()):
img, gt_boxes, gt_labels, img_scale = data
gt_scores = np.ones(gt_labels.shape).astype('float32')
gt_scores = to_variable(gt_scores)
img = to_variable(img)
gt_boxes = to_variable(gt_boxes)
gt_labels = to_variable(gt_labels)
outputs = model(img) #前向传播,输出[P0, P1, P2]
loss = model.get_loss(outputs, gt_boxes, gt_labels, gtscore=gt_scores,
anchors = ANCHORS,
anchor_masks = ANCHOR_MASKS,
ignore_thresh=IGNORE_THRESH,
use_label_smooth=False) # 计算损失函数
loss.backward() # 反向传播计算梯度
opt.minimize(loss) # 更新参数
model.clear_gradients()
if i % 1 == 0:
timestring = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print('{}[TRAIN]epoch {}, iter {}, output loss: {}'.format(timestring, epoch, i, loss.numpy()))
# save params of model
if (epoch % 5 == 0) or (epoch == MAX_EPOCH -1):
fluid.save_dygraph(model.state_dict(), 'yolo_epoch{}'.format(epoch))
# 每个epoch结束之后在验证集上进行测试
model.eval()
for i, data in enumerate(valid_loader()):
img, gt_boxes, gt_labels, img_scale = data
gt_scores = np.ones(gt_labels.shape).astype('float32')
gt_scores = to_variable(gt_scores)
img = to_variable(img)
gt_boxes = to_variable(gt_boxes)
gt_labels = to_variable(gt_labels)
outputs = model(img)
loss = model.get_loss(outputs, gt_boxes, gt_labels, gtscore=gt_scores,
anchors = ANCHORS,
anchor_masks = ANCHOR_MASKS,
ignore_thresh=IGNORE_THRESH,
use_label_smooth=False)
if i % 1 == 0:
timestring = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print('{}[VALID]epoch {}, iter {}, output loss: {}'.format(timestring, epoch, i, loss.numpy()))
model.train()
paddle.fluid.layers.yolo_box(x, imgsize, anchors, classnum, confthresh, downsampleratio, clipbbox=True, name=None)
x,网络输出特征图,例如上面提到的P0或者P1、P2。
img_size,输入图片尺寸。
anchors,使用到的anchor的尺寸,如[10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchor_mask: 每个层级上使用的anchor的掩码,[[6, 7, 8], [3, 4, 5], [0, 1, 2]]
class_num,物体类别数。
conf_thresh, 置信度阈值,得分低于该阈值的预测框位置数值不用计算直接设置为0.0。
downsample_ratio, 特征图的下采样比例,例如P0是32,P1是16,P2是8。
name=None,名字,例如’yolo_box’,一般无需设置,默认值为None。
返回值包括两项,boxes和scores,其中boxes是所有预测框的坐标值,scores是所有预测框的得分。
#画图展示目标物体边界框
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.image import imread
import math
#非极大值抑制
def nms(bboxes, scores, score_thresh, nms_thresh, pre_nms_topk, i=0, c=0):
"""
nms
"""
inds = np.argsort(scores)
inds = inds[::-1]
keep_inds = []
while(len(inds) > 0):
cur_ind = inds[0]
cur_score = scores[cur_ind]
# if score of the box is less than score_thresh, just drop it
if cur_score < score_thresh:
break
keep = True
for ind in keep_inds:
current_box = bboxes[cur_ind]
remain_box = bboxes[ind]
iou = box_iou_xyxy(current_box, remain_box)
if iou > nms_thresh:
keep = False
break
if i == 0 and c == 4 and cur_ind == 951:
print('suppressed, ', keep, i, c, cur_ind, ind, iou)
if keep:
keep_inds.append(cur_ind)
inds = inds[1:]
return np.array(keep_inds)
#多分类非极大值抑制
def multiclass_nms(bboxes, scores, score_thresh=0.01, nms_thresh=0.45, pre_nms_topk=1000, pos_nms_topk=100):
"""
This is for multiclass_nms
"""
batch_size = bboxes.shape[0]
class_num = scores.shape[1]
rets = []
for i in range(batch_size):
bboxes_i = bboxes[i]
scores_i = scores[i]
ret = []
for c in range(class_num):
scores_i_c = scores_i[c]
keep_inds = nms(bboxes_i, scores_i_c, score_thresh, nms_thresh, pre_nms_topk, i=i, c=c)
if len(keep_inds) < 1:
continue
keep_bboxes = bboxes_i[keep_inds]
keep_scores = scores_i_c[keep_inds]
keep_results = np.zeros([keep_scores.shape[0], 6])
keep_results[:, 0] = c
keep_results[:, 1] = keep_scores[:]
keep_results[:, 2:6] = keep_bboxes[:, :]
ret.append(keep_results)
if len(ret) < 1:
rets.append(ret)
continue
ret_i = np.concatenate(ret, axis=0)
scores_i = ret_i[:, 1]
if len(scores_i) > pos_nms_topk:
inds = np.argsort(scores_i)[::-1]
inds = inds[:pos_nms_topk]
ret_i = ret_i[inds]
rets.append(ret_i)
return rets
import json
import os
ANCHORS = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
ANCHOR_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
VALID_THRESH = 0.01
NMS_TOPK = 400
NMS_POSK = 100
NMS_THRESH = 0.45
NUM_CLASSES = 7
if __name__ == '__main__':
TRAINDIR = '/home/aistudio/work/insects/train/images'
TESTDIR = '/home/aistudio/work/insects/test/images'
VALIDDIR = '/home/aistudio/work/insects/val'
with fluid.dygraph.guard():
model = YOLOv3(num_classes=NUM_CLASSES, is_train=False)
params_file_path = '/home/aistudio/work/yolo_epoch50'
model_state_dict, _ = fluid.load_dygraph(params_file_path)
model.load_dict(model_state_dict)
model.eval()
total_results = []
test_loader = test_data_loader(TESTDIR, batch_size= 1, mode='test')
for i, data in enumerate(test_loader()):
img_name, img_data, img_scale_data = data
img = to_variable(img_data)
img_scale = to_variable(img_scale_data)
outputs = model.forward(img)
bboxes, scores = model.get_pred(outputs,
im_shape=img_scale,
anchors=ANCHORS,
anchor_masks=ANCHOR_MASKS,
valid_thresh = VALID_THRESH)
bboxes_data = bboxes.numpy()
scores_data = scores.numpy()
result = multiclass_nms(bboxes_data, scores_data,
score_thresh=VALID_THRESH,
nms_thresh=NMS_THRESH,
pre_nms_topk=NMS_TOPK,
pos_nms_topk=NMS_POSK)
for j in range(len(result)):
result_j = result[j]
img_name_j = img_name[j]
total_results.append([img_name_j, result_j.tolist()])
print('processed {} pictures'.format(len(total_results)))
print('')
json.dump(total_results, open('pred_results.json', 'w'))
def single_image_data_loader(filename, test_image_size=608, mode='test'):
"""
加载测试用的图片,测试数据没有groundtruth标签
"""
batch_size= 1
def reader():
batch_data = []
img_size = test_image_size
file_path = os.path.join(filename)
img = cv2.imread(file_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
H = img.shape[0]
W = img.shape[1]
img = cv2.resize(img, (img_size, img_size))
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
mean = np.array(mean).reshape((1, 1, -1))
std = np.array(std).reshape((1, 1, -1))
out_img = (img / 255.0 - mean) / std
out_img = out_img.astype('float32').transpose((2, 0, 1))
img = out_img #np.transpose(out_img, (2,0,1))
im_shape = [H, W]
batch_data.append((image_name.split('.')[0], img, im_shape))
if len(batch_data) == batch_size:
yield make_test_array(batch_data)
batch_data = []
return reader
定义绘制预测框的画图函数,代
INSECT_NAMES = ['Boerner', 'Leconte', 'Linnaeus',
'acuminatus', 'armandi', 'coleoptera', 'linnaeus']
# 定义画矩形框的函数
def draw_rectangle(currentAxis, bbox, edgecolor = 'k', facecolor = 'y', fill=False, linestyle='-'):
# currentAxis,坐标轴,通过plt.gca()获取
# bbox,边界框,包含四个数值的list, [x1, y1, x2, y2]
# edgecolor,边框线条颜色
# facecolor,填充颜色
# fill, 是否填充
# linestype,边框线型
# patches.Rectangle需要传入左上角坐标、矩形区域的宽度、高度等参数
rect=patches.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0]+1, bbox[3]-bbox[1]+1, linewidth=1,
edgecolor=edgecolor,facecolor=facecolor,fill=fill, linestyle=linestyle)
currentAxis.add_patch(rect)
# 定义绘制预测结果的函数
def draw_results(result, filename, draw_thresh=0.5):
plt.figure(figsize=(10, 10))
im = imread(filename)
plt.imshow(im)
currentAxis=plt.gca()
colors = ['r', 'g', 'b', 'k', 'y', 'c', 'purple']
for item in result:
box = item[2:6]
label = int(item[0])
name = INSECT_NAMES[label]
if item[1] > draw_thresh:
draw_rectangle(currentAxis, box, edgecolor = colors[label])
plt.text(box[0], box[1], name, fontsize=12, color=colors[label])
import json
import paddle
import paddle.fluid as fluid
ANCHORS = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
ANCHOR_MASKS = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
VALID_THRESH = 0.01
NMS_TOPK = 400
NMS_POSK = 100
NMS_THRESH = 0.45
NUM_CLASSES = 7
if __name__ == '__main__':
image_name = '/home/aistudio/work/insects/test/images/2599.jpeg'
params_file_path = '/home/aistudio/work/yolo_epoch50'
with fluid.dygraph.guard():
model = YOLOv3(num_classes=NUM_CLASSES, is_train=False)
model_state_dict, _ = fluid.load_dygraph(params_file_path)
model.load_dict(model_state_dict)
model.eval()
total_results = []
test_loader = single_image_data_loader(image_name, mode='test')
for i, data in enumerate(test_loader()):
img_name, img_data, img_scale_data = data
img = to_variable(img_data)
img_scale = to_variable(img_scale_data)
outputs = model.forward(img)
bboxes, scores = model.get_pred(outputs,
im_shape=img_scale,
anchors=ANCHORS,
anchor_masks=ANCHOR_MASKS,
valid_thresh = VALID_THRESH)
bboxes_data = bboxes.numpy()
scores_data = scores.numpy()
results = multiclass_nms(bboxes_data, scores_data,
score_thresh=VALID_THRESH,
nms_thresh=NMS_THRESH,
pre_nms_topk=NMS_TOPK,
pos_nms_topk=NMS_POSK)
result = results[0]
draw_results(result, image_name, draw_thresh=0.5)