本文主要讲解基于mxnet深度学习框架实现目标检测,实现的模型为YoloV5
环境配置:
python 3.8
mxnet 1.7.0
cuda 10.1
图像分类任务的实现可以让我们粗略的知道图像中包含了什么类型的物体,但并不知道物体在图像中哪一个位置,也不知道物体的具体信息,在一些具体的应用场景比如车牌识别、交通违章检测、人脸识别、运动捕捉,单纯的图像分类就不能完全满足我们的需求了。
这时候,需要引入图像领域另一个重要任务:物体的检测与识别。在传统机器领域,一个典型的案例是利用HOG(Histogram of Gradient)特征来生成各种物体相应的“滤波器”,HOG滤波器能完整的记录物体的边缘和轮廓信息,利用这一滤波器过滤不同图片的不同位置,当输出响应值幅度超过一定阈值,就认为滤波器和图片中的物体匹配程度较高,从而完成了物体的检测。
首先我是用的是halcon数据集里边的药片,去了前边的100张做标注,后面的300张做测试,其中100张里边选择90张做训练集,10张做验证集。
pip install labelimg
进入cmd,输入labelimg,会出现如图的标注工具:
首先我们先创建3个文件夹,如图:
DataImage:100张需要标注的图像
DataLabel:空文件夹,主要是存放标注文件,这个在labelimg中生成标注文件
test:存放剩下的300张图片,不需要标注
DataImage目录下和test目录的存放样子是这样的(以DataImage为例):
首先我们需要在labelimg中设置图像路径和标签存放路径,如图:
然后先记住快捷键:w:开始编辑,a:上一张,d:下一张。这个工具只需要这三个快捷键即可完成工作。
开始标注工作,首先按下键盘w,这个时候进入编辑框框的模式,然后在图像上绘制框框,输入标签(框框属于什么类别),即可完成物体1的标注,一张物体可以多个标注和多个类别,但是切记不可摸棱两可,比如这张图像对于某物体标注了,另一张图像如果出现同样的就需要标注,或者标签类别不可多个,比如这个图象A物体标注为A标签,下张图的A物体标出成了B标签,最终的效果如图:
最后标注完成会在DataLabel中看到标注文件,json格式:
xml标签文件如图,我们用到的就只有object对象,对其进行解析即可。
论文地址:
网络结构:
core:损失计算及一些核心计算的文件都存放在此文件夹
data:数据加载的相关函数及类
net:包含主干网络结构及标准的YoloV5结构
utils:数据预处理的相关文件
Ctu_YoloV5.py:YoloV5的训练类和测试类,是整个AI的主入口
import os,sys,cv2,json,colorsys,time
sys.path.append('.')
import mxnet as mx
import numpy as np
from mxnet import nd, autograd, gluon
from data.data_params import Create_pascol_voc_Data
from data.data_loader import YoloV5DataSet
from core.loss import ComputeLoss, build_targets
from nets.yolo import yolov5
from utils.utils_data import non_max_suppression, scale_coords, xywh2xyxy, process_batch, ap_per_class
from PIL import Image,ImageDraw,ImageFont
self.ctx = [mx.gpu(int(i)) for i in USEGPU.split(',') if i.strip()]
self.ctx = self.ctx if self.ctx else [mx.cpu()]
这里输入的是迭代器,后面都会利用它构建训练的迭代器
class YoloV5DataSet(gluon.data.Dataset):
def __init__(self,DataDir='./dataset/train',classes=80,img_sizes=640,shuffle=True,mode = 'train'):
super(YoloV5DataSet, self).__init__()
self.img_size=img_sizes
self.img_files = []
self.lbl_files = []
for f in os.listdir(os.path.join(DataDir,"images",mode + "_data")):
if not os.path.isfile(os.path.join(DataDir,"images",mode + "_data",f)):
continue
self.img_files.append(os.path.join(DataDir, "images",mode + "_data", f))
self.lbl_files.append(os.path.join(DataDir, "labels",mode + "_data", f.split(".")[0]+".txt"))
self.len = len(self.img_files)
self.images = []
self.labels = []
for i in range(self.len):
lbl_name = self.lbl_files[i]
if os.path.exists(lbl_name):
lbl_data = np.loadtxt(lbl_name)
else:
lbl_data = np.zeros((0,5),dtype="float64")
if len(lbl_data.shape) == 1:
lbl_data = np.zeros((0,5),dtype="float64")
self.labels.append(lbl_data)
#self.batch = batch_size
self.shape = (img_sizes, img_sizes)
self.classes = classes
self.mosaic_border = [-img_sizes//2, -img_sizes//2]
self.path = DataDir
self.shuffle = shuffle
self.classes = np.concatenate(self.labels, axis=0)[:,0]
bincount = np.bincount(self.classes.astype("int32"), minlength=classes) + 10
bincount = np.sqrt(np.bincount(self.classes.astype("int32"), minlength=classes) + 10)
self.weight = 1./bincount
self.weight = self.weight * classes / np.sum(self.weight)
self.weight = np.ones((classes), dtype="float32")
def __len__(self):
return len(self.img_files)
def load_img(self, i):
img = cv2.imread(self.img_files[i])
h0, w0, _ = img.shape
r = self.img_size/max(h0,w0)
if r != 1:
img = cv2.resize(img, (int(w0*r), int(h0*r)), interpolation=cv2.INTER_CUBIC)
return img, (h0, w0), img.shape[:2]
def __getitem__(self, index):
yc, xc = (int(random.uniform(-x, 2 * self.img_size + x)) for x in self.mosaic_border)
indices = [index]+[random.choice(range(self.len)) for _ in range(3)]
random.shuffle(indices)
labels4 = []
for i, index in enumerate(indices):
#img, _, (h,w) = self.images[index]
img, _, (h,w) = self.load_img(index)
# place img in img4
if i == 0: # top left
img4 = np.full((self.img_size * 2, self.img_size * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles
x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image)
x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image)
elif i == 1: # top right
x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, self.img_size * 2), yc
x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
elif i == 2: # bottom left
x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(self.img_size * 2, yc + h)
x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
elif i == 3: # bottom right
x1a, y1a, x2a, y2a = xc, yc, min(xc + w, self.img_size * 2), min(self.img_size * 2, yc + h)
x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax]
padw = x1a - x1b
padh = y1a - y1b
labels = self.labels[index].copy()
if labels.size > 0:
labels[:,1:] = xywhn2xyxy(labels[:, 1:], w, h, padw, padh) # normalized xywh to pixel xyxy format
else:
xx = 0
labels4.append(labels)
# Concat/clip labels
labels4 = np.concatenate(labels4, 0)
labels4[:, 1:] = np.clip(labels4[:, 1:], 0, 2 * self.img_size) # clip when using random_perspective()
img, labels = random_perspective(img4, labels4, degrees=0.0, translate=0.1, scale=0.5, shear=0.0, perspective=0.0, border=self.mosaic_border)
nl = len(labels)
if nl:
labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3)
# Albumentations None
# HSV color-space
augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4)
# Flip left-right
if random.random() < 0.5:
img = np.fliplr(img)
if nl:
labels[:, 1] = 1 - labels[:, 1]
labels_out = np.zeros((nl, 6))
if nl:
labels_out[:, 1:] = np.array(labels)
# Convert
img = img.transpose((2, 0, 1))
img = np.ascontiguousarray(img)
out = {"img": img, "label":labels_out}
return img, labels_out
本项目使用YoloV5的代码
class yolov5(HybridBlock):
def __init__(self,num_classes, batch_size = 16, mode="train", ctx=mx.cpu(), act="silu", gd=1, gw=1):
super(yolov5, self).__init__()
self.ctx = ctx
self.mode = mode
self.batch_size = batch_size
self.act = act
self.conv1 = conv(3, 16*gw,6,2,2, act=self.act)
self.conv2 = conv(16*gw,32*gw,3,2,1, act=self.act)
self.c3_1 = eval(f'c3_rep{gd*1}')(32*gw,32*gw,1,True,group=1,e=0.5, act=self.act)
self.conv3 = conv(32*gw,64*gw,3,2,1, act=self.act)
self.c3_2 = eval(f'c3_rep{gd*2}')(64*gw,64*gw,2,True,group=1,e=0.5, act=self.act)
self.conv4 = conv(64*gw,128*gw,3,2,1, act=self.act)
self.c3_3 = eval(f'c3_rep{gd*3}')(128*gw,128*gw,3,True,group=1,e=0.5, act=self.act)
self.conv5 = conv(128*gw,256*gw,3,2,1, act=self.act)
self.c3_4 = eval(f'c3_rep{gd*1}')(256*gw,256*gw,1,True,group=1,e=0.5, act=self.act)
self.sppf = sppf(256*gw,256*gw,5, act=self.act)
self.conv6 = conv(256*gw,128*gw,1,1, act=self.act)
self.upsample1 = resize()
self.cat1 = cat(dim=1)
self.c3_5 = eval(f'c3_rep{gd*1}')(256*gw,128*gw,1,False,group=1,e=0.5, act=self.act)
self.conv7 = conv(128*gw,64*gw,1,1, act=self.act)
self.upsample2 = resize()
self.cat2 = cat(dim=1)
self.c3_6 = eval(f'c3_rep{gd*1}')(128*gw,64*gw,1,False,group=1,e=0.5, act=self.act)
self.conv8 = conv(64*gw,64*gw,3,2,1, act=self.act)
self.cat3 = cat(dim=1)
self.c3_7 = eval(f'c3_rep{gd*1}')(128*gw,128*gw,1,False,group=1,e=0.5, act=self.act)
self.conv9 = conv(128*gw,128*gw,3,2,1, act=self.act)
self.cat4 = cat(dim=1)
self.c3_8 = eval(f'c3_rep{gd*1}')(256*gw,256*gw,1,False,group=1,e=0.5, act=self.act)
anchors = [[10,13, 16,30, 33,23],[30,61, 62,45, 59,119],[116,90, 156,198, 373,326]]
self.det = detect(self.batch_size, nc=num_classes, anchors=anchors,ch=[64*gw,128*gw,256*gw],inplace=True, mode=self.mode,ctx=self.ctx)
def hybrid_forward(self, F, x):
x = self.conv1(x) #0
x = self.conv2(x) #1
x = self.c3_1(x) #2
x = self.conv3(x) #3
c3_2 = self.c3_2(x) #4
x = self.conv4(c3_2) #5
c3_3 = self.c3_3(x) #6
x = self.conv5(c3_3) #7
x = self.c3_4(x) #8
x = self.sppf(x) #9
conv6 = self.conv6(x) #10
x = self.upsample1(conv6) #11
x = self.cat1(x,c3_3) #12
x = self.c3_5(x) #13
conv7 = self.conv7(x) #14
x = self.upsample2(conv7) #15
x = self.cat2(x,c3_2) #16
c3_6 = self.c3_6(x) #17
x = self.conv8(c3_6) #18
x = self.cat3(x,conv7) #19
c3_7 = self.c3_7(x) #20
x = self.conv9(c3_7) #21
x = self.cat4(x,conv6) #22
c3_8 = self.c3_8(x) #23
out = self.det(c3_6,c3_7,c3_8) #24
return out
lr_steps = sorted([int(ls) for ls in lr_decay_epoch.split(',') if ls.strip()])
lr_decay_epoch = [e for e in lr_steps]
lr_decay_epoch = [i*(len(self.train_dataset)//self.batch_size) for i in lr_decay_epoch]
schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_decay_epoch, factor=lr_decay)
optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=schedule)
trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer)
final_loss = ComputeLoss(len(self.classes_names),ctx=self.ctx[0], pos_weight=nd.array(self.train_dataset.weight, ctx=self.ctx[0]))
for i, batch in enumerate(self.train_dataloader):
imgs=batch[0]
labels = batch[1]
imgs = nd.array(imgs.astype("float32")/255.).as_in_context(self.ctx[0])
with autograd.record():
pred = self.model(imgs)
tcls, tbox, indices, anchors = build_targets(pred, labels, ctx=self.ctx[0], num_classes=len(self.classes_names))
with autograd.record():
loss, lbox, lobj, lcls = final_loss(pred, tcls, tbox, indices, anchors)
loss.backward()
trainer.step(self.batch_size, ignore_stale_grad=True)
with autograd.pause():
lbox_np = lbox.asscalar()
lobj_np = lobj.asscalar()
lcls_np = lcls.asscalar()
print("[{}:{}/{}]: loss = {:4f}, lbox = {:4f}, lobj = {:4f}, lcls = {:4f}, lr={:5f}".format(epoch+1,TrainNum,i+1, lbox_np+lobj_np+lcls_np, lbox_np, lobj_np, lcls_np, schedule.base_lr))
if os.path.exists(ModelPath+'_yoloV5') == False:
os.makedirs(ModelPath+'_yoloV5')
ClassDict = {}
ClassDict["image_size"] = self.image_size
ClassDict["classes_names"] = self.classes_names
ClassDict["model_name"] = self.model_name
with open(os.path.join(ModelPath+'_yoloV5', "class_index.json"), 'w', encoding='utf-8') as f:
f.write(json.dumps(ClassDict, sort_keys=True, indent=4, separators=(',', ': ')))
self.model.save_parameters(os.path.join(ModelPath+'_yoloV5', "final_model.dat"))
def predict(self,img_cv):
start_time = time.time()
img0s = img_cv.copy()
height, width = img_cv.shape[0:2]
scale = min(self.image_size/height, self.image_size/width)
h0, w0 = height*scale, width*scale
img0 = cv2.resize(img_cv, (round(w0/32.)*32, round(h0/32.)*32))
img = img0.astype("float32")/255.
img = nd.array(img.transpose((2,0,1))[None], ctx = self.ctx[0])
pred = self.model(img).asnumpy()
pred = non_max_suppression(pred, self.conf_thres, self.iou_thres, None, self.agnostic_nms, max_det=self.max_det)
origin_img_pillow = self.cv2_pillow(img_cv)
font = ImageFont.truetype(font='./model_data/simhei.ttf', size=np.floor(3e-2 * np.shape(origin_img_pillow)[1] + 0.5).astype('int32'))
thickness = max((np.shape(origin_img_pillow)[0] + np.shape(origin_img_pillow)[1]) // self.image_size, 1)
# annotator = Annotator(img0s, line_width=1, example=str(self.classes_names))
det = pred[0]
print(det.shape[0])
if det.shape[0] > 0:
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0s.shape).round()
else:
return None
imgbox = []
for *xyxy, conf, cls in reversed(det):
cls_id =int(cls)
# label = f'{self.classes_names[cls_id]} {conf:.2f}'
# annotator.box_label(xyxy, label, color=Colors()(cls_id, True))
xmin, ymin, xmax, ymax = int(xyxy[0]), int(xyxy[1]), int(xyxy[2]), int(xyxy[3])
imgbox.append([(xmin, ymin, xmax, ymax), cls_id, self.classes_names[cls_id], float(f'{conf:.2f}')])
top, left, bottom, right = ymin, xmin, ymax, xmax
label = '{}-{}'.format(self.classes_names[cls_id], float(f'{conf:.2f}'))
draw = ImageDraw.Draw(origin_img_pillow)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
for i in range(thickness):
draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[cls_id])
draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[cls_id])
draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font)
del draw
# img0s = annotator.result()
# cv2.imwrite('1.bmp', img0s)
result_value = {
"image_result": self.pillow_cv2(origin_img_pillow),
"bbox": imgbox,
"time": (time.time() - start_time) * 1000
}
return result_value
if __name__ == '__main__':
ctu = Ctu_YoloV5(USEGPU='0', image_size=640)
ctu.InitModel(DataDir='E:\DL_Project\DataSet\DataSet_Detection\DataSet_Halcon_YaoPian',split_train = 0.9,batch_size = 2,model_name='yolov5s',Pre_Model='./Model_yoloV5/final_model.dat',valFlag=True)
ctu.train(TrainNum=500,learning_rate=0.00005,lr_decay_epoch='50,100,150,200',lr_decay = 0.9,ModelPath='./Model_x')
# ctu = Ctu_YoloV5(USEGPU='0')
# ctu.LoadModel(r'./Model_x_yoloV5')
# cv2.namedWindow("result", 0)
# cv2.resizeWindow("result", 640, 480)
# index = 0
# for root, dirs, files in os.walk(r'D:/Ctu/Ctu_Project_DL/DataSet/DataSet_Detection_YaoPian/test'):
# for f in files:
# img_cv = ctu.read_image(os.path.join(root, f))
# if img_cv is None:
# continue
# res = ctu.predict(img_cv)
# for each in res['bbox']:
# print(each)
# print("耗时:" + str(res['time']) + ' ms')
# # cv2.imwrite(str(index + 1)+'.bmp',res['image_result'])
# cv2.imshow("result", res['image_result'])
# cv2.waitKey()
# # index +=1