YOLO作为机器视觉领域的常用深度学习算法，具有检测实时性强、精度高的特点。本文将介绍如何从零开始搭建YOLO环境，实现对自制数据集的目标检测。

一、硬件环境搭建

若无GPU环境则可跳过此步骤
opencv根据选择安装（后期opencv可调用yolo的权重模型提供图像智能化检测以及视频输出接口）
如何在linux环境下搭建GPU环境参考章节2与章节3

Windows则登录英伟达驱动官网下载对应驱动安装
确定显卡驱动对应的CUDA版本后登录CUDA官网下载安装即可
运行cmd，输入nvcc --version 即可查看版本号判断是否安装成功

二、Darknet环境搭建

Darkent作为yolo的原生框架具有轻量、代码透明、易扩展的优点，适合学习和嵌入式移植使用。

$ git clone https://github.com/pjreddie/darknet.git
$ cd darknet
$ vim Makefile//darknet 目录下的 makefile 文件前几行根据实际环境更改为：
 GPU=1 //若成功安装显卡驱动 则设置为1
 CUDNN=1 //若成功安装cuda与cudnn 则设置为1
 OPENCV=1 //若成功安装opencv 则设置为1
...
 ifeq（$(GPU),1) //重新设置cuda路径
 COMMON+= -DGPU -I/usr/local/cuda/include/    //默认cuda安装位置
 CFLAGS+= -DGPU
 LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -Lcudart -lcublas -lcurand
...

$ make

安装完成后可以通过官方提供的权重模型进行测试

$ wget https://pjreddie.com/media/files/yolo.weights //模型下载指令
$ ./darknet detect cfg/yolo.cfg yolo.weights data/dog.jpg //测试指令

三、自制VOC数据集

若有问题详情见海参图像数据及制作
1 批量修改JPG图片文件名

# -*- coding:utf8 -*-
 
import os
 
class BatchRename():
    '''
    批量重命名文件夹中的图片文件
    '''
    def __init__(self):
        #我的图片文件夹路径horse
        self.path = '/home/lbf/BIM-VOC/VOCdevkit/VOC2019/JPEGImages'
 
    def rename(self):
        filelist = os.listdir(self.path)
        total_num = len(filelist)
        i = 1 # 起始 index
        n = 6 # 总共几位 例如 000001.jpg 为6位
        for item in filelist:
            if item.endswith('.jpg'):
                n = 6 - len(str(i))
                src = os.path.join(os.path.abspath(self.path), item)
                dst = os.path.join(os.path.abspath(self.path), str(0)*n + str(i) + '.jpg')
                try:
                    os.rename(src, dst)
                    print('converting %s to %s ...' % (src, dst))
                    i = i + 1
            
                except:
                    continue
        print('total %d to rename & converted %d jpgs' % (total_num, i))
 
if __name__ == '__main__':
    demo = BatchRename()
    demo.rename()

2 建立如下文件树并移动图片文件到JPEGImages目录

--VOC2019           (年份可自定义)
    --Annotations  （XML文件）
    --ImageSets    （txt文件）
      --Main    
    --JPEGImages    （图片）

3 使用labelimg标注工具标记图像目标区域
提取码abcd

4 生成TXT文件

import os
import random

trainval_percent = 0.66
train_percent = 0.95
xmlfilepath = 'Annotations'
txtsavepath = 'ImageSets\Main'
total_xml = os.listdir(xmlfilepath)

num=len(total_xml)
list=range(num)
tv=int(num*trainval_percent)
tr=int(tv*train_percent)
trainval= random.sample(list,tv)
train=random.sample(trainval,tr)

ftrainval = open('ImageSets/Main/trainval.txt', 'w')
ftest = open('ImageSets/Main/test.txt', 'w')
ftrain = open('ImageSets/Main/train.txt', 'w')
fval = open('ImageSets/Main/val.txt', 'w')

for i  in list:
    name=total_xml[i][:-4]+'\n'
    if i in trainval:
        ftrainval.write(name)
        if i in train:
            ftrain.write(name)
        else:
            fval.write(name)
    else:
        ftest.write(name)

ftrainval.close()
ftrain.close()
fval.close()
ftest .close()

5 根据数据集具体信息修改voc_label.py

import xml.etree.ElementTree as ET
import pickle
import os
from os import listdir, getcwd
from os.path import join

# 此处年份信息根据之前设定的文件夹名保持一致
sets=[('2019', 'train'), ('2019', 'val'), ('2019', 'test')]
# 此处类别信息修改为要检测的对象信息
classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]


def convert(size, box):
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

def convert_annotation(year, image_id):
    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
    out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
    tree=ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult)==1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        bb = convert((w,h), b)
        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')

wd = getcwd()

for year, image_set in sets:
    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
    list_file = open('%s_%s.txt'%(year, image_set), 'w')
    for image_id in image_ids:
        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
        convert_annotation(year, image_id)
    list_file.close()

# os.system("cat 2019_train.txt 2019_val.txt 2012_train.txt 2012_val.txt > train.txt")
# os.system("cat 2019_train.txt 2019_val.txt 2019_test.txt 2012_train.txt 2012_val.txt > train.all.txt")

6 修改data文件夹中的voc.names文件与cfg文件夹中的voc.data文件。也是按自己需求修改
voc.names

person //检测目标标签名
dog

voc.data

classes= 2 //类别数
train  = /home/ben/Extra/darknet/scripts/2019_train.txt  //训练样本的绝对路径文件
valid  = /home/ben/Extra/darknet/scripts/2019_test.txt //包含绝对路径的测试图像路径（测试mAP主要文件）
names = /home/ben/Extra/darknet/data/voc.names //上一步修改的voc.names文件 检测类别名称
backup =/home/ben/Extra/darknet/backup //指示训练后生成的权重放在哪

ps. 若后期移动数据集需要修改xml文件中的path路径可参考以下code

# coding=utf-8
import os
import os.path
import xml.dom.minidom
import re

# 替换xml文件存放路径
file_path = "/home/ben/Extra/darknet/scripts/VOCdeckit/VOC2019/labelImg-master/NEU-DET/ANNOTATIONS"
# 替换图片保存路径
picture_path = "/home/ben/Extra/darknet/scripts/VOCdeckit/VOC2019/labelImg-master/NEU-DET/IMAGES/"

class ChangePath():
    def __init__(self):
        self.path = './ANNOTATIONS/'# 原xml文件路径
 
    def chpath(self):
        LINES = []
        total_num = 0
        filelist = os.listdir(self.path)
        for item in filelist:
            if item.endswith('.xml'):
                with open((self.path+item),'r') as fp:
                    findpath = False 
                    ### 此处面向没有path信息的xml文件
                    ### 若为修改原path路径 可将if语句后修改为
                    ### if line.startswith('    ')  and findpath == False:
                    ###     LINES.append(('     '+picture_path+os.path.splitext(item)[0]+'.jpg\n'))
                    ###     findpath = True
                    for line in fp:
                        if line.startswith('    ')  and findpath == False:# 找到filename行
                            LINES.append(line) # 保存原filename行
                            LINES.append(('     '+picture_path+os.path.splitext(item)[0]+'.jpg\n'))# 增加一行path信息
                            findpath = True
                        else:
                            LINES.append(line)
                fp.close()
                with open((self.path+item),'w') as fp:
                    fp.writelines(LINES)
                    while(len(LINES)>0):
                        LINES.pop(0)
                    print('%s文件已更改完成！'%item)
                    total_num = total_num +1
                fp.close()
        print('共计%d个文件'%total_num)
 
if __name__ == '__main__':
    change = ChangePath()
    change.chpath()

四、网络参数修改

参数信息详情参考YOLOV3参数详解
主要修改如下信息(YOLOV3网络有三处 YOLO-TINY网络有2处)

...
 
[convolutional]  
size=1  
stride=1  
pad=1  
filters=30  //修改每个YOLO层之前最后一层卷积层核参数个数，
                //计算公式是依旧自己数据的类别数filter=num×（classes + coords + 1）=5×(classes+4+1） 
                //num = 5的意义是5个坐标，论文中的tx,ty,tw,th,to
activation=linear  
[yolo]
mask = 6,7,8
anchors =  XXXX
classes=10 //需要识别的类别个数 比如只识别行人 则此处为1
num=9  //每个grid cell预测几个box,和anchors的数量一致

五、训练效果可视化

主要观察IOU与Loss值信息变化
可视化中间参数需要用到训练时保存的log文件（命令中的路径根据自己实际修改）：

./darknet detector train pds/fish/cfg/fish.data pds/fish/cfg/yolov3-fish.cfg darknet53.conv.74 2>1 | tee visualization/train_yolov3.log

详情参考 YOLO-V3可视化训练过程中的参数，绘制loss、IOU、avg Recall等的曲线图
以下为参考code
extract_log.py

# coding=utf-8
# 该文件用来提取训练log，去除不可解析的log后使log文件格式化，生成新的log文件供可视化工具绘图
 
import inspect
import os
import random
import sys
def extract_log(log_file,new_log_file,key_word):
    with open(log_file, 'r') as f:
      with open(new_log_file, 'w') as train_log:
  #f = open(log_file)
    #train_log = open(new_log_file, 'w')
        for line in f:
    # 去除多gpu的同步log
          if 'Syncing' in line:
            continue
    # 去除除零错误的log
          if 'nan' in line:
            continue
          if key_word in line:
            train_log.write(line)
    f.close()
    train_log.close()
 
extract_log('train_yolov3.log','train_log_loss.txt','images')
extract_log('train_yolov3.log','train_log_iou.txt','IOU')

iou绘制

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
 
lines = 319815    #根据train_log_iou.txt的行数修改
# result = pd.read_csv('train_log_iou.txt', skiprows=[x for x in range(lines) if (x%10==0 or x%10==9) ] ,error_bad_lines=False, names=['Region Avg IOU', 'Class', 'Obj', 'No Obj', 'Avg Recall','count'])
result = pd.read_csv('train_log_iou.txt', skiprows=[x for x in range(lines) if (x%100!=99) ] ,error_bad_lines=False, names=['Region Avg IOU', 'Class', 'Obj', 'No Obj', 'Avg Recall','count'])
result.head()
 
result['Region Avg IOU']=result['Region Avg IOU'].str.split(': ').str.get(1)
result['Class']=result['Class'].str.split(': ').str.get(1)
result['Obj']=result['Obj'].str.split(': ').str.get(1)
result['No Obj']=result['No Obj'].str.split(': ').str.get(1)
result['Avg Recall']=result['Avg Recall'].str.split(': ').str.get(1)
result['count']=result['count'].str.split(': ').str.get(1)
result.head()
result.tail()
 
# print(result.head())
# print(result.tail())
# print(result.dtypes)
print(result['Region Avg IOU'])
 
result['Region Avg IOU']=pd.to_numeric(result['Region Avg IOU'])
result['Class']=pd.to_numeric(result['Class'])
result['Obj']=pd.to_numeric(result['Obj'])
result['No Obj']=pd.to_numeric(result['No Obj'])
result['Avg Recall']=pd.to_numeric(result['Avg Recall'])
result['count']=pd.to_numeric(result['count'])
result.dtypes
 
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(result['Region Avg IOU'].values,label='平均交并比值')
# ax.plot(result['Class'].values,label='Class')
# ax.plot(result['Obj'].values,label='Obj')
# ax.plot(result['No Obj'].values,label='No Obj')
# ax.plot(result['Avg Recall'].values,label='Avg Recall')
# ax.plot(result['count'].values,label='count')
ax.legend(loc='best')
ax.set_title('平均交并比曲线')
ax.set_xlabel('批次')
ax.set_ylabel('交并比')
# fig.savefig('Avg IOU')
fig.savefig('Region Avg IOU', dpi=256)

loss绘制

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
 
lines =20000    #改为自己生成的train_log_loss.txt中的行数
result = pd.read_csv('train_log_loss.txt', skiprows=[x for x in range(lines) if ((x%10 != 9) |(x<1000))] ,error_bad_lines=False, names=['loss', 'avg', 'rate', 'seconds', 'images'])
#result = pd.read_csv('train_log_loss.txt', skiprows=[x for x in range(lines) if ((x%10 != 9 and x%10 != 5) or x<1000)] ,error_bad_lines=False, names=['loss', 'avg', 'rate', 'seconds', 'images'])
result.head()
 
result['loss']=result['loss'].str.split(' ').str.get(1)
result['avg']=result['avg'].str.split(' ').str.get(1)
result['rate']=result['rate'].str.split(' ').str.get(1)
result['seconds']=result['seconds'].str.split(' ').str.get(1)
result['images']=result['images'].str.split(' ').str.get(1)
result.head()
result.tail()
 
# print(result.head())
# print(result.tail())
# print(result.dtypes)
 
print(result['loss'])
print(result['avg'])
print(result['rate'])
print(result['seconds'])
print(result['images'])
 
result['loss']=pd.to_numeric(result['loss'])
result['avg']=pd.to_numeric(result['avg'])
result['rate']=pd.to_numeric(result['rate'])
result['seconds']=pd.to_numeric(result['seconds'])
result['images']=pd.to_numeric(result['images'])
result.dtypes
 
 
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(result['avg'].values,label='平均损失值')
# ax.plot(result['loss'].values,label='loss')
ax.legend(loc='best')  #图列自适应位置
ax.set_title('损失值曲线')
ax.set_xlabel('批次')
ax.set_ylabel('损失值')
fig.savefig('avg_loss', dpi=256)
# fig.savefig('loss')

六、其他

神经网络部分知识点
学习率设定技巧
anchor box
kmeans使用方法：修改k-means-yolo.py中的cluster数量，比较IOU变化率，选择拐点处k值即可
k-means-yolo.py

import glob
import os
import sys
import xml.etree.ElementTree as ET
import numpy as np
from kmeans import kmeans, avg_iou

# 根文件夹
ROOT_PATH = '/root/darknet/scripts/VOCdevkit/VOC2007/'
# 聚类的数目 通过不断调整此处值 比较IOU变化率 选择拐点处值作为k值即可
CLUSTERS = 6
# 模型中图像的输入尺寸，默认是一样的
SIZE = 640

# 加载YOLO格式的标注数据
def load_dataset(path):
    jpegimages = os.path.join(path, 'JPEGImages')
    if not os.path.exists(jpegimages):
        print('no JPEGImages folders, program abort')
        sys.exit(0)
    labels_txt = os.path.join(path, 'labels')
    if not os.path.exists(labels_txt):
        print('no labels folders, program abort')
        sys.exit(0)

    label_file = os.listdir(labels_txt)
    print('label count: {}'.format(len(label_file)))
    dataset = []

    for label in label_file:
        with open(os.path.join(labels_txt, label), 'r') as f:
            txt_content = f.readlines()

        for line in txt_content:
            line_split = line.split(' ')
            roi_with = float(line_split[len(line_split)-2])
            roi_height = float(line_split[len(line_split)-1])
            if roi_with == 0 or roi_height == 0:
                continue
            dataset.append([roi_with, roi_height])
            # print([roi_with, roi_height])

    return np.array(dataset)

data = load_dataset(ROOT_PATH)
out = kmeans(data, k=CLUSTERS)

print(out)
print("Accuracy: {:.2f}%".format(avg_iou(data, out) * 100))
print("Boxes:\n {}-{}".format(out[:, 0] * SIZE, out[:, 1] * SIZE))

ratios = np.around(out[:, 0] / out[:, 1], decimals=2).tolist()
print("Ratios:\n {}".format(sorted(ratios)))

k-means.py

import numpy as np

def iou(box, clusters):
    """
    Calculates the Intersection over Union (IoU) between a box and k clusters.
    :param box: tuple or array, shifted to the origin (i. e. width and height)
    :param clusters: numpy array of shape (k, 2) where k is the number of clusters
    :return: numpy array of shape (k, 0) where k is the number of clusters
    """
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
        raise ValueError("Box has no area")

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou_ = intersection / (box_area + cluster_area - intersection)

    return iou_

def avg_iou(boxes, clusters):
    """
    Calculates the average Intersection over Union (IoU) between a numpy array of boxes and k clusters.
    :param boxes: numpy array of shape (r, 2), where r is the number of rows
    :param clusters: numpy array of shape (k, 2) where k is the number of clusters
    :return: average IoU as a single float
    """
    return np.mean([np.max(iou(boxes[i], clusters)) for i in range(boxes.shape[0])])

def translate_boxes(boxes):
    """
    Translates all the boxes to the origin.
    :param boxes: numpy array of shape (r, 4)
    :return: numpy array of shape (r, 2)
    """
    new_boxes = boxes.copy()
    for row in range(new_boxes.shape[0]):
        new_boxes[row][2] = np.abs(new_boxes[row][2] - new_boxes[row][0])
        new_boxes[row][3] = np.abs(new_boxes[row][3] - new_boxes[row][1])
    return np.delete(new_boxes, [0, 1], axis=1)

def kmeans(boxes, k, dist=np.median):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    :param boxes: numpy array of shape (r, 2), where r is the number of rows
    :param k: number of clusters
    :param dist: distance function
    :return: numpy array of shape (k, 2)
    """
    rows = boxes.shape[0]

    distances = np.empty((rows, k))
    last_clusters = np.zeros((rows,))

    np.random.seed()

    # the Forgy method will fail if the whole array contains the same rows
    clusters = boxes[np.random.choice(rows, k, replace=False)]

    while True:
        for row in range(rows):
            distances[row] = 1 - iou(boxes[row], clusters)

        nearest_clusters = np.argmin(distances, axis=1)

        if (last_clusters == nearest_clusters).all():
            break

        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        last_clusters = nearest_clusters

    return clusters

YOLO从零开始-实战