源码:
在网上挑了一个资料比较详细的源码入手:
github地址:https://github.com/hizhangp
[百度云源码地址](链接:https://pan.baidu.com/s/1XMNzOp5E1F218AqsCw8mBg
提取码:4tzd)
流程:
详细内容点这
目录
解读各个文件
- yolo/yolo_net.py : YOLO网络的主题框架
- yolo/config.py : 这是YOLO检测模型的配置文件
- test.py 对训练好的模型进行测试,读取cptk文件,测试image。这里只用到了YOLO模型的self.logits,注意self.logits.shape=7x7x30=1407。这里有两种检测模式,摄像头和图片格式(我下载的是最新版)。
- train.py 是进行训练用的文件,这里只用的到了YOLO的损失函数。
菜鸟吐槽:看了这么就的模型代码,发现模型都通俗易懂,就是数据处理总是一头雾水,而且数据处理占据整个模型文件70%-90%的代码量。增加了理解整个模型的难度。最后发现,面向对象的代码才是最简洁,代码可以方便地调试,但也是最抽象的。
。。。。。。。。。。。。谁来拯救菜鸟。。。。。。。。。。
模型构建
为了进行流程追踪,还是从train.py开始。
在train.py种的主函数中,如下:
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', default="YOLO_small.ckpt", type=str)
parser.add_argument('--data_dir', default="data", type=str)
parser.add_argument('--threshold', default=0.2, type=float)
parser.add_argument('--iou_threshold', default=0.5, type=float)
parser.add_argument('--gpu', default='', type=str)
args = parser.parse_args()
if args.gpu is not None: #如果训练传进来的gpu参数非空,则将传进来的gpu信息赋值给配置文件中
cfg.GPU = args.gpu
if args.data_dir != cfg.DATA_PATH: #如果传经来的数据路径与当前配置文件数据路径不一致,则更新配置信息
update_config_paths(args.data_dir, args.weights)
os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
yolo = YOLONet() #声明类对象yolo
pascal = pascal_voc('train') #定义类别
solver = Solver(yolo, pascal) #利用yolo网络结构,对传进的数据,生成solver
print('Start training ...') #开始训练
solver.train()
print('Done training.') #完成训练
在main()的前面就是,参数设置,就是需要在命令行窗口设置,不然就会使用默认的参数。
这里YOLONet() 就是构建YOLO模型
yolo = YOLONet()
获取训练用的数据:
pascal = pascal_voc('train')
这里直接进行训练
solver = Solver(yolo, pascal)
这里的Solver不进行细讲,主要讲几个重要的步骤:
这里是初始化函数,这主要设置一些参数:
class Solver(object):
def __init__(self, net, data):
self.net = net
self.data = data
self.weights_file = cfg.WEIGHTS_FILE #权重文件,默认无
self.max_iter = cfg.MAX_ITER #默认15000
self.initial_learning_rate = cfg.LEARNING_RATE #初始学习率0.0001
self.decay_steps = cfg.DECAY_STEPS #衰减步长:30000
self.decay_rate = cfg.DECAY_RATE #衰减率:0.1
self.staircase = cfg.STAIRCASE
self.summary_iter = cfg.SUMMARY_ITER #日志记录迭代步数:10
self.save_iter = cfg.SAVE_ITER #保存迭代步长:1000
self.output_dir = os.path.join(
cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M')) #保存路径:output/年_月_日_时_分
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.save_cfg()
self.variable_to_restore = tf.global_variables()
self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)
self.ckpt_file = os.path.join(self.output_dir, 'yolo') #模型文件路径: 输出目录/yolo
self.summary_op = tf.summary.merge_all()
self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60)
self.global_step = tf.train.create_global_step()
self.learning_rate = tf.train.exponential_decay( #产生一个指数衰减的学习速率,learning_rate=initial_learning_rate*decay_rate^(global_step/decay_steps)
self.initial_learning_rate, self.global_step, self.decay_steps,
self.decay_rate, self.staircase, name='learning_rate')
self.optimizer = tf.train.GradientDescentOptimizer(
learning_rate=self.learning_rate)
# 进行loss的优化计算的操作
self.train_op = slim.learning.create_train_op(
self.net.total_loss, self.optimizer, global_step=self.global_step)
Solver(object)主要结构如下:
其他的不要去纠结:
主要关注应该是loss
去了哪?????
class Solver(object):
def __init__(self, net, data):
self.net = net
self.data = data
.......
# 对YOLO的损失函数进行优化
self.train_op = slim.learning.create_train_op(
self.net.total_loss, self.optimizer, global_step=self.global_step)
........
def train(self):
........
images, labels = self.data.get() #从pascal_voc数据集读取图像和实际标签信息
feed_dict = {self.net.images: images, #生成一个图像和label对应的字典
self.net.labels: labels}
for step in range(1, self.max_iter + 1):
........
summary_str, loss, _ = self.sess.run(
[self.summary_op, self.net.total_loss, self.train_op], #模型训练,返回 loss
feed_dict=feed_dict)
........
def save_cfg(self): #保存当前的模型配置信息
with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f: #往output/config.txt中写配置信息
cfg_dict = cfg.__dict__
for key in sorted(cfg_dict.keys()):
if key[0].isupper():
cfg_str = '{}: {}\n'.format(key, cfg_dict[key])
f.write(cfg_str)
是Solver的主要结构可以总结如下:
- 拿到YOLO的框架
- 拿到数据读取的框架
- 对YOLO进行loss计算,
- 使用slim进行优化
这里的slim.learning.create_train_op不清楚可以点-->TensorFlow-Slim API 官方教程
config.py
在开始yolo_net.py之前,这里还有一个专门设置参数的文件。
config.py就是设置各种训练参数的文件。
import os
#
# path and dataset parameter
#
DATA_PATH = 'data'
PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc') #pascal的路径是;当前工作路径/data/pascal_voc
CACHE_PATH = os.path.join(PASCAL_PATH, 'cache') #cache的路径是;当前工作路径/data/pascal_voc/cache
OUTPUT_DIR = os.path.join(PASCAL_PATH, 'output') #output的路径是;当前工作路径/data/pascal_voc/output
WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights') #weights的路径是;当前工作路径/data/pascal_voc/weights
WEIGHTS_FILE = None
# WEIGHTS_FILE = os.path.join(DATA_PATH, 'weights', 'YOLO_small.ckpt')
CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', #目标类别
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
'train', 'tvmonitor']
FLIPPED = True #是否flipped
#
# model parameter
#
IMAGE_SIZE = 448
CELL_SIZE = 7
BOXES_PER_CELL = 2
ALPHA = 0.1
DISP_CONSOLE = False
OBJECT_SCALE = 1.0 #这四个损失函数系数
NOOBJECT_SCALE = 1.0
CLASS_SCALE = 2.0
COORD_SCALE = 5.0
#
# solver parameter
#
GPU = ''
LEARNING_RATE = 0.0001
DECAY_STEPS = 30000
DECAY_RATE = 0.1
STAIRCASE = True
BATCH_SIZE = 30
MAX_ITER = 15000
SUMMARY_ITER = 10
SAVE_ITER = 1000
#
# test parameter
#
THRESHOLD = 0.2
IOU_THRESHOLD = 0.5
yolo_net.py
现在来到yolo.py的文件。在train.py中用到net的net.total_loss。
# train.py
self.train_op = slim.learning.create_train_op(
self.net.total_loss, self.optimizer, global_step=self.global_step)
在yolo的初始化函数:
class YOLONet(object):
def __init__(self, is_training=True):
#省略前面的参数设置.........
self.images = tf.placeholder(
tf.float32, [None, self.image_size, self.image_size, 3],
name='images')
self.logits = self.build_network(
self.images, num_outputs=self.output_size, alpha=self.alpha,
is_training=is_training)
if is_training:
self.labels = tf.placeholder(
tf.float32,
[None, self.cell_size, self.cell_size, 5 + self.num_class])
self.loss_layer(self.logits, self.labels)
self.total_loss = tf.losses.get_total_loss()
tf.summary.scalar('total_loss', self.total_loss)
从上面可以看出,在init()就已经计算定义了模型的各种参数
这句代码
self.logits = self.build_network(
self.images, num_outputs=self.output_size, alpha=self.alpha,
is_training=is_training)
build_network把所有的结果输出了,包括分类和坐标
shape=(2*(x, y, w, h, c), C)。logits=[None, 1470] ,1470=7x7x30.C=20.
如果进行训练:
if is_training:
self.labels = tf.placeholder(
tf.float32,
[None, self.cell_size, self.cell_size, 5 + self.num_class])
self.loss_layer(self.logits, self.labels)
self.total_loss = tf.losses.get_total_loss()
tf.summary.scalar('total_loss', self.total_loss)
选择训练,就会计算所有的loss。还需要提供labels。
build_network()
def build_network(self,
images,
num_outputs,
alpha,
keep_prob=0.5,
is_training=True,
scope='yolo'):
with tf.variable_scope(scope):
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
activation_fn=leaky_relu(alpha),
weights_regularizer=slim.l2_regularizer(0.0005),
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
):
net = tf.pad(
images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
name='pad_1')
net = slim.conv2d(
net, 64, 7, 2, padding='VALID', scope='conv_2')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
net = slim.conv2d(net, 192, 3, scope='conv_4')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
net = slim.conv2d(net, 128, 1, scope='conv_6')
net = slim.conv2d(net, 256, 3, scope='conv_7')
net = slim.conv2d(net, 256, 1, scope='conv_8')
net = slim.conv2d(net, 512, 3, scope='conv_9')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
net = slim.conv2d(net, 256, 1, scope='conv_11')
net = slim.conv2d(net, 512, 3, scope='conv_12')
net = slim.conv2d(net, 256, 1, scope='conv_13')
net = slim.conv2d(net, 512, 3, scope='conv_14')
net = slim.conv2d(net, 256, 1, scope='conv_15')
net = slim.conv2d(net, 512, 3, scope='conv_16')
net = slim.conv2d(net, 256, 1, scope='conv_17')
net = slim.conv2d(net, 512, 3, scope='conv_18')
net = slim.conv2d(net, 512, 1, scope='conv_19')
net = slim.conv2d(net, 1024, 3, scope='conv_20')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
net = slim.conv2d(net, 512, 1, scope='conv_22')
net = slim.conv2d(net, 1024, 3, scope='conv_23')
net = slim.conv2d(net, 512, 1, scope='conv_24')
net = slim.conv2d(net, 1024, 3, scope='conv_25')
net = slim.conv2d(net, 1024, 3, scope='conv_26')
net = tf.pad(
net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),
name='pad_27')
net = slim.conv2d(
net, 1024, 3, 2, padding='VALID', scope='conv_28')
net = slim.conv2d(net, 1024, 3, scope='conv_29')
net = slim.conv2d(net, 1024, 3, scope='conv_30')
net = tf.transpose(net, [0, 3, 1, 2], name='trans_31')
net = slim.flatten(net, scope='flat_32')
net = slim.fully_connected(net, 512, scope='fc_33')
net = slim.fully_connected(net, 4096, scope='fc_34')
net = slim.dropout(
net, keep_prob=keep_prob, is_training=is_training,
scope='dropout_35')
net = slim.fully_connected(
net, num_outputs, activation_fn=None, scope='fc_36')
return net
build_network()函数的操作很简单就是对image进行CNN计算,最后
得出box和class, 最后的输出是[None, 1407]。如果这不是YOLO,我会认为这是一个分类模型,说明神经网络的特征提取能力实在是太强大了。
calc_iou()
在进行loss计算的时候需要计算IOU。
如下:
def calc_iou(self, boxes1, boxes2, scope='iou'):
#计算iou
"""calculate ious
Args:
boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h)
boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
Return:
iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
with tf.variable_scope(scope):
# transform (x_center, y_center, w, h) to (x1, y1, x2, y2)
boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
boxes1[..., 1] - boxes1[..., 3] / 2.0,
boxes1[..., 0] + boxes1[..., 2] / 2.0,
boxes1[..., 1] + boxes1[..., 3] / 2.0],
axis=-1)
boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
boxes2[..., 1] - boxes2[..., 3] / 2.0,
boxes2[..., 0] + boxes2[..., 2] / 2.0,
boxes2[..., 1] + boxes2[..., 3] / 2.0],
axis=-1)
# calculate the left up point & right down point
lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
# intersection
intersection = tf.maximum(0.0, rd - lu)
inter_square = intersection[..., 0] * intersection[..., 1]
# calculate the boxs1 square and boxs2 square
square1 = boxes1[..., 2] * boxes1[..., 3]
square2 = boxes2[..., 2] * boxes2[..., 3]
union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)
calc_iou计算两个box1和box2的交并比。tf.clip_by_value把交并比在限制在[0,1]。
loss_layer()
def loss_layer(self, predicts, labels, scope='loss_layer'):
#loss函数
#idx=33,predicts为fc_32,labels shape为(45, 7, 7, 25)
#self.loss = self.loss_layer(33, self.fc_32, self.labels)
with tf.variable_scope(scope):
#将网络输出分离为类别和定位以及box大小,输出维度为7*7*20+7*7*2+7*7*2*4=1470
#类别,shape为(45, 7, 7, 20)
predict_classes = tf.reshape(
predicts[:, :self.boundary1],
[self.batch_size, self.cell_size, self.cell_size, self.num_class])
#定位,shape为(45, 7, 7, 2)
predict_scales = tf.reshape(
predicts[:, self.boundary1:self.boundary2],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
#box大小,长宽等 shape为(45, 7, 7, 2, 4)
predict_boxes = tf.reshape(
predicts[:, self.boundary2:],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4])
#label的类别结果,shape为(45, 7, 7, 1)
response = tf.reshape(
labels[..., 0],
[self.batch_size, self.cell_size, self.cell_size, 1])
#label的定位结果,shape为(45, 7, 7, 1, 4)
boxes = tf.reshape(
labels[..., 1:5],
[self.batch_size, self.cell_size, self.cell_size, 1, 4])
#label的大小结果,shapewei (45, 7, 7, 2, 4)
boxes = tf.tile(
boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size
#shape 为(45, 7, 7, 20)
classes = labels[..., 5:]
#offset shape为(7, 7, 2)
offset = tf.reshape(
tf.constant(self.offset, dtype=tf.float32),
[1, self.cell_size, self.cell_size, self.boxes_per_cell])
#shape为 (1,7, 7, 2)
offset = tf.tile(offset, [self.batch_size, 1, 1, 1])
#shape为(45, 7, 7, 2)
offset_tran = tf.transpose(offset, (0, 2, 1, 3))
#shape为(45, 7, 7, 2, 4)
predict_boxes_tran = tf.stack(
[(predict_boxes[..., 0] + offset) / self.cell_size,
(predict_boxes[..., 1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[..., 2]),
tf.square(predict_boxes[..., 3])], axis=-1)
#shape为(45, 7, 7, 2)
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
# calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
#shape为 (45, 7, 7, 1)
object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
#shape为(45, 7, 7, 2)
object_mask = tf.cast(
(iou_predict_truth >= object_mask), tf.float32) * response
# calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
#shape为(45, 7, 7, 2)
noobject_mask = tf.ones_like(
object_mask, dtype=tf.float32) - object_mask
#shape为(4, 45, 7, 7, 2)
boxes_tran = tf.stack(
[boxes[..., 0] * self.cell_size - offset,
boxes[..., 1] * self.cell_size - offset_tran,
tf.sqrt(boxes[..., 2]),
tf.sqrt(boxes[..., 3])], axis=-1)
# class_loss
class_delta = response * (predict_classes - classes)
class_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
name='class_loss') * self.class_scale
# object_loss
object_delta = object_mask * (predict_scales - iou_predict_truth)
object_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
name='object_loss') * self.object_scale
# noobject_loss
noobject_delta = noobject_mask * predict_scales
noobject_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
name='noobject_loss') * self.noobject_scale
# coord_loss
coord_mask = tf.expand_dims(object_mask, 4)
boxes_delta = coord_mask * (predict_boxes - boxes_tran)
coord_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
name='coord_loss') * self.coord_scale
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(coord_loss)
上面的注释已经详细说明了过程,但为了防止遗忘,我还是需要着重讲一下。再次感慨一下神经网络的强大之处。随便设置的1470个输出,再设计一个loss函数,就决定了每个输出的用途。单单看1470个输出,完全不知道那个输出类别,那个输出box坐标。注意box的坐标是(x, y, sqrt(w), sqrt(h)).也就是说x,y是box的中心点坐标,宽高其实w和h的开根号。
现在拆开1470输出,看看到底每个参数是干嘛的。
首先给出一些值:
self.boundary1 = 7x7x20=980
self.boundary2 = 7x7x20 + 7x7x2=1078
- predict_classes
#类别,shape为(45, 7, 7, 20)
predict_classes = tf.reshape(
predicts[:, :self.boundary1],
[self.batch_size, self.cell_size, self.cell_size, self.num_class])
1470的前980是输出每个框的类别。
- predict_scales
#背景,shape为(45, 7, 7, 2)
predict_scales = tf.reshape(
predicts[:, self.boundary1:self.boundary2],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
1470的980:1078是输出每个框的是否有物体的置信度值。这里每个点有两个框,每个框均有一个值。
- predict_boxes
#box大小,长宽等 shape为(45, 7, 7, 2, 4)
predict_boxes = tf.reshape(
predicts[:, self.boundary2:],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4])
1470的1078:之后就是每个框的坐标。每个点有2个框。
再次佩服网络的强大。
把坐标对应
为什么这么做?
坐标归一化:
(x,h,w,h)是网络的输出其实为归一化的输出。(xc,yc)是物体在图片的实际中心,图像的宽为widthimage,高为heightimage。S是图片的切分数目,(col,row)是中心落在(SxS)的那个点上的坐标。怎么归一化点这里
在训练时,需要构造每个单元的输出意义。虽然YOLO预测的物体中心坐标,但这里需要告诉网络每个box应该预测哪个部分。把所的坐标归到不同的box中。这里的代码是进行(x, y, w, h)的归一化。
#shape为 (1,7, 7, 2)
offset = tf.tile(offset, [self.batch_size, 1, 1, 1])
#shape为(45, 7, 7, 2)
offset_tran = tf.transpose(offset, (0, 2, 1, 3))
#shape为(45, 7, 7, 2, 4)
predict_boxes_tran = tf.stack(
[(predict_boxes[..., 0] + offset) / self.cell_size,
(predict_boxes[..., 1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[..., 2]),
tf.square(predict_boxes[..., 3])], axis=-1)
这里同样将的标签进行归一化 [x,y ,w, h] ->[xc, yc, sqrt(w), sqrt(h)]
boxes_tran = tf.stack([boxes[:, :, :, :, 0] * self.cell_size - offset,
boxes[:, :, :, :, 1] * self.cell_size - tf.transpose(offset, (0, 2, 1, 3)),
tf.sqrt(boxes[:, :, :, :, 2]),
tf.sqrt(boxes[:, :, :, :, 3])])
boxes_tran = tf.transpose(boxes_tran, [1, 2, 3, 4, 0])
上面只是相减了,其实在数据读取的时候已经进行过相除操作:
这里插入pascal_voc.py的load_pascal_annotation函数:
- load_pascal_annotation()如下:
def load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
im = cv2.imread(imname)
h_ratio = 1.0 * self.image_size / im.shape[0]
w_ratio = 1.0 * self.image_size / im.shape[1]
# im = cv2.resize(im, [self.image_size, self.image_size])
label = np.zeros((self.cell_size, self.cell_size, 25))
filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
for obj in objs:
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]
x_ind = int(boxes[0] * self.cell_size / self.image_size)
y_ind = int(boxes[1] * self.cell_size / self.image_size)
if label[y_ind, x_ind, 0] == 1:
continue
label[y_ind, x_ind, 0] = 1
label[y_ind, x_ind, 1:5] = boxes
label[y_ind, x_ind, 5 + cls_ind] = 1
return label, len(objs)
下面:
计算iou:
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
#找出iou_predict_truth 第 3维度(即box_per_cell)维度计算得到的最大值构成一个tensor
object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
#object_mask:表示有目标 以及 目标与gt的IOU
object_mask = tf.cast(
(iou_predict_truth >= object_mask), tf.float32) * response
- 计算每个点中box与标签的坐标的iou
- 保留最大的iou值
- iou值大于阀值的表示为有物体
- 计算类别loss
class_delta = response * (predict_classes - classes)
class_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
name='class_loss') * self.class_scale
- 计算box的坐标loss
#需要判断第i个cell中第j个box会否负责这个object
#坐标四个维度对应求差,平方和
coord_mask = tf.expand_dims(object_mask, 4)
boxes_delta = coord_mask * (predict_boxes - boxes_tran)
coord_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
name='coord_loss') * self.coord_scale
- 含有object的box的confidence预测loss
object_delta = object_mask * (predict_scales - iou_predict_truth)
object_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
name='object_loss') * self.object_scale
- 不含object的box的confidence预测loss
#不含object的box的confidence预测
noobject_delta = noobject_mask * predict_scales
noobject_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
name='noobject_loss') * self.noobject_scale
这里还有很多值得探究的问题,暂时先到这里吧。。。。。。
以后在到这里来吧。。。。。。。
test.py
进入main()
老规矩,直接上关键代码:
# test.py
def main():
.........
yolo = YOLONet(False)
weight_file = os.path.join(args.data_dir, args.weight_dir, args.weights)
detector = Detector(yolo, weight_file)
# detect from camera
# cap = cv2.VideoCapture(0)
# detector.camera_detector(cap)
# detect from image file
imname = 'test/person.jpg'
detector.image_detector(imname)
- 这里调用yolo = YOLONet(False),进行测试
2 . 使用detector = Detector(yolo, weight_file)进行检测。
看看Detector()
Detector
def __init__(self, net, weight_file):
self.net = net
self.weights_file = weight_file
print(self.weights_file )
self.classes = cfg.CLASSES
self.num_class = len(self.classes)
self.image_size = cfg.IMAGE_SIZE
self.cell_size = cfg.CELL_SIZE
self.boxes_per_cell = cfg.BOXES_PER_CELL
self.threshold = cfg.THRESHOLD
self.iou_threshold = cfg.IOU_THRESHOLD
self.boundary1 = self.cell_size * self.cell_size * self.num_class
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
print('Restoring weights from: ' + self.weights_file)
self.saver = tf.train.Saver()
self.saver.restore(self.sess, self.weights_file)
初始化函数没什么,先把yolo的参数初始化(从ckpt文件中resotre)。
注意再main函数中使用了
imname = 'test/person.jpg'
detector.image_detector(imname)
那就看看image_detector()
- image_detector()
def image_detector(self, imname, wait=0):
detect_timer = Timer()
image = cv2.imread(imname)
detect_timer.tic()
result = self.detect(image)
detect_timer.toc()
print('Average detecting time: {:.3f}s'.format(
detect_timer.average_time))
self.draw_result(image, result)
cv2.imshow('Image', image)
cv2.waitKey(wait)
这里使用了 self.detect(image)进行box和classes的预测。
并且使用self.draw_result(image, result)画image和box。
- draw_result()
def draw_result(self, img, result): #在输入图像img上对检测到的result进行绘制框并标注类别概率信息
for i in range(len(result)): #目标个数遍历绘图
x = int(result[i][1]) #目标中心x
y = int(result[i][2]) #目标中心y
w = int(result[i][3] / 2) #目标宽取一半
h = int(result[i][4] / 2) #目标高取一半
cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2) #目标框
cv2.rectangle(img, (x - w, y - h - 20), #显示目标类别和概率值的灰色填充框
(x + w, y - h), (125, 125, 125), -1)
lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA #根据opencv版本,作者已经做了考虑了
cv2.putText(
img, result[i][0] + ' : %.2f' % result[i][5], #概率是两位小数的浮点数
(x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
(0, 0, 0), 1, lineType)
这里的x,y是每个box的中心的坐标,在opencv上画出box需要左下和右上的坐标。
- self.detect(image)
def detect(self, img):
img_h, img_w, _ = img.shape
inputs = cv2.resize(img, (self.image_size, self.image_size))
inputs = cv2.cvtColor(inputs, cv2.COLOR_BGR2RGB).astype(np.float32)
inputs = (inputs / 255.0) * 2.0 - 1.0
inputs = np.reshape(inputs, (1, self.image_size, self.image_size, 3))
result = self.detect_from_cvmat(inputs)[0]
for i in range(len(result)):
result[i][1] *= (1.0 * img_w / self.image_size)
result[i][2] *= (1.0 * img_h / self.image_size)
result[i][3] *= (1.0 * img_w / self.image_size)
result[i][4] *= (1.0 * img_h / self.image_size)
return result
这里返回合适的box,并把box的坐标还原到当前的image上去。
来到detect_from_cvmat(inputs)
- self.detect_from_cvmat(inputs)
def detect_from_cvmat(self, inputs):
net_output = self.sess.run(self.net.logits,
feed_dict={self.net.images: inputs})
results = []
for i in range(net_output.shape[0]):
results.append(self.interpret_output(net_output[i]))
return results
从这里可以看到,net_output就是yolo网络的输出shape=[None, 1470],拿到所有的预测值,现在需要删掉不合适的预测值,操作在interpret_output()中
- interpret_output()
def interpret_output(self, output):
probs = np.zeros((self.cell_size, self.cell_size, #所有box (98个)对应每个类别的概率,[7,7,2,20]
self.boxes_per_cell, self.num_class))
class_probs = np.reshape(
output[0:self.boundary1], #输出的[0:7x7x20]这980个数代表每个cell预测的每个类别的概率值
(self.cell_size, self.cell_size, self.num_class)) #最后输出时,每个cell只返回一个类别,因此类别概率维度变为[7,7,20]
scales = np.reshape(
output[self.boundary1:self.boundary2], #输出的[7x7x20:7x7x22]这98个数reshape成[7,7,2],个人理解是有无目标落在这98个box中
(self.cell_size, self.cell_size, self.boxes_per_cell))
boxes = np.reshape( #输出的[7x7x22:]这些数记录的是每个box对应的目标坐标信息,reshape为[7,7,2,4]
output[self.boundary2:],
(self.cell_size, self.cell_size, self.boxes_per_cell, 4))
offset = np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell)
offset = np.transpose(
np.reshape(
offset,
[self.boxes_per_cell, self.cell_size, self.cell_size]),#offset;[2,7,7]->[7,7,2]
(1, 2, 0))
# 反向归一化,把(x,y,w,h)放到原图上面去
boxes[:, :, :, 0] += offset
boxes[:, :, :, 1] += np.transpose(offset, (1, 0, 2))
boxes[:, :, :, :2] = 1.0 * boxes[:, :, :, 0:2] / self.cell_size
boxes[:, :, :, 2:] = np.square(boxes[:, :, :, 2:])
boxes *= self.image_size #将目标坐标相对cell的偏移量反映到448图像上
for i in range(self.boxes_per_cell):
for j in range(self.num_class):
probs[:, :, i, j] = np.multiply( #某cell中第i个box中含目标的概率*该cell中数据第j个类别概率
class_probs[:, :, j], scales[:, :, i])
filter_mat_probs = np.array(probs >= self.threshold, dtype='bool') #若概率大于0.2,filter_mat_probs=1
filter_mat_boxes = np.nonzero(filter_mat_probs) #过滤掉一个cell中的两个box的其中一个,返回filter_mat_probs中不为0的下标
boxes_filtered = boxes[filter_mat_boxes[0],
filter_mat_boxes[1], filter_mat_boxes[2]]
probs_filtered = probs[filter_mat_probs]
classes_num_filtered = np.argmax(
filter_mat_probs, axis=3)[
filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]]
argsort = np.array(np.argsort(probs_filtered))[::-1]
boxes_filtered = boxes_filtered[argsort] #过滤刷选出box
probs_filtered = probs_filtered[argsort] #过滤刷选出probs高的
classes_num_filtered = classes_num_filtered[argsort] #过滤刷选出类别
for i in range(len(boxes_filtered)):
if probs_filtered[i] == 0:
continue
for j in range(i + 1, len(boxes_filtered)):
if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold:
probs_filtered[j] = 0.0
filter_iou = np.array(probs_filtered > 0.0, dtype='bool')
boxes_filtered = boxes_filtered[filter_iou]
probs_filtered = probs_filtered[filter_iou]
classes_num_filtered = classes_num_filtered[filter_iou]
result = []
for i in range(len(boxes_filtered)):
result.append(
[self.classes[classes_num_filtered[i]],
boxes_filtered[i][0],
boxes_filtered[i][1],
boxes_filtered[i][2],
boxes_filtered[i][3],
probs_filtered[i]])
return result #输出过滤后的类别,以及对应box的坐标
这里选出合适的box的坐标。
在最后需要介绍一下pascal_voc.py文件
pascal_voc.py
在test.py文件里没有使用这个文件,只有train.py使用了
在train中使用了三次。
class Solver(object):
def __init__(self, net, data):
self.data = data
........
def train():
............
for step in range(1, self.max_iter + 1): #最大迭代:15000
load_timer.tic() #开始计时
images, labels = self.data.get()
............
log_str = "{} Epoch: {}, Step: {}, Learning rate: {},"\ #报文字符串内容
"Loss: {:5.3f}\nSpeed: {:.3f}s/iter,"\
"Load: {:.3f}s/iter, Remain: {}".format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.data.epoch,
int(step),
round(self.learning_rate.eval(session=self.sess), 6),
loss,
train_timer.average_time,
load_timer.average_time,
train_timer.remain(step, self.max_iter))
就进入pascal_voc看看
class pascal_voc(object): #定义一个pascal_voc类
def __init__(self, phase, rebuild=False):
self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit') #开发包列表目录:当前工作路径/data/pascal_voc/VOCdevkit
self.data_path = os.path.join(self.devkil_path, 'VOC2007') #开发包数据目录:当前工作路径/data/pascal_voc/VOCdevkit/VOC2007
self.cache_path = cfg.CACHE_PATH #见yolo目录下的config.py文件
self.batch_size = cfg.BATCH_SIZE
self.image_size = cfg.IMAGE_SIZE
self.cell_size = cfg.CELL_SIZE
self.classes = cfg.CLASSES
self.class_to_ind = dict(zip(self.classes, range(len(self.classes)))) #将类别中文名数字序列化成0,1,2,……
self.flipped = cfg.FLIPPED
self.phase = phase #定义训练or测试
self.rebuild = rebuild
self.cursor = 0 #光标移动用,查询gt_labels这个结构
self.epoch = 1
self.gt_labels = None
self.prepare()
这里的self.prepare()
self.prepare()
def prepare(self): #是否做flipped并打乱原来次序返回结果
gt_labels = self.load_labels() #获取gt labels数据
if self.flipped: #判断是否做flipped
print('Appending horizontally-flipped training examples ...')
gt_labels_cp = copy.deepcopy(gt_labels)
for idx in range(len(gt_labels_cp)):
gt_labels_cp[idx]['flipped'] = True
gt_labels_cp[idx]['label'] =\
gt_labels_cp[idx]['label'][:, ::-1, :]
for i in range(self.cell_size):
for j in range(self.cell_size):
if gt_labels_cp[idx]['label'][i, j, 0] == 1:
gt_labels_cp[idx]['label'][i, j, 1] = \
self.image_size - 1 -\
gt_labels_cp[idx]['label'][i, j, 1]
gt_labels += gt_labels_cp
np.random.shuffle(gt_labels) #对gt labels打乱顺序
self.gt_labels = gt_labels
return gt_labels
直接获取labels。
同时假若需要翻转:
flipped是数据增强gt_labels_cp[idx]['label'][i, j, 1] = self.image_size - 1 -gt_labels_cp[idx]['label'][i, j, 1] 其实就是求水平反转后的坐标。
最后把做了数据增强的返回,这个地方只是做了水平反转。同时把最终的结果传给gt_labels,self.gt_labels = gt_labels。详细的解释
点这里
常用的get()
- get()
def get(self):
images = np.zeros( #初始化图像。bs x 448x448x3
(self.batch_size, self.image_size, self.image_size, 3))
labels = np.zeros( #初始化类别(gt)。bs x 7x7x25 ,对于另外一个box就不构建维度了,因此是25
(self.batch_size, self.cell_size, self.cell_size, 25))
count = 0
while count < self.batch_size: #batch处理
imname = self.gt_labels[self.cursor]['imname'] #从gt label中读取图像名
flipped = self.gt_labels[self.cursor]['flipped'] #从gt label中查看是否flipped
images[count, :, :, :] = self.image_read(imname, flipped)
labels[count, :, :, :] = self.gt_labels[self.cursor]['label'] #从gt label中获取label类别坐标等信息
count += 1
self.cursor += 1
if self.cursor >= len(self.gt_labels): #判断是否训练完一个epoch了
np.random.shuffle(self.gt_labels)
self.cursor = 0
self.epoch += 1
return images, labels #返回尺寸缩放和归一化后的image序列;以及labels 真实信息
参考:
YOLO源码解析
深度学习实战03】——YOLO tensorflow运行及源码解析