TensorFlow-YOLO_V1训练代码梳理,相信看过后对yolo-v1的整个训练流程都一目了然了。
硬件:NVIDIA-GTX1080
软件:Windows7、python3.6.5、tensorflow-gpu-1.4.0
一、基础知识
1、yolo-v1的loss目标分类不是21类,而是20+2类,其中20是目标分类,2是背景和目标的二分类
2、total_loss = 1.0*class_loss + 1.0*object_loss + 0.5*noobject_loss + 5.0*coord_loss
class_loss:20类分类误差(有目标情况下)
object_loss:目标分类误差
noobject_loss:非目标分类误差
coord_loss:边框回归误差(有目标情况下)
二、数据和预训练模型下载
1、数据是voc2007-trainval,存放路径是data/pascal_voc/VOCdevkit/VOC2007
2、预训练模型:https://pan.baidu.com/s/1517r5zxY01vQgrEJUqGEfw (xh45) ,存放路径是data/weights
三、代码展示(做了详细注释)
1、train.py
import os
import argparse
import datetime
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc
slim = tf.contrib.slim
class Solver(object):
def __init__(self, net, data):
self.net = net
self.data = data
# 预训练模型文件
self.weights_file = cfg.WEIGHTS_FILE
# 最大迭代次数
self.max_iter = cfg.MAX_ITER
# 初始学习率
self.initial_learning_rate = cfg.LEARNING_RATE
# 学习率下降间隔数
self.decay_steps = cfg.DECAY_STEPS
# 学习率下降系数
self.decay_rate = cfg.DECAY_RATE
# 是否按照阶梯状下降
self.staircase = cfg.STAIRCASE
# 多少次迭代显示一次
self.summary_iter = cfg.SUMMARY_ITER
# 多少次迭代保存一次模型
self.save_iter = cfg.SAVE_ITER
# 模型保存路径
self.output_dir = os.path.join(
cfg.OUTPUT_DIR, datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
# 保存一次cfg配置文件
self.save_cfg()
self.variable_to_restore = tf.global_variables()
self.saver = tf.train.Saver(self.variable_to_restore, max_to_keep=None)
# 保存模型路径
self.ckpt_file = os.path.join(self.output_dir, 'yolo')
self.summary_op = tf.summary.merge_all()
self.writer = tf.summary.FileWriter(self.output_dir, flush_secs=60)
# decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
# if staircase is True, int(global_step / decay_steps)
self.global_step = tf.train.create_global_step()
self.learning_rate = tf.train.exponential_decay(
self.initial_learning_rate, self.global_step, self.decay_steps,
self.decay_rate, self.staircase, name='learning_rate')
# 模型优化器
self.optimizer = tf.train.GradientDescentOptimizer(
learning_rate=self.learning_rate)
self.train_op = slim.learning.create_train_op(
self.net.total_loss, self.optimizer, global_step=self.global_step)
# 定义sess
gpu_options = tf.GPUOptions()
config = tf.ConfigProto(gpu_options=gpu_options)
self.sess = tf.Session(config=config)
self.sess.run(tf.global_variables_initializer())
# 载入预训练模型
if self.weights_file is not None:
print('Restoring weights from: ' + self.weights_file)
self.saver.restore(self.sess, self.weights_file)
self.writer.add_graph(self.sess.graph)
def train(self):
train_timer = Timer()
load_timer = Timer()
for step in range(1, self.max_iter + 1):
load_timer.tic()
images, labels = self.data.get()
load_timer.toc()
feed_dict = {self.net.images: images,
self.net.labels: labels}
if step % self.summary_iter == 0:
# 每过summary_iter * 10次,打印一次输出结果
if step % (self.summary_iter * 10) == 0:
train_timer.tic()
summary_str, loss, _ = self.sess.run(
[self.summary_op, self.net.total_loss, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
log_str = '{} Epoch: {}, Step: {}, Learning rate: {},Loss: {:5.3f}\nSpeed: {:.3f}s/iter, Load: {:.3f}s/iter, Remain: {}'.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.data.epoch,
int(step),
round(self.learning_rate.eval(session=self.sess), 6),
loss,
train_timer.average_time,
load_timer.average_time,
train_timer.remain(step, self.max_iter))
print(log_str)
else:
train_timer.tic()
summary_str, _ = self.sess.run(
[self.summary_op, self.train_op],
feed_dict=feed_dict)
train_timer.toc()
self.writer.add_summary(summary_str, step)
else:
train_timer.tic()
self.sess.run(self.train_op, feed_dict=feed_dict)
train_timer.toc()
# 保存模型
if step % self.save_iter == 0:
print('{} Saving checkpoint file to: {}'.format(
datetime.datetime.now().strftime('%m-%d %H:%M:%S'),
self.output_dir))
self.saver.save(
self.sess, self.ckpt_file, global_step=self.global_step)
def save_cfg(self):
with open(os.path.join(self.output_dir, 'config.txt'), 'w') as f:
cfg_dict = cfg.__dict__
for key in sorted(cfg_dict.keys()):
if key[0].isupper():
cfg_str = '{}: {}\n'.format(key, cfg_dict[key])
f.write(cfg_str)
def update_config_paths(data_dir, weights_file):
cfg.DATA_PATH = data_dir
cfg.PASCAL_PATH = os.path.join(data_dir, 'pascal_voc')
cfg.CACHE_PATH = os.path.join(cfg.PASCAL_PATH, 'cache')
cfg.OUTPUT_DIR = os.path.join(cfg.PASCAL_PATH, 'output')
cfg.WEIGHTS_DIR = os.path.join(cfg.PASCAL_PATH, 'weights')
cfg.WEIGHTS_FILE = os.path.join(cfg.WEIGHTS_DIR, weights_file)
def main():
'''
parser = argparse.ArgumentParser()
parser.add_argument('--weights', default="YOLO_small.ckpt", type=str)
parser.add_argument('--data_dir', default="data", type=str)
parser.add_argument('--threshold', default=0.2, type=float)
parser.add_argument('--iou_threshold', default=0.5, type=float)
parser.add_argument('--gpu', default='', type=str)
args = parser.parse_args()
if args.gpu is not None:
cfg.GPU = args.gpu
if args.data_dir != cfg.DATA_PATH:
update_config_paths(args.data_dir, args.weights)
os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPU
'''
yolo = YOLONet()
pascal = pascal_voc('train')
solver = Solver(yolo, pascal)
print('Start training ...')
solver.train()
print('Done training.')
if __name__ == '__main__':
# python train.py --weights YOLO_small.ckpt --gpu 0
main()
2、utils/pascal_voc.py
import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import yolo.config as cfg
class pascal_voc(object):
def __init__(self, phase, rebuild=False):
self.devkil_path = os.path.join(cfg.PASCAL_PATH, 'VOCdevkit')
# 数据路径
self.data_path = os.path.join(self.devkil_path, 'VOC2007')
# 数据文件保存路径
self.cache_path = cfg.CACHE_PATH
# 训练批次
self.batch_size = cfg.BATCH_SIZE
# 训练图像大小(448x448)
self.image_size = cfg.IMAGE_SIZE
# 单元块个数
self.cell_size = cfg.CELL_SIZE
# 类别
self.classes = cfg.CLASSES
# {'aeroplane': 0, 'bicycle': 1...}
self.class_to_ind = dict(zip(self.classes, range(len(self.classes))))
# 图像是否翻转(数据增强)
self.flipped = cfg.FLIPPED
# train or val
self.phase = phase
# 判断数据是否已生成,如果没有则重新生成
self.rebuild = rebuild
self.cursor = 0
self.epoch = 1
# 数据标签
self.gt_labels = None
self.prepare()
def get(self):
images = np.zeros(
(self.batch_size, self.image_size, self.image_size, 3))
labels = np.zeros(
(self.batch_size, self.cell_size, self.cell_size, 25))
count = 0
while count < self.batch_size:
imname = self.gt_labels[self.cursor]['imname']
flipped = self.gt_labels[self.cursor]['flipped']
images[count, :, :, :] = self.image_read(imname, flipped)
labels[count, :, :, :] = self.gt_labels[self.cursor]['label']
count += 1
self.cursor += 1
# 当所有数据都训练一次(1 epoch)时,数据打乱再训练
if self.cursor >= len(self.gt_labels):
np.random.shuffle(self.gt_labels)
self.cursor = 0
self.epoch += 1
return images, labels
# 图像归一化
def image_read(self, imname, flipped=False):
image = cv2.imread(imname)
image = cv2.resize(image, (self.image_size, self.image_size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
image = (image / 255.0) * 2.0 - 1.0
if flipped:
image = image[:, ::-1, :]
return image
# flip_gt_labels + gt_labels
def prepare(self):
gt_labels = self.load_labels()
if self.flipped:
print('Appending horizontally-flipped training examples ...')
gt_labels_cp = copy.deepcopy(gt_labels)
for idx in range(len(gt_labels_cp)):
gt_labels_cp[idx]['flipped'] = True
# 左右镜像需要将坐标x倒序排列,但是这里只是倒序了顺序,坐标值值未变
gt_labels_cp[idx]['label'] =\
gt_labels_cp[idx]['label'][:, ::-1, :]
for i in range(self.cell_size):
for j in range(self.cell_size):
if gt_labels_cp[idx]['label'][i, j, 0] == 1:
# 倒序后,坐标值需做相应镜像
gt_labels_cp[idx]['label'][i, j, 1] = \
self.image_size - 1 -\
gt_labels_cp[idx]['label'][i, j, 1]
gt_labels += gt_labels_cp
np.random.shuffle(gt_labels)
self.gt_labels = gt_labels
return gt_labels
def load_labels(self):
cache_file = os.path.join(
self.cache_path, 'pascal_' + self.phase + '_gt_labels.pkl')
if os.path.isfile(cache_file) and not self.rebuild:
print('Loading gt_labels from: ' + cache_file)
with open(cache_file, 'rb') as f:
gt_labels = pickle.load(f)
return gt_labels
print('Processing gt_labels from: ' + self.data_path)
if not os.path.exists(self.cache_path):
os.makedirs(self.cache_path)
if self.phase == 'train':
txtname = os.path.join(
self.data_path, 'ImageSets', 'Main', 'trainval.txt')
else:
txtname = os.path.join(
self.data_path, 'ImageSets', 'Main', 'test.txt')
# image_index所有图片的图片名
with open(txtname, 'r') as f:
self.image_index = [x.strip() for x in f.readlines()]
gt_labels = []
for index in self.image_index:
label, num = self.load_pascal_annotation(index)
if num == 0:
continue
imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
gt_labels.append({'imname': imname,
'label': label,
'flipped': False})
print('Saving gt_labels to: ' + cache_file)
with open(cache_file, 'wb') as f:
pickle.dump(gt_labels, f)
return gt_labels
def load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
imname = os.path.join(self.data_path, 'JPEGImages', index + '.jpg')
im = cv2.imread(imname)
h_ratio = 1.0 * self.image_size / im.shape[0]
w_ratio = 1.0 * self.image_size / im.shape[1]
# 数据置零,所有标签全为0
# 25: [0] for IOU*obj/noobj(有目标时一般为1), [1:4] for bbox, [5:25] for prob(条件概率0 or 1)
label = np.zeros((self.cell_size, self.cell_size, 25))
filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
for obj in objs:
bbox = obj.find('bndbox')
# 确保边框相对图像尺寸的有效性,且坐标相对与训练图像尺寸
# x_min448 / x_min = image_size / im.shape[0]
x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio, self.image_size - 1), 0)
y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
# 标签和边框信息,此时的边框暂未做单元格的归一化!归一化在yolo_net.py中取boxes时
cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()] # 小写去特殊字符
boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]
# 判断该中心点在哪个单元格里
# x / x_mid448 = cell_size / image_size
x_ind = int(boxes[0] * self.cell_size / self.image_size)
y_ind = int(boxes[1] * self.cell_size / self.image_size)
# 只能接受最先提供给label的一类物体,后面重复的则忽略,伏笔!!!
if label[y_ind, x_ind, 0] == 1:
continue
# 根据python-opencv图像排列原则,shape[0]表示高,shape[1],所以这里y在前,x在后
# [0] for IOU*obj/noobj(有目标时一般为1)
label[y_ind, x_ind, 0] = 1
# [1:4] for bbox
label[y_ind, x_ind, 1:5] = boxes
# [5:25] for prob(条件概率0 or 1)
label[y_ind, x_ind, 5 + cls_ind] = 1
return label, len(objs)
3、utils/timer.py
import time
import datetime
class Timer(object):
'''
A simple timer.
'''
def __init__(self):
self.init_time = time.time()
self.total_time = 0.
self.calls = 0
self.start_time = 0.
self.diff = 0.
self.average_time = 0.
self.remain_time = 0.
def tic(self):
# using time.time instead of time.clock because time time.clock
# does not normalize for multithreading
self.start_time = time.time()
def toc(self, average=True):
self.diff = time.time() - self.start_time
self.total_time += self.diff
self.calls += 1
self.average_time = self.total_time / self.calls
if average:
return self.average_time
else:
return self.diff
def remain(self, iters, max_iters):
if iters == 0:
self.remain_time = 0
else:
self.remain_time = (time.time() - self.init_time) * \
(max_iters - iters) / iters
return str(datetime.timedelta(seconds=int(self.remain_time)))
4、yolo/yolo_net.py
import numpy as np
import tensorflow as tf
import yolo.config as cfg
slim = tf.contrib.slim
class YOLONet(object):
def __init__(self, is_training=True):
self.classes = cfg.CLASSES
self.num_class = len(self.classes)
self.image_size = cfg.IMAGE_SIZE
self.cell_size = cfg.CELL_SIZE
self.boxes_per_cell = cfg.BOXES_PER_CELL
self.output_size = (self.cell_size * self.cell_size) *\
(self.num_class + self.boxes_per_cell * 5)
self.scale = 1.0 * self.image_size / self.cell_size
self.boundary1 = self.cell_size * self.cell_size * self.num_class
self.boundary2 = self.boundary1 +\
self.cell_size * self.cell_size * self.boxes_per_cell
# 有目标的惩罚系数
self.object_scale = cfg.OBJECT_SCALE
# 无目标的惩罚系数
self.noobject_scale = cfg.NOOBJECT_SCALE
# 类别的惩罚系数
self.class_scale = cfg.CLASS_SCALE
# 边框的惩罚系数
self.coord_scale = cfg.COORD_SCALE
# 学习率
self.learning_rate = cfg.LEARNING_RATE
# 训练批次
self.batch_size = cfg.BATCH_SIZE
self.alpha = cfg.ALPHA
self.offset = np.transpose(np.reshape(np.array(
[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
(self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
self.images = tf.placeholder(
tf.float32, [None, self.image_size, self.image_size, 3],
name='images')
self.logits = self.build_network(
self.images, num_outputs=self.output_size, alpha=self.alpha,
is_training=is_training)
if is_training:
self.labels = tf.placeholder(
tf.float32,
[None, self.cell_size, self.cell_size, 5 + self.num_class])
self.loss_layer(self.logits, self.labels)
self.total_loss = tf.losses.get_total_loss()
tf.summary.scalar('total_loss', self.total_loss)
def build_network(self,
images,
num_outputs,
alpha,
keep_prob=0.5,
is_training=True,
scope='yolo'):
with tf.variable_scope(scope):
with slim.arg_scope(
[slim.conv2d, slim.fully_connected],
activation_fn=leaky_relu(alpha),
weights_regularizer=slim.l2_regularizer(0.0005),
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
):
net = tf.pad(
images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
name='pad_1')
net = slim.conv2d(
net, 64, 7, 2, padding='VALID', scope='conv_2')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
net = slim.conv2d(net, 192, 3, scope='conv_4')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
net = slim.conv2d(net, 128, 1, scope='conv_6')
net = slim.conv2d(net, 256, 3, scope='conv_7')
net = slim.conv2d(net, 256, 1, scope='conv_8')
net = slim.conv2d(net, 512, 3, scope='conv_9')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
net = slim.conv2d(net, 256, 1, scope='conv_11')
net = slim.conv2d(net, 512, 3, scope='conv_12')
net = slim.conv2d(net, 256, 1, scope='conv_13')
net = slim.conv2d(net, 512, 3, scope='conv_14')
net = slim.conv2d(net, 256, 1, scope='conv_15')
net = slim.conv2d(net, 512, 3, scope='conv_16')
net = slim.conv2d(net, 256, 1, scope='conv_17')
net = slim.conv2d(net, 512, 3, scope='conv_18')
net = slim.conv2d(net, 512, 1, scope='conv_19')
net = slim.conv2d(net, 1024, 3, scope='conv_20')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
net = slim.conv2d(net, 512, 1, scope='conv_22')
net = slim.conv2d(net, 1024, 3, scope='conv_23')
net = slim.conv2d(net, 512, 1, scope='conv_24')
net = slim.conv2d(net, 1024, 3, scope='conv_25')
net = slim.conv2d(net, 1024, 3, scope='conv_26')
net = tf.pad(
net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]),
name='pad_27')
net = slim.conv2d(
net, 1024, 3, 2, padding='VALID', scope='conv_28')
net = slim.conv2d(net, 1024, 3, scope='conv_29')
net = slim.conv2d(net, 1024, 3, scope='conv_30')
net = tf.transpose(net, [0, 3, 1, 2], name='trans_31')
net = slim.flatten(net, scope='flat_32')
net = slim.fully_connected(net, 512, scope='fc_33')
net = slim.fully_connected(net, 4096, scope='fc_34')
net = slim.dropout(
net, keep_prob=keep_prob, is_training=is_training,
scope='dropout_35')
net = slim.fully_connected(
net, num_outputs, activation_fn=None, scope='fc_36')
return net
def calc_iou(self, boxes1, boxes2, scope='iou'):
"""calculate ious
Args:
boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h)
boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
Return:
iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
with tf.variable_scope(scope):
# transform (x_center, y_center, w, h) to (x1, y1, x2, y2)
boxes1_t = tf.stack([boxes1[..., 0] - boxes1[..., 2] / 2.0,
boxes1[..., 1] - boxes1[..., 3] / 2.0,
boxes1[..., 0] + boxes1[..., 2] / 2.0,
boxes1[..., 1] + boxes1[..., 3] / 2.0],
axis=-1)
boxes2_t = tf.stack([boxes2[..., 0] - boxes2[..., 2] / 2.0,
boxes2[..., 1] - boxes2[..., 3] / 2.0,
boxes2[..., 0] + boxes2[..., 2] / 2.0,
boxes2[..., 1] + boxes2[..., 3] / 2.0],
axis=-1)
# calculate the left up point & right down point
lu = tf.maximum(boxes1_t[..., :2], boxes2_t[..., :2])
rd = tf.minimum(boxes1_t[..., 2:], boxes2_t[..., 2:])
# intersection
intersection = tf.maximum(0.0, rd - lu)
inter_square = intersection[..., 0] * intersection[..., 1]
# calculate the boxs1 square and boxs2 square
square1 = boxes1[..., 2] * boxes1[..., 3]
square2 = boxes2[..., 2] * boxes2[..., 3]
union_square = tf.maximum(square1 + square2 - inter_square, 1e-10)
#tf.clip_by_value限制输出值范围(0.0,1.0)
return tf.clip_by_value(inter_square / union_square, 0.0, 1.0)
def loss_layer(self, predicts, labels, scope='loss_layer'):
with tf.variable_scope(scope):
# 提取输出类别条件概率 (batch, 7, 7, 20)
predict_classes = tf.reshape(
predicts[:, :self.boundary1],
[self.batch_size, self.cell_size, self.cell_size, self.num_class])
# 提取输出各边框的置信度IOU (batch, 7, 7, 2)
predict_scales = tf.reshape(
predicts[:, self.boundary1:self.boundary2],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
# 提取输出各边框 (batch, 7, 7, 2, 4)
predict_boxes = tf.reshape(
predicts[:, self.boundary2:],
[self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4])
# 提取标签边框的置信度IOU (batch, 7, 7, 1)
response = tf.reshape(
labels[..., 0],
[self.batch_size, self.cell_size, self.cell_size, 1])
# 提取标签边框 (batch, 7, 7, 1, 4)
boxes = tf.reshape(
labels[..., 1:5],
[self.batch_size, self.cell_size, self.cell_size, 1, 4])
# 标签边框维度4复制了boxes_per_cell次,(batch, 7, 7, 1, 4)->(batch, 7, 7, 2, 4)
# 边框归一化在这里,和yolo的voc_label生成的数据格式一致!!!
# x_train = x_mid448 / 448(此时这里的坐标并不是相对于各自单元格!!!)
# w_train = w_box / 448
boxes = tf.tile(boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size
# 提取标签类别条件概率 (batch, 7, 7, 20)
classes = labels[..., 5:]
# x_offset(1x7x7x2)
offset = tf.reshape(
tf.constant(self.offset, dtype=tf.float32),
[1, self.cell_size, self.cell_size, self.boxes_per_cell])
# x_offset(1x7x7x2)->(batch x 7x7x2)
offset = tf.tile(offset, [self.batch_size, 1, 1, 1])
# y_offset(batch x 7x7x2)
offset_tran = tf.transpose(offset, (0, 2, 1, 3))
# 输出边框相对于各自单元格左上角的偏移量,计算后和yolo标签一一对应,为了计算iou!
predict_boxes_tran = tf.stack(
[(predict_boxes[..., 0] + offset) / self.cell_size,
(predict_boxes[..., 1] + offset_tran) / self.cell_size,
tf.square(predict_boxes[..., 2]),
tf.square(predict_boxes[..., 3])], axis=-1)
# 计算的结果用作标签的边框置信度(IOU),在线计算,而实际的标签边框置信度IOU(response),是用作有无目标的过滤器
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
# calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
# 求出各自边框中的最大值(batch, 7, 7, 2)->(batch, 7, 7, 1)
object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dims=True)
# 标签置信度(batch, 7, 7, 2),其中2的值分别为0和标签置信度,0对应预测min(prebbox_labbbox_iou0, prebbox_labbbox_iou1)
# 意思是,预测的2个边框,和标签2个边框的iou,值大者预测置信度对应标签的置信度,值小者预测置信度对应标签置信度0
# 作用是,与其相乘,无目标的值都为0,留下有目标的
object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response
# calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
# 和object_mask相反,值大者预测置信度对应标签的(1-置信度 = 0),值小者预测置信度对应标签置信度1
# 作用是,与其相乘,有目标的值都为0,留下无目标的
noobject_mask = tf.ones_like(
object_mask, dtype=tf.float32) - object_mask
# 标签边框相对于各自单元格左上角的偏移量,计算后和训练输出边框一一对应!
boxes_tran = tf.stack(
[boxes[..., 0] * self.cell_size - offset,
boxes[..., 1] * self.cell_size - offset_tran,
tf.sqrt(boxes[..., 2]),
tf.sqrt(boxes[..., 3])], axis=-1)
##########################calculate loss###################
# 目标中心点落在某个单元格内,这个单元格才有标签,其他情况没有,没有时计算的iou_predict_truth肯定为0
# class_loss 1.0(只分类20类目标:有目标,则区分20类,需要计算损失;若是不在20类中的背景,就不需要计算损失)
# (n,7,7,1)*(n,7,7,20) = (n,7,7,20)
# 含有目标的单元格才有类别损失,其他都不需要回传损失,也就不需要计算,默认网格只出现一种类别
class_delta = response * (predict_classes - classes)
class_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]),
name='class_loss') * self.class_scale
# object_loss 1.0(区分目标和背景,是目标则计算损失)
# (n,7,7,2)*(n,7,7,2) = (n,7,7,2)
# 负责检测box的标签边框置信度是在线计算的IOU,无目标的都为0
object_delta = object_mask * (predict_scales - iou_predict_truth)
object_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(object_delta), axis=[1, 2, 3]),
name='object_loss') * self.object_scale
# noobject_loss 0.5(区分目标和背景,是背景则计算损失)
# 若是背景,iou_predict_truth所有值都为0,减不减都无所谓
noobject_delta = noobject_mask * predict_scales
noobject_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(noobject_delta), axis=[1, 2, 3]),
name='noobject_loss') * self.noobject_scale
# coord_loss 5.0(坐标误差)
# 负责检测的才有位置损失,其他的都不需回传损失,也就不需要计算
coord_mask = tf.expand_dims(object_mask, 4)
boxes_delta = coord_mask * (predict_boxes - boxes_tran)
coord_loss = tf.reduce_mean(
tf.reduce_sum(tf.square(boxes_delta), axis=[1, 2, 3, 4]),
name='coord_loss') * self.coord_scale
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(coord_loss)
tf.summary.scalar('class_loss', class_loss)
tf.summary.scalar('object_loss', object_loss)
tf.summary.scalar('noobject_loss', noobject_loss)
tf.summary.scalar('coord_loss', coord_loss)
tf.summary.histogram('boxes_delta_x', boxes_delta[..., 0])
tf.summary.histogram('boxes_delta_y', boxes_delta[..., 1])
tf.summary.histogram('boxes_delta_w', boxes_delta[..., 2])
tf.summary.histogram('boxes_delta_h', boxes_delta[..., 3])
tf.summary.histogram('iou', iou_predict_truth)
def leaky_relu(alpha):
def op(inputs):
return tf.nn.leaky_relu(inputs, alpha=alpha, name='leaky_relu')
return op
5、yolo/config.py
import os
#
# path and dataset parameter
#
DATA_PATH = 'data'
PASCAL_PATH = os.path.join(DATA_PATH, 'pascal_voc')
CACHE_PATH = os.path.join(PASCAL_PATH, 'cache')
OUTPUT_DIR = os.path.join(PASCAL_PATH, 'output')
WEIGHTS_DIR = os.path.join(PASCAL_PATH, 'weights')
# WEIGHTS_FILE = None
WEIGHTS_FILE = os.path.join(DATA_PATH, 'weights', 'YOLO_small.ckpt')
CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
'train', 'tvmonitor']
FLIPPED = True
#
# model parameter
#
IMAGE_SIZE = 448
CELL_SIZE = 7
BOXES_PER_CELL = 2
ALPHA = 0.1
DISP_CONSOLE = False
OBJECT_SCALE = 1.0
NOOBJECT_SCALE = 0.5
CLASS_SCALE = 1.0
COORD_SCALE = 5.0
#
# solver parameter
#
GPU = ''
LEARNING_RATE = 0.0001
DECAY_STEPS = 30000
DECAY_RATE = 0.1
STAIRCASE = True
BATCH_SIZE = 45
MAX_ITER = 15000
SUMMARY_ITER = 10
SAVE_ITER = 1000
#
# test parameter
#
THRESHOLD = 0.2
IOU_THRESHOLD = 0.5
四、测试结果展示
1、测试程序参考https://blog.csdn.net/samylee/article/details/85263143
2、将生成的yolo-1000.data-00000-of-00001及yolo-1000.index文件拷贝到my_model下(参考)
3、修改代码
weight_file = "model/YOLO_small.ckpt"
修改为
weight_file = 'my_model/yolo-1000'
4、训练1000步的效果
五、参考
https://github.com/hizhangp/yolo_tensorflow
任何问题请加唯一QQ2258205918(名称samylee)!
或唯一VX:samylee_csdn