import argparse
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import scipy.io
import scipy.misc
import numpy as np
import pandas as pd
import PIL
import tensorflow as tf
from keras import backend as K
from keras.layers import Input, Lambda, Conv2D
from keras.models import load_model, Model
from yolo_utils import read_classes, read_anchors, generate_colors, preprocess_image, draw_boxes, scale_boxes
from yad2k.models.keras_yolo import yolo_head, yolo_boxes_to_corners, preprocess_true_boxes, yolo_loss, yolo_body
YOLO模型的输入是图像:(m, 608, 608, 3),输出是 (m, 19, 19, 5, 85)标签。则对于每张图片的输出就是(19, 19, 5, 85),5代表5个anchor box,85是类别。这一步就是要通过计算每个anchor box 中的85个类别的概率,选出最大的一个,确定为此anchor box的类别。
def yolo_filter_boxes(box_confidence, boxes, box_class_probs, threshold = .6):
# Step 1: Compute box scores
box_scores = box_confidence * box_class_probs
# Step 2: Find the box_classes thanks to the max box_scores, keep track of the corresponding score
box_classes = K.argmax(box_scores, axis=-1)
box_class_scores = K.max(box_scores, axis=-1, keepdims=False)
#tep 3: Create a filtering mask based on "box_class_scores" by using "threshold". The mask should have the
# same dimension as box_class_scores, and be True for the boxes you want to keep (with probability >= threshold)
filtering_mask = box_class_scores >= threshold
# Step 4: Apply the mask to scores, boxes and classes
scores = tf.compat.v1.boolean_mask(box_class_scores, filtering_mask)
boxes = tf.compat.v1.boolean_mask(boxes, filtering_mask)
classes = tf.compat.v1.boolean_mask(box_classes, filtering_mask)
return scores, boxes, classes
此函数的输入box_confidence, boxes, box_class_probs
其实是将YOLO模型的输出(19, 19, 5, 85) 中的 85 个元素分开
box_confidence
:形状为 (19, 19, 5, 1),标签元素为pc(概率)。
boxes
:形状为 (19, 19, 5, 4),标签元素为(bx,by,bh,bw)
box_class_probs
:形状为 (19, 19, 5, 80),标签元素为80种类别。
首先定义一个box_scores
变量,值为box_confidence
与boxes
的乘积,然后取具有最高分数的类别的索引box_classes
,使用的函数是 keras.backend.argmax() (此函数的作用是返回沿指定坐标轴方向最大值的索引)。接着取具有最高分数的box的分数box_class_scores
,使用keras.backend.max()两个函数都取axis = -1,因为都是在最后一个轴上取值,而这两个张量的维度都是 (19, 19, 5)。下一步建一个filtering_mask
,它的值是布尔值,只有分数大于等于每个box里取的最大分数box_class_scores
值才为True,维度也是 (19, 19, 5)。最后将mask应用到scores, boxes 及 classes,过滤掉一些最大分数也小于阈值的那些box。返回值是分数较高(概率较大)的box及对应的索引以及分数 。由于不知道具体过滤掉了多少(肯定不是(19,19,5)具体取决于阈值)最后的返回值scores, boxes , classes
维度分别是(None,), (None,4) ,(None,)。
def yolo_non_max_suppression(scores, boxes, classes, max_boxes = 10, iou_threshold = 0.5):
max_boxes_tensor = K.variable(max_boxes, dtype='int32') # tensor to be used in tf.image.non_max_suppression()
K.get_session().run(tf.variables_initializer([max_boxes_tensor])) # initialize variable max_boxes_tensor
# Use tf.image.non_max_suppression() to get the list of indices corresponding to boxes you keep
nms_indices = tf.compat.v1.image.non_max_suppression(boxes,scores,max_boxes,iou_threshold)
# Use K.gather() to select only nms_indices from scores, boxes and classes
scores = K.gather(scores,nms_indices)
boxes = K.gather(boxes,nms_indices)
classes = K.gather(classes,nms_indices)
return scores, boxes, classes
此函数的输入是上面函数的输出,此时的box有一些具有低交并比的,需要留下最好的一个。首先定义一个变量max_boxes_tensor
,传入整数max_boxes
,并使用session初始化变量。然后使用tf内置函数tf.compat.v1.image.non_max_suppression()进行非极大值抑制,返回值selected_indices
是M个一维整数张量,M<=max_output_size表示从anchor box 中选择的留下的box的索引,最后再使用tf.gather()函数取得最终的box及对应类别。
boxes = yolo_boxes_to_corners(box_xy, box_wh)
这个函数实现了将边界框由(x,y,w,h)的表示形式转化为两个点的形式 (x1, y1, x2, y2),以适应用到的`yolo_filter_boxes`函数
boxes = scale_boxes(boxes, image_shape)
YOLO的网络经过训练可以在608x608的图像上运行。如果要在其他尺寸的图像上测试此数据(例如,汽车检测数据集具有720x1280图像),则此函数将重新缩放边界框,以便可以将其绘制在原始720x1280图像的顶部。
def yolo_eval(yolo_outputs, image_shape = (720., 1280.), max_boxes=10, score_threshold=.6, iou_threshold=.5):
# Retrieve outputs of the YOLO model
box_confidence, box_xy, box_wh, box_class_probs = yolo_outputs
# Convert boxes to be ready for filtering functions
boxes = yolo_boxes_to_corners(box_xy, box_wh)
scores, boxes, classes = yolo_filter_boxes(box_confidence, boxes, box_class_probs, score_threshold )
# Scale boxes back to original image shape.
boxes = scale_boxes(boxes, image_shape)
# Use one of the functions you've implemented to perform Non-max suppression with a threshold of iou_threshold (≈1 line)
scores, boxes, classes = yolo_non_max_suppression(scores, boxes, classes, max_boxes , iou_threshold )
return scores, boxes, classes
box_confidence, box_xy, box_wh, box_class_probs
yolo_filter_boxes()
函数过滤以上就是运用YOLO的过程,为了完整实现识别物体,还需要加入模型的引入以及绘制边界框,保存图像等步骤。
sess = K.get_session()
class_names = read_classes("model_data/coco_classes.txt")
anchors = read_anchors("model_data/yolo_anchors.txt")
image_shape = (183., 275.)
在“coco_classes.txt”
和“ yolo_anchors.txt”
两个文件中中收集了有关80个类和5个框的信息,以上代码是将其加载到模型中,image_shape
是原始图像的尺寸。从以下代码可以看到加载类的过程:
def read_classes(classes_path):
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
训练YOLO模型需要花费很长时间,这里直接加载存储在“ yolo.h5”
中的现有预训练的Keras YOLO模型,以下代码加载了经过训练的YOLO模型的权重:
yolo_model = load_model("model_data/yolo.h5")
该模型将经过预处理的一批输入图像(形状:(m,608,608,3))转换为形状为(m,19,19,5,85)的张量
(这个加载的过程还不太懂,以后慢慢理解)
yolo_model的输出是(m,19,19,5,85)张量,需要通过处理和转换:
yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names))
这个处理过程也不是很懂,但通过查看yolo_head
函数可以知道,处理后的返回值正好适合前面函数yolo_filter_boxes
的输入
使用上面的yolo_eval
函数,应用yolo_outputs
中的变量进行box的过滤
scores, boxes, classes = yolo_eval(yolo_outputs, image_shape)
参数说明:
yolo_model.input
: yolo_model 的输入,此模型用来输出 yolo_model.outputyolo_model.output
:经过 yolo_head处理后得到 yolo_outputsyolo_outputs
: 通过函数 yolo_eval, 输出的预测结果: scores, boxes, classes
函数说明:
在图像输入模型之前,需要先进行预处理,使其形状变为(608, >608), 其输出为image和image_data 我们只用到image_data ,它代表的是模型的输入数组.image, image_data = preprocess_image("images/" + image_file, model_image_size = (608, 608))
下面是最终的预测函数:
def predict(sess, image_file):
"""
Runs the graph stored in "sess" to predict boxes for "image_file". Prints and plots the preditions.
Arguments:
sess -- your tensorflow/Keras session containing the YOLO graph
image_file -- name of an image stored in the "images" folder.
Returns:
out_scores -- tensor of shape (None, ), scores of the predicted boxes
out_boxes -- tensor of shape (None, 4), coordinates of the predicted boxes
out_classes -- tensor of shape (None, ), class index of the predicted boxes
Note: "None" actually represents the number of predicted boxes, it varies between 0 and max_boxes.
"""
# Preprocess your image
image, image_data = preprocess_image("images/" + image_file, model_image_size = (608, 608))
# Run the session with the correct tensors and choose the correct placeholders in the feed_dict.
# You'll need to use feed_dict={yolo_model.input: ... , K.learning_phase(): 0})
out_scores, out_boxes, out_classes = sess.run([scores, boxes, classes],feed_dict={yolo_model.input: image_data,K.learning_phase(): 0})
# Print predictions info
print('Found {} boxes for {}'.format(len(out_boxes), image_file))
# Generate colors for drawing bounding boxes.
colors = generate_colors(class_names)
# Draw bounding boxes on the image file
draw_boxes(image, out_scores, out_boxes, out_classes, class_names, colors)
# Save the predicted bounding box on the image
image.save(os.path.join("out", image_file), quality=90)
# Display the results in the notebook
output_image = scipy.misc.imread(os.path.join("out", image_file))
imshow(output_image)
return out_scores, out_boxes, out_classes
image_data
(这个代码其实就是在使用上面的模型计算最后的预测值)image, out_scores, out_boxes, out_classes, class_names, colors
,在图像文件上绘制边框以上就是目标检测的完整实现,大部分内容都已理解,最后的文件操作部分尚有不懂的,在后面的学习中继续深入理解吧