Faster-RCNN 算法由于其较高的检测准确率成为主流的目标检测算法之一,相比较 YOLO系列算法,Faster-RCNN 速度方面略显不足,平均检测精度(mAP)很高,它将 region proposal提取和 Fast-RCNN 部分融合进了一个网络模型 (区域生成网络 RPN 层)。
(2)将产生的 feature map传入 RPN网络产生建议框,然后进行是否含有目标的二分类,同时 feature map 传入 ROIpooling 层进行池化操作,产生固定大小的候选区域特征图。
Faster-RCNN 相较以往算法有几点重大改进:其核心是提出了 RPN(Region Proposal Network)网络,它的出现代替了传统产生候选区域的方法,实现了端到端的训练,并且将整个物体检测的流程统一到同一个神经网络中去,使得 RPN 和 Fast RCNN 实现了共享卷积特征,减少了训练时间;采用 ROI Pooling 使用最大值池化将特征图上 ROI 固定为特定大小的特征图;采用 NMS(非极大值抑制,Non-maximum suppression)技术,筛选候选框数量。
# 验证集的划分在train.py代码里面进行
# test.txt和val.txt里面没有内容是正常的。训练不会使用到。
FileNotFoundError: [WinError 3] 系统找不到指定的路径。: './VOCdevkit/VOC2007/Annotations'
import os
import random
# 想要增加测试集修改trainval_percent
# train_percent不需要修改
temp_xml = os.listdir(xmlfilepath)
total_xml = []
for xml in temp_xml:
if xml.endswith(".xml"):
trainval= random.sample(list,tv)
print("train and val size",tv)
print("traub suze",tr)
ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath,'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w')
fval = open(os.path.join(saveBasePath,'val.txt'), 'w')
for i in list:
if i in trainval:
if i in train:
ftest .close()
# 运行前一定要修改classes
# 如果生成的2007_train.txt里面没有目标信息
# 那么就是因为classes没有设定正确
import xml.etree.ElementTree as ET
from os import getcwd
sets=[('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
# 这里设定的classes顺序要和model_data里的txt一样
# classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
classes = ["cabbage", "carrot", "nori", "potato"]
# E:\python-run-env\Faster-RCNN\VOCdevkit\VOC2007\ImageSets\Main\train.txt
def convert_annotation(year, image_id, list_file):
in_file = open('E:/python-run-env/Faster-RCNN/VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id), encoding='utf-8')
root = tree.getroot()
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult')!=None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult)==1:
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
wd = getcwd()
for year, image_set in sets:
image_ids = open('E:/python-run-env/Faster-RCNN/VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set), encoding='utf-8').read().strip().split()
list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
for image_id in image_ids:
list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg'%(wd, year, image_id))
convert_annotation(year, image_id, list_file)
# 对视频中的predict.py进行了修改,
# 将单张图片预测、摄像头检测和FPS测试功能
# 整合到了一个py文件中,通过指定mode进行模式的修改。
import time
import cv2
import numpy as np
import tensorflow as tf
from PIL import Image
from frcnn import FRCNN
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
if __name__ == "__main__":
frcnn = FRCNN()
# mode用于指定测试的模式:
# 'predict'表示单张图片预测
# 'video'表示视频检测
# 'fps'表示测试fps
mode = "predict"
# video_path用于指定视频的路径,当video_path=0时表示检测摄像头
# video_save_path表示视频保存的路径,当video_save_path=""时表示不保存
# video_fps用于保存的视频的fps
# video_path、video_save_path和video_fps仅在mode='video'时有效
# 保存视频时需要ctrl+c退出才会完成完整的保存步骤,不可直接结束程序。
# video_path = r"E:/python-run-env/Faster RCNN/img/12.mp4"
video_path = 0
video_save_path = r"C:/Users/asus/Desktop/Test/12.mp4"
video_fps = 25.0
if mode == "predict":
比如判断if predicted_class == 'car': 即可判断当前目标是否为车,然后记录数量即可。利用draw.text即可写字。
while True:
img = input('Input image filename:')
image =
print('Open Error! Try again!')
r_image = frcnn.detect_image(image)
elif mode == "video":
if video_save_path!="":
fourcc = cv2.VideoWriter_fourcc(*'XVID')
size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
fps = 0.0
t1 = time.time()
# 读取某一帧
# 格式转变,BGRtoRGB
frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
# 转变成Image
frame = Image.fromarray(np.uint8(frame))
# 进行检测
frame = np.array(frcnn.detect_image(frame))
# RGBtoBGR满足opencv显示格式
frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR)
fps = ( fps + (1./(time.time()-t1)) ) / 2
print("fps= %.2f"%(fps))
frame = cv2.putText(frame, "fps= %.2f"%(fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
c= cv2.waitKey(1) & 0xff
if video_save_path!="":
if c==27:
elif mode == "fps":
test_interval = 100
img ='img/street.jpg')
tact_time = frcnn.get_FPS(img, test_interval)
print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1')
raise AssertionError("Please specify the correct mode: 'predict', 'video' or 'fps'.")
import colorsys
import copy
import os
import time
import numpy as np
import tensorflow as tf
from PIL import Image, ImageDraw, ImageFont
from tensorflow.keras.applications.imagenet_utils import preprocess_input
import nets.frcnn as frcnn
from nets.frcnn_training import get_new_img_size
from utils.anchors import get_anchors
from utils.config import Config
from utils.utils import BBoxUtility
# 使用自己训练好的模型预测需要修改2个参数
# model_path和classes_path都需要修改!
# 如果出现shape不匹配
# 一定要注意训练时的NUM_CLASSES、
# model_path和classes_path参数的修改
# E:\python-run-env\Faster-RCNN\model_data\voc_classes.txt
class FRCNN(object):
_defaults = {
# E:\python-run-env\Faster-RCNN\model_data\voc_weight.h5
"model_path" : r'E:\python-run-env\Faster-RCNN\model_data\Epoch90-Total_Loss0.5199-Val_Loss0.5419.h5',
"classes_path" : r'E:\python-run-env\Faster-RCNN\model_data\classes.txt',
"confidence" : 0.5,
"iou" : 0.3
def get_defaults(cls, n):
if n in cls._defaults:
return cls._defaults[n]
return "Unrecognized attribute name '" + n + "'"
# 初始化faster RCNN
def __init__(self, **kwargs):
self.class_names = self._get_class()
self.config = Config()
self.bbox_util = BBoxUtility()
# 获得所有的分类
def _get_class(self):
classes_path = os.path.expanduser(self.classes_path)
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
# 获得所有的分类
def generate(self):
model_path = os.path.expanduser(self.model_path)
assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
# 计算总的类的数量
self.num_classes = len(self.class_names)+1
# 载入模型与权值
self.model_rpn, self.model_classifier = frcnn.get_predict_model(self.config, self.num_classes)
self.model_rpn.load_weights(self.model_path, by_name=True)
self.model_classifier.load_weights(self.model_path, by_name=True)
print('{} model, anchors, and classes loaded.'.format(model_path))
# 画框设置不同的颜色
hsv_tuples = [(x / len(self.class_names), 1., 1.)
for x in range(len(self.class_names))]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
# 用于计算共享特征层的大小
def get_img_output_length(self, width, height):
def get_output_length(input_length):
# input_length += 6
filter_sizes = [7, 3, 1, 1]
padding = [3,1,0,0]
stride = 2
for i in range(4):
# input_length = (input_length - filter_size + stride) // stride
input_length = (input_length+2*padding[i]-filter_sizes[i]) // stride + 1
return input_length
return get_output_length(width), get_output_length(height)
def model_rpn_get_pred(self, photo):
preds = self.model_rpn(photo, training=False)
return preds
def model_classifier_get_pred(self, photo):
preds = self.model_classifier(photo, training=False)
return preds
# 检测图片
def detect_image(self, image):
# 转换成RGB图片,可以用于灰度图预测。
image = image.convert("RGB")
image_shape = np.array(np.shape(image)[0:2])
old_width, old_height = image_shape[1], image_shape[0]
old_image = copy.deepcopy(image)
# 给原图像进行resize,resize到短边为600的大小上
width, height = get_new_img_size(old_width, old_height)
image = image.resize([width,height], Image.BICUBIC)
photo = np.array(image,dtype = np.float64)
# 图片预处理,归一化。
photo = preprocess_input(np.expand_dims(photo,0))
rpn_pred = self.model_rpn_get_pred(photo)
rpn_pred = [x.numpy() for x in rpn_pred]
# 将建议框网络的预测结果进行解码
base_feature_width, base_feature_height = self.get_img_output_length(width, height)
anchors = get_anchors([base_feature_width, base_feature_height], width, height)
rpn_results = self.bbox_util.detection_out_rpn(rpn_pred, anchors)
# 在获得建议框和共享特征层后,将二者传入classifier中进行预测
base_layer = rpn_pred[2]
proposal_box = np.array(rpn_results)[:, :, 1:]
temp_ROIs = np.zeros_like(proposal_box)
temp_ROIs[:, :, [0, 1, 2, 3]] = proposal_box[:, :, [1, 0, 3, 2]]
classifier_pred = self.model_classifier_get_pred([base_layer, temp_ROIs])
classifier_pred = [x.numpy() for x in classifier_pred]
# 利用classifier的预测结果对建议框进行解码,获得预测框
results = self.bbox_util.detection_out_classifier(classifier_pred, proposal_box, self.config, self.confidence)
if len(results[0])==0:
return old_image
results = np.array(results[0])
boxes = results[:, :4]
top_conf = results[:, 4]
top_label_indices = results[:, 5]
boxes[:, [0, 2]] = boxes[:, [0, 2]] * old_width
boxes[:, [1, 3]] = boxes[:, [1, 3]] * old_height
font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32'))
thickness = max((np.shape(old_image)[0] + np.shape(old_image)[1]) // old_width * 2, 1)
image = old_image
for i, c in enumerate(top_label_indices):
predicted_class = self.class_names[int(c)]
score = top_conf[i]
left, top, right, bottom = boxes[i]
top = top - 5
left = left - 5
bottom = bottom + 5
right = right + 5
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32'))
right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32'))
# 画框框
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
print(label, top, left, bottom, right)
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
text_origin = np.array([left, top + 1])
for i in range(thickness):
[left + i, top + i, right - i, bottom - i],
[tuple(text_origin), tuple(text_origin + label_size)],
draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font)
del draw
return image
def get_FPS(self, image, test_interval):
# 转换成RGB图片,可以用于灰度图预测。
image = image.convert("RGB")
image_shape = np.array(np.shape(image)[0:2])
old_width, old_height = image_shape[1], image_shape[0]
# 给原图像进行resize,resize到短边为600的大小上
width, height = get_new_img_size(old_width, old_height)
image = image.resize([width,height], Image.BICUBIC)
photo = np.array(image,dtype = np.float64)
# 图片预处理,归一化。
photo = preprocess_input(np.expand_dims(photo,0))
rpn_pred = self.model_rpn.predict(photo)
# 将建议框网络的预测结果进行解码
base_feature_width, base_feature_height = self.get_img_output_length(width, height)
anchors = get_anchors([base_feature_width, base_feature_height], width, height)
rpn_results = self.bbox_util.detection_out_rpn(rpn_pred, anchors)
# 在获得建议框和共享特征层后,将二者传入classifier中进行预测
base_layer = rpn_pred[2]
proposal_box = np.array(rpn_results)[:, :, 1:]
temp_ROIs = np.zeros_like(proposal_box)
temp_ROIs[:, :, [0, 1, 2, 3]] = proposal_box[:, :, [1, 0, 3, 2]]
classifier_pred = self.model_classifier.predict([base_layer, temp_ROIs])
# 利用classifier的预测结果对建议框进行解码,获得预测框
results = self.bbox_util.detection_out_classifier(classifier_pred, proposal_box, self.config, self.confidence)
if len(results[0])>0:
results = np.array(results[0])
boxes = results[:, :4]
top_conf = results[:, 4]
top_label_indices = results[:, 5]
boxes[:, [0, 2]] = boxes[:, [0, 2]] * old_width
boxes[:, [1, 3]] = boxes[:, [1, 3]] * old_height
t1 = time.time()
for _ in range(test_interval):
rpn_pred = self.model_rpn.predict(photo)
# 将建议框网络的预测结果进行解码
base_feature_width, base_feature_height = self.get_img_output_length(width, height)
anchors = get_anchors([base_feature_width, base_feature_height], width, height)
rpn_results = self.bbox_util.detection_out_rpn(rpn_pred, anchors)
# 在获得建议框和共享特征层后,将二者传入classifier中进行预测
base_layer = rpn_pred[2]
proposal_box = np.array(rpn_results)[:, :, 1:]
temp_ROIs = np.zeros_like(proposal_box)
temp_ROIs[:, :, [0, 1, 2, 3]] = proposal_box[:, :, [1, 0, 3, 2]]
classifier_pred = self.model_classifier.predict([base_layer, temp_ROIs])
# 利用classifier的预测结果对建议框进行解码,获得预测框
results = self.bbox_util.detection_out_classifier(classifier_pred, proposal_box, self.config, self.confidence)
if len(results[0])>0:
results = np.array(results[0])
boxes = results[:, :4]
top_conf = results[:, 4]
top_label_indices = results[:, 5]
boxes[:, [0, 2]] = boxes[:, [0, 2]] * old_width
boxes[:, [1, 3]] = boxes[:, [1, 3]] * old_height
t2 = time.time()
tact_time = (t2 - t1) / test_interval
return tact_time
import numpy as np
from tensorflow import keras
from config import Config
config = Config()
def generate_anchors(sizes=None, ratios=None):
if sizes is None:
sizes = config.anchor_box_scales
if ratios is None:
ratios = config.anchor_box_ratios
num_anchors = len(sizes) * len(ratios)
anchors = np.zeros((num_anchors, 4))
anchors[:, 2:] = np.tile(sizes, (2, len(ratios))).T
for i in range(len(ratios)):
anchors[3*i:3*i+3, 2] = anchors[3*i:3*i+3, 2]*ratios[i][0]
anchors[3*i:3*i+3, 3] = anchors[3*i:3*i+3, 3]*ratios[i][1]
anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
return anchors
def shift(shape, anchors, stride=config.rpn_stride):
shift_x = (np.arange(0, shape[0], dtype=keras.backend.floatx()) + 0.5) * stride
shift_y = (np.arange(0, shape[1], dtype=keras.backend.floatx()) + 0.5) * stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shift_x = np.reshape(shift_x, [-1])
shift_y = np.reshape(shift_y, [-1])
shifts = np.stack([
], axis=0)
shifts = np.transpose(shifts)
number_of_anchors = np.shape(anchors)[0]
k = np.shape(shifts)[0]
shifted_anchors = np.reshape(anchors, [1, number_of_anchors, 4]) + np.array(np.reshape(shifts, [k, 1, 4]), keras.backend.floatx())
shifted_anchors = np.reshape(shifted_anchors, [k * number_of_anchors, 4])
return shifted_anchors
def get_anchors(shape,width,height):
anchors = generate_anchors()
network_anchors = shift(shape,anchors)
network_anchors[:,0] = network_anchors[:,0]/width
network_anchors[:,1] = network_anchors[:,1]/height
network_anchors[:,2] = network_anchors[:,2]/width
network_anchors[:,3] = network_anchors[:,3]/height
network_anchors = np.clip(network_anchors,0,1)
return network_anchors
class Config:
def __init__(self):
self.anchor_box_scales = [128, 256, 512]
self.anchor_box_ratios = [[1, 1], [1, 2], [2, 1]]
self.rpn_stride = 16
# 视频中rois的值为32,修改成128效果更好
self.num_rois = 128
# 用于预测和用于训练的建议框的数量
self.num_RPN_predict_pre = 300
self.num_RPN_train_pre = 600
self.rpn_min_overlap = 0.3
self.rpn_max_overlap = 0.7
# 与真实框的iou在classifier_min_overlap到classifier_max_overlap之间的为负样本
# 与真实框的iou大于classifier_max_overlap之间的为正样本
# 由于添加了多batch训练,如果将classifier_min_overlap设置成0.1可能存在无负样本的情况
# 将classifier_min_overlap下调为0,从而实现多batch训练
self.classifier_min_overlap = 0
self.classifier_max_overlap = 0.5
self.classifier_regr_std = [8.0, 8.0, 4.0, 4.0]
self.pooling_regions = 14
import numpy as np
def bbox_iou(bbox_a, bbox_b):
if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
print(bbox_a, bbox_b)
raise IndexError
tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
area_i = - tl, axis=2) * (tl < br).all(axis=2)
area_a =[:, 2:] - bbox_a[:, :2], axis=1)
area_b =[:, 2:] - bbox_b[:, :2], axis=1)
return area_i / (area_a[:, None] + area_b - area_i)
def bbox2loc(src_bbox, dst_bbox):
width = src_bbox[:, 2] - src_bbox[:, 0]
height = src_bbox[:, 3] - src_bbox[:, 1]
ctr_x = src_bbox[:, 0] + 0.5 * width
ctr_y = src_bbox[:, 1] + 0.5 * height
base_width = dst_bbox[:, 2] - dst_bbox[:, 0]
base_height = dst_bbox[:, 3] - dst_bbox[:, 1]
base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width
base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height
eps = np.finfo(height.dtype).eps
width = np.maximum(width, eps)
height = np.maximum(height, eps)
dx = (base_ctr_x - ctr_x) / width
dy = (base_ctr_y - ctr_y) / height
dw = np.log(base_width / width)
dh = np.log(base_height / height)
loc = np.vstack((dx, dy, dw, dh)).transpose()
return loc
def calc_iou(R, config, all_boxes, num_classes):
bboxes = all_boxes[:, :4]
label = all_boxes[:, 4]
R = np.concatenate([R, bboxes], axis=0)
# ----------------------------------------------------- #
# 计算建议框和真实框的重合程度
# ----------------------------------------------------- #
iou = bbox_iou(R, bboxes)
if len(bboxes)==0:
gt_assignment = np.zeros(len(R), np.int32)
max_iou = np.zeros(len(R))
gt_roi_label = np.zeros(len(R))
# 获得每一个建议框最对应的真实框的iou [num_roi, ]
max_iou = iou.max(axis=1)
# 获得每一个建议框最对应的真实框 [num_roi, ]
gt_assignment = iou.argmax(axis=1)
# 真实框的标签
gt_roi_label = label[gt_assignment]
# 满足建议框和真实框重合程度大于neg_iou_thresh_high的作为负样本
# 将正样本的数量限制在self.pos_roi_per_image以内
pos_index = np.where(max_iou >= config.classifier_max_overlap)[0]
pos_roi_per_this_image = int(min(config.num_rois//2, pos_index.size))
if pos_index.size > 0:
pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False)
# 满足建议框和真实框重合程度小于neg_iou_thresh_high大于neg_iou_thresh_low作为负样本
# 将正样本的数量和负样本的数量的总和固定成self.n_sample
neg_index = np.where((max_iou < config.classifier_max_overlap) & (max_iou >= config.classifier_min_overlap))[0]
neg_roi_per_this_image = config.num_rois - pos_roi_per_this_image
if neg_roi_per_this_image > neg_index.size:
neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=True)
neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False)
# sample_roi [n_sample, ]
# gt_roi_loc [n_sample, 4]
# gt_roi_label [n_sample, ]
keep_index = np.append(pos_index, neg_index)
sample_roi = R[keep_index]
if len(bboxes)!=0:
gt_roi_loc = bbox2loc(sample_roi, bboxes[gt_assignment[keep_index]])
gt_roi_loc = gt_roi_loc * np.array(config.classifier_regr_std)
gt_roi_loc = np.zeros_like(sample_roi)
gt_roi_label = gt_roi_label[keep_index]
gt_roi_label[pos_roi_per_this_image:] = num_classes - 1
# X [n_sample, 4]
# Y1 [n_sample, num_classes]
# Y2 [n_sample, (num_clssees-1)*8]
X = np.zeros_like(sample_roi)
X[:, [0, 1, 2, 3]] = sample_roi[:, [1, 0, 3, 2]]
Y1 = np.eye(num_classes)[np.array(gt_roi_label,np.int32)]
y_class_regr_label = np.zeros([np.shape(gt_roi_loc)[0], num_classes-1, 4])
y_class_regr_coords = np.zeros([np.shape(gt_roi_loc)[0], num_classes-1, 4])
y_class_regr_label[np.arange(np.shape(gt_roi_loc)[0])[:pos_roi_per_this_image], np.array(gt_roi_label[:pos_roi_per_this_image], np.int32)] = 1
y_class_regr_coords[np.arange(np.shape(gt_roi_loc)[0])[:pos_roi_per_this_image], np.array(gt_roi_label[:pos_roi_per_this_image], np.int32)] = \
y_class_regr_label = np.reshape(y_class_regr_label, [np.shape(gt_roi_loc)[0], -1])
y_class_regr_coords = np.reshape(y_class_regr_coords, [np.shape(gt_roi_loc)[0], -1])
Y2 = np.concatenate([np.array(y_class_regr_label), np.array(y_class_regr_coords)],axis=1)
return X, Y1, Y2
import math
import numpy as np
import tensorflow as tf
from PIL import Image
class BBoxUtility(object):
def __init__(self, overlap_threshold=0.7, ignore_threshold=0.3, rpn_pre_boxes=12000, rpn_nms=0.7, classifier_nms=0.3, top_k=300):
self.overlap_threshold = overlap_threshold
self.ignore_threshold = ignore_threshold
self.rpn_pre_boxes = rpn_pre_boxes
self.rpn_nms = rpn_nms
self.classifier_nms = classifier_nms
self.top_k = top_k
def iou(self, box):
# 计算出每个真实框与所有的先验框的iou
# 判断真实框与先验框的重合情况
inter_upleft = np.maximum(self.priors[:, :2], box[:2])
inter_botright = np.minimum(self.priors[:, 2:4], box[2:])
inter_wh = inter_botright - inter_upleft
inter_wh = np.maximum(inter_wh, 0)
inter = inter_wh[:, 0] * inter_wh[:, 1]
# 真实框的面积
area_true = (box[2] - box[0]) * (box[3] - box[1])
# 先验框的面积
area_gt = (self.priors[:, 2] - self.priors[:, 0])*(self.priors[:, 3] - self.priors[:, 1])
# 计算iou
union = area_true + area_gt - inter
iou = inter / union
return iou
def encode_ignore_box(self, box, return_iou=True):
iou = self.iou(box)
ignored_box = np.zeros((self.num_priors, 1))
# 找到处于忽略门限值范围内的先验框
assign_mask_ignore = (iou > self.ignore_threshold) & (iou < self.overlap_threshold)
ignored_box[:, 0][assign_mask_ignore] = iou[assign_mask_ignore]
encoded_box = np.zeros((self.num_priors, 4 + return_iou))
# 找到每一个真实框,重合程度较高的先验框
assign_mask = iou > self.overlap_threshold
if not assign_mask.any():
assign_mask[iou.argmax()] = True
if return_iou:
encoded_box[:, -1][assign_mask] = iou[assign_mask]
assigned_priors = self.priors[assign_mask]
# 逆向编码,将真实框转化为FRCNN预测结果的格式
# 先计算真实框的中心与长宽
box_center = 0.5 * (box[:2] + box[2:])
box_wh = box[2:] - box[:2]
# 再计算重合度较高的先验框的中心与长宽
assigned_priors_center = 0.5 * (assigned_priors[:, :2] +
assigned_priors[:, 2:4])
assigned_priors_wh = (assigned_priors[:, 2:4] -
assigned_priors[:, :2])
# 逆向求取efficientdet应该有的预测结果
# 先求取中心的预测结果,再求取宽高的预测结果
encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
encoded_box[:, :2][assign_mask] /= assigned_priors_wh
encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh)
return encoded_box.ravel(), ignored_box.ravel()
def assign_boxes(self, boxes, anchors):
self.num_priors = len(anchors)
self.priors = anchors
# assignment分为2个部分
# :4 的内容为网络应该有的回归预测结果
# 4 的内容为先验框是否包含物体,默认为背景
assignment = np.zeros((self.num_priors, 4 + 1))
assignment[:, 4] = 0.0
if len(boxes) == 0:
return assignment
# 对每一个真实框都进行iou计算
apply_along_axis_boxes = np.apply_along_axis(self.encode_ignore_box, 1, boxes[:, :4])
encoded_boxes = np.array([apply_along_axis_boxes[i, 0] for i in range(len(apply_along_axis_boxes))])
ingored_boxes = np.array([apply_along_axis_boxes[i, 1] for i in range(len(apply_along_axis_boxes))])
# 在reshape后,获得的ingnored_boxes的shape为:
# [num_true_box, num_priors, 1] 其中1为iou
ingored_boxes = ingored_boxes.reshape(-1, self.num_priors, 1)
ignore_iou = ingored_boxes[:, :, 0].max(axis=0)
ignore_iou_mask = ignore_iou > 0
assignment[:, 4][ignore_iou_mask] = -1
# 在reshape后,获得的encoded_boxes的shape为:
# [num_true_box, num_priors, 4+1]
# 4是编码后的结果,1为iou
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
# [num_priors]求取每一个先验框重合度最大的真实框
best_iou = encoded_boxes[:, :, -1].max(axis=0)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
best_iou_mask = best_iou > 0
best_iou_idx = best_iou_idx[best_iou_mask]
# 计算一共有多少先验框满足需求
assign_num = len(best_iou_idx)
# 将编码后的真实框取出
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx,np.arange(assign_num),:4]
# 4代表为当前先验框是否包含目标
assignment[:, 4][best_iou_mask] = 1
return assignment
def decode_boxes(self, mbox_loc, mbox_priorbox):
# 获得先验框的宽与高
prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
# 获得先验框的中心点
prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
# 真实框距离先验框中心的xy轴偏移情况
decode_bbox_center_x = mbox_loc[:, 0] * prior_width / 4
decode_bbox_center_x += prior_center_x
decode_bbox_center_y = mbox_loc[:, 1] * prior_height / 4
decode_bbox_center_y += prior_center_y
# 真实框的宽与高的求取
decode_bbox_width = np.exp(mbox_loc[:, 2] / 4)
decode_bbox_width *= prior_width
decode_bbox_height = np.exp(mbox_loc[:, 3] / 4)
decode_bbox_height *= prior_height
# 获取真实框的左上角与右下角
decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
# 真实框的左上角与右下角进行堆叠
decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
decode_bbox_ymin[:, None],
decode_bbox_xmax[:, None],
decode_bbox_ymax[:, None]), axis=-1)
# 防止超出0与1
decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
return decode_bbox
def detection_out_rpn(self, predictions, mbox_priorbox):
# 获得种类的置信度
mbox_conf = predictions[0]
# mbox_loc是回归预测结果
mbox_loc = predictions[1]
# 获得网络的先验框
mbox_priorbox = mbox_priorbox
results = []
# 对每一张图片进行处理,由于在predict.py的时候,我们只输入一张图片,所以for i in range(len(mbox_loc))只进行一次
for i in range(len(mbox_loc)):
# 利用回归结果对先验框进行解码
decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox)
# 取出先验框内包含物体的概率
c_confs = mbox_conf[i, :, 0]
argsort_index = np.argsort(c_confs)[::-1]
c_confs = c_confs[argsort_index[:self.rpn_pre_boxes]]
decode_bbox = decode_bbox[argsort_index[:self.rpn_pre_boxes], :]
# 进行iou的非极大抑制
idx = tf.image.non_max_suppression(decode_bbox, c_confs, self.top_k, iou_threshold=self.rpn_nms).numpy()
# 取出在非极大抑制中效果较好的内容
good_boxes = decode_bbox[idx]
confs = c_confs[idx][:, None]
c_pred = np.concatenate((confs, good_boxes), axis=1)
argsort = np.argsort(c_pred[:, 0])[::-1]
c_pred = c_pred[argsort]
return np.array(results)
def detection_out_classifier(self, predictions, proposal_box, config, confidence):
# 获得种类的置信度
proposal_conf = predictions[0]
# proposal_loc是回归预测结果
proposal_loc = predictions[1]
results = []
# 对每一张图片进行处理,由于在predict.py的时候,我们只输入一张图片,所以for i in range(len(mbox_loc))只进行一次
for i in range(len(proposal_conf)):
proposal_pred = []
proposal_box[i, :, 2] = proposal_box[i, :, 2] - proposal_box[i, :, 0]
proposal_box[i, :, 3] = proposal_box[i, :, 3] - proposal_box[i, :, 1]
for j in range(proposal_conf[i].shape[0]):
if np.max(proposal_conf[i][j, :-1]) < confidence:
label = np.argmax(proposal_conf[i][j, :-1])
score = np.max(proposal_conf[i][j, :-1])
(x, y, w, h) = proposal_box[i, j, :]
(tx, ty, tw, th) = proposal_loc[i][j, 4*label: 4*(label+1)]
tx /= config.classifier_regr_std[0]
ty /= config.classifier_regr_std[1]
tw /= config.classifier_regr_std[2]
th /= config.classifier_regr_std[3]
cx = x + w/2.
cy = y + h/2.
cx1 = tx * w + cx
cy1 = ty * h + cy
w1 = math.exp(tw) * w
h1 = math.exp(th) * h
x1 = cx1 - w1/2.
y1 = cy1 - h1/2.
x2 = cx1 + w1/2
y2 = cy1 + h1/2
num_classes = np.shape(proposal_conf)[-1]
proposal_pred = np.array(proposal_pred)
good_boxes = []
if len(proposal_pred)!=0:
for c in range(num_classes):
mask = proposal_pred[:, -1] == c
if len(proposal_pred[mask]) > 0:
boxes_to_process = proposal_pred[:, :4][mask]
confs_to_process = proposal_pred[:, 4][mask]
idx = tf.image.non_max_suppression(boxes_to_process, confs_to_process, self.top_k, iou_threshold=self.classifier_nms).numpy()
# 取出在非极大抑制中效果较好的内容
return results
import os
import random
from random import shuffle
import cv2
import numpy as np
import scipy.signal
import tensorflow as tf
from matplotlib import pyplot as plt
from PIL import Image
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.applications.imagenet_utils import preprocess_input
from anchors import get_anchors
def rand(a=0, b=1):
return np.random.rand()*(b-a) + a
def cls_loss(ratio=3):
def _cls_loss(y_true, y_pred):
# y_true [batch_size, num_anchor, 1]
# y_pred [batch_size, num_anchor, 1]
labels = y_true
# -1 是需要忽略的, 0 是背景, 1 是存在目标
anchor_state = y_true
classification = y_pred
# 获得无需忽略的所有样本
indices_for_no_ignore = tf.where(keras.backend.not_equal(anchor_state, -1))
labels_for_no_ignore = tf.gather_nd(labels, indices_for_no_ignore)
classification_for_no_ignore = tf.gather_nd(classification, indices_for_no_ignore)
cls_loss_for_no_ignore = keras.backend.binary_crossentropy(labels_for_no_ignore, classification_for_no_ignore)
cls_loss_for_no_ignore = keras.backend.sum(cls_loss_for_no_ignore)
# 进行标准化
normalizer_no_ignore = tf.where(keras.backend.not_equal(anchor_state, -1))
normalizer_no_ignore = keras.backend.cast(keras.backend.shape(normalizer_no_ignore)[0], keras.backend.floatx())
normalizer_no_ignore = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer_no_ignore)
# 总的loss
loss = cls_loss_for_no_ignore / normalizer_no_ignore
return loss
return _cls_loss
def smooth_l1(sigma=1.0):
sigma_squared = sigma ** 2
def _smooth_l1(y_true, y_pred):
# y_true [batch_size, num_anchor, 4+1]
# y_pred [batch_size, num_anchor, 4]
regression = y_pred
regression_target = y_true[:, :, :-1]
anchor_state = y_true[:, :, -1]
# 找到正样本
indices = tf.where(keras.backend.equal(anchor_state, 1))
regression = tf.gather_nd(regression, indices)
regression_target = tf.gather_nd(regression_target, indices)
# 计算smooth L1损失
regression_diff = regression - regression_target
regression_diff = keras.backend.abs(regression_diff)
regression_loss = tf.where(
keras.backend.less(regression_diff, 1.0 / sigma_squared),
0.5 * sigma_squared * keras.backend.pow(regression_diff, 2),
regression_diff - 0.5 / sigma_squared
# 将所获得的loss除上正样本的数量
normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0])
normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx())
regression_loss = keras.backend.sum(regression_loss) / normalizer
return regression_loss
return _smooth_l1
def class_loss_regr(num_classes):
epsilon = 1e-4
def class_loss_regr_fixed_num(y_true, y_pred):
x = y_true[:, :, 4*num_classes:] - y_pred
x_abs = K.abs(x)
x_bool = K.cast(K.less_equal(x_abs, 1.0), 'float32')
loss = 4 * K.sum(y_true[:, :, :4*num_classes] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :4*num_classes])
return loss
return class_loss_regr_fixed_num
def class_loss_cls(y_true, y_pred):
loss = K.mean(K.categorical_crossentropy(y_true, y_pred))
return loss
def get_new_img_size(width, height, img_min_side=600):
if width <= height:
f = float(img_min_side) / width
resized_height = int(f * height)
resized_width = int(img_min_side)
f = float(img_min_side) / height
resized_width = int(f * width)
resized_height = int(img_min_side)
return resized_width, resized_height
def get_img_output_length(width, height):
def get_output_length(input_length):
# input_length += 6
filter_sizes = [7, 3, 1, 1]
padding = [3,1,0,0]
stride = 2
for i in range(4):
# input_length = (input_length - filter_size + stride) // stride
input_length = (input_length+2*padding[i]-filter_sizes[i]) // stride + 1
return input_length
return get_output_length(width), get_output_length(height)
class Generator(object):
def __init__(self, bbox_util, train_lines, num_classes, Batch_size, input_shape = [600,600], num_regions=256):
self.bbox_util = bbox_util
self.train_lines = train_lines
self.train_batches = len(train_lines)
self.num_classes = num_classes
self.Batch_size = Batch_size
self.input_shape = input_shape
self.num_regions = num_regions
def get_random_data(self, annotation_line, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):
line = annotation_line.split()
# print(type(line[0]))
# print(line[0])
# print(line[0][3])
path_split = line[0].split('/')
# print(path_split)
# print(path_split[0])
# print(path_split[1])
# print(path_split)
link_to_img_path = "../input/images"
last_Path2 = "../input/myimge/" + path_split[4]
# print(last_Path2)
last_Path = os.path.join(link_to_img_path,path_split[4])
image =
iw, ih = image.size
w, h = self.input_shape
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
if not random:
# resize image
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
image = image.resize((nw,nh), Image.BICUBIC)
new_image ='RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# correct boxes
box_data = np.zeros((len(box),5))
if len(box)>0:
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
return image_data, box_data
# resize image
new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
scale = rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
# place image
dx = int(rand(0, w-nw))
dy = int(rand(0, h-nh))
new_image ='RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
# flip image or not
flip = rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
# distort image
hue = rand(-hue, hue)
sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
val = rand(1, val) if rand()<.5 else 1/rand(1, val)
x = cv2.cvtColor(np.array(image,np.float32)/255, cv2.COLOR_RGB2HSV)
x[..., 0] += hue*360
x[..., 0][x[..., 0]>1] -= 1
x[..., 0][x[..., 0]<0] += 1
x[..., 1] *= sat
x[..., 2] *= val
x[x[:,:, 0]>360, 0] = 360
x[:, :, 1:][x[:, :, 1:]>1] = 1
x[x<0] = 0
image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB)*255
box_data = np.zeros((len(box),5))
if len(box)>0:
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]]
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
return image_data, box_data
def generate(self):
while True:
lines = self.train_lines
inputs = []
target0 = []
target1 = []
target2 = []
for annotation_line in lines:
img, y = self.get_random_data(annotation_line)
height, width, _ = np.shape(img)
if len(y)>0:
boxes = np.array(y[:,:4],dtype=np.float32)
boxes[:,0] = boxes[:,0] / width
boxes[:,1] = boxes[:,1] / height
boxes[:,2] = boxes[:,2] / width
boxes[:,3] = boxes[:,3] / height
y[:,:4] = boxes[:,:4]
anchors = get_anchors(get_img_output_length(width, height), width, height)
# assignment分为2个部分,它的shape为 :, 5
# :, :4 的内容为网络应该有的回归预测结果
# :, 4 的内容为先验框是否包含物体,默认为背景
assignment = self.bbox_util.assign_boxes(y,anchors)
classification = assignment[:, 4]
regression = assignment[:, :]
# 对正样本与负样本进行筛选,训练样本总和为256
mask_pos = classification[:]>0
num_pos = len(classification[mask_pos])
if num_pos > self.num_regions/2:
val_locs = random.sample(range(num_pos), int(num_pos - self.num_regions/2))
temp_classification = classification[mask_pos]
temp_regression = regression[mask_pos]
temp_classification[val_locs] = -1
temp_regression[val_locs,-1] = -1
classification[mask_pos] = temp_classification
regression[mask_pos] = temp_regression
mask_neg = classification[:]==0
num_neg = len(classification[mask_neg])
mask_pos = classification[:]>0
num_pos = len(classification[mask_pos])
if len(classification[mask_neg]) + num_pos > self.num_regions:
val_locs = random.sample(range(num_neg), int(num_neg + num_pos - self.num_regions))
temp_classification = classification[mask_neg]
temp_classification[val_locs] = -1
classification[mask_neg] = temp_classification
if len(inputs) == self.Batch_size:
tmp_inp = np.array(inputs)
tmp_targets = [np.array(target0, np.float32), np.array(target1, np.float32)]
tmp_y = target2
yield preprocess_input(tmp_inp), tmp_targets, tmp_y
inputs = []
target0 = []
target1 = []
target2 = []
class LossHistory():
def __init__(self, log_dir):
import datetime
curr_time =
time_str = datetime.datetime.strftime(curr_time,'%Y_%m_%d_%H_%M_%S')
self.log_dir = log_dir
self.time_str = time_str
self.save_path = os.path.join(self.log_dir, "loss_" + str(self.time_str))
self.losses = []
self.val_loss = []
def append_loss(self, loss, val_loss):
with open(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".txt"), 'a') as f:
with open(os.path.join(self.save_path, "epoch_val_loss_" + str(self.time_str) + ".txt"), 'a') as f:
def loss_plot(self):
iters = range(len(self.losses))
plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss')
plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss')
if len(self.losses) < 25:
num = 5
num = 15
plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss')
plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss')
plt.legend(loc="upper right")
plt.savefig(os.path.join(self.save_path, "epoch_loss_" + str(self.time_str) + ".png"))
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer
class RoiPoolingConv(Layer):
def __init__(self, pool_size, **kwargs):
self.pool_size = pool_size
super(RoiPoolingConv, self).__init__(**kwargs)
def build(self, input_shape):
self.nb_channels = input_shape[0][3]
def compute_output_shape(self, input_shape):
input_shape2 = input_shape[1]
return None, input_shape2[1], self.pool_size, self.pool_size, self.nb_channels
def call(self, x, mask=None):
assert(len(x) == 2)
img = x[0]
rois = x[1]
num_rois = tf.shape(rois)[1]
batch_size = tf.shape(rois)[0]
box_index = tf.expand_dims(tf.range(0, batch_size), 1)
box_index = tf.tile(box_index, (1, num_rois))
box_index = tf.reshape(box_index, [-1])
rs = tf.image.crop_and_resize(img, tf.reshape(rois, [-1,4]), box_index, (self.pool_size, self.pool_size))
final_output = K.reshape(rs, (batch_size, num_rois, self.pool_size, self.pool_size, self.nb_channels))
return final_output
# ResNet50的网络部分
from __future__ import print_function
from tensorflow.keras import layers
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import (Activation, Add, AveragePooling2D, Conv2D, BatchNormalization,
MaxPooling2D, TimeDistributed,
def identity_block(input_tensor, kernel_size, filters, stage, block):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), kernel_initializer=RandomNormal(stddev=0.02), name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(trainable=False, name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size, padding='same', kernel_initializer=RandomNormal(stddev=0.02), name=conv_name_base + '2b')(x)
x = BatchNormalization(trainable=False, name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), kernel_initializer=RandomNormal(stddev=0.02), name=conv_name_base + '2c')(x)
x = BatchNormalization(trainable=False, name=bn_name_base + '2c')(x)
x = layers.add([x, input_tensor])
x = Activation('relu')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), strides=strides, kernel_initializer=RandomNormal(stddev=0.02),
name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(trainable=False, name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size, padding='same', kernel_initializer=RandomNormal(stddev=0.02),
name=conv_name_base + '2b')(x)
x = BatchNormalization(trainable=False, name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), kernel_initializer=RandomNormal(stddev=0.02), name=conv_name_base + '2c')(x)
x = BatchNormalization(trainable=False, name=bn_name_base + '2c')(x)
shortcut = Conv2D(filters3, (1, 1), strides=strides, kernel_initializer=RandomNormal(stddev=0.02),
name=conv_name_base + '1')(input_tensor)
shortcut = BatchNormalization(trainable=False, name=bn_name_base + '1')(shortcut)
x = layers.add([x, shortcut])
x = Activation('relu')(x)
return x
def ResNet50(inputs):
# 假设输入进来的图片是600,600,3
img_input = inputs
# 600,600,3 -> 300,300,64
x = ZeroPadding2D((3, 3))(img_input)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
x = BatchNormalization(trainable=False, name='bn_conv1')(x)
x = Activation('relu')(x)
# 300,300,64 -> 150,150,64
x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# 150,150,64 -> 150,150,256
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
# 150,150,256 -> 75,75,512
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
# 75,75,512 -> 38,38,1024
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
# 最终获得一个38,38,1024的共享特征层
return x
def identity_block_td(input_tensor, kernel_size, filters, stage, block):
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = TimeDistributed(Conv2D(nb_filter1, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2a')(input_tensor)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = TimeDistributed(Conv2D(nb_filter2, (kernel_size, kernel_size), kernel_initializer='normal',padding='same'), name=conv_name_base + '2b')(x)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = TimeDistributed(Conv2D(nb_filter3, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2c')(x)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2c')(x)
x = Add()([x, input_tensor])
x = Activation('relu')(x)
return x
def conv_block_td(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = TimeDistributed(Conv2D(nb_filter1, (1, 1), strides=strides, kernel_initializer='normal'), name=conv_name_base + '2a')(input_tensor)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = TimeDistributed(Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same', kernel_initializer='normal'), name=conv_name_base + '2b')(x)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = TimeDistributed(Conv2D(nb_filter3, (1, 1), kernel_initializer='normal'), name=conv_name_base + '2c')(x)
x = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '2c')(x)
shortcut = TimeDistributed(Conv2D(nb_filter3, (1, 1), strides=strides, kernel_initializer='normal'), name=conv_name_base + '1')(input_tensor)
shortcut = TimeDistributed(BatchNormalization(trainable=False), name=bn_name_base + '1')(shortcut)
x = Add()([x, shortcut])
x = Activation('relu')(x)
return x
def classifier_layers(x):
# num_rois, 14, 14, 1024 -> num_rois, 7, 7, 2048
x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', strides=(2, 2))
# num_rois, 7, 7, 2048 -> num_rois, 7, 7, 2048
x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b')
# num_rois, 7, 7, 2048 -> num_rois, 7, 7, 2048
x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c')
# num_rois, 7, 7, 2048 -> num_rois, 1, 1, 2048
x = TimeDistributed(AveragePooling2D((7, 7)), name='avg_pool')(x)
return x
from tensorflow.keras.layers import (Conv2D, Dense, Flatten, Input, Reshape,
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import RandomNormal
from resnet import ResNet50, classifier_layers
from RoiPoolingConv import RoiPoolingConv
# 创建建议框网络
# 该网络结果会对先验框进行调整获得建议框
def get_rpn(base_layers, num_anchors):
# 利用一个512通道的3x3卷积进行特征整合
x = Conv2D(512, (3, 3), padding='same', activation='relu', kernel_initializer=RandomNormal(stddev=0.02), name='rpn_conv1')(base_layers)
# 利用一个1x1卷积调整通道数,获得预测结果
x_class = Conv2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer=RandomNormal(stddev=0.02), name='rpn_out_class')(x)
x_regr = Conv2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer=RandomNormal(stddev=0.02), name='rpn_out_regress')(x)
x_class = Reshape((-1,1),name="classification")(x_class)
x_regr = Reshape((-1,4),name="regression")(x_regr)
return [x_class, x_regr]
# 将共享特征层和建议框传入classifier网络
# 该网络结果会对建议框进行调整获得预测框
def get_classifier(base_layers, input_rois, nb_classes=21, pooling_regions = 14):
# num_rois, 38, 38, 1024 -> num_rois, 14, 14, 2048
out_roi_pool = RoiPoolingConv(pooling_regions)([base_layers, input_rois])
# num_rois, 14, 14, 1024 -> num_rois, 1, 1, 2048
out = classifier_layers(out_roi_pool)
# num_rois, 1, 1, 1024 -> num_rois, 2048
out = TimeDistributed(Flatten())(out)
# num_rois, 1, 1, 1024 -> num_rois, nb_classes
out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer=RandomNormal(stddev=0.02)), name='dense_class_{}'.format(nb_classes))(out)
# num_rois, 1, 1, 1024 -> num_rois, 4 * (nb_classes-1)
out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer=RandomNormal(stddev=0.02)), name='dense_regress_{}'.format(nb_classes))(out)
return [out_class, out_regr]
def get_model(config, num_classes):
inputs = Input(shape=(None, None, 3))
roi_input = Input(shape=(None, 4))
# 假设输入为600,600,3
# 获得一个38,38,1024的共享特征层base_layers
base_layers = ResNet50(inputs)
# 每个特征点9个先验框
num_anchors = len(config.anchor_box_scales) * len(config.anchor_box_ratios)
# 将共享特征层传入建议框网络
# 该网络结果会对先验框进行调整获得建议框
rpn = get_rpn(base_layers, num_anchors)
model_rpn = Model(inputs, rpn)
# 将共享特征层和建议框传入classifier网络
# 该网络结果会对建议框进行调整获得预测框
classifier = get_classifier(base_layers, roi_input, num_classes, config.pooling_regions)
model_all = Model([inputs, roi_input], rpn + classifier)
return model_rpn, model_all
def get_predict_model(config, num_classes):
inputs = Input(shape=(None, None, 3))
roi_input = Input(shape=(None, 4))
feature_map_input = Input(shape=(None,None,1024))
# 假设输入为600,600,3
# 获得一个38,38,1024的共享特征层base_layers
base_layers = ResNet50(inputs)
# 每个特征点9个先验框
num_anchors = len(config.anchor_box_scales) * len(config.anchor_box_ratios)
# 将共享特征层传入建议框网络
# 该网络结果会对先验框进行调整获得建议框
rpn = get_rpn(base_layers, num_anchors)
model_rpn = Model(inputs, rpn + [base_layers])
# 将共享特征层和建议框传入classifier网络
# 该网络结果会对建议框进行调整获得预测框
classifier = get_classifier(feature_map_input, roi_input, num_classes, config.pooling_regions)
model_classifier_only = Model([feature_map_input, roi_input], classifier)
return model_rpn, model_classifier_only训练代码)
# 训练文件
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras
from tqdm import tqdm
import os
from frcnn import get_model
from frcnn_training3 import (Generator, LossHistory, class_loss_cls,
class_loss_regr, cls_loss,
get_img_output_length, smooth_l1)
from anchors import get_anchors
from config import Config
from roi_helpers import calc_iou
from utils import BBoxUtility
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
def write_log(callback, names, logs, batch_no):
with callback.as_default():
for name, value in zip(names, logs):
def fit_one_epoch(model_rpn,model_all,epoch,epoch_size,epoch_size_val,gen,genval,Epoch,callback):
total_loss = 0
rpn_loc_loss = 0
rpn_cls_loss = 0
roi_loc_loss = 0
roi_cls_loss = 0
val_toal_loss = 0
with tqdm(total=epoch_size,desc=f'Epoch {
epoch + 1}/{
Epoch}',postfix=dict,mininterval=0.3) as pbar:
for iteration, batch in enumerate(gen):
if iteration >= epoch_size:
X, Y, boxes = batch[0], batch[1], batch[2]
P_rpn = model_rpn.predict_on_batch(X)
height, width, _ = np.shape(X[0])
base_feature_width, base_feature_height = get_img_output_length(width, height)
anchors = get_anchors([base_feature_width, base_feature_height], width, height)
results = bbox_util.detection_out_rpn(P_rpn, anchors)
roi_inputs = []
out_classes = []
out_regrs = []
for i in range(len(X)):
R = results[i][:, 1:]
X2, Y1, Y2 = calc_iou(R, config, boxes[i], NUM_CLASSES)
loss_class = model_all.train_on_batch([X, np.array(roi_inputs)], [Y[0], Y[1], np.array(out_classes), np.array(out_regrs)])
write_log(callback, ['total_loss','rpn_cls_loss', 'rpn_reg_loss', 'detection_cls_loss', 'detection_reg_loss'], loss_class, iteration)
rpn_cls_loss += loss_class[1]
rpn_loc_loss += loss_class[2]
roi_cls_loss += loss_class[3]
roi_loc_loss += loss_class[4]
total_loss = rpn_loc_loss + rpn_cls_loss + roi_loc_loss + roi_cls_loss
'total' : total_loss / (iteration + 1),
'rpn_cls' : rpn_cls_loss / (iteration + 1),
'rpn_loc' : rpn_loc_loss / (iteration + 1),
'roi_cls' : roi_cls_loss / (iteration + 1),
'roi_loc' : roi_loc_loss / (iteration + 1),
'lr' : K.get_value(})
print('Start Validation')
with tqdm(total=epoch_size_val, desc=f'Epoch {
epoch + 1}/{
Epoch}',postfix=dict,mininterval=0.3) as pbar:
for iteration, batch in enumerate(genval):
if iteration >= epoch_size_val:
X, Y, boxes = batch[0], batch[1], batch[2]
P_rpn = model_rpn.predict_on_batch(X)
height, width, _ = np.shape(X[0])
base_feature_width, base_feature_height = get_img_output_length(width, height)
anchors = get_anchors([base_feature_width, base_feature_height], width, height)
results = bbox_util.detection_out_rpn(P_rpn, anchors)
roi_inputs = []
out_classes = []
out_regrs = []
for i in range(len(X)):
R = results[i][:, 1:]
X2, Y1, Y2 = calc_iou(R, config, boxes[i], NUM_CLASSES)
loss_class = model_all.test_on_batch([X, np.array(roi_inputs)], [Y[0], Y[1], np.array(out_classes), np.array(out_regrs)])
val_toal_loss += loss_class[0]
'total' : val_toal_loss / (iteration + 1)})
loss_history.append_loss(total_loss/(epoch_size+1), val_toal_loss/(epoch_size_val+1))
print('Finish Validation')
print('Epoch:'+ str(epoch+1) + '/' + str(Epoch))
print('Total Loss: %.4f || Val Loss: %.4f ' % (total_loss/(epoch_size+1),val_toal_loss/(epoch_size_val+1)))
print('Saving state, iter:', str(epoch+1))
# E:\python-run-env\Faster-RCNN\logs ./
if((epoch+1)%10 == 0):
# 检测精度mAP和pr曲线计算参考视频
if __name__ == "__main__":
config = Config()
# 训练之前一定要修改NUM_CLASSES
# 修改成所需要区分的类的个数+1。
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# os.environ["CUDA_VISIBLE_DEVICES"]="-1"
# gpus = tf.config.experimental.list_physical_devices('GPU')
# if gpus:
# tf.config.experimental.set_virtual_device_configuration(gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
# tf.config.experimental.set_memory_growth(gpus[0],True)
# tf.config.experimental.set_virtual_device_configuration(
# gpus[0],
# [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]
# )
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
# gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
# cpus = tf.config.experimental.list_physical_devices(device_type='CPU')
# print(gpus, cpus)
# os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
# tf.config.experimental.set_visible_devices(devices=cpus[0], device_type='CPU')
# input_shape是输入图片的大小,默认为800,800,3
# 随着输入图片的增大,占用显存会增大
# 视频上为600,600,3,多次训练测试后发现800,800,3更优
input_shape = [800, 800, 3]
model_rpn, model_all = get_model(config, NUM_CLASSES)
# 权值文件请看README,百度网盘下载
# 训练自己的数据集时提示维度不匹配正常
# 预测的东西都不一样了自然维度不匹配
# base_net_weights = r"E:\python-run-env\Faster-RCNN\model_data\voc_weights.h5"
# model_rpn.load_weights(base_net_weights, by_name=True)
# model_all.load_weights(base_net_weights, by_name=True)
bbox_util = BBoxUtility(overlap_threshold=config.rpn_max_overlap,ignore_threshold=config.rpn_min_overlap,top_k=config.num_RPN_train_pre)
# 训练参数的设置
callback = tf.summary.create_file_writer("logs")
loss_history = LossHistory("logs/")
annotation_path = '../input/2007-train/2007_train.txt'
# 验证集的划分在train.py代码里面进行
# 2007_test.txt和2007_val.txt里面没有内容是正常的。训练不会使用到。
# 当前划分方式下,验证集和训练集的比例为1:9
val_split = 0.1
with open(annotation_path) as f:
lines = f.readlines()
num_val = int(len(lines)*val_split)
num_train = len(lines) - num_val
# 主干特征提取网络特征通用,使用预训练权重可以加快训练
# Init_Epoch为起始世代
# Interval_Epoch为中间训练的世代
# Epoch总训练世代
# 提示OOM或者显存不足请调小Batch_size
if True:
lr = 1e-4
Batch_size = 5
Init_Epoch = 0
Interval_Epoch = 100
loss = {
'classification': cls_loss(),
'regression' : smooth_l1()
}, optimizer=keras.optimizers.Adam(lr=lr)
loss = {
'classification' : cls_loss(),
'regression' : smooth_l1(),
'dense_class_{}'.format(NUM_CLASSES) : class_loss_cls,
'dense_regress_{}'.format(NUM_CLASSES) : class_loss_regr(NUM_CLASSES-1)
}, optimizer=keras.optimizers.Adam(lr=lr)
gen = Generator(bbox_util, lines[:num_train], NUM_CLASSES, Batch_size, input_shape=[input_shape[0], input_shape[1]]).generate()
gen_val = Generator(bbox_util, lines[num_train:], NUM_CLASSES, Batch_size, input_shape=[input_shape[0], input_shape[1]]).generate()
epoch_size = num_train // Batch_size
epoch_size_val = num_val // Batch_size
if epoch_size == 0 or epoch_size_val == 0:
raise ValueError("数据集过小,无法进行训练,请扩充数据集。")
for epoch in range(Init_Epoch, Interval_Epoch):
# E:\python-run-env\Faster-RCNN\VOCdevkit\VOC2007\JPEGImages\023.jpg
fit_one_epoch(model_rpn, model_all, epoch, epoch_size, epoch_size_val, gen, gen_val, Interval_Epoch, callback)
lr = lr*0.92
K.set_value(, lr)
K.set_value(, lr)
if True:
lr = 1e-5
Batch_size = 5
Interval_Epoch = 100
Epoch = 200
loss = {
'classification': cls_loss(),
'regression' : smooth_l1()
}, optimizer=keras.optimizers.Adam(lr=lr)
loss = {
'classification' : cls_loss(),
'regression' : smooth_l1(),
'dense_class_{}'.format(NUM_CLASSES) : class_loss_cls,
'dense_regress_{}'.format(NUM_CLASSES) : class_loss_regr(NUM_CLASSES-1)
}, optimizer=keras.optimizers.Adam(lr=lr)
gen = Generator(bbox_util, lines[:num_train], NUM_CLASSES, Batch_size, input_shape=[input_shape[0], input_shape[1]]).generate()
gen_val = Generator(bbox_util, lines[num_train:], NUM_CLASSES, Batch_size, input_shape=[input_shape[0], input_shape[1]]).generate()
epoch_size = num_train // Batch_size
epoch_size_val = num_val // Batch_size
if epoch_size == 0 or epoch_size_val == 0:
raise ValueError("数据集过小,无法进行训练,请扩充数据集。")
for epoch in range(Interval_Epoch, Epoch):
fit_one_epoch(model_rpn, model_all, epoch, epoch_size, epoch_size_val, gen, gen_val, Epoch, callback)
lr = lr*0.92
K.set_value(, lr)
K.set_value(, lr)