yolov5–v5.0版本(最新)代码解析导航
github ultralytics/yolov5
使用的yolov5为2021年6月23号的版本v5.0
此篇作为学习笔记,也花了比较大的功夫,尽可能对每一个要点进行了解释,不仅仅包括detect.py本身,还包含了模型的加载,推理的前向传播,NMS细节的展开。
如有一些问题或错误,欢迎大家一起交流。
if __name__ == "__main__":
opt = parse_opt()
main(opt)
相关参数解释
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')#需要加载的权重
parser.add_argument('--source', type=str, default='data/images', help='file/dir/URL/glob, 0 for webcam')#需要进行推理的图片
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='inference size (pixels)')#推理的图片输入尺寸
parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')#置信度阈值
parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')#NMS IOU阈值
parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')#最大侦测的目标数
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')#设备 编号
parser.add_argument('--view-img', action='store_true', help='show results')#展示推理后的图片
parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')#结果保存为txt
parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')#在保存的txt里面,除了类别,再保存对应的置信度
parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')#保存用目标框crop的图片
parser.add_argument('--nosave', action='store_true', help='do not save images/videos')#不保存图片/视频
parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')#过滤得到为classes分类的图片
parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')###不同类别间也可以做NMS (不开启的话,每个类别单独做NMS)
parser.add_argument('--augment', action='store_true', help='augmented inference')#推理增强
parser.add_argument('--update', action='store_true', help='update all models')#将模型中包含的优化器、ema等操作进行去除,减小模型的大小(MB)
parser.add_argument('--project', default='runs/detect', help='save results to project/name')#推理保存的工程目录
parser.add_argument('--name', default='exp', help='save results to project/name')#本次结果的保存文件夹名
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')#默认为False,每次运行都会创建一个新的文件夹,相关内容保存在这下面,如果为True,则会在之前的文件夹下保存
parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')#边界框厚度
parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')#隐藏每个目标的标签
parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')#隐藏每个目标的置信度
parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')#FP16,半精度推理(增加推理速度)
opt = parser.parse_args()
return opt
def main(opt):
#打印相关opt(参数)
print(colorstr('detect: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items()))
#检查依赖包
check_requirements(exclude=('tensorboard', 'thop'))
#运行run()
run(**vars(opt))
这一部分主要设置参数的读入,目录创建,日志创建,模型加载,设备选择,读取数据集等预备工作。
@torch.no_grad()
def run(weights='yolov5s.pt', # model.pt path(s)
source='data/images', # file/dir/URL/glob, 0 for webcam
imgsz=640, # inference size (pixels)
conf_thres=0.25, # confidence threshold
iou_thres=0.45, # NMS IOU threshold
max_det=1000, # maximum detections per image
device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu
view_img=False, # show results
save_txt=False, # save results to *.txt
save_conf=False, # save confidences in --save-txt labels
save_crop=False, # save cropped prediction boxes
nosave=False, # do not save images/videos
classes=None, # filter by class: --class 0, or --class 0 2 3
agnostic_nms=False, # class-agnostic NMS
augment=False, # augmented inference
update=False, # update all models
project='runs/detect', # save results to project/name
name='exp', # save results to project/name
exist_ok=False, # existing project/name ok, do not increment
line_thickness=3, # bounding box thickness (pixels)
hide_labels=False, # hide labels
hide_conf=False, # hide confidences
half=False, # use FP16 half-precision inference
):
#save_img:bool 判断是否要保存图片
save_img = not nosave and not source.endswith('.txt') # save inference images
webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
('rtsp://', 'rtmp://', 'http://', 'https://'))
# Directories
# 创建本次推理的目录
save_dir = increment_path(Path(project) / name, exist_ok=exist_ok) # increment run
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir
# Initialize
set_logging() # 设置日志
device = select_device(device) #设置设备
half &= device.type != 'cpu' # half precision only supported on CUDA 半精度推理 half:Bool
# Load model
model = attempt_load(weights, map_location=device) # load FP32 model # 加载模型
stride = int(model.stride.max()) # model stride #模型的最大步长(默认32)
imgsz = check_img_size(imgsz, s=stride) # check image size #检查图片的大小,默认模型步长为32,那么图片宽高要是32的倍数,如果不是,那么就调整为32的倍数。
names = model.module.names if hasattr(model, 'module') else model.names # get class names #类别(cls)的名字
#使用半精度,默认不使用
if half:
model.half() # to FP16
# Second-stage classifier
# 加载的分类模型,(先检测目标框,再进行分类)。默认是不使用的。
classify = False
if classify:
modelc = load_classifier(name='resnet50', n=2) # initialize
modelc.load_state_dict(torch.load('resnet50.pt', map_location=device)['model']).to(device).eval()
# Set Dataloader
vid_path, vid_writer = None, None
if webcam:
view_img = check_imshow()
cudnn.benchmark = True # set True to speed up constant image size inference
#读取视频流
dataset = LoadStreams(source, img_size=imgsz, stride=stride)
else:
#读取图片
dataset = LoadImages(source, img_size=imgsz, stride=stride)
dataset = LoadImages(source, img_size=imgsz, stride=stride)
中的attempt_load,LoadImages
函数将另外作解释
其实这里主要是对多个模型进行集成和读取,如果是单个模型可以直接忽略,这一部分主要是为了多个模型进行一些操作,保证兼容。比如strdie,要取步长最大的才行。
def attempt_load(weights, map_location=None, inplace=True):
from models.yolo import Detect, Model
# Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
model = Ensemble()
for w in weights if isinstance(weights, list) else [weights]:
ckpt = torch.load(attempt_download(w), map_location=map_location) # load
model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval()) # FP32 model
# Compatibility updates
for m in model.modules():
if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6, nn.SiLU, Detect, Model]:
m.inplace = inplace # pytorch 1.7.0 compatibility
elif type(m) is Conv:
m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatibility
if len(model) == 1:
return model[-1] # return model
else:
print(f'Ensemble created with {weights}\n')
for k in ['names']:
setattr(model, k, getattr(model[-1], k))
model.stride = model[torch.argmax(torch.tensor([m.stride.max() for m in model])).int()].stride # max stride
return model # return ensemble
class Ensemble
这是一个存放集成的模型的类,将预测的结果cat在一起,会将多个模型预测结果的框cat在一起,最后我们作NMS操作就行。 如果传参中 - -weights输入是列表,存着多个模型权重,那么使用模型集成,否则使用单个模型。
class Ensemble(nn.ModuleList):
# Ensemble of models
def __init__(self):
super(Ensemble, self).__init__()
def forward(self, x, augment=False):
y = []
for module in self:
y.append(module(x, augment)[0])
# y = torch.stack(y).max(0)[0] # max ensemble
# y = torch.stack(y).mean(0) # mean ensemble
y = torch.cat(y, 1) # nms ensemble
return y, None # inference, train output
可以看到Ensemble
继承了nn.ModuleList
的方法,如定义在nn.ModuleList
中的append
。于是attempt_load()
中,model.append(...)
,这些模型,会在Ensemble
的forward
的 for module in self:
取出来。
每个模型的输出 ( b a t c h , n i , 5 + n c ) (batch,n_i,5+nc) (batch,ni,5+nc),其中ni代表当前模型输出的boxes数量,5代表xywhc, nc为类别数。在第一维度cat,得到y的形状为 ( b a t c h , Σ m i = 0 n i , 5 + n c ) (batch,\underset{i=0}{\overset{m}{\varSigma}}n_i,5+nc) (batch,i=0Σmni,5+nc)
定义了一个类(object
),当作迭代器.每次调用__next__
的返回值,__iter__
返回迭代器的当前迭代次数.
class LoadImages: # for inference
def __init__(self, path, img_size=640, stride=32):
#需要推理的图片的路径
p = str(Path(path).absolute()) # os-agnostic absolute path
#搜索路径下的图片,并将路径存放在files:List中
if '*' in p:
files = sorted(glob.glob(p, recursive=True)) # glob
elif os.path.isdir(p):
files = sorted(glob.glob(os.path.join(p, '*.*'))) # dir
elif os.path.isfile(p):
files = [p] # files
else:
raise Exception(f'ERROR: {p} does not exist')
#筛选指定格式的图片(jpg,png,...)和视频.具体看img_formats和vid_formats定义
images = [x for x in files if x.split('.')[-1].lower() in img_formats]
videos = [x for x in files if x.split('.')[-1].lower() in vid_formats]
ni, nv = len(images), len(videos)#图片数量和视频数量
self.img_size = img_size #图片大小
self.stride = stride #步长
self.files = images + videos #图片和视频放在一个List中
self.nf = ni + nv # number of files # 图片和视频总数量
self.video_flag = [False] * ni + [True] * nv # 记录self.files:List中,video的位置
self.mode = 'image' # 推理的模式,默认为图片
if any(videos): #读取视频
self.new_video(videos[0]) # new video
else:
self.cap = None
assert self.nf > 0, f'No images or videos found in {p}. ' \
f'Supported formats are:\nimages: {img_formats}\nvideos: {vid_formats}'
def __iter__(self):
#定义迭代器的初始值0,后续每调用一次__next__,那么self.count+1,也就是说,用来记录迭代次数的
self.count = 0
return self
def __next__(self):
#如果迭代次数和number of files相等,那么结束迭代
if self.count == self.nf:
raise StopIteration
#根据当前迭代次数,获取相应图片或者视频的路径
path = self.files[self.count]
#判断当前path,是否是视频,如果是就从视频中读取图片
if self.video_flag[self.count]:
# Read video
self.mode = 'video' #切换为视频模式
ret_val, img0 = self.cap.read() # 读取视频的帧(图片),ret_val:Bool用来判断当前帧读取正常与否,img0为读出来的图片
#如果视频中的帧读取失败,说明该视频已经播放完了。如果还能迭代,就播放下一个视频。
if not ret_val:
self.count += 1
self.cap.release()
if self.count == self.nf: # last video
raise StopIteration
else:
path = self.files[self.count]
self.new_video(path)
ret_val, img0 = self.cap.read()
self.frame += 1
print(f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: ', end='')
else:
#读取图片
# Read image
self.count += 1
img0 = cv2.imread(path) # BGR
assert img0 is not None, 'Image Not Found ' + path
print(f'image {self.count}/{self.nf} {path}: ', end='')
# 进行padding,比如模型的下采样倍率为32,那么宽高一定要是32的倍数,所以要进行padding,来改变形状
# Padded resize
img = letterbox(img0, self.img_size, stride=self.stride)[0]
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)#内存连续
#返回路径,padding后的图(用于输入模型),原始的图(直接cv2.imread的图),判断是否为视频的标志(如果是图片则为None)
return path, img, img0, self.cap
def new_video(self, path):
#opencv读取视频的流程
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
def __len__(self):
#迭代器的长度
return self.nf # number of files
另外__next__
中还有letterbox()
的使用,主要用处就给图片加边,变成下采样倍率(默认为32)的整数倍,
比如大小为(90,128)的图片会加边成(96,128)大小的图片
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
# Resize and pad image while meeting stride-multiple constraints
#获取图片的高宽
shape = img.shape[:2] # current shape [height, width]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
'''
只缩小图片,不放大图片
取min,这样后续只需要对不够32倍数的边,进行加边就行了
如果取max可能短边会超过new_shape的设定,比如:new_shape默认(640,640),那么取max,
其中一边缩小到640,但是另外一边还是大于640的,那还咋加边呢,所以取min.
'''
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scaleup: # only scale down, do not scale up (for better test mAP)
r = min(r, 1.0)
#计算padding
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) # scale 后的形状,由于不是32的倍数,要加边
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding #wh需要padding的大小
if auto: # minimum rectangle
dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
elif scaleFill: # stretch # 不加边,后续强行resize,会导致图片变形,默认不使用.
dw, dh = 0.0, 0.0
new_unpad = (new_shape[1], new_shape[0])
ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
#图片两侧都要加边
dw /= 2 # divide padding into 2 sides
dh /= 2
#先将原图进行resize,之前只是计算了scale和padding
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
'''
比如之前dw=1,需要padding的大小为1,dw /= 2后,dw=0.5,
但是我们只能将padding=1加在一侧,并不能两边同时0.5,所以我们作了这一步,round为四舍五入
'''
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
#padding操作
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)
这一部分主要是读取图片,模型输出pred结果,经过NMS,画box和保存结果的过程。
而NMS主要在non_max_suppression()
中
# Run inference
if device.type != 'cpu':
'''
将模型加入到设备,并为同一类型(因为是Ensemble(集成)的模型,每个模型的参数类型不一样,
我们需要统一一下),输入torch.zeros(1, 3, imgsz, imgsz),是为声明,输入的形状,
同时也可以判断模型是否正常运行。
'''
model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters()))) # run once
t0 = time.time()
#从上面的LoadImages()就可以看到每次只输出单张图片,这边img: (C,H,W)
#如果每次要输出好几张图片传入模型,那么就需要修改LoadImages()了,那么这里img才会变成(N,C,H,W)
for path, img, im0s, vid_cap in dataset:
# 读取图片,归一化等操作
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
# 如果是单张图片,那么(C,H,W)升维成(1,C,H,W)
if img.ndimension() == 3:
img = img.unsqueeze(0)
# Inference
t1 = time_synchronized()
'''
获得预测的结果 索引0:inference, 索引1:train output(默认为None)
pred的结果为例如为:torch.Size([1, 18900, 85]), img为(1,3,640,480),
其中模型的下采样倍率为[8,16,32],
那么(640/32*480/32+640/16*480/16+480/8*640/8)*3=18900,也就是特征图上格子数.
pred输出的xywh是在输入模型的图片的坐标,即原图加上padding后的图片
'''
pred = model(img, augment=augment)[0] # pred:torch.Size([1, 18900, 85])
#进行非极大值抑制
# Apply NMS
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
#返回结果 pred List[tensor(N1,6),tensor(N2,6),...]
t2 = time_synchronized()
# 使用二阶 分类模型 进行分类
# Apply Classifier
if classify:
pred = apply_classifier(pred, modelc, img, im0s)
# Process detections
# pred List[tensor(N1,6),tensor(N2,6),...]
for i, det in enumerate(pred): # detections per image
if webcam: # batch_size >= 1
p, s, im0, frame = path[i], f'{i}: ', im0s[i].copy(), dataset.count
else:
p, s, im0, frame = path, '', im0s.copy(), getattr(dataset, 'frame', 0)
#定义图片,txt等存储的地址
p = Path(p) # to Path
save_path = str(save_dir / p.name) # img.jpg
txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}') # img.txt
s += '%gx%g ' % img.shape[2:] # print string
#因为pred结果的box是在加边(padding)后的图片上的坐标,所以要还原到原图的坐标
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh
imc = im0.copy() if save_crop else im0 # for save_crop
if len(det):
# Rescale boxes from img_size to im0 size
#加边后图的坐标转为原图坐标
#det(N,6) ,6代表x1,y1,x2,y2,conf,cls ,img(1,3,H,W)
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class 类别数
s += f"{n} {names[int(c)]}{'s' * (n > 1)}, " # add to string
# Write results
# xyxy:List[x1,y1,x2,y2]
for *xyxy, conf, cls in reversed(det):
if save_txt: # Write to file
xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format
with open(txt_path + '.txt', 'a') as f:
f.write(('%g ' * len(line)).rstrip() % line + '\n')
#给推理的图片加box
if save_img or save_crop or view_img: # Add bbox to image
c = int(cls) # integer class
label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
#将框画在图中
plot_one_box(xyxy, im0, label=label, color=colors(c, True), line_thickness=line_thickness)
#保存crop的图
if save_crop:
save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)
# Print time (inference + NMS)
print(f'{s}Done. ({t2 - t1:.3f}s)')
# Stream results
if view_img:
cv2.imshow(str(p), im0)
cv2.waitKey(1) # 1 millisecond
#保存结果,图片就保存图片,视频就保存视频片段
# Save results (image with detections)
if save_img:
if dataset.mode == 'image':
cv2.imwrite(save_path, im0)
else: # 'video' or 'stream'
if vid_path != save_path: # new video
vid_path = save_path
if isinstance(vid_writer, cv2.VideoWriter):
vid_writer.release() # release previous video writer
if vid_cap: # video
fps = vid_cap.get(cv2.CAP_PROP_FPS)
w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
else: # stream
fps, w, h = 30, im0.shape[1], im0.shape[0]
save_path += '.mp4'
vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
vid_writer.write(im0)
if save_txt or save_img:
s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
print(f"Results saved to {save_dir}{s}")
if update:
strip_optimizer(weights) # update model (to fix SourceChangeWarning)
print(f'Done. ({time.time() - t0:.3f}s)')
class Detect(nn.Module):
...
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
...
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
'''
在前向传播之前,设置model.train(),那么self.training=True,
设置model.eval(),那么self.training=False.
显然在detect阶段,self.training=False
'''
if not self.training: # inference
if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = x[i].sigmoid()
if self.inplace:
'''
将模型输出的xy,还原到输入模型的图片的尺度上.
因为标签xy在(-0.5,1.5)的范围内,所以输出的结果也要做这样的变换,虽然输出可能超过这个范围
然后加上格子所在的索引,并乘上相应的步长,可以得到在输入模型的图片上的尺度.
标签wh在(0,4)范围内,所以输出结果也做这样的变换. 平方是为了确保wh大于0.
乘上对应anchor的大小,可以得到wh在输入模型的图片上的尺度
'''
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
else:
xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh
y = torch.cat((xy, wh, y[..., 4:]), -1)
z.append(y.view(bs, -1, self.no))
return x if self.training else (torch.cat(z, 1), x)
@staticmethod
def _make_grid(nx=20, ny=20):
...
所以,推理过程中,模型的(取索引0)输出 ( b a t c h , a n c h o r s , 5 + n c ) (batch,anchors,5+nc) (batch,anchors,5+nc) ,且xywh尺度为输出图片的尺度。取索引1则为训练的输出.
训练过程中,模型的输出 ( b a t c h , n a , h , w , 5 + n c ) (batch,na,h,w,5+nc) (batch,na,h,w,5+nc),xywh为在特征图上的尺度.
明白了输入模型的输出的形状和尺度,那么做后续的尺度形状等变换的时候就不再会疑惑了。
非极大值抑制的核心部分
主要是做一些置信度筛选,先通过 o b j obj obj置信度进行粗略的筛选,再通过 o b j ∗ c l s obj*cls obj∗cls置信度进行细微的筛选(而不是直接 o b j ∗ c l s obj*cls obj∗cls筛选,是为了保证运行速度)
接着是对结果的框进行输出即可
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
labels=(), max_det=300):
"""Runs Non-Maximum Suppression (NMS) on inference results
Returns:
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
"""
'''
例如输入的prediction torch.Size([1, 18900, 85]),1指代批次(一张图片),
18900表示特征图的格子数,85:每个格子都有xywhc + nc
'''
# 类别数
nc = prediction.shape[2] - 5 # number of classes
# 筛选出obj置信度大于阈值的
xc = prediction[..., 4] > conf_thres # candidates xc:torch.Size([1, 18900]),值为Bool类型
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
# Settings
# box的最小最大宽高
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
# 最大进行nms的box数
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
#用于存储输出的结果, 最后output:List[tensor.size(M,6)],M代表目标数
output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
# confidence
#用obj置信度进行初步筛选
x = x[xc[xi]] #xi:index=0 xc:(1,18900) Bool xc[xi]:(18900) Bool x:(18900,85)->(71,85)
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 5), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf #将置信度更新为obj置信度*类别置信度,后续用于第二轮筛选.筛选后目标
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres] #再进行一次置信度筛选
'''
假设第一次筛选后还有55个框,第二次筛选后还有49个框
(55,4)+(55,1)+(55,1)-->(55,6) ---再做一次置信度筛选,
之前是对obj进行粗略的筛选,这次对obj*conf做筛选-->(49,6)
'''
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes #超过最大nms限度,就取置信度前max_nms个
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
'''
agnostic表示做NMS的时候忽略了类别,也就是说所有的框全部放一起做NMS.
否则,则是按类别,每个类别各自做NMS.
默认情况下是不做agnostic的,也就是按类别,各自做NMS.
下面就是通过,对box的x1y1x2y2,都加上 类别序号*max_wh(框的最大wh限制)
这样子,不同类别的box就不会相交的问题了.
然后再i = torchvision.ops.nms(boxes, scores, iou_thres)的时候,就是按照每个类别
单独做NMS了.
如果是agnostic,则是直接丢入做NMS,不分类别
'''
# x:torch.Size([49, 6]) c:torch.Size([49, 1])
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
# boxes:torch.Size([49, 4]) scores:torch.Size([49])
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS操作
if i.shape[0] > max_det: # limit detections # 限制一下最大目标数
i = i[:max_det]
'''
下面是做merge NMS,默认为False,如果要使用,需要手动去打开merge=True
如:boxes:(49,4) boxes[i]:(3,4) iou:(3,49) type:Bool
score:(49) weight:(3,49) x[:, :4] :(49,6)
相当于对除了NMS后剩下的box,还选取与它们iou较大的,根据权重(置信度),对这些box的坐标取平均
使得最后的获得box更加准确
redunant则是表示除了NMS的box,还需要与之对应有冗余的box(且iou大于阈值的),才保留下来
'''
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
merge NMS的过程:
比如框a为NMS保留下来的框,只有b,c框与a的iou大于iou_thresh,b的obj置信度为0.9,c的为0.8,而a的为0.95
如果不做merge,那么box左上角坐标就是x=x1
如果做merge,那么box左上角坐标就是
x = x 1 × 0.95 + x 2 × 0.9 + x 3 × 0.8 0.95 + 0.9 + 0.8 x=\frac{x_1\times 0.95+x_2\times 0.9+x_3\times 0.8}{0.95+0.9+0.8} x=0.95+0.9+0.8x1×0.95+x2×0.9+x3×0.8