因为自己工作中经常要使用yolov5,所以就用了一些时间研究了一下,也是收获颇丰,就放上来,一是方便自己以后查阅,二是希望能帮到需要的朋友.
我写的注释比较详细,主要是针对刚会python和了解深度学习的小白,有错误的和不懂的地方可以联系我,大家相互交流学习哦VX:Y1685637070
目前只注释了detect.py的代码,后续会慢慢更新,卷起来!
# YOLOv5 by Ultralytics, GPL-3.0 license
"""
Run inference on images, videos, directories, streams, etc.
Usage - sources:
$ python path/to/detect.py --weights yolov5s.pt --source 0 # webcam
img.jpg # image
vid.mp4 # video
path/ # directory
path/*.jpg # glob
'https://youtu.be/Zgi9g1ksQHc' # YouTube
'rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP stream
Usage - formats:
$ python path/to/detect.py --weights yolov5s.pt # PyTorch
yolov5s.torchscript # TorchScript
yolov5s.onnx # ONNX Runtime or OpenCV DNN with --dnn
yolov5s.xml # OpenVINO
yolov5s.engine # TensorRT
yolov5s.mlmodel # CoreML (MacOS-only)
yolov5s_saved_model # TensorFlow SavedModel
yolov5s.pb # TensorFlow GraphDef
yolov5s.tflite # TensorFlow Lite
yolov5s_edgetpu.tflite # TensorFlow Edge TPU
"""
import argparse
import os
import sys
from pathlib import Path
import glob
import numpy as np
import cv2
import torch
import torch.backends.cudnn as cudnn
import xml.etree.ElementTree as ET
FILE = Path(__file__).resolve()
# 将路径解析为绝对路径
ROOT = FILE.parents[0] # YOLOv5 root directory
if str(ROOT) not in sys.path: # 模块查询路径的列表
sys.path.append(str(ROOT)) # add ROOT to PATH 将路径添加到系统路径里,方便后面导入
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync
from utils.augmentations import letterbox
class LoadImages1: # 读取图片
# YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`
def __init__(self, path, img_size=640, stride=32, auto=True):
p = str(Path(path).resolve())
print("p:", p) # os-agnostic absolute path 输出路径
if '*' in p:
files = sorted(glob.glob(p, recursive=True)) # glob.glob() recursive遍历文件夹下所有文件
elif os.path.isdir(p):
files = sorted(glob.glob(os.path.join(p, '*.*'))) # dir
elif os.path.isfile(p):
files = [p] # files
else:
raise Exception(f'ERROR: {p} does not exist')
images = []
for root, dirs, filenames in os.walk(p):
for filename in filenames:
if filename.split('.')[-1].lower() in IMG_FORMATS:
images.append(os.path.join(root, filename))
# images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
ni, nv = len(images), len(videos)
self.img_size = img_size
self.stride = stride
self.files = images + videos
self.nf = ni + nv # number of files
self.video_flag = [False] * ni + [True] * nv
self.mode = 'image'
self.auto = auto
if any(videos):
self.new_video(videos[0]) # new video
else:
self.cap = None
assert self.nf > 0, f'No images or videos found in {p}. ' \
f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}'
def __iter__(self):
self.count = 0
return self
def __next__(self):
if self.count == self.nf:
raise StopIteration
path = self.files[self.count]
if self.video_flag[self.count]:
# Read video
self.mode = 'video'
ret_val, img0 = self.cap.read()
while not ret_val:
self.count += 1
self.cap.release()
if self.count == self.nf: # last video
raise StopIteration
else:
path = self.files[self.count]
self.new_video(path)
ret_val, img0 = self.cap.read()
self.frame += 1
s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '
else:
# Read image
self.count += 1
img0 = cv2.imread(path) # BGR
assert img0 is not None, f'Image Not Found {path}'
s = f'image {self.count}/{self.nf} {path}: '
# Padded resize
img = letterbox(img0, self.img_size, stride=self.stride, auto=self.auto)[0] # 将尺寸转换为能用的尺寸
# Convert
img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
img = np.ascontiguousarray(img) # ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快。
return path, img, img0, self.cap, s
def new_video(self, path):
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
def __len__(self):
return self.nf # number of files
@torch.no_grad()
def run(weights=ROOT / 'yolov5s.pt', # model.pt path(s)
source=ROOT / 'data/images', # file/dir/URL/glob, 0 for webcam
data=ROOT / 'data/coco128.yaml', # dataset.yaml path
imgsz=(640, 640), # inference size (height, width)
conf_thres=0.25, # confidence threshold
iou_thres=0.45, # NMS IOU threshold
max_det=1000, # maximum detections per image
device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu
view_img=False, # show results
save_txt=False, # save results to *.txt
save_conf=False, # save confidences in --save-txt labels
save_crop=False, # save cropped prediction boxes
nosave=False, # do not save images/videos
classes=None, # filter by class: --class 0, or --class 0 2 3
agnostic_nms=False, # class-agnostic NMS
augment=False, # augmented inference
visualize=False, # visualize features
update=False, # update all models
project=ROOT / 'runs/detect', # save results to project/name
name='exp', # save results to project/name
exist_ok=False, # existing project/name ok, do not increment
line_thickness=3, # bounding box thickness (pixels)
hide_labels=False, # hide labels
hide_conf=False, # hide confidences
half=False, # use FP16 half-precision inference
dnn=False, # use OpenCV DNN for ONNX inference
):
source = str(source)
save_img = not nosave and not source.endswith('.txt') # save inference images 是否保存图品
is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS) # 判断数据后缀是什么
is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://')) # 判断是否为网址
webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
# 判断是否为视频流
if is_url and is_file:
source = check_file(source) # 检查是否为文件
# Directories
save_dir = increment_path(Path(project) / name, exist_ok=exist_ok) # increment run 创建保存路径
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir 创建结果保存路径
# Load model
device = select_device(device) # 挑选GPU
model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data) # 导入模型
stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
# 这里会判断权重文件是什么类型的并返回对应的值,如权重文件是pt,只有PT,stride,names为真,其他都为空。
# stride 64 names = cpde
imgsz = check_img_size(imgsz, s=stride) # check image size 检查图片尺寸
# Half
half &= (pt or jit or onnx or engine) and device.type != 'cpu' # FP16 supported on limited backends with CUDA
# 判断是否使用半精度,如果是cpu计算速度较慢可以使用半精度
if pt or jit:
model.model.half() if half else model.model.float() # 不使用半精度就使用全精度
# Dataloader
if webcam: # 调用摄像头
view_img = check_imshow()
cudnn.benchmark = True # set True to speed up constant image size inference
dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
bs = len(dataset) # batch_size
else:
dataset = LoadImages1(source, img_size=imgsz, stride=stride, auto=pt)
# 源代码写的只能读取一级目录下的图片,我这里增加os.walk(path)可以遍历该文件夹下所有的图片文件
bs = 1 # batch_size
vid_path, vid_writer = [None] * bs, [None] * bs
# Run inference
model.warmup(imgsz=(1 if pt else bs, 3, *imgsz), half=half) # warmup热身
dt, seen = [0.0, 0.0, 0.0], 0
for path, im, im0s, vid_cap, s in dataset: # 遍历图片
t1 = time_sync() # 开始时间
im = torch.from_numpy(im).to(device) # img0s为原始图片,im为转换过的图片
im = im.half() if half else im.float() # uint8 to fp16/32是否半精度
im /= 255 # 0 - 255 to 0.0 - 1.0归一化
if len(im.shape) == 3: # 判断图像通道数,一般图片均为RGB三通道
im = im[None] # expand for batch dim 增加一个维度匹配opencv
t2 = time_sync() # 结束时间
dt[0] += t2 - t1 # 判断整个用时
# Inference
# 可视化
visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False # 可视化
# .stem的意思是去掉路径最后的一个后缀
pred = model(im, augment=augment, visualize=visualize) # 预测结果保存在pred中
t3 = time_sync() #计时
dt[1] += t3 - t2
# NMS
pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) # NMS
# NMS 非极大抑制,保留主要的预测结果
dt[2] += time_sync() - t3
# Second-stage classifier (optional)
# pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)
# Process predictions
for i, det in enumerate(pred): # per image 遍历图片的预测结果
seen += 1 # 计数
if webcam: # batch_size >= 1 视频流
p, im0, frame = path[i], im0s[i].copy(), dataset.count
s += f'{i}: '
else:
p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)
# getattr(dataset, 'frame', 0)返回dataset中的frame的值,如果不存在就返回0
p = Path(p) # to Path 路径标准化
save_path = str(save_dir / p.name) # im.jpg
txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}') # im.txt
s += '%gx%g ' % im.shape[2:] # print string
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh
imc = im0.copy() if save_crop else im0 # for save_crop 保存裁剪的预测图片,也就是将预测的区域裁剪保存
annotator = Annotator(im0, line_width=line_thickness, example=str(names)) # 实例化画框的类
if len(det): # det此时有每一个预测结果的xy坐标,预测类,分数 如:
# [[5,5,5,5,person,0.6],
# [10,10,10,10,bus,0.7]
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
# det每一行的前4个数为框的坐标,但是此时得出的坐标为图片变化后的坐标,因此需要将框的尺寸改成原来的尺寸
# Print results 打印结果
for c in det[:, -1].unique(): # 获取所有预测结果 .unque用于去重,这里是统计一共预测出来几个结果
n = (det[:, -1] == c).sum() # detections per class这里是统计每个预测结果出现的个数
s += f"{n} {names[int(c)]}{'s' * (n > 1)}, " # add to string
# Write results
for *xyxy, conf, cls in reversed(det): #reversed反序遍历 [0,1,2,3].reversed = [3,2,1,0]
if save_txt: # Write to file 保存结果txt
xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format
with open(txt_path + '.txt', 'a') as f:
f.write(('%g ' * len(line)).rstrip() % line + '\n')
if save_img or save_crop or view_img: # Add bbox to imag 在图片上画框
c = int(cls) # integer class
label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
# hide_labels 隐藏类别 hide_conf因此分数,如果都是False,就将类别分数都标出
annotator.box_label(xyxy, label, color=colors(c, True)) # 在原图上画框
if save_crop: #曹村裁剪的图片
save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)
# Stream results
im0 = annotator.result() # 画完框的图片
if view_img: # 预测完成显示图片
cv2.imshow(str(p), im0)
cv2.waitKey(1) # 1 millisecond
# Save results (image with detections)
if save_img: #保存图片
if dataset.mode == 'image':
cv2.imwrite(save_path, im0) #保存图片将画好框的图片保存
else: # 'video' or 'stream'
if vid_path[i] != save_path: # new video
vid_path[i] = save_path
if isinstance(vid_writer[i], cv2.VideoWriter):
vid_writer[i].release() # release previous video writer
if vid_cap: # video
fps = vid_cap.get(cv2.CAP_PROP_FPS)
w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
else: # stream
fps, w, h = 30, im0.shape[1], im0.shape[0]
save_path = str(Path(save_path).with_suffix('.mp4')) # force *.mp4 suffix on results videos
vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
vid_writer[i].write(im0)
# Print time (inference-only)
LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)') # 打印时间
# Print results
t = tuple(x / seen * 1E3 for x in dt) # speeds per image
LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
if save_txt or save_img:
s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}")
if update:
strip_optimizer(weights) # update model (to fix SourceChangeWarning)
def parse_opt():
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'pre_weights/yolov5m6.pt',
help='model path(s)') ##权重文件
parser.add_argument('--source', type=str, default=r"D:\yolov5_61\data\images",
help='file/dir/URL/glob, 0 for webcam') # 数据地址
parser.add_argument('--data', type=str, default=ROOT / 'data/mydataset_ACT.yaml',
help='(optional) dataset.yaml path') # code列表
parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640, 640],
help='inference size h,w') # 输入的IMG_SIEZ
parser.add_argument('--conf-thres', type=float, default=0.05,
help='confidence threshold') # 阈值,模型对图片进行推理,当获得标签的分数大于这个值就会标出矩形框
parser.add_argument('--iou-thres', type=float, default=0.45,
help='NMS IoU threshold') # IOU,模型对图片进行预测时会出现很多预测框,选一个分数最高的并和其他的进行IOU计算,留下IOU值小于设定值的框
parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image') # 保留最大的检测框数量
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') # 使用GPUorcpu
parser.add_argument('--view-img', default=True, action='store_true',
help='show results') # 是否展示预测后的图片, 常用于视频的事实展示效果,。action='store_true'的意思是在启动时输入“--view-img”则值为True,不输入则值为Flase
parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') # 是否将预测的框坐标以txt文件形式保存
parser.add_argument('--save-conf', action='store_true',
help='save confidences in --save-txt labels') # 是否将置信度conf也保存到txt中,默认False
parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes') # 是否保存裁剪的预测框图片
parser.add_argument('--nosave', action='store_true', help='do not save images/videos') # 不保存图片
parser.add_argument('--classes', nargs='+', type=int,
help='filter by class: --classes 0, or --classes 0 2 3') # 设置只保留某一部分类别,形如0或者0 2 3
parser.add_argument('--agnostic-nms', action='store_true',
help='class-agnostic NMS') # True表示多个类一起计算nms,False表示按照不同的类分别进行计算nms
parser.add_argument('--augment', action='store_true', help='augmented inference') # 用于设定是否进行数据增强
parser.add_argument('--visualize', action='store_true', help='visualize features') # 是否可视化网络层输出特征
parser.add_argument('--update', action='store_true',
help='update all models') # 如果为True,则对所有模型进行strip_optimizer操作,去除pt文件中的优化器等信息,默认为False,无关紧要
parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name') # 保存位置
parser.add_argument('--name', default='exp', help='save results to project/name') # 保存名称
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') # 创建新文件夹
parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)') # 画框的线条粗细
parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels') # 可视化时隐藏预测类别
parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences') ##可视化时隐藏预测分数
parser.add_argument('--half', action='store_true',
help='use FP16 half-precision inference') # 使用half:是否使用F16精度推理,半精度
parser.add_argument('--dnn', action='store_true', default=True,
help='use OpenCV DNN for ONNX inference') # 是否使用 OpenCV DNN 进行 ONNX 推理
opt = parser.parse_args()
opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1 # expand 判断输入的图片尺寸是否为长和宽,即扩充维度
print("111")
# FILE.stem 提取无后缀的文件名
print_args(FILE.stem, opt) # 输出所有参数
print("ok")
return opt
def main(opt):
check_requirements(exclude=('tensorboard', 'thop'))
run(**vars(opt))
if __name__ == "__main__":
opt = parse_opt()
main(opt)