yoloV5_6.1代码逐行解释(面向小白)

yoloV5_6.1代码逐行解释(面向小白)

因为自己工作中经常要使用yolov5,所以就用了一些时间研究了一下,也是收获颇丰,就放上来,一是方便自己以后查阅,二是希望能帮到需要的朋友.

我写的注释比较详细,主要是针对刚会python和了解深度学习的小白,有错误的和不懂的地方可以联系我,大家相互交流学习哦VX:Y1685637070

目前只注释了detect.py的代码,后续会慢慢更新,卷起来!

detect.py

# YOLOv5  by Ultralytics, GPL-3.0 license
"""
Run inference on images, videos, directories, streams, etc.

Usage - sources:
    $ python path/to/detect.py --weights yolov5s.pt --source 0              # webcam
                                                             img.jpg        # image
                                                             vid.mp4        # video
                                                             path/          # directory
                                                             path/*.jpg     # glob
                                                             'https://youtu.be/Zgi9g1ksQHc'  # YouTube
                                                             'rtsp://example.com/media.mp4'  # RTSP, RTMP, HTTP stream

Usage - formats:
    $ python path/to/detect.py --weights yolov5s.pt                 # PyTorch
                                         yolov5s.torchscript        # TorchScript
                                         yolov5s.onnx               # ONNX Runtime or OpenCV DNN with --dnn
                                         yolov5s.xml                # OpenVINO
                                         yolov5s.engine             # TensorRT
                                         yolov5s.mlmodel            # CoreML (MacOS-only)
                                         yolov5s_saved_model        # TensorFlow SavedModel
                                         yolov5s.pb                 # TensorFlow GraphDef
                                         yolov5s.tflite             # TensorFlow Lite
                                         yolov5s_edgetpu.tflite     # TensorFlow Edge TPU
"""

import argparse
import os
import sys
from pathlib import Path
import glob
import numpy as np
import cv2
import torch
import torch.backends.cudnn as cudnn
import xml.etree.ElementTree as ET

FILE = Path(__file__).resolve()
# 将路径解析为绝对路径
ROOT = FILE.parents[0]  # YOLOv5 root directory
if str(ROOT) not in sys.path:  # 模块查询路径的列表
    sys.path.append(str(ROOT))  # add ROOT to PATH 将路径添加到系统路径里,方便后面导入
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync
from utils.augmentations import letterbox


class LoadImages1:  # 读取图片
    # YOLOv5 image/video dataloader, i.e. `python detect.py --source image.jpg/vid.mp4`
    def __init__(self, path, img_size=640, stride=32, auto=True):
        p = str(Path(path).resolve())
        print("p:", p)  # os-agnostic absolute path 输出路径
        if '*' in p:
            files = sorted(glob.glob(p, recursive=True))  # glob.glob() recursive遍历文件夹下所有文件
        elif os.path.isdir(p):
            files = sorted(glob.glob(os.path.join(p, '*.*')))  # dir
        elif os.path.isfile(p):
            files = [p]  # files
        else:
            raise Exception(f'ERROR: {p} does not exist')
        images = []
        for root, dirs, filenames in os.walk(p):
            for filename in filenames:
                if filename.split('.')[-1].lower() in IMG_FORMATS:
                    images.append(os.path.join(root, filename))

        # images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
        videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
        ni, nv = len(images), len(videos)

        self.img_size = img_size
        self.stride = stride
        self.files = images + videos
        self.nf = ni + nv  # number of files
        self.video_flag = [False] * ni + [True] * nv
        self.mode = 'image'
        self.auto = auto
        if any(videos):
            self.new_video(videos[0])  # new video
        else:
            self.cap = None
        assert self.nf > 0, f'No images or videos found in {p}. ' \
                            f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}'

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count == self.nf:
            raise StopIteration
        path = self.files[self.count]

        if self.video_flag[self.count]:
            # Read video
            self.mode = 'video'
            ret_val, img0 = self.cap.read()
            while not ret_val:
                self.count += 1
                self.cap.release()
                if self.count == self.nf:  # last video
                    raise StopIteration
                else:
                    path = self.files[self.count]
                    self.new_video(path)
                    ret_val, img0 = self.cap.read()

            self.frame += 1
            s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '

        else:
            # Read image
            self.count += 1
            img0 = cv2.imread(path)  # BGR
            assert img0 is not None, f'Image Not Found {path}'
            s = f'image {self.count}/{self.nf} {path}: '

        # Padded resize
        img = letterbox(img0, self.img_size, stride=self.stride, auto=self.auto)[0]  # 将尺寸转换为能用的尺寸

        # Convert
        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
        img = np.ascontiguousarray(img)  # ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快。

        return path, img, img0, self.cap, s

    def new_video(self, path):
        self.frame = 0
        self.cap = cv2.VideoCapture(path)
        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))

    def __len__(self):
        return self.nf  # number of files


@torch.no_grad()
def run(weights=ROOT / 'yolov5s.pt',  # model.pt path(s)
        source=ROOT / 'data/images',  # file/dir/URL/glob, 0 for webcam
        data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
        imgsz=(640, 640),  # inference size (height, width)
        conf_thres=0.25,  # confidence threshold
        iou_thres=0.45,  # NMS IOU threshold
        max_det=1000,  # maximum detections per image
        device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
        view_img=False,  # show results
        save_txt=False,  # save results to *.txt
        save_conf=False,  # save confidences in --save-txt labels
        save_crop=False,  # save cropped prediction boxes
        nosave=False,  # do not save images/videos
        classes=None,  # filter by class: --class 0, or --class 0 2 3
        agnostic_nms=False,  # class-agnostic NMS
        augment=False,  # augmented inference
        visualize=False,  # visualize features
        update=False,  # update all models
        project=ROOT / 'runs/detect',  # save results to project/name
        name='exp',  # save results to project/name
        exist_ok=False,  # existing project/name ok, do not increment
        line_thickness=3,  # bounding box thickness (pixels)
        hide_labels=False,  # hide labels
        hide_conf=False,  # hide confidences
        half=False,  # use FP16 half-precision inference
        dnn=False,  # use OpenCV DNN for ONNX inference
        ):
    source = str(source)
    save_img = not nosave and not source.endswith('.txt')  # save inference images 是否保存图品
    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)  # 判断数据后缀是什么
    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))  # 判断是否为网址
    webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
    # 判断是否为视频流
    if is_url and is_file:
        source = check_file(source)  # 检查是否为文件

    # Directories
    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run 创建保存路径
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir 创建结果保存路径

    # Load model
    device = select_device(device)  # 挑选GPU
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data)  # 导入模型
    stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
    #  这里会判断权重文件是什么类型的并返回对应的值,如权重文件是pt,只有PT,stride,names为真,其他都为空。
    # stride 64 names = cpde
    imgsz = check_img_size(imgsz, s=stride)  # check image size 检查图片尺寸

    # Half
    half &= (pt or jit or onnx or engine) and device.type != 'cpu'  # FP16 supported on limited backends with CUDA
    # 判断是否使用半精度,如果是cpu计算速度较慢可以使用半精度
    if pt or jit:
        model.model.half() if half else model.model.float()  # 不使用半精度就使用全精度

    # Dataloader
    if webcam:  # 调用摄像头
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
        bs = len(dataset)  # batch_size
    else:
        dataset = LoadImages1(source, img_size=imgsz, stride=stride, auto=pt)
        # 源代码写的只能读取一级目录下的图片,我这里增加os.walk(path)可以遍历该文件夹下所有的图片文件
        bs = 1  # batch_size
    vid_path, vid_writer = [None] * bs, [None] * bs

    # Run inference
    model.warmup(imgsz=(1 if pt else bs, 3, *imgsz), half=half)  # warmup热身
    dt, seen = [0.0, 0.0, 0.0], 0
    for path, im, im0s, vid_cap, s in dataset:  # 遍历图片
        t1 = time_sync()  # 开始时间
        im = torch.from_numpy(im).to(device)  # img0s为原始图片,im为转换过的图片
        im = im.half() if half else im.float()  # uint8 to fp16/32是否半精度
        im /= 255  # 0 - 255 to 0.0 - 1.0归一化
        if len(im.shape) == 3:  # 判断图像通道数,一般图片均为RGB三通道
            im = im[None]  # expand for batch dim 增加一个维度匹配opencv
        t2 = time_sync()  # 结束时间
        dt[0] += t2 - t1  # 判断整个用时

        # Inference
        # 可视化

        visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False  # 可视化
        # .stem的意思是去掉路径最后的一个后缀
        pred = model(im, augment=augment, visualize=visualize) # 预测结果保存在pred中
        t3 = time_sync() #计时
        dt[1] += t3 - t2

        # NMS
        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)  # NMS
        # NMS 非极大抑制,保留主要的预测结果
        dt[2] += time_sync() - t3

        # Second-stage classifier (optional)
        # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)

        # Process predictions
        for i, det in enumerate(pred):  # per image 遍历图片的预测结果
            seen += 1  # 计数
            if webcam:  # batch_size >= 1 视频流
                p, im0, frame = path[i], im0s[i].copy(), dataset.count
                s += f'{i}: '
            else:
                p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)
                # getattr(dataset, 'frame', 0)返回dataset中的frame的值,如果不存在就返回0

            p = Path(p)  # to Path 路径标准化
            save_path = str(save_dir / p.name)  # im.jpg
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
            s += '%gx%g ' % im.shape[2:]  # print string
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            imc = im0.copy() if save_crop else im0  # for save_crop  保存裁剪的预测图片,也就是将预测的区域裁剪保存
            annotator = Annotator(im0, line_width=line_thickness, example=str(names)) # 实例化画框的类
            if len(det): # det此时有每一个预测结果的xy坐标,预测类,分数 如:
                # [[5,5,5,5,person,0.6],
                # [10,10,10,10,bus,0.7]
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
                # det每一行的前4个数为框的坐标,但是此时得出的坐标为图片变化后的坐标,因此需要将框的尺寸改成原来的尺寸

                # Print results 打印结果
                for c in det[:, -1].unique(): # 获取所有预测结果 .unque用于去重,这里是统计一共预测出来几个结果
                    n = (det[:, -1] == c).sum()  # detections per class这里是统计每个预测结果出现的个数
                    s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

                # Write results
                for *xyxy, conf, cls in reversed(det): #reversed反序遍历 [0,1,2,3].reversed = [3,2,1,0]
                    if save_txt:  # Write to file 保存结果txt
                        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                        line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                        with open(txt_path + '.txt', 'a') as f:
                            f.write(('%g ' * len(line)).rstrip() % line + '\n')

                    if save_img or save_crop or view_img:  # Add bbox to imag  在图片上画框
                        c = int(cls)  # integer class
                        label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
                        # hide_labels 隐藏类别 hide_conf因此分数,如果都是False,就将类别分数都标出
                        annotator.box_label(xyxy, label, color=colors(c, True))  # 在原图上画框
                        if save_crop:  #曹村裁剪的图片
                            save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)

            # Stream results
            im0 = annotator.result() # 画完框的图片
            if view_img:  # 预测完成显示图片
                cv2.imshow(str(p), im0)
                cv2.waitKey(1)  # 1 millisecond

            # Save results (image with detections)
            if save_img:  #保存图片
                if dataset.mode == 'image':
                    cv2.imwrite(save_path, im0)  #保存图片将画好框的图片保存
                else:  # 'video' or 'stream'
                    if vid_path[i] != save_path:  # new video
                        vid_path[i] = save_path
                        if isinstance(vid_writer[i], cv2.VideoWriter):
                            vid_writer[i].release()  # release previous video writer
                        if vid_cap:  # video
                            fps = vid_cap.get(cv2.CAP_PROP_FPS)
                            w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                            h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                        else:  # stream
                            fps, w, h = 30, im0.shape[1], im0.shape[0]
                        save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
                        vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                    vid_writer[i].write(im0)

        # Print time (inference-only)
        LOGGER.info(f'{s}Done. ({t3 - t2:.3f}s)') # 打印时间

    # Print results
    t = tuple(x / seen * 1E3 for x in dt)  # speeds per image
    LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
    if save_txt or save_img:
        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}")
    if update:
        strip_optimizer(weights)  # update model (to fix SourceChangeWarning)


def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'pre_weights/yolov5m6.pt',
                        help='model path(s)')  ##权重文件
    parser.add_argument('--source', type=str, default=r"D:\yolov5_61\data\images",
                        help='file/dir/URL/glob, 0 for webcam')  # 数据地址
    parser.add_argument('--data', type=str, default=ROOT / 'data/mydataset_ACT.yaml',
                        help='(optional) dataset.yaml path')  # code列表
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640, 640],
                        help='inference size h,w')  # 输入的IMG_SIEZ
    parser.add_argument('--conf-thres', type=float, default=0.05,
                        help='confidence threshold')  # 阈值,模型对图片进行推理,当获得标签的分数大于这个值就会标出矩形框
    parser.add_argument('--iou-thres', type=float, default=0.45,
                        help='NMS IoU threshold')  # IOU,模型对图片进行预测时会出现很多预测框,选一个分数最高的并和其他的进行IOU计算,留下IOU值小于设定值的框
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')  # 保留最大的检测框数量
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')  # 使用GPUorcpu
    parser.add_argument('--view-img', default=True, action='store_true',
                        help='show results')  # 是否展示预测后的图片, 常用于视频的事实展示效果,。action='store_true'的意思是在启动时输入“--view-img”则值为True,不输入则值为Flase
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')  # 是否将预测的框坐标以txt文件形式保存
    parser.add_argument('--save-conf', action='store_true',
                        help='save confidences in --save-txt labels')  # 是否将置信度conf也保存到txt中,默认False
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')  # 是否保存裁剪的预测框图片
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')  # 不保存图片
    parser.add_argument('--classes', nargs='+', type=int,
                        help='filter by class: --classes 0, or --classes 0 2 3')  # 设置只保留某一部分类别,形如0或者0 2 3
    parser.add_argument('--agnostic-nms', action='store_true',
                        help='class-agnostic NMS')  # True表示多个类一起计算nms,False表示按照不同的类分别进行计算nms
    parser.add_argument('--augment', action='store_true', help='augmented inference')  # 用于设定是否进行数据增强
    parser.add_argument('--visualize', action='store_true', help='visualize features')  # 是否可视化网络层输出特征
    parser.add_argument('--update', action='store_true',
                        help='update all models')  # 如果为True,则对所有模型进行strip_optimizer操作,去除pt文件中的优化器等信息,默认为False,无关紧要
    parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name')  # 保存位置
    parser.add_argument('--name', default='exp', help='save results to project/name')  # 保存名称
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')  # 创建新文件夹
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')  # 画框的线条粗细
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')  # 可视化时隐藏预测类别
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')  ##可视化时隐藏预测分数
    parser.add_argument('--half', action='store_true',
                        help='use FP16 half-precision inference')  # 使用half:是否使用F16精度推理,半精度
    parser.add_argument('--dnn', action='store_true', default=True,
                        help='use OpenCV DNN for ONNX inference')  # 是否使用 OpenCV DNN 进行 ONNX 推理
    opt = parser.parse_args()
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand  判断输入的图片尺寸是否为长和宽,即扩充维度
    print("111")
    # FILE.stem  提取无后缀的文件名
    print_args(FILE.stem, opt)  # 输出所有参数
    print("ok")
    return opt


def main(opt):
    check_requirements(exclude=('tensorboard', 'thop'))
    run(**vars(opt))


if __name__ == "__main__":
    opt = parse_opt()
    main(opt)

你可能感兴趣的:(python,深度学习,人工智能)