YOLOv5 源码理解

detect.py

# 推理
import argparse
import csv
import os
import platform
import sys
from pathlib import Path

import torch


# __file__ -- 当前文件路径,与下面代码等价,类型不同,str 类型
FILE = Path(__file__).resolve()  # /home/deep/lwj/yolov5-master/detect.py
# parents[0] -- ROOT 的上一级路径,注意:0 离当前层最近
ROOT = FILE.parents[0]  # /home/deep/lwj/yolov5-master

# str(ROOT) -- /home/deep/lwj/Jeffery_file
# sys.path -- 列表,包含了已经添加到系统的环境变量路径
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH,后面导包查找路径使用
# ROOT -- /home/deep/lwj/Jeffery_file
# Path.cwd() -- /home/deep/lwj,获取当前【工作目录】,注意和当前文件目录区别
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # 绝对路径 -> 相对路径

from ultralytics.utils.plotting import Annotator, colors, save_one_box

from models.common import DetectMultiBackend
from utils.dataloaders import IMG_FORMATS, VID_FORMATS, LoadImages, LoadScreenshots, LoadStreams
from utils.general import (LOGGER, Profile, check_file, check_img_size, check_imshow, check_requirements, colorstr, cv2,
                           increment_path, non_max_suppression, print_args, scale_boxes, strip_optimizer, xyxy2xywh)
from utils.torch_utils import select_device, smart_inference_mode


@smart_inference_mode()
def run(
        weights=ROOT / 'yolov5s.pt',                          # model path or triton URL
        source=ROOT / 'data/images',                          # file/dir/URL/glob/screen/0(webcam)
        data=ROOT / 'data/coco128.yaml',                      # dataset.yaml path   
        imgsz=(640, 640),                                     # inference size (height, width)
        conf_thres=0.25,                                      # confidence threshold
        iou_thres=0.45,                                       # NMS IOU threshold
        max_det=1000,                                         # maximum detections per image
        device='',                                            # cuda device, i.e. 0 or 0,1,2,3 or cpu
        view_img=False,                                       # show results
        save_txt=False,                                       # save results to *.txt
        save_csv=False,                                       # save results in CSV format
        save_conf=False,                                      # save confidences in --save-txt labels
        save_crop=False,                                      # save cropped prediction boxes
        nosave=False,                                         # do not save images/videos
        classes=None,                                         # filter by class: --class 0, or --class 0 2 3
        agnostic_nms=False,                                   # class-agnostic NMS
        augment=False,                                        # augmented inference
        visualize=False,                                      # visualize features
        update=False,                                         # update all models
        project=ROOT / 'runs/detect',                         # save results to project/name
        name='exp',                                           # save results to project/name
        exist_ok=False,                                       # existing project/name ok, do not increment
        line_thickness=3,                                     # bounding box thickness (pixels)
        hide_labels=False,                                    # hide labels
        hide_conf=False,                                      # hide confidences
        half=False,                                           # use FP16 half-precision inference
        dnn=False,                                            # use OpenCV DNN for ONNX inference
        vid_stride=1,                                         # video frame-rate stride
):
    source = str(source)  # python detect.py --source data/images/bus.jpg
    # 保存预测结果
    save_img = not nosave and not source.endswith('.txt')
    # source -- data/images/bus.jpg
    # suffix -- .jpg
    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
    # str.isnumeric() -- 如果字符串中只包含数字字符,返回 True,否则 False
    webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
    screenshot = source.lower().startswith('screen')
    if is_url and is_file:
        source = check_file(source)  # download

    # save_dir -- 保存结果文件夹 runs/detect/exp3
    # increment_path -- 确定 exp 后的数字到几
    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
    # runs/detect/exp3/labels -- 添加一个 labels 文件夹
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

    # Load model
    device = select_device(device)
    # DetectMultiBackend -- 多后端框架 PyTorch、ONNX、TensorRT 等,主要是用来加载权重文件
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
    # stride -- 特征图相对原图缩放的倍数
    stride, names, pt = model.stride, model.names, model.pt
    # 验证图像大小是否为 stride(32) 的倍数
    imgsz = check_img_size(imgsz, s=stride)  # 验证完,一般还是 640*640

    # Dataloader
    bs = 1  # batch_size
    if webcam:
        view_img = check_imshow(warn=True)
        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
        bs = len(dataset)
    elif screenshot:
        dataset = LoadScreenshots(source, img_size=imgsz, stride=stride, auto=pt)
    else:
        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
    vid_path, vid_writer = [None] * bs, [None] * bs

    # Run inference
    model.warmup(imgsz=(1 if pt or model.triton else bs, 3, *imgsz))  # 热身:给 model 传一张空白图片进行前向传播
    seen, windows, dt = 0, [], (Profile(), Profile(), Profile())
    for path, im, im0s, vid_cap, s in dataset:
        # 每一次循环,都先执行 LoadImages 的 __iter__ 和 __next__
        with dt[0]:  # dt 主要是用来记录时间的
            # im 从 dataset 过来的图片是 numpy 类型 -> tensor
            im = torch.from_numpy(im).to(model.device)  # torch.Size([3, 640, 480])
            im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
            im /= 255  # 0 - 255 to 0.0 - 1.0,归一化
            if len(im.shape) == 3:  # torch.Size([3, 640, 480])
                im = im[None]  # 扩增 batch 维度

        # Inference 推理,这文件最主要的部分了属实是
        with dt[1]:
            # visualize -- 推理时保存模型特征图
            visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
            # 模型预测出来的检测框 -- 18900 个
            # 预训练权重输出的 85 个信息,4 个坐标信息,1 个置信度信息,80 个类别概率
            pred = model(im, augment=augment, visualize=visualize)  # tensor.Size([1, 18900, 85]),1 -- batch

        # NMS
        with dt[2]:
            # conf_thres -- 根据 85 进行筛选
            # max_det    -- 最大检测目标
            # tensor.Size([1, 18900, 85]) -> tensor.Size([1,5,6]) 6 -- 四个坐标(xywh) + 类别 + 置信度
            pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)

        # Define the path for the CSV file
        csv_path = save_dir / 'predictions.csv'

        # Create or append to the CSV file
        def write_to_csv(image_name, prediction, confidence):
            data = {'Image Name': image_name, 'Prediction': prediction, 'Confidence': confidence}
            with open(csv_path, mode='a', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=data.keys())
                if not csv_path.is_file():
                    writer.writeheader()
                writer.writerow(data)

        # Process predictions
        # 每个 batch 中的每个图片,其实 det 就是框的数量 5
        for i, det in enumerate(pred):  # per image,tensor.Size([5,6])
            seen += 1   # 图片数量
            if webcam:  # batch_size >= 1
                p, im0, frame = path[i], im0s[i].copy(), dataset.count
                s += f'{i}: '  # 输出信息
            else:
                p, im0, frame = path, im0s.copy(), getattr(dataset, 'frame', 0)

            p = Path(p)  # to Path
            save_path = str(save_dir / p.name)  # "runs/detect/exp3","bus.jpg"
            # 输入一张图片,就用不上
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
            s += '%gx%g ' % im.shape[2:]  # 输出信息,图片大小
            # 原图的宽高,保存 txt 文件时可以用到
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            # 是否将检测框中的物体裁剪保存
            imc = im0.copy() if save_crop else im0      # for save_crop
            # 绘图工具
            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
            if len(det):  # 把框画在原图
                # Rescale boxes from img_size to im0 size 将特征图映射到原图大小
                # im - 640*480,im0 -- 1080*810
                det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()

                # Print results
                for c in det[:, 5].unique():
                    # unique() -- 去除重复元素,并按元素由大到小返回一个新的无元素重复的元组或者列表
                    n = (det[:, 5] == c).sum()  # detections per class 所有框的类别
                    # 输出信息,检测到的目标类别
                    s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

                # Write results
                for *xyxy, conf, cls in reversed(det):
                    c = int(cls)  # integer class
                    label = names[c] if hide_conf else f'{names[c]}'
                    confidence = float(conf)
                    confidence_str = f'{confidence:.2f}'

                    if save_csv:
                        write_to_csv(p.name, label, confidence_str)

                    if save_txt:  # Write to file
                        xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                        line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
                        with open(f'{txt_path}.txt', 'a') as f:
                            f.write(('%g ' * len(line)).rstrip() % line + '\n')

                    if save_img or save_crop or view_img:  # Add bbox to image
                        c = int(cls)  # integer class
                        # hide_conf -- 检测框上面的标签,如 person 0.85
                        label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
                        annotator.box_label(xyxy, label, color=colors(c, True))
                    if save_crop:
                        save_one_box(xyxy, imc, file=save_dir / 'crops' / names[c] / f'{p.stem}.jpg', BGR=True)

            # Stream results
            im0 = annotator.result()
            if view_img:  # 弹框展示检测结果
                if platform.system() == 'Linux' and p not in windows:
                    windows.append(p)
                    cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)  # allow window resize (Linux)
                    cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0])
                cv2.imshow(str(p), im0)
                cv2.waitKey(1)  # 1 millisecond

            # Save results (image with detections)
            if save_img:
                if dataset.mode == 'image':
                    cv2.imwrite(save_path, im0)
                else:  # 'video' or 'stream'
                    if vid_path[i] != save_path:  # new video
                        vid_path[i] = save_path
                        if isinstance(vid_writer[i], cv2.VideoWriter):
                            vid_writer[i].release()  # release previous video writer
                        if vid_cap:  # video
                            fps = vid_cap.get(cv2.CAP_PROP_FPS)
                            w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                            h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                        else:  # stream
                            fps, w, h = 30, im0.shape[1], im0.shape[0]
                        save_path = str(Path(save_path).with_suffix('.mp4'))  # force *.mp4 suffix on results videos
                        vid_writer[i] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
                    vid_writer[i].write(im0)

        # Print time (inference-only)
        LOGGER.info(f"{s}{'' if len(det) else '(no detections), '}{dt[1].dt * 1E3:.1f}ms")

    # Print results
    # seen -- 总共预测了多少张图片
    # dt   -- 总共耗时
    t = tuple(x.t / seen * 1E3 for x in dt)  # speeds per image 平均时间
    LOGGER.info(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)
    if save_txt or save_img:
        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
        LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}{s}")
    if update:
        strip_optimizer(weights[0])  # update model (to fix SourceChangeWarning)


def parse_opt():
    # argparse -- 命令行选项、参数和子命令解析器
    # ArgumentParser 对象 -- 将命令行解析成 Python 数据类型所需的全部信息
    # add_argument 添加命令行参数
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s.pt', help='model path or triton URL')
    # 指定一个文件夹路径,会对文件夹下的图片、视频等进行检测,然后将检测结果保存
    parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)')
    parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='(optional) dataset.yaml path')
    # 将 image resize 之后送到网络模型中,但不会改变输入输出图片的像素大小
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
    # 置信度阈值设置,当大于 default 才显示检测结果
    parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
    # 对非极大值抑制值进行设置
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    # 设置实时显示结果
    parser.add_argument('--view-img', action='store_true', help='show results')
    # 保存结果的 txt 文件,物体类别、xywh
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--save-csv', action='store_true', help='save results in CSV format')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
    # 不保存检测结果到文件夹
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    # 设定只检测标签数据集中的一种或几种标签
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3')
    # 是否激活非极大值抑制
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    # 数据增强
    parser.add_argument('--augment', action='store_true', help='augmented inference')
    parser.add_argument('--visualize', action='store_true', help='visualize features')
    parser.add_argument('--update', action='store_true', help='update all models')
    # 设定保存目标检测结果文件位置
    parser.add_argument('--project', default=ROOT / 'runs/detect', help='save results to project/name')
    # 设置保存结果的名字
    parser.add_argument('--name', default='exp', help='save results to project/name')
    # 设定预测结果的保存存在位置情况
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
    parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
    parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
    parser.add_argument('--vid-stride', type=int, default=1, help='video frame-rate stride')
    opt = parser.parse_args()  # 解析命令行参数
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # [640, 640]
    print_args(vars(opt))
    return opt


def main(opt):
    check_requirements(ROOT / 'requirements.txt', exclude=('tensorboard', 'thop'))
    run(**vars(opt))


if __name__ == '__main__':
    opt = parse_opt()  # 解析命令行参数
    main(opt)

augmentations.py

# YOLOv5  by Ultralytics, AGPL-3.0 license
"""
Image augmentation functions
"""

import math
import random

import cv2
import numpy as np
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF

from utils.general import LOGGER, check_version, colorstr, resample_segments, segment2box, xywhn2xyxy
from utils.metrics import bbox_ioa

IMAGENET_MEAN = 0.485, 0.456, 0.406  # RGB mean
IMAGENET_STD = 0.229, 0.224, 0.225  # RGB standard deviation


class Albumentations:
    # YOLOv5 Albumentations class (optional, only used if package is installed)
    def __init__(self, size=640):
        self.transform = None
        prefix = colorstr('albumentations: ')
        try:
            import albumentations as A
            check_version(A.__version__, '1.0.3', hard=True)  # version requirement

            T = [
                A.RandomResizedCrop(height=size, width=size, scale=(0.8, 1.0), ratio=(0.9, 1.11), p=0.0),
                A.Blur(p=0.01),
                A.MedianBlur(p=0.01),
                A.ToGray(p=0.01),
                A.CLAHE(p=0.01),
                A.RandomBrightnessContrast(p=0.0),
                A.RandomGamma(p=0.0),
                A.ImageCompression(quality_lower=75, p=0.0)]  # transforms
            self.transform = A.Compose(T, bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

            LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
        except ImportError:  # package not installed, skip
            pass
        except Exception as e:
            LOGGER.info(f'{prefix}{e}')

    def __call__(self, im, labels, p=1.0):
        if self.transform and random.random() < p:
            new = self.transform(image=im, bboxes=labels[:, 1:], class_labels=labels[:, 0])  # transformed
            im, labels = new['image'], np.array([[c, *b] for c, b in zip(new['class_labels'], new['bboxes'])])
        return im, labels


def normalize(x, mean=IMAGENET_MEAN, std=IMAGENET_STD, inplace=False):
    # Denormalize RGB images x per ImageNet stats in BCHW format, i.e. = (x - mean) / std
    return TF.normalize(x, mean, std, inplace=inplace)


def denormalize(x, mean=IMAGENET_MEAN, std=IMAGENET_STD):
    # Denormalize RGB images x per ImageNet stats in BCHW format, i.e. = x * std + mean
    for i in range(3):
        x[:, i] = x[:, i] * std[i] + mean[i]
    return x


def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5):
    # HSV color-space augmentation
    """
    用在 LoadImagesAndLabels 模块的 __getitem__ 函数
    hsv 色域增强:   处理图像 hsv,不对 label 进行任何处理
    :param img:     待处理图片  BGR [736, 736]
    :param hgain: h 通道色域参数 用于生成新的 h 通道
    :param sgain: h 通道色域参数 用于生成新的 s 通道
    :param vgain: h 通道色域参数 用于生成新的 v 通道
    :return:        返回 hsv 增强后的图片 img
    """
    if hgain or sgain or vgain:
        # 随机取 -1 到 1 三个实数,乘以 hyp 中的 hsv 三通道的系数,用于生成新的 hsv 通道
        r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1     # random gains
        hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV))  # 图像的通道拆分 h s v
        dtype = im.dtype  # uint8

        x = np.arange(0, 256, dtype=r.dtype)
        lut_hue = ((x * r[0]) % 180).astype(dtype)          # 生成新的 h 通道
        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)   # 生成新的 s 通道
        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)   # 生成新的 v 通道

        # 图像的通道合并 img_hsv = h + s + v 随机调整 hsv 之后重新组合 hsv 通道
        # cv2.LUT(hue, lut_hue) 通道色域变换,输入变换前通道 hue 和变换后通道 lut_hue
        im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
        # no return needed  dst:输出图像
        cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im)  # no return needed


def hist_equalize(im, clahe=True, bgr=False):
    # Equalize histogram on BGR image 'im' with im.shape(n,m,3) and range 0-255
    yuv = cv2.cvtColor(im, cv2.COLOR_BGR2YUV if bgr else cv2.COLOR_RGB2YUV)
    if clahe:
        c = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        yuv[:, :, 0] = c.apply(yuv[:, :, 0])
    else:
        yuv[:, :, 0] = cv2.equalizeHist(yuv[:, :, 0])  # equalize Y channel histogram
    return cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR if bgr else cv2.COLOR_YUV2RGB)  # convert YUV image to RGB


def replicate(im, labels):
    # Replicate labels
    h, w = im.shape[:2]
    boxes = labels[:, 1:].astype(int)
    x1, y1, x2, y2 = boxes.T
    s = ((x2 - x1) + (y2 - y1)) / 2  # side length (pixels)
    for i in s.argsort()[:round(s.size * 0.5)]:  # smallest indices
        x1b, y1b, x2b, y2b = boxes[i]
        bh, bw = y2b - y1b, x2b - x1b
        yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw))  # offset x, y
        x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh]
        im[y1a:y2a, x1a:x2a] = im[y1b:y2b, x1b:x2b]  # im4[ymin:ymax, xmin:xmax]
        labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0)

    return im, labels


# 邮箱盒子:通过填充边界的形式 -> 保持原始图片的长宽比
# 作用:缩放填充!
def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    # new_shape 是 __init__ 中的 self.batch_shapes
    # letterbox 函数将输入图像 im 等比缩放到 new_shape 大小,不够的地方补充黑边(或灰边)
    shape = im.shape[:2]  # current shape [height, width] [1080, 810]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    # 1.计算缩放比例:对原图长边进行缩放,填充短边
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    # scaleup 为 False 的话表示图像只能缩小不能放大
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    # 2.计算缩放后的尺寸 [width, height] -- (480, 640)
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))  # 并不是 32 的整数倍
    
    # 3.计算填充数值
    # new_unpad 需要补充 dw dh 个像素 -> 32 的倍数
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding 160, 0
    # auto -- 是否自动补齐到 32 的整数倍
    if auto:  # minimum rectangle 只要满足 32 的倍数即可,比如如果 480 是 32 的倍数,则不用填充到 640
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    # 不采用自适应缩放,直接 resize 到目标 shape,无需补边
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    # 单边填充数值
    dw /= 2  # divide padding into 2 sides
    dh /= 2

    # 当原始图像与目标 shape 不一致的时候,缩放
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    # round(dw,dh - 0.1) 直接让小于 1 的为 0
    # dw 与 dh 在上面除以 2 之前可能是个奇数,那么补边的时候,两侧的边补充的像素数应该是相差 1 的
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    # 添加灰度边界框
    # cv2.BORDER_CONSTANT -- 添加的边界框像素值为常数
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    
    # im -- 填充后的图片 ratio -- 填充后的长宽比例(一般为 1,1) (dw, dh) -- 要填充的高和宽的像素值
    return im, ratio, (dw, dh)  # 两侧补边操作


# 随机透视变换
def random_perspective(im, targets=(), segments=(), degrees=10, translate=.1,
                       scale=.1, shear=10, perspective=0.0, border=(0, 0)):
    # 对 mosaic 整合后的图片进行随机旋转、缩放、平移、裁剪,透视变换,并 resize 为输入大小 img_size
    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(0.1, 0.1), scale=(0.9, 1.1), shear=(-10, 10))
    # targets = [cls, xyxy]

    '''
    :params img:         mosaic 整合后的图片 img4[2*img_size, 2*img_size]
                         如果 mosaic 后的图片没有一个多边形标签就使用 targets, segments 为空;
                         如果有一个多边形标签就使用 segments, targets 不为空
    :params targets:     mosaic 整合后图片的所有正常 label 标签 labels4 (不正常的会通过 segments2boxes 将多边形标签转化为正常标签) [N, cls+xyxy]
    :params segments:    mosaic 整合后图片的所有不正常 label 信息(包含 segments 多边形也包含正常 gt)  [m, x1y1....]
    :params degrees:     旋转和缩放矩阵参数
    :params translate:   平移矩阵参数
    :params scale:       缩放矩阵参数
    :params shear:       剪切矩阵参数
    :params perspective: 透视变换参数
    :params border:      用于确定最后输出的图片大小 一般等于[-img_size, -img_size] 那么最后输出的图片大小为 [img_size, img_size]
    :return img:         通过透视变换/仿射变换后的 img [img_size, img_size]
    :return targets:     通过透视变换/仿射变换后的 img 对应的标签 [n, cls+x1y1x2y2]  (通过筛选后的)
    '''
    # 设定输出图片的 H W
    # border=-s // 2  所以最后图片的大小直接减半 [img_size, img_size, 3]
    height = im.shape[0] + border[0] * 2  # shape(h,w,c)
    width = im.shape[1] + border[1] * 2

    # ============================ 开始变换 =============================
    # 需要注意的是,其实 opencv 是实现了仿射变换的, 不过我们要先生成仿射变换矩阵 M
    # Center 设置中心平移矩阵
    C = np.eye(3)
    C[0, 2] = -im.shape[1] / 2  # x translation (pixels)
    C[1, 2] = -im.shape[0] / 2  # y translation (pixels)

    # Perspective 设置透视变换矩阵
    P = np.eye(3)
    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)

    # Rotation and Scale 设置旋转和缩放矩阵
    R = np.eye(3)  # 初始化 R = [[1,0,0], [0,1,0], [0,0,1]]    (3, 3)
    # a: 随机生成旋转角度,范围在(-degrees, degrees)
    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
    a = random.uniform(-degrees, degrees)
    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
    # s: 随机生成旋转后图像的缩放比例 范围在(1 - scale, 1 + scale)
    # s = 2 ** random.uniform(-scale, scale)
    s = random.uniform(1 - scale, 1 + scale)
    # s = 2 ** random.uniform(-scale, scale)
    # s = 2 ** random.uniform(-scale, scale)
    # cv2.getRotationMatrix2D: 二维旋转缩放函数
    # 参数 angle:旋转角度  center: 旋转中心(默认就是图像的中心)  scale: 旋转后图像的缩放比例
    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)

    # Shear 设置剪切矩阵
    S = np.eye(3)  # 初始化 T = [[1,0,0], [0,1,0], [0,0,1]]
    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)

    # Translation 设置平移矩阵
    T = np.eye(3)
    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)

    # Combined rotation matrix
    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
    # 将仿射变换矩阵 M 作用在图片上
    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
        if perspective:
            # 透视变换函数,实现旋转平移缩放变换后的平行线不再平行
            # 参数和下面 warpAffine 类似
            im = cv2.warpPerspective(im, M, dsize=(width, height), borderValue=(114, 114, 114))
        else:  # affine
            # 仿射变换函数  实现旋转平移缩放变换后的平行线依旧平行
            # image changed  img  [1472, 1472, 3] => [736, 736, 3]
            # cv2.warpAffine: opencv实现的仿射变换函数
            # 参数:  img: 需要变化的图像   M: 变换矩阵  dsize: 输出图像的大小  flags: 插值方法的组合(int 类型!)
            #        borderValue: (重点!)边界填充值  默认情况下,它为0。
            im = cv2.warpAffine(im, M[:2], dsize=(width, height), borderValue=(114, 114, 114))

    # Visualize
    # import matplotlib.pyplot as plt
    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
    # ax[0].imshow(im[:, :, ::-1])  # base
    # ax[1].imshow(im2[:, :, ::-1])  # warped

    # Transform label coordinates
    # 同样需要调整标签信息
    n = len(targets)
    if n:
        # 判断是否可以使用segment标签: 只有segments不为空时即数据集中有多边形gt也有正常gt时才能使用segment标签 use_segments=True
        #                          否则如果只有正常gt时segments为空 use_segments=False
        use_segments = any(x.any() for x in segments) and len(segments) == n
        new = np.zeros((n, 4))
        # 如果使用的是 segments 标签(标签中含有多边形 gt)
        if use_segments:  # warp segments
            # 先对 segment 标签进行重采样
            # 比如说 segment 坐标只有 100 个,通过 interp 函数将其采样为 n 个(默认 1000)
            # [n, x1y2...x99y100] 扩增坐标-> [n, 500, 2]
            # 由于有旋转,透视变换等操作,所以需要对多边形所有角点都进行变换
            segments = resample_segments(segments)  # upsample
            for i, segment in enumerate(segments):
                xy = np.ones((len(segment), 3))
                xy[:, :2] = segment
                # 对该标签多边形的所有顶点坐标进行透视/仿射变换
                xy = xy @ M.T  # transform
                xy = xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]  # perspective rescale or affine

                # 根据 segment 的坐标,取 xy 坐标的最大最小值,得到边框的坐标  clip
                new[i] = segment2box(xy, width, height)
        # 不使用 segments 标签 使用正常的矩形的标签 targets
        else:  # warp boxes
            # 直接对 box 透视/仿射变换
            # 由于有旋转,透视变换等操作,所以需要对四个角点都进行变换
            xy = np.ones((n * 4, 3))
            xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
            xy = xy @ M.T  # transform 每个角点的坐标
            xy = (xy[:, :2] / xy[:, 2:3] if perspective else xy[:, :2]).reshape(n, 8)  # perspective rescale or affine

            # create new boxes
            x = xy[:, [0, 2, 4, 6]]
            y = xy[:, [1, 3, 5, 7]]
            new = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T

            # clip 去除太小的 target(target 大部分跑到图外去了)
            new[:, [0, 2]] = new[:, [0, 2]].clip(0, width)
            new[:, [1, 3]] = new[:, [1, 3]].clip(0, height)

        # filter candidates  过滤target 筛选box
        # 长和宽必须大于wh_thr个像素 裁剪过小的框(面积小于裁剪前的area_thr)  长宽比范围在(1/ar_thr, ar_thr)之间的限制
        # 筛选结果 [n] 全是True或False   使用比如: box1[i]即可得到i中所有等于True的矩形框 False的矩形框全部删除
        i = box_candidates(box1=targets[:, 1:5].T * s, box2=new.T, area_thr=0.01 if use_segments else 0.10)
        # 得到所有满足条件的 targets
        targets = targets[i]
        targets[:, 1:5] = new[i]

    return im, targets


def copy_paste(im, labels, segments, p=0.5):
    # Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy)
    n = len(segments)
    if p and n:
        h, w, c = im.shape  # height, width, channels
        im_new = np.zeros(im.shape, np.uint8)
        for j in random.sample(range(n), k=round(p * n)):
            l, s = labels[j], segments[j]
            box = w - l[3], l[2], w - l[1], l[4]
            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
            if (ioa < 0.30).all():  # allow 30% obscuration of existing labels
                labels = np.concatenate((labels, [[l[0], *box]]), 0)
                segments.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1))
                cv2.drawContours(im_new, [segments[j].astype(np.int32)], -1, (1, 1, 1), cv2.FILLED)

        result = cv2.flip(im, 1)  # augment segments (flip left-right)
        i = cv2.flip(im_new, 1).astype(bool)
        im[i] = result[i]  # cv2.imwrite('debug.jpg', im)  # debug

    return im, labels, segments


def cutout(im, labels, p=0.5):
    # Applies image cutout augmentation https://arxiv.org/abs/1708.04552
    if random.random() < p:
        h, w = im.shape[:2]
        scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16  # image size fraction
        for s in scales:
            mask_h = random.randint(1, int(h * s))  # create random masks
            mask_w = random.randint(1, int(w * s))

            # box
            xmin = max(0, random.randint(0, w) - mask_w // 2)
            ymin = max(0, random.randint(0, h) - mask_h // 2)
            xmax = min(w, xmin + mask_w)
            ymax = min(h, ymin + mask_h)

            # apply random color mask
            im[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)]

            # return unobscured labels
            if len(labels) and s > 0.03:
                box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
                ioa = bbox_ioa(box, xywhn2xyxy(labels[:, 1:5], w, h))  # intersection over area
                labels = labels[ioa < 0.60]  # remove >60% obscured labels

    return labels


def mixup(im, labels, im2, labels2):
    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
    im = (im * r + im2 * (1 - r)).astype(np.uint8)
    labels = np.concatenate((labels, labels2), 0)
    return im, labels


def box_candidates(box1, box2, wh_thr=2, ar_thr=100, area_thr=0.1, eps=1e-16):  # box1(4,n), box2(4,n)
    # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
    ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
    return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + eps) > area_thr) & (ar < ar_thr)  # candidates


def classify_albumentations(
        augment=True,
        size=224,
        scale=(0.08, 1.0),
        ratio=(0.75, 1.0 / 0.75),  # 0.75, 1.33
        hflip=0.5,
        vflip=0.0,
        jitter=0.4,
        mean=IMAGENET_MEAN,
        std=IMAGENET_STD,
        auto_aug=False):
    # YOLOv5 classification Albumentations (optional, only used if package is installed)
    prefix = colorstr('albumentations: ')
    try:
        import albumentations as A
        from albumentations.pytorch import ToTensorV2
        check_version(A.__version__, '1.0.3', hard=True)  # version requirement
        if augment:  # Resize and crop
            T = [A.RandomResizedCrop(height=size, width=size, scale=scale, ratio=ratio)]
            if auto_aug:
                # TODO: implement AugMix, AutoAug & RandAug in albumentation
                LOGGER.info(f'{prefix}auto augmentations are currently not supported')
            else:
                if hflip > 0:
                    T += [A.HorizontalFlip(p=hflip)]
                if vflip > 0:
                    T += [A.VerticalFlip(p=vflip)]
                if jitter > 0:
                    color_jitter = (float(jitter), ) * 3  # repeat value for brightness, contrast, satuaration, 0 hue
                    T += [A.ColorJitter(*color_jitter, 0)]
        else:  # Use fixed crop for eval set (reproducibility)
            T = [A.SmallestMaxSize(max_size=size), A.CenterCrop(height=size, width=size)]
        T += [A.Normalize(mean=mean, std=std), ToTensorV2()]  # Normalize and convert to Tensor
        LOGGER.info(prefix + ', '.join(f'{x}'.replace('always_apply=False, ', '') for x in T if x.p))
        return A.Compose(T)

    except ImportError:  # package not installed, skip
        LOGGER.warning(f'{prefix}⚠️ not found, install with `pip install albumentations` (recommended)')
    except Exception as e:
        LOGGER.info(f'{prefix}{e}')


def classify_transforms(size=224):
    # Transforms to apply if albumentations not installed
    assert isinstance(size, int), f'ERROR: classify_transforms size {size} must be integer, not (list, tuple)'
    # T.Compose([T.ToTensor(), T.Resize(size), T.CenterCrop(size), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
    return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])


class LetterBox:
    # YOLOv5 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])
    def __init__(self, size=(640, 640), auto=False, stride=32):
        super().__init__()
        self.h, self.w = (size, size) if isinstance(size, int) else size
        self.auto = auto  # pass max size integer, automatically solve for short side using stride
        self.stride = stride  # used with auto

    def __call__(self, im):  # im = np.array HWC
        imh, imw = im.shape[:2]
        r = min(self.h / imh, self.w / imw)  # ratio of new/old
        h, w = round(imh * r), round(imw * r)  # resized image
        hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w
        top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
        im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype)
        im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
        return im_out


class CenterCrop:
    # YOLOv5 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])
    def __init__(self, size=640):
        super().__init__()
        self.h, self.w = (size, size) if isinstance(size, int) else size

    def __call__(self, im):  # im = np.array HWC
        imh, imw = im.shape[:2]
        m = min(imh, imw)  # min dimension
        top, left = (imh - m) // 2, (imw - m) // 2
        return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)


class ToTensor:
    # YOLOv5 ToTensor class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])
    def __init__(self, half=False):
        super().__init__()
        self.half = half

    def __call__(self, im):  # im = np.array HWC in BGR order
        im = np.ascontiguousarray(im.transpose((2, 0, 1))[::-1])  # HWC to CHW -> BGR to RGB -> contiguous
        im = torch.from_numpy(im)  # to torch
        im = im.half() if self.half else im.float()  # uint8 to fp16/32
        im /= 255.0  # 0-255 to 0.0-1.0
        return im

你可能感兴趣的:(YOLO,pytorch,人工智能)