原Github项目, 本文只是做简单解析
原作者对repo的介绍:
Here are some details about our modification:
- we choose yolov5 as an object detector instead of Faster R-CNN, it is faster and more convenient
- we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames
- our processing speed reached 24.2 FPS at 30 inference batch size (on a single RTX 2080Ti GPU)
从__main__开始分析:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# 输入
parser.add_argument('--input', type=str, default="/home/wufan/images/video/vad.mp4",
help='test imgs folder or video or camera')
# 输出目录
parser.add_argument('--output', type=str, default="output.mp4",
help='folder to save result imgs, can not use input folder')
# object detect config
parser.add_argument('--imsize', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--conf', type=float, default=0.4, help='object confidence threshold')
parser.add_argument('--iou', type=float, default=0.4, help='IOU threshold for NMS')
parser.add_argument('--device', default='cuda', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--classes', nargs='+', default=0, type=int,
help='filter by class: --class 0, or --class 0 2 3')
config = parser.parse_args()
print(config)
main(config)
__main__里设置了输入输出参数以及目标检测的一些参数。
解析main函数:
对输入的config参数解析并使用,模型使用yolov5l6,权重下载到本地
# 载入Yolov5模型
model = torch.hub.load('ultralytics/yolov5', 'yolov5l6')
model.conf = config.conf
model.iou = config.iou
# 最多探测目标数
model.max_det = 200
# 探测类别,设置为只探测人体
if config.classes:
model.classes = config.classes
device = config.device
imsize = config.imsize
加载Slowfast、Deepsort模型,使用的Slowfast是在AVA2.2上训练的,通过AvaLabeledVideoFramePaths函数获得id到动作的mapping
# 加载Slowfast resnet50模型
video_model = slowfast_r50_detection(True).eval().to(device)
# 加载DeepSORT预训练权重
deepsort_tracker = DeepSort("deep_sort/deep_sort/deep/checkpoint/ckpt.t7")
# 加载the id to label mapping for the AVA V2.2 dataset on which the Torch Hub models were fine-tuned
ava_labelnames, _ = AvaLabeledVideoFramePaths.read_label_map("selfutils/temp.pbtxt")
coco_color_map = [[random.randint(0, 255) for _ in range(3)] for _ in range(80)]
对输出视频进行相关设置
vide_save_path = config.output
# 获取输入视频的尺寸并应用于输出视频
video = cv2.VideoCapture(config.input)
width, height = int(video.get(3)), int(video.get(4))
video.release()
# MPEG-4编码 .mp4 可指定结果视频的大小
# 25帧输出
outputvideo = cv2.VideoWriter(vide_save_path, cv2.VideoWriter_fourcc(*'mp4v'), 25, (width, height))
之后载入输出视频并以math.ceil(video.duration)长度进行for循环处理:
首先对视频进行抽帧处理,通过get_clip()对一秒内的视频进行抽帧,只保留视频图片,将tensor转numpy数组,BGR格式
# 对视频一秒内进行抽帧得到图像数据
video_clips = video.get_clip(i, i + 1 - 0.04)
# return {"video": clip_frames, "frame_indices": frame_indices, "audio": None}
video_clips = video_clips['video']
if video_clips is None:
continue
img_num = video_clips.shape[1]
imgs = []
for j in range(img_num):
imgs.append(tensor_to_numpy(video_clips[:, j, :, :]))
# "video": A tensor of the clip's RGB frames with shape: (channel, time, height, width).
# 将tensor转为numpy数组,BGR格式
def tensor_to_numpy(tensor):
img = tensor.cpu().numpy().transpose((1, 2, 0))
return img
yolo预测
yolo_preds = model(imgs, size=imsize)
# 每25帧后插入1帧作为预测图像
yolo_preds.files = [f"img_{i * 25 + k}.jpg" for k in range(img_num)]
使用预训练的Deepsort权重,以yolo预测结果作为输入,用Deepsort的结果代替yolo预测的结果,这里Deepsort是用来给相同id的目标分配动作label的。
# DeepSORT
deepsort_outputs = []
for j in range(len(yolo_preds.pred)):
temp = deepsort_update(deepsort_tracker, yolo_preds.pred[j].cpu(), yolo_preds.xywh[j][:, 0:4].cpu(),
yolo_preds.imgs[j])
if len(temp) == 0:
temp = np.ones((0, 8))
deepsort_outputs.append(temp.astype(np.float32))
# 用DeepSORT的结果代替yolo预测的结果
# we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames
yolo_preds.pred = deepsort_outputs
id_to_ava_labels = {}
def deepsort_update(Tracker, pred, xywh, np_img):
outputs = Tracker.update(xywh, pred[:, 4:5], pred[:, 5].tolist(), cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB))
return outputs
通过ava_inference_transform()函数对预测输入进行预处理,然后通过调用Slowfast模型进行预测,最后为每个id分配动作类别
id_to_ava_labels = {}
if yolo_preds.pred[img_num // 2].shape[0]:
# 视频和bbox用于Slowfast模型前的预处理
inputs, inp_boxes, _ = ava_inference_transform(video_clips, yolo_preds.pred[img_num // 2][:, 0:4],crop_size=imsize)
# 把bbox和一个全为0的数组拼接
inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0], 1), inp_boxes], dim=1)
if isinstance(inputs, list):
# Prepend data sample id for each bounding box.
inputs = [inp.unsqueeze(0).to(device) for inp in inputs]
else:
inputs = inputs.unsqueeze(0).to(device)
with torch.no_grad():
slowfaster_preds = video_model(inputs, inp_boxes.to(device))
slowfaster_preds = slowfaster_preds.cpu()
# 为每个id分配动作类别
for tid, avalabel in zip(yolo_preds.pred[img_num // 2][:, 5].tolist(),np.argmax(slowfaster_preds, axis=1).tolist()):
id_to_ava_labels[tid] = ava_labelnames[avalabel + 1]
最后将结果整合为视频
def save_yolopreds_tovideo(yolo_preds, id_to_ava_labels, color_map, output_video):
for i, (im, pred) in enumerate(zip(yolo_preds.imgs, yolo_preds.pred)):
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
if pred.shape[0]:
for j, (*box, cls, trackid, vx, vy) in enumerate(pred):
if int(cls) != 0:
ava_label = ''
elif trackid in id_to_ava_labels.keys():
ava_label = id_to_ava_labels[trackid].split(' ')[0]
else:
ava_label = 'Unknow'
text = '{} {} {}'.format(int(trackid), yolo_preds.names[int(cls)], ava_label)
color = color_map[int(cls)]
im = plot_one_box(box, im, color, text)
output_video.write(im.astype(np.uint8))