视频编辑评价指标

实现clip_text_videoclip_video_selfclip_video_videopixel_video_self 四类评价指标

video.py 中实现Video_Editing_Metrics类:

import cv2
import torch
import clip
import os
import os.path as osp
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from decord import VideoReader, cpu
import numpy as np


class Video_Editing_Metrics:
    def __init__(self) -> None:
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("ViT-B/32", device=self.device)
        self.metrics = ["clip_text_video", "clip_video_self", "clip_video_video", "pixel_video_self"]

    def clip_text_video(self, text: str, video_path: str, ctx='cpu'):
        vr = VideoReader(video_path, ctx=cpu(0))
        frame_count = len(vr)
        scores = []

        for i in range(frame_count):
            text_input = clip.tokenize([text]).to(self.device)
            image_input = self.preprocess(Image.fromarray(vr[i].asnumpy())).unsqueeze(0).to(self.device)
            with torch.no_grad():
                text_features = self.model.encode_text(text_input)
                image_features = self.model.encode_image(image_input)

            text_features /= text_features.norm(dim=-1, keepdim=True)
            image_features /= image_features.norm(dim=-1, keepdim=True)

            similarity = (text_features @ image_features.T).squeeze(0).cpu().item()
            scores.append(similarity)

        return scores, sum(scores)/frame_count


    def clip_video_self(self, video_path: str):
        def clip_image_image(image1, image2):
            image1_input = self.preprocess(image1).unsqueeze(0).to(self.device)
            image2_input = self.preprocess(image2).unsqueeze(0).to(self.device)
            with torch.no_grad():
                image1_features = self.model.encode_image(image1_input)
                image2_features = self.model.encode_image(image2_input)
            image1_features /= image1_features.norm(dim=-1, keepdim=True)
            image2_features /= image2_features.norm(dim=-1, keepdim=True)
            similarity = (image1_features @ image2_features.T).squeeze(0).cpu().item()
            return similarity

        vr = VideoReader(video_path, ctx=cpu(0))
        scores = []

        for i in range(1, len(vr)):
            image1 = Image.fromarray(vr[i-1].asnumpy())
            image2 = Image.fromarray(vr[i].asnumpy())
            similarity = clip_image_image(image1, image2)
            scores.append(similarity)

        avg_score = sum(scores) / len(vr)
        return scores, avg_score



    def clip_video_video(self, video_path1: str, video_path2: str):
        def clip_video_self(image1, image2):
            image1_input = self.preprocess(image1).unsqueeze(0).to(self.device)
            image2_input = self.preprocess(image2).unsqueeze(0).to(self.device)
            with torch.no_grad():
                image1_features = self.model.encode_image(image1_input)
                image2_features = self.model.encode_image(image2_input)
            image1_features /= image1_features.norm(dim=-1, keepdim=True)
            image2_features /= image2_features.norm(dim=-1, keepdim=True)
            similarity = (image1_features @ image2_features.T).squeeze(0).cpu().item()
            return similarity

        vr1 = VideoReader(video_path1, ctx=cpu(0))
        vr2 = VideoReader(video_path2, ctx=cpu(0))
        f = min(len(vr1),len(vr2))
        print('video frames:', f)

        scores = []
        for i in range(f):
            image1 = Image.fromarray(vr1[i].asnumpy())
            image2 = Image.fromarray(vr2[i].asnumpy())
            similarity = clip_video_self(image1, image2)
            scores.append(similarity)

        avg_score = sum(scores) / f
        return scores, avg_score


    def pixel_video_self(self, video_path: str):
        def calculate_pixel_l2_distance(image1, image2):
            tensor1 = torch.from_numpy(np.array(image1, dtype=np.float32))
            tensor2 = torch.from_numpy(np.array(image2, dtype=np.float32))
            distance = torch.sqrt(torch.mean((tensor1 - tensor2) ** 2))
            return distance.item()

        vr = VideoReader(video_path, ctx=cpu(0))
        scores = []
        for i in range(1, len(vr)):
            image1 = Image.fromarray(vr[i-1].asnumpy())
            image2 = Image.fromarray(vr[i].asnumpy())
            l2_distance = calculate_pixel_l2_distance(image1, image2)
            scores.append(l2_distance)

        avg_score = sum(scores) / len(vr)
        return scores, avg_score

实例化Video_Editing_Metrics类:

from video import Video_Editing_Metrics
video_metrics = Video_Editing_Metrics()
print(video_metrics.metrics)
original_video_path = '/home/pgao/yue/metrics/input_fps30.mp4'
edited_video_path = '/home/pgao/yue/metrics/tokenflow_PnP_fps_30.mp4'
prompt = "a colorful cartoon wolf"
['clip_text_video', 'clip_video_self', 'clip_video_video', 'pixel_video_self']
score, avg_score = video_metrics.clip_text_video("a wolf", edited_video_path)
print(score)
print("Average clip_text_video score:", avg_score)
[0.295654296875, 0.301513671875, 0.28759765625, 0.290283203125, 0.292236328125, 0.287841796875, 0.288818359375, 0.291748046875, 0.291748046875, 0.296142578125, 0.296875, 0.298583984375, 0.29638671875, 0.31396484375, 0.31005859375, 0.29638671875, 0.31201171875, 0.29541015625, 0.296875, 0.306640625, 0.30908203125, 0.296875, 0.298583984375, 0.30029296875, 0.299560546875, 0.29541015625, 0.295654296875, 0.297607421875, 0.29443359375, 0.29345703125, 0.295654296875, 0.290283203125, 0.296142578125, 0.305419921875, 0.2978515625, 0.2939453125, 0.29443359375, 0.295166015625, 0.299560546875, 0.298095703125]
Average clip_text_video score: 0.297357177734375
score, avg_score = video_metrics.clip_video_self(edited_video_path)
print(score)
print("Average clip_video_self score:", avg_score)
[0.9892578125, 0.98876953125, 0.994140625, 0.98974609375, 0.99267578125, 0.9970703125, 0.9951171875, 0.994140625, 0.99365234375, 0.9951171875, 0.99462890625, 0.990234375, 0.9677734375, 0.9873046875, 0.9814453125, 0.966796875, 0.9658203125, 0.9931640625, 0.98974609375, 0.986328125, 0.97021484375, 0.98876953125, 0.99169921875, 0.99365234375, 0.98974609375, 0.99560546875, 0.99462890625, 0.99853515625, 0.99462890625, 0.9970703125, 0.9921875, 0.98583984375, 0.99072265625, 0.99169921875, 0.9970703125, 0.9970703125, 0.9951171875, 0.998046875, 0.9970703125]
Average clip_video_self score: 0.9653076171875
score, avg_score = video_metrics.clip_video_video(original_video_path, edited_video_path)
print(score)
print("Average clip_video_video score:", avg_score)
/data1/zwb/envs/diffusion/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
['clip_text_video', 'clip_video_self', 'clip_video_video', 'pixel_video_self']
[0.295654296875, 0.301513671875, 0.28759765625, 0.290283203125, 0.292236328125, 0.287841796875, 0.288818359375, 0.291748046875, 0.291748046875, 0.296142578125, 0.296875, 0.298583984375, 0.29638671875, 0.31396484375, 0.31005859375, 0.29638671875, 0.31201171875, 0.29541015625, 0.296875, 0.306640625, 0.30908203125, 0.296875, 0.298583984375, 0.30029296875, 0.299560546875, 0.29541015625, 0.295654296875, 0.297607421875, 0.29443359375, 0.29345703125, 0.295654296875, 0.290283203125, 0.296142578125, 0.305419921875, 0.2978515625, 0.2939453125, 0.29443359375, 0.295166015625, 0.299560546875, 0.298095703125]
Average clip_text_video score: 0.297357177734375
[0.9892578125, 0.98876953125, 0.994140625, 0.98974609375, 0.99267578125, 0.9970703125, 0.9951171875, 0.994140625, 0.99365234375, 0.9951171875, 0.99462890625, 0.990234375, 0.9677734375, 0.9873046875, 0.9814453125, 0.966796875, 0.9658203125, 0.9931640625, 0.98974609375, 0.986328125, 0.97021484375, 0.98876953125, 0.99169921875, 0.99365234375, 0.98974609375, 0.99560546875, 0.99462890625, 0.99853515625, 0.99462890625, 0.9970703125, 0.9921875, 0.98583984375, 0.99072265625, 0.99169921875, 0.9970703125, 0.9970703125, 0.9951171875, 0.998046875, 0.9970703125]
Average clip_video_self score: 0.9653076171875





Average Optical Flow MSE: 17.320485770702362
video frames: 40
[0.81005859375, 0.83642578125, 0.853515625, 0.84814453125, 0.85400390625, 0.86181640625, 0.8515625, 0.84814453125, 0.85498046875, 0.84130859375, 0.8544921875, 0.84423828125, 0.8681640625, 0.82177734375, 0.8349609375, 0.85205078125, 0.7822265625, 0.8388671875, 0.84716796875, 0.83203125, 0.78369140625, 0.857421875, 0.849609375, 0.8408203125, 0.8349609375, 0.857421875, 0.8642578125, 0.853515625, 0.83984375, 0.85205078125, 0.845703125, 0.8447265625, 0.8388671875, 0.8232421875, 0.83837890625, 0.84375, 0.85498046875, 0.83935546875, 0.8232421875, 0.8408203125]
Average clip_video_video score: 0.84156494140625
score, avg_score = video_metrics.pixel_video_self(edited_video_path)
print(score)
print("Average pixel_video_self score:", avg_score)
[21.371034622192383, 19.461402893066406, 16.73526382446289, 17.8797664642334, 17.596729278564453, 16.682973861694336, 16.84737205505371, 16.643695831298828, 17.080795288085938, 16.25534439086914, 17.287216186523438, 18.431577682495117, 21.61277198791504, 25.54276466369629, 26.09408950805664, 27.980266571044922, 27.825368881225586, 25.40290641784668, 27.089553833007812, 26.539926528930664, 26.01329231262207, 24.538654327392578, 21.16415786743164, 22.847314834594727, 19.37087059020996, 17.53938102722168, 14.976680755615234, 13.714262008666992, 15.014124870300293, 13.917937278747559, 13.885147094726562, 16.4520206451416, 12.729424476623535, 13.119254112243652, 12.160407066345215, 11.924071311950684, 11.72919750213623, 10.484443664550781, 10.358728408813477]
Average pixel_video_self score: 18.057504773139954

你可能感兴趣的:(人工智能,深度学习)