实现clip_text_video
,clip_video_self
,clip_video_video
,pixel_video_self
四类评价指标
video.py
中实现Video_Editing_Metrics
类:
import cv2
import torch
import clip
import os
import os.path as osp
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from decord import VideoReader, cpu
import numpy as np
class Video_Editing_Metrics:
def __init__(self) -> None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model, self.preprocess = clip.load("ViT-B/32", device=self.device)
self.metrics = ["clip_text_video", "clip_video_self", "clip_video_video", "pixel_video_self"]
def clip_text_video(self, text: str, video_path: str, ctx='cpu'):
vr = VideoReader(video_path, ctx=cpu(0))
frame_count = len(vr)
scores = []
for i in range(frame_count):
text_input = clip.tokenize([text]).to(self.device)
image_input = self.preprocess(Image.fromarray(vr[i].asnumpy())).unsqueeze(0).to(self.device)
with torch.no_grad():
text_features = self.model.encode_text(text_input)
image_features = self.model.encode_image(image_input)
text_features /= text_features.norm(dim=-1, keepdim=True)
image_features /= image_features.norm(dim=-1, keepdim=True)
similarity = (text_features @ image_features.T).squeeze(0).cpu().item()
scores.append(similarity)
return scores, sum(scores)/frame_count
def clip_video_self(self, video_path: str):
def clip_image_image(image1, image2):
image1_input = self.preprocess(image1).unsqueeze(0).to(self.device)
image2_input = self.preprocess(image2).unsqueeze(0).to(self.device)
with torch.no_grad():
image1_features = self.model.encode_image(image1_input)
image2_features = self.model.encode_image(image2_input)
image1_features /= image1_features.norm(dim=-1, keepdim=True)
image2_features /= image2_features.norm(dim=-1, keepdim=True)
similarity = (image1_features @ image2_features.T).squeeze(0).cpu().item()
return similarity
vr = VideoReader(video_path, ctx=cpu(0))
scores = []
for i in range(1, len(vr)):
image1 = Image.fromarray(vr[i-1].asnumpy())
image2 = Image.fromarray(vr[i].asnumpy())
similarity = clip_image_image(image1, image2)
scores.append(similarity)
avg_score = sum(scores) / len(vr)
return scores, avg_score
def clip_video_video(self, video_path1: str, video_path2: str):
def clip_video_self(image1, image2):
image1_input = self.preprocess(image1).unsqueeze(0).to(self.device)
image2_input = self.preprocess(image2).unsqueeze(0).to(self.device)
with torch.no_grad():
image1_features = self.model.encode_image(image1_input)
image2_features = self.model.encode_image(image2_input)
image1_features /= image1_features.norm(dim=-1, keepdim=True)
image2_features /= image2_features.norm(dim=-1, keepdim=True)
similarity = (image1_features @ image2_features.T).squeeze(0).cpu().item()
return similarity
vr1 = VideoReader(video_path1, ctx=cpu(0))
vr2 = VideoReader(video_path2, ctx=cpu(0))
f = min(len(vr1),len(vr2))
print('video frames:', f)
scores = []
for i in range(f):
image1 = Image.fromarray(vr1[i].asnumpy())
image2 = Image.fromarray(vr2[i].asnumpy())
similarity = clip_video_self(image1, image2)
scores.append(similarity)
avg_score = sum(scores) / f
return scores, avg_score
def pixel_video_self(self, video_path: str):
def calculate_pixel_l2_distance(image1, image2):
tensor1 = torch.from_numpy(np.array(image1, dtype=np.float32))
tensor2 = torch.from_numpy(np.array(image2, dtype=np.float32))
distance = torch.sqrt(torch.mean((tensor1 - tensor2) ** 2))
return distance.item()
vr = VideoReader(video_path, ctx=cpu(0))
scores = []
for i in range(1, len(vr)):
image1 = Image.fromarray(vr[i-1].asnumpy())
image2 = Image.fromarray(vr[i].asnumpy())
l2_distance = calculate_pixel_l2_distance(image1, image2)
scores.append(l2_distance)
avg_score = sum(scores) / len(vr)
return scores, avg_score
实例化Video_Editing_Metrics
类:
from video import Video_Editing_Metrics
video_metrics = Video_Editing_Metrics()
print(video_metrics.metrics)
original_video_path = '/home/pgao/yue/metrics/input_fps30.mp4'
edited_video_path = '/home/pgao/yue/metrics/tokenflow_PnP_fps_30.mp4'
prompt = "a colorful cartoon wolf"
['clip_text_video', 'clip_video_self', 'clip_video_video', 'pixel_video_self']
score, avg_score = video_metrics.clip_text_video("a wolf", edited_video_path)
print(score)
print("Average clip_text_video score:", avg_score)
[0.295654296875, 0.301513671875, 0.28759765625, 0.290283203125, 0.292236328125, 0.287841796875, 0.288818359375, 0.291748046875, 0.291748046875, 0.296142578125, 0.296875, 0.298583984375, 0.29638671875, 0.31396484375, 0.31005859375, 0.29638671875, 0.31201171875, 0.29541015625, 0.296875, 0.306640625, 0.30908203125, 0.296875, 0.298583984375, 0.30029296875, 0.299560546875, 0.29541015625, 0.295654296875, 0.297607421875, 0.29443359375, 0.29345703125, 0.295654296875, 0.290283203125, 0.296142578125, 0.305419921875, 0.2978515625, 0.2939453125, 0.29443359375, 0.295166015625, 0.299560546875, 0.298095703125]
Average clip_text_video score: 0.297357177734375
score, avg_score = video_metrics.clip_video_self(edited_video_path)
print(score)
print("Average clip_video_self score:", avg_score)
[0.9892578125, 0.98876953125, 0.994140625, 0.98974609375, 0.99267578125, 0.9970703125, 0.9951171875, 0.994140625, 0.99365234375, 0.9951171875, 0.99462890625, 0.990234375, 0.9677734375, 0.9873046875, 0.9814453125, 0.966796875, 0.9658203125, 0.9931640625, 0.98974609375, 0.986328125, 0.97021484375, 0.98876953125, 0.99169921875, 0.99365234375, 0.98974609375, 0.99560546875, 0.99462890625, 0.99853515625, 0.99462890625, 0.9970703125, 0.9921875, 0.98583984375, 0.99072265625, 0.99169921875, 0.9970703125, 0.9970703125, 0.9951171875, 0.998046875, 0.9970703125]
Average clip_video_self score: 0.9653076171875
score, avg_score = video_metrics.clip_video_video(original_video_path, edited_video_path)
print(score)
print("Average clip_video_video score:", avg_score)
/data1/zwb/envs/diffusion/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
['clip_text_video', 'clip_video_self', 'clip_video_video', 'pixel_video_self']
[0.295654296875, 0.301513671875, 0.28759765625, 0.290283203125, 0.292236328125, 0.287841796875, 0.288818359375, 0.291748046875, 0.291748046875, 0.296142578125, 0.296875, 0.298583984375, 0.29638671875, 0.31396484375, 0.31005859375, 0.29638671875, 0.31201171875, 0.29541015625, 0.296875, 0.306640625, 0.30908203125, 0.296875, 0.298583984375, 0.30029296875, 0.299560546875, 0.29541015625, 0.295654296875, 0.297607421875, 0.29443359375, 0.29345703125, 0.295654296875, 0.290283203125, 0.296142578125, 0.305419921875, 0.2978515625, 0.2939453125, 0.29443359375, 0.295166015625, 0.299560546875, 0.298095703125]
Average clip_text_video score: 0.297357177734375
[0.9892578125, 0.98876953125, 0.994140625, 0.98974609375, 0.99267578125, 0.9970703125, 0.9951171875, 0.994140625, 0.99365234375, 0.9951171875, 0.99462890625, 0.990234375, 0.9677734375, 0.9873046875, 0.9814453125, 0.966796875, 0.9658203125, 0.9931640625, 0.98974609375, 0.986328125, 0.97021484375, 0.98876953125, 0.99169921875, 0.99365234375, 0.98974609375, 0.99560546875, 0.99462890625, 0.99853515625, 0.99462890625, 0.9970703125, 0.9921875, 0.98583984375, 0.99072265625, 0.99169921875, 0.9970703125, 0.9970703125, 0.9951171875, 0.998046875, 0.9970703125]
Average clip_video_self score: 0.9653076171875
Average Optical Flow MSE: 17.320485770702362
video frames: 40
[0.81005859375, 0.83642578125, 0.853515625, 0.84814453125, 0.85400390625, 0.86181640625, 0.8515625, 0.84814453125, 0.85498046875, 0.84130859375, 0.8544921875, 0.84423828125, 0.8681640625, 0.82177734375, 0.8349609375, 0.85205078125, 0.7822265625, 0.8388671875, 0.84716796875, 0.83203125, 0.78369140625, 0.857421875, 0.849609375, 0.8408203125, 0.8349609375, 0.857421875, 0.8642578125, 0.853515625, 0.83984375, 0.85205078125, 0.845703125, 0.8447265625, 0.8388671875, 0.8232421875, 0.83837890625, 0.84375, 0.85498046875, 0.83935546875, 0.8232421875, 0.8408203125]
Average clip_video_video score: 0.84156494140625
score, avg_score = video_metrics.pixel_video_self(edited_video_path)
print(score)
print("Average pixel_video_self score:", avg_score)
[21.371034622192383, 19.461402893066406, 16.73526382446289, 17.8797664642334, 17.596729278564453, 16.682973861694336, 16.84737205505371, 16.643695831298828, 17.080795288085938, 16.25534439086914, 17.287216186523438, 18.431577682495117, 21.61277198791504, 25.54276466369629, 26.09408950805664, 27.980266571044922, 27.825368881225586, 25.40290641784668, 27.089553833007812, 26.539926528930664, 26.01329231262207, 24.538654327392578, 21.16415786743164, 22.847314834594727, 19.37087059020996, 17.53938102722168, 14.976680755615234, 13.714262008666992, 15.014124870300293, 13.917937278747559, 13.885147094726562, 16.4520206451416, 12.729424476623535, 13.119254112243652, 12.160407066345215, 11.924071311950684, 11.72919750213623, 10.484443664550781, 10.358728408813477]
Average pixel_video_self score: 18.057504773139954