【高质量github项目合辑】视频、文本的特征提取

近期的科研工作,用到了许多github上面的开源项目,大多跟视频特征和文本特征的提取相关

文章目录

    • torch_videovision
    • EKT-NLVL_vidcaps
    • ig65m-pytorch
    • 视频特征提取收获
    • pytorch-pos-tagging
    • skip-thoughts

torch_videovision

用途:对视频数据进行处理,实现了pytorch中常见图像处理操作的视频版本,如Resize、Crop、ToTensor等

安装:可作为包使用

pip install git+https://github.com/hassony2/torch_videovision

用法

from torchvideotransforms import video_transforms, volume_transforms

video_transform_list = [video_transforms.RandomRotation(30),
			video_transforms.RandomCrop((200, 200))
			volume_transforms.ClipToTensor()]
transforms = video_transforms.Compose(video_transform_list)

EKT-NLVL_vidcaps

用途:使用ResNet网络提取Charades、ActivityNet等视频数据集的2D特征(以帧为单位)

安装

git clone https://github.com/carpedkm/EKT-NLVL_vidcaps

用法:(以Charades为例)

目标文件./EKT-NLVL_vidcaps/misc/extract_feats_2D_charades.py

原版本中,作者的输入是原视频,因此还需要经过转化为帧、取样帧等操作。而我是直接对已经转化为帧的视频片段进行特征提取,因此魔改了代码:

import cv2
import imageio
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
device = 'cuda:0'
import pretrainedmodels
model_name = 'inceptionresnetv2'
import torchvision.transforms as trn
import torch
import argparse
import process_anno
from tqdm import tqdm
import json
import argparse

from PIL import Image

class Identity(torch.nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x

def extract_feats(file_path, filenames, frame_num, batch_size, save_path, anno):
	"""Extract 2D features (saved in .npy) for frames in a video."""
	net = pretrainedmodels.__dict__[model_name](num_classes=1001, pretrained='imagenet+background')
	net.last_linear = Identity()
	
	net.eval()
	net.cuda(device)
	transform = trn.Compose([trn.ToPILImage(),
		trn.Resize((299, 299)), # 299 for IRV2 # 224 for ResNet
		trn.ToTensor(),
		trn.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])#trn.Normalize(net.mean, net.std)])
		
	print("inceptionresnetv2 Network loaded")	

	#Read videos and extract features in batches
	cnt = 0
	feat = {}
	for fname in tqdm(filenames):
		imgs_dir = os.path.join(file_path, fname)
		bd_info = anno[fname]
		feat[fname] = {}
		for bd_ in bd_info:

			st_fr = int(bd_[0])
			end_fr = int(bd_[1])
			window = int(bd_[2])
			
			idx = np.linspace(st_fr, end_fr, window)
			idx = np.round(idx).astype(int)
			opened_imgs = []
			for i in idx:
				img = transform(np.array(Image.open(os.path.join(imgs_dir, 'img_' + str(i).zfill(6) + '.jpg'))))
				opened_imgs.append(img.unsqueeze(0))
			curr_frames = torch.cat(opened_imgs, dim=0)
			curr_feats = []
			for i in range(0, end_fr-st_fr, batch_size):
				curr_batch = curr_frames[i:i+batch_size,:,:,:].cuda(device)
				out = net(curr_batch)
				curr_feats.append(out.detach().cpu())
			curr_feats = torch.cat(curr_feats, 0)
			del out
			feat[fname][str(st_fr)+'_'+str(end_fr)] = curr_feats.numpy()
			del curr_feats
			cnt += 1

	np.save(save_path, feat)
	print(cnt)

if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('--file_path', type=str, default='/data2/wangyan/data/') # /saat/Charades_for_SAAT
	parser.add_argument('--dataset_name', type=str, default='Charades') # Charades
	parser.add_argument('--frame_per_video', type=int, default=28)
	parser.add_argument('--batch_size', type=int, default=32)
	parser.add_argument('--expnum', type=str, default='4')
	opt = parser.parse_args()
	with open('/data2/wangyan/tools/EKT-NLVL_vidcaps/charades_clip_info.json', 'r') as f:
		anno_info = json.load(f)
	anno, train_keys = anno_info, list(anno_info.keys())

	opt.file_path = opt.file_path
	save_path = os.path.join(opt.file_path, 'charades_feature_2d.npy')
	
	read_in_path = '/data/wangyan/Charades-STA/Charades_v1_rgb/'
	extract_feats(read_in_path, train_keys, opt.frame_per_video, opt.batch_size, save_path, anno)

ig65m-pytorch

用途:使用IG-65m上预训练好的R(2+1)D模型提取Charades等视频数据集的motion特征(以32帧的clip为单位)

安装

git clone https://github.com/moabitcoin/ig65m-pytorch

用法

目标文件ig65m-pytorch/tree/master/ig65m/extract.py

原版本中,作者的输入是原视频,因此还需要经过转化为帧、取样帧等操作。而我是直接对已经转化为帧的视频片段进行特征提取,因此魔改了代码:

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import sys, argparse, json

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from torchvision.transforms import Compose

import numpy as np
from einops.layers.torch import Rearrange, Reduce
from tqdm import tqdm
sys.path.append('/data2/wangyan/tools/ig65m-pytorch/')
from ig65m.models import r2plus1d_34_32_ig65m
from ig65m.datasets import VideoDataset
from ig65m.transforms import ToTensor, Resize, Normalize

from pathlib import Path
from PIL import Image


class VideoModel(nn.Module):
    def __init__(self, pool_spatial="mean", pool_temporal="mean"):
        super().__init__()

        self.model = r2plus1d_34_32_ig65m(num_classes=359, pretrained=True, progress=True)

        self.pool_spatial = Reduce("n c t h w -> n c t", reduction=pool_spatial)
        self.pool_temporal = Reduce("n c t -> n c", reduction=pool_temporal)

    def forward(self, x):
        x = self.model.stem(x)
        x = self.model.layer1(x)
        x = self.model.layer2(x)
        x = self.model.layer3(x)
        x = self.model.layer4(x)

        x = self.pool_spatial(x)
        x = self.pool_temporal(x)

        return x


def main(args):
    if torch.cuda.is_available():
        print(" Running on GPU(s)", file=sys.stderr)
        device = torch.device("cuda:0")
        torch.backends.cudnn.benchmark = True
    else:
        print(" Running on CPU(s)", file=sys.stderr)
        device = torch.device("cpu")

    model = VideoModel(pool_spatial=args.pool_spatial,
                       pool_temporal=args.pool_temporal)

    model.eval()

    for params in model.parameters():
        params.requires_grad = False

    model = model.to(device)
    model = nn.DataParallel(model)

    transform = Compose([
        ToTensor(),
        Rearrange("t h w c -> c t h w"),
        Resize(args.frame_size),
        Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
    ])

    with open(args.anno, 'r') as f:
        anno_info = json.load(f)
    anno, train_keys = anno_info, list(anno_info.keys())
    read_in_path = args.video
    feat = {}
    for fname in tqdm(train_keys):
        imgs_dir = os.path.join(read_in_path, fname)
        bd_info = anno[fname]
        feat[fname] = {}
        for bd_ in bd_info:
            st_fr = int(bd_[0])
            end_fr = int(bd_[1])
            window = int(bd_[2])
            idx = np.linspace(st_fr, end_fr, window)
            idx = np.round(idx).astype(int)
            opened_imgs = []
            for i in idx:
                img = np.array(Image.open(os.path.join(imgs_dir, 'img_' + str(i).zfill(6) + '.jpg')))
                opened_imgs.append(np.expand_dims(img, 0))
            curr_frames = np.concatenate(opened_imgs, axis=0)
            curr_frames = transform(curr_frames)
            curr_feats = []
            for i in range(0, end_fr-st_fr, args.batch_size):
                curr_batch = curr_frames[:, i:i + args.batch_size, :, :].cuda(device)
                out = model(curr_batch.unsqueeze(0))
                curr_feats.append(out.detach().cpu())
            curr_feats = torch.cat(curr_feats, 0)
            del out
            feat[fname][str(st_fr) + '_' + str(end_fr)] = curr_feats.numpy()
            del curr_feats

    np.save(args.features, feat)
    print(" Done", file=sys.stderr)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--anno', default='/data2/wangyan/tools/EKT-NLVL_vidcaps/charades_clip_info.json', help="video to run feature extraction on")
    parser.add_argument('--video', default='/data/wangyan/Charades-STA/Charades_v1_rgb/')
    parser.add_argument('--features', default='/data2/wangyan/data/charades_feature_motion.npy')
    parser.add_argument('--pool_spatial', type=str, default="mean")
    parser.add_argument('--pool_temporal', type=str, default="mean")
    parser.add_argument('--frame_size', default=(112, 112))
    parser.add_argument('--batch_size', type=int, default=32)
    opt = parser.parse_args()
    main(args=opt)

视频特征提取收获

经过以上两个项目的学习,一个很大的收获是:可以无需构造datasetdataloader。相对图像而言,视频占用显存过大,这往往会给贫穷的深度学习学生带来困扰。我们可以选择,以某个帧数为单位片段,每次放入一个单位片段进行处理,这样就不用构造datasetdataloader,只需要将单位片段构造为tensor即可

pytorch-pos-tagging

这个项目之前仔细学习过,总结在博客PoS Tagging代码学习与应用中

用途:使用pos tagging模型对Charades-STA等视觉文本数据集的文本进行词性标记,并使用预训练好的GloVe提取目标词性(如名词、动词)对应的单词嵌入

安装

git clone https://github.com/bentrevett/pytorch-pos-tagging

用法

添加文件pos_vmr.py并写入

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import numpy as np
from pos_train import BiLSTMPOSTagger
import torch
from torchtext.legacy import data
import spacy
from torch.utils.data import Dataset
from torchtext.legacy import datasets
nlp = spacy.load('en_core_web_sm')
from torch.utils.data import DataLoader
import itertools
glove = np.load('/data/wangyan/dataset/data/6B.300d.npy', allow_pickle=True).tolist()
from tqdm import tqdm

class Charades_Text(Dataset):
    def __init__(self, annotation_file, text_field):
        self.info = [x.strip().split('##')[0] for x in open(annotation_file)]
        self.data = [x.strip().split('##')[-1] for x in open(annotation_file)]
        self.text_field = text_field
        # self.data = []
        # for x in open(annotation_file):
        #     sentence = x.strip().split('##')[-1]
        #     self.data.append(token.text for token in nlp(sentence))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        self.tokens = [token.text for token in nlp(self.data[idx])]
        self.tokens = [t.lower() for t in self.tokens]
        # 3)完成word2id
        numericalized_tokens = [self.text_field.vocab.stoi[t] for t in self.tokens]
        # 记录unknown单词的index
        unk_idx = self.text_field.vocab.stoi[self.text_field.unk_token]
        # 将id转化为tensor并放到gpu上
        # token_tensor: (vocab_len)
        token_tensor = torch.LongTensor(numericalized_tokens)
        # (vocab_len, 1)
        # token_tensor = token_tensor.unsqueeze(-1).to(device)
        data_torch = {'info': self.info[idx], 'data': token_tensor, 'token': self.tokens}
        return data_torch

def text_process(batch):
    re = {}
    re['info'] = []
    re['data'] = []
    re['token'] = []
    for item in batch:
        re['info'].append(item['info'])
        re['data'].append(item['data'])
        re['token'].append(item['token'])
    re['data'] = torch.nn.utils.rnn.pad_sequence(re['data'], batch_first=False, padding_value=1)
    return re

if __name__=='__main__':
    TEXT = data.Field(lower=True)
    UD_TAGS = data.Field(unk_token=None)
    PTB_TAGS = data.Field(unk_token=None)
    # fields用于dataset传递信息
    fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))
    # 从datasets.UDPOS加载训练集、验证集、测试集
    # UDPOS数据集每一行形式是years	NOUN	NNS,是句子中单词及其两种词性标注
    # 训练集12543,验证集2002,测试集2077,形式如下图:
    train_data, valid_data, test_data = datasets.UDPOS.splits(fields)
    # 出现频率两次才放入vocab(否则看作unknown)
    MIN_FREQ = 2
    TEXT.build_vocab(train_data,
                     min_freq=MIN_FREQ,
                     vectors="glove.6B.100d",
                     unk_init=torch.Tensor.normal_)
    UD_TAGS.build_vocab(train_data)

    charades_data = Charades_Text('./charades/charades_sta_train.txt', TEXT)
    charades_dataloader = DataLoader(charades_data, batch_size=128, collate_fn=text_process)

    device = 'cuda:0'
    INPUT_DIM = len(TEXT.vocab)  # 8866
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 128
    OUTPUT_DIM = len(UD_TAGS.vocab)  # 18
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.25
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # 1

    model = BiLSTMPOSTagger(INPUT_DIM,
                            EMBEDDING_DIM,
                            HIDDEN_DIM,
                            OUTPUT_DIM,
                            N_LAYERS,
                            BIDIRECTIONAL,
                            DROPOUT,
                            PAD_IDX)
    model.load_state_dict(torch.load('tut1-model.pt', map_location=device))
    model.cuda(device=device)
    model.eval()

    save_path = '/data2/wangyan/data/charades_sent_ov_feat.npy'
    save_dict = {}
    for batch in tqdm(charades_dataloader, desc='Pos Extracting', total=len(charades_dataloader)):
        batch['data'] = batch['data'].cuda(device=device)
        predictions = model(batch['data'])
        # 获得预测结果
        top_predictions = predictions.argmax(-1).transpose(0, 1)
        # 转化为可视化的tag
        predicted_tags = []
        for idx in range(0, len(top_predictions)):
            predicted_tags.append([UD_TAGS.vocab.itos[t.item()] for t in top_predictions[idx]])

        for a, item in enumerate(predicted_tags):
            info = batch['info'][a]
            save_dict[info] = []
            o_f = torch.zeros(300)
            v_f = torch.zeros(300)
            o_cnt = 0
            v_cnt = 0
            for b, label in enumerate(item):
                if label == 'PUNCT' or b > len(batch['token'][a]):
                    break
                elif label =='NOUN':
                    try:
                        o_f += glove[batch['token'][a][b]]
                    except KeyError:
                        o_f += np.random.randn(300, )
                    o_cnt += 1
                elif label == 'VERB':
                    try:
                        v_f += glove[batch['token'][a][b]]
                    except KeyError:
                        v_f += np.random.randn(300, )
                    v_cnt += 1
            o_f = o_f / o_cnt
            v_f = v_f / v_cnt
            save_dict[info].append(o_f)
            save_dict[info].append(v_f)

    np.save(save_path, save_dict)

skip-thoughts

用途:使用skip-thoughts模型对Charades-STA等视觉文本数据集的文本进行特征提取(以句子为单位)

因为我共同组合了训练数据集所需要的其他信息,所以下面的代码需要自行转换为任务需要的

安装

git clone https://github.com/ryankiros/skip-thoughts

值得说明的是,这个项目需要在python 2.7theano 0.7下运行

安装下列文件:

wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl

用法

添加文件encoder_test.py,并写入

import pickle

import numpy as np
from tqdm import tqdm

import skipthoughts
import os
import numpy

'''
calculate temporal intersection over union
'''
# i0/i1: a tuple of (start, end)
def calculate_IoU(i0, i1):
    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
    return iou

'''
calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0
'''
def calculate_nIoL(base, sliding_clip):
    inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))
    inter_l = inter[1]-inter[0]
    length = sliding_clip[1]-sliding_clip[0]
    nIoL = 1.0*(length-inter_l)/length
    return nIoL

if __name__=='__main__':
    model = skipthoughts.load_model()
    encoder = skipthoughts.Encoder(model)
    # read sliding windows, and match them with the groundtruths to make training samples
    label_path = "/data2/wangyan/baseline/TALL_pytorch/charades_sta_test.txt"
    sliding_clip_path = "/data2/wangyan/data/charades_c3dfeature_separate_py2/"
    data_path = '/data/wangyan/Charades-STA/Charades_v1_rgb/'
    clip_sentence_pairs_iou = []
    save_path = '/data2/wangyan/data/charades_testing_samples.npy'
    with open(label_path) as file:
        label = file.readlines()
        for line in tqdm(label, desc='Testing Samples Collecting: ', total=len(label)):
            info, sent = line.split('##')
            # x = ['Hello world']
            sent = [sent.split('\n')[0]]
            s_feat = encoder.encode(sent)
            vid, ss, es = info.split(" ")
            sliding_file = os.path.join(sliding_clip_path, vid+'.npy')
            sliding_clip = numpy.load(sliding_file, allow_pickle=True)
            movie_path = os.path.join(data_path, vid)
            img_len = len(os.listdir(movie_path))
            fps = img_len/float(sliding_clip[0]['video_length'])
            for clip in sliding_clip:
                sf, ef = clip['clip_info'].split("-")
                sf = int(sf)
                ef = int(ef)
                # o_start/o_end for ground truth
                g_sf = float(ss) * fps
                g_ef = float(es) * fps
                iou = calculate_IoU((sf, ef), (g_sf, g_ef))
                if iou > 0.5:
                    nIoL = calculate_nIoL((g_sf, g_ef), (sf, ef))
                    if nIoL < 0.15:
                        start_offset = g_sf - sf
                        end_offset = g_ef - ef
                        pairs = {}
                        pairs['v_feat'] = clip['v_glob_feature']
                        pairs['s_feat'] = s_feat
                        pairs['fps'] = fps
                        pairs['vid'] = vid
                        pairs['s_off'] = start_offset
                        pairs['e_off'] = end_offset
                        pairs['clip_name'] = str(clip['clip_info'])
                        clip_sentence_pairs_iou.append(pairs)
    num_samples_iou = len(clip_sentence_pairs_iou)
    with open(save_path, 'a'):
        np.save(save_path, clip_sentence_pairs_iou)
    print num_samples_iou

你可能感兴趣的:(视频片段检索,github,音视频,深度学习)