近期的科研工作,用到了许多github上面的开源项目,大多跟视频特征和文本特征的提取相关
用途:对视频数据进行处理,实现了pytorch中常见图像处理操作的视频版本,如Resize、Crop、ToTensor等
安装:可作为包使用
pip install git+https://github.com/hassony2/torch_videovision
用法:
from torchvideotransforms import video_transforms, volume_transforms
video_transform_list = [video_transforms.RandomRotation(30),
video_transforms.RandomCrop((200, 200))
volume_transforms.ClipToTensor()]
transforms = video_transforms.Compose(video_transform_list)
用途:使用ResNet网络提取Charades、ActivityNet等视频数据集的2D特征(以帧为单位)
安装:
git clone https://github.com/carpedkm/EKT-NLVL_vidcaps
用法:(以Charades为例)
目标文件./EKT-NLVL_vidcaps/misc/extract_feats_2D_charades.py
原版本中,作者的输入是原视频,因此还需要经过转化为帧、取样帧等操作。而我是直接对已经转化为帧的视频片段进行特征提取,因此魔改了代码:
import cv2
import imageio
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
device = 'cuda:0'
import pretrainedmodels
model_name = 'inceptionresnetv2'
import torchvision.transforms as trn
import torch
import argparse
import process_anno
from tqdm import tqdm
import json
import argparse
from PIL import Image
class Identity(torch.nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
def extract_feats(file_path, filenames, frame_num, batch_size, save_path, anno):
"""Extract 2D features (saved in .npy) for frames in a video."""
net = pretrainedmodels.__dict__[model_name](num_classes=1001, pretrained='imagenet+background')
net.last_linear = Identity()
net.eval()
net.cuda(device)
transform = trn.Compose([trn.ToPILImage(),
trn.Resize((299, 299)), # 299 for IRV2 # 224 for ResNet
trn.ToTensor(),
trn.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])#trn.Normalize(net.mean, net.std)])
print("inceptionresnetv2 Network loaded")
#Read videos and extract features in batches
cnt = 0
feat = {}
for fname in tqdm(filenames):
imgs_dir = os.path.join(file_path, fname)
bd_info = anno[fname]
feat[fname] = {}
for bd_ in bd_info:
st_fr = int(bd_[0])
end_fr = int(bd_[1])
window = int(bd_[2])
idx = np.linspace(st_fr, end_fr, window)
idx = np.round(idx).astype(int)
opened_imgs = []
for i in idx:
img = transform(np.array(Image.open(os.path.join(imgs_dir, 'img_' + str(i).zfill(6) + '.jpg'))))
opened_imgs.append(img.unsqueeze(0))
curr_frames = torch.cat(opened_imgs, dim=0)
curr_feats = []
for i in range(0, end_fr-st_fr, batch_size):
curr_batch = curr_frames[i:i+batch_size,:,:,:].cuda(device)
out = net(curr_batch)
curr_feats.append(out.detach().cpu())
curr_feats = torch.cat(curr_feats, 0)
del out
feat[fname][str(st_fr)+'_'+str(end_fr)] = curr_feats.numpy()
del curr_feats
cnt += 1
np.save(save_path, feat)
print(cnt)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--file_path', type=str, default='/data2/wangyan/data/') # /saat/Charades_for_SAAT
parser.add_argument('--dataset_name', type=str, default='Charades') # Charades
parser.add_argument('--frame_per_video', type=int, default=28)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--expnum', type=str, default='4')
opt = parser.parse_args()
with open('/data2/wangyan/tools/EKT-NLVL_vidcaps/charades_clip_info.json', 'r') as f:
anno_info = json.load(f)
anno, train_keys = anno_info, list(anno_info.keys())
opt.file_path = opt.file_path
save_path = os.path.join(opt.file_path, 'charades_feature_2d.npy')
read_in_path = '/data/wangyan/Charades-STA/Charades_v1_rgb/'
extract_feats(read_in_path, train_keys, opt.frame_per_video, opt.batch_size, save_path, anno)
用途:使用IG-65m上预训练好的R(2+1)D模型提取Charades等视频数据集的motion特征(以32帧的clip为单位)
安装:
git clone https://github.com/moabitcoin/ig65m-pytorch
用法:
目标文件ig65m-pytorch/tree/master/ig65m/extract.py
原版本中,作者的输入是原视频,因此还需要经过转化为帧、取样帧等操作。而我是直接对已经转化为帧的视频片段进行特征提取,因此魔改了代码:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import sys, argparse, json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.transforms import Compose
import numpy as np
from einops.layers.torch import Rearrange, Reduce
from tqdm import tqdm
sys.path.append('/data2/wangyan/tools/ig65m-pytorch/')
from ig65m.models import r2plus1d_34_32_ig65m
from ig65m.datasets import VideoDataset
from ig65m.transforms import ToTensor, Resize, Normalize
from pathlib import Path
from PIL import Image
class VideoModel(nn.Module):
def __init__(self, pool_spatial="mean", pool_temporal="mean"):
super().__init__()
self.model = r2plus1d_34_32_ig65m(num_classes=359, pretrained=True, progress=True)
self.pool_spatial = Reduce("n c t h w -> n c t", reduction=pool_spatial)
self.pool_temporal = Reduce("n c t -> n c", reduction=pool_temporal)
def forward(self, x):
x = self.model.stem(x)
x = self.model.layer1(x)
x = self.model.layer2(x)
x = self.model.layer3(x)
x = self.model.layer4(x)
x = self.pool_spatial(x)
x = self.pool_temporal(x)
return x
def main(args):
if torch.cuda.is_available():
print(" Running on GPU(s)", file=sys.stderr)
device = torch.device("cuda:0")
torch.backends.cudnn.benchmark = True
else:
print(" Running on CPU(s)", file=sys.stderr)
device = torch.device("cpu")
model = VideoModel(pool_spatial=args.pool_spatial,
pool_temporal=args.pool_temporal)
model.eval()
for params in model.parameters():
params.requires_grad = False
model = model.to(device)
model = nn.DataParallel(model)
transform = Compose([
ToTensor(),
Rearrange("t h w c -> c t h w"),
Resize(args.frame_size),
Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]),
])
with open(args.anno, 'r') as f:
anno_info = json.load(f)
anno, train_keys = anno_info, list(anno_info.keys())
read_in_path = args.video
feat = {}
for fname in tqdm(train_keys):
imgs_dir = os.path.join(read_in_path, fname)
bd_info = anno[fname]
feat[fname] = {}
for bd_ in bd_info:
st_fr = int(bd_[0])
end_fr = int(bd_[1])
window = int(bd_[2])
idx = np.linspace(st_fr, end_fr, window)
idx = np.round(idx).astype(int)
opened_imgs = []
for i in idx:
img = np.array(Image.open(os.path.join(imgs_dir, 'img_' + str(i).zfill(6) + '.jpg')))
opened_imgs.append(np.expand_dims(img, 0))
curr_frames = np.concatenate(opened_imgs, axis=0)
curr_frames = transform(curr_frames)
curr_feats = []
for i in range(0, end_fr-st_fr, args.batch_size):
curr_batch = curr_frames[:, i:i + args.batch_size, :, :].cuda(device)
out = model(curr_batch.unsqueeze(0))
curr_feats.append(out.detach().cpu())
curr_feats = torch.cat(curr_feats, 0)
del out
feat[fname][str(st_fr) + '_' + str(end_fr)] = curr_feats.numpy()
del curr_feats
np.save(args.features, feat)
print(" Done", file=sys.stderr)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--anno', default='/data2/wangyan/tools/EKT-NLVL_vidcaps/charades_clip_info.json', help="video to run feature extraction on")
parser.add_argument('--video', default='/data/wangyan/Charades-STA/Charades_v1_rgb/')
parser.add_argument('--features', default='/data2/wangyan/data/charades_feature_motion.npy')
parser.add_argument('--pool_spatial', type=str, default="mean")
parser.add_argument('--pool_temporal', type=str, default="mean")
parser.add_argument('--frame_size', default=(112, 112))
parser.add_argument('--batch_size', type=int, default=32)
opt = parser.parse_args()
main(args=opt)
经过以上两个项目的学习,一个很大的收获是:可以无需构造dataset
或dataloader
。相对图像而言,视频占用显存过大,这往往会给贫穷的深度学习学生带来困扰。我们可以选择,以某个帧数为单位片段,每次放入一个单位片段进行处理,这样就不用构造dataset
或dataloader
,只需要将单位片段构造为tensor
即可
这个项目之前仔细学习过,总结在博客PoS Tagging代码学习与应用中
用途:使用pos tagging模型对Charades-STA等视觉文本数据集的文本进行词性标记,并使用预训练好的GloVe提取目标词性(如名词、动词)对应的单词嵌入
安装:
git clone https://github.com/bentrevett/pytorch-pos-tagging
用法:
添加文件pos_vmr.py
并写入
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import numpy as np
from pos_train import BiLSTMPOSTagger
import torch
from torchtext.legacy import data
import spacy
from torch.utils.data import Dataset
from torchtext.legacy import datasets
nlp = spacy.load('en_core_web_sm')
from torch.utils.data import DataLoader
import itertools
glove = np.load('/data/wangyan/dataset/data/6B.300d.npy', allow_pickle=True).tolist()
from tqdm import tqdm
class Charades_Text(Dataset):
def __init__(self, annotation_file, text_field):
self.info = [x.strip().split('##')[0] for x in open(annotation_file)]
self.data = [x.strip().split('##')[-1] for x in open(annotation_file)]
self.text_field = text_field
# self.data = []
# for x in open(annotation_file):
# sentence = x.strip().split('##')[-1]
# self.data.append(token.text for token in nlp(sentence))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
self.tokens = [token.text for token in nlp(self.data[idx])]
self.tokens = [t.lower() for t in self.tokens]
# 3)完成word2id
numericalized_tokens = [self.text_field.vocab.stoi[t] for t in self.tokens]
# 记录unknown单词的index
unk_idx = self.text_field.vocab.stoi[self.text_field.unk_token]
# 将id转化为tensor并放到gpu上
# token_tensor: (vocab_len)
token_tensor = torch.LongTensor(numericalized_tokens)
# (vocab_len, 1)
# token_tensor = token_tensor.unsqueeze(-1).to(device)
data_torch = {'info': self.info[idx], 'data': token_tensor, 'token': self.tokens}
return data_torch
def text_process(batch):
re = {}
re['info'] = []
re['data'] = []
re['token'] = []
for item in batch:
re['info'].append(item['info'])
re['data'].append(item['data'])
re['token'].append(item['token'])
re['data'] = torch.nn.utils.rnn.pad_sequence(re['data'], batch_first=False, padding_value=1)
return re
if __name__=='__main__':
TEXT = data.Field(lower=True)
UD_TAGS = data.Field(unk_token=None)
PTB_TAGS = data.Field(unk_token=None)
# fields用于dataset传递信息
fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))
# 从datasets.UDPOS加载训练集、验证集、测试集
# UDPOS数据集每一行形式是years NOUN NNS,是句子中单词及其两种词性标注
# 训练集12543,验证集2002,测试集2077,形式如下图:
train_data, valid_data, test_data = datasets.UDPOS.splits(fields)
# 出现频率两次才放入vocab(否则看作unknown)
MIN_FREQ = 2
TEXT.build_vocab(train_data,
min_freq=MIN_FREQ,
vectors="glove.6B.100d",
unk_init=torch.Tensor.normal_)
UD_TAGS.build_vocab(train_data)
charades_data = Charades_Text('./charades/charades_sta_train.txt', TEXT)
charades_dataloader = DataLoader(charades_data, batch_size=128, collate_fn=text_process)
device = 'cuda:0'
INPUT_DIM = len(TEXT.vocab) # 8866
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(UD_TAGS.vocab) # 18
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # 1
model = BiLSTMPOSTagger(INPUT_DIM,
EMBEDDING_DIM,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT,
PAD_IDX)
model.load_state_dict(torch.load('tut1-model.pt', map_location=device))
model.cuda(device=device)
model.eval()
save_path = '/data2/wangyan/data/charades_sent_ov_feat.npy'
save_dict = {}
for batch in tqdm(charades_dataloader, desc='Pos Extracting', total=len(charades_dataloader)):
batch['data'] = batch['data'].cuda(device=device)
predictions = model(batch['data'])
# 获得预测结果
top_predictions = predictions.argmax(-1).transpose(0, 1)
# 转化为可视化的tag
predicted_tags = []
for idx in range(0, len(top_predictions)):
predicted_tags.append([UD_TAGS.vocab.itos[t.item()] for t in top_predictions[idx]])
for a, item in enumerate(predicted_tags):
info = batch['info'][a]
save_dict[info] = []
o_f = torch.zeros(300)
v_f = torch.zeros(300)
o_cnt = 0
v_cnt = 0
for b, label in enumerate(item):
if label == 'PUNCT' or b > len(batch['token'][a]):
break
elif label =='NOUN':
try:
o_f += glove[batch['token'][a][b]]
except KeyError:
o_f += np.random.randn(300, )
o_cnt += 1
elif label == 'VERB':
try:
v_f += glove[batch['token'][a][b]]
except KeyError:
v_f += np.random.randn(300, )
v_cnt += 1
o_f = o_f / o_cnt
v_f = v_f / v_cnt
save_dict[info].append(o_f)
save_dict[info].append(v_f)
np.save(save_path, save_dict)
用途:使用skip-thoughts模型对Charades-STA等视觉文本数据集的文本进行特征提取(以句子为单位)
因为我共同组合了训练数据集所需要的其他信息,所以下面的代码需要自行转换为任务需要的
安装:
git clone https://github.com/ryankiros/skip-thoughts
值得说明的是,这个项目需要在python 2.7
和theano 0.7
下运行
安装下列文件:
wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl
用法:
添加文件encoder_test.py
,并写入
import pickle
import numpy as np
from tqdm import tqdm
import skipthoughts
import os
import numpy
'''
calculate temporal intersection over union
'''
# i0/i1: a tuple of (start, end)
def calculate_IoU(i0, i1):
union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
return iou
'''
calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0
'''
def calculate_nIoL(base, sliding_clip):
inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))
inter_l = inter[1]-inter[0]
length = sliding_clip[1]-sliding_clip[0]
nIoL = 1.0*(length-inter_l)/length
return nIoL
if __name__=='__main__':
model = skipthoughts.load_model()
encoder = skipthoughts.Encoder(model)
# read sliding windows, and match them with the groundtruths to make training samples
label_path = "/data2/wangyan/baseline/TALL_pytorch/charades_sta_test.txt"
sliding_clip_path = "/data2/wangyan/data/charades_c3dfeature_separate_py2/"
data_path = '/data/wangyan/Charades-STA/Charades_v1_rgb/'
clip_sentence_pairs_iou = []
save_path = '/data2/wangyan/data/charades_testing_samples.npy'
with open(label_path) as file:
label = file.readlines()
for line in tqdm(label, desc='Testing Samples Collecting: ', total=len(label)):
info, sent = line.split('##')
# x = ['Hello world']
sent = [sent.split('\n')[0]]
s_feat = encoder.encode(sent)
vid, ss, es = info.split(" ")
sliding_file = os.path.join(sliding_clip_path, vid+'.npy')
sliding_clip = numpy.load(sliding_file, allow_pickle=True)
movie_path = os.path.join(data_path, vid)
img_len = len(os.listdir(movie_path))
fps = img_len/float(sliding_clip[0]['video_length'])
for clip in sliding_clip:
sf, ef = clip['clip_info'].split("-")
sf = int(sf)
ef = int(ef)
# o_start/o_end for ground truth
g_sf = float(ss) * fps
g_ef = float(es) * fps
iou = calculate_IoU((sf, ef), (g_sf, g_ef))
if iou > 0.5:
nIoL = calculate_nIoL((g_sf, g_ef), (sf, ef))
if nIoL < 0.15:
start_offset = g_sf - sf
end_offset = g_ef - ef
pairs = {}
pairs['v_feat'] = clip['v_glob_feature']
pairs['s_feat'] = s_feat
pairs['fps'] = fps
pairs['vid'] = vid
pairs['s_off'] = start_offset
pairs['e_off'] = end_offset
pairs['clip_name'] = str(clip['clip_info'])
clip_sentence_pairs_iou.append(pairs)
num_samples_iou = len(clip_sentence_pairs_iou)
with open(save_path, 'a'):
np.save(save_path, clip_sentence_pairs_iou)
print num_samples_iou