网络层 | 卷积核 | padding(conv) stride(pool) | 输入 | 输出 | 激活函数 |
---|---|---|---|---|---|
conv1 | 64 ∗ 3 ∗ 3 ∗ 3 64*3*3*3 64∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 3 , 16 , 112 , 112 3, 16, 112, 112 3,16,112,112 | 64 , 16 , 112 , 112 64, 16, 112, 112 64,16,112,112 | ReLU |
pool1 | 1 ∗ 2 ∗ 2 1*2*2 1∗2∗2 | 1 ∗ 2 ∗ 2 1*2*2 1∗2∗2 | 64 , 16 , 112 , 112 64, 16, 112, 112 64,16,112,112 | 64 , 16 , 56 , 56 64, 16, 56, 56 64,16,56,56 | |
conv2 | 128 ∗ 3 ∗ 3 ∗ 3 128*3*3*3 128∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 64 , 16 , 56 , 56 64, 16, 56, 56 64,16,56,56 | 128 , 16 , 56 , 56 128, 16, 56, 56 128,16,56,56 | ReLU |
pool2 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 128 , 16 , 56 , 56 128, 16, 56, 56 128,16,56,56 | 128 , 8 , 28 , 28 128, 8, 28, 28 128,8,28,28 | |
conv3a | 256 ∗ 3 ∗ 3 ∗ 3 256*3*3*3 256∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 128 , 8 , 28 , 28 128, 8, 28, 28 128,8,28,28 | 256 , 8 , 28 , 28 256, 8, 28, 28 256,8,28,28 | ReLU |
conv3b | 256 ∗ 3 ∗ 3 ∗ 3 256*3*3*3 256∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 256 , 8 , 28 , 28 256, 8, 28, 28 256,8,28,28 | 256 , 8 , 28 , 28 256, 8, 28, 28 256,8,28,28 | ReLU |
pool3 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 256 , 8 , 28 , 28 256, 8, 28, 28 256,8,28,28 | 256 , 4 , 14 , 14 256, 4, 14, 14 256,4,14,14 | |
conv4a | 512 ∗ 3 ∗ 3 ∗ 3 512*3*3*3 512∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 256 , 4 , 14 , 14 256, 4, 14, 14 256,4,14,14 | 512 , 4 , 14 , 14 512, 4, 14, 14 512,4,14,14 | ReLU |
conv4b | 512 ∗ 3 ∗ 3 ∗ 3 512*3*3*3 512∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 512 , 4 , 14 , 14 512, 4, 14, 14 512,4,14,14 | 512 , 4 , 14 , 14 512, 4, 14, 14 512,4,14,14 | ReLU |
pool4 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 512 , 4 , 14 , 14 512, 4, 14, 14 512,4,14,14 | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | |
conv5a | 512 ∗ 3 ∗ 3 ∗ 3 512*3*3*3 512∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | ReLU |
conv5b | 512 ∗ 3 ∗ 3 ∗ 3 512*3*3*3 512∗3∗3∗3 | 1 ∗ 1 ∗ 1 1*1*1 1∗1∗1 | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | ReLU |
pool5 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2 | 2 ∗ 2 ∗ 2 2*2*2 2∗2∗2, 0 ∗ 1 ∗ 1 0*1*1 0∗1∗1(padding) | 512 , 2 , 7 , 7 512, 2, 7, 7 512,2,7,7 | 512 , 1 , 4 , 4 512, 1, 4, 4 512,1,4,4 | |
view | 512 , 1 , 4 , 4 512, 1, 4, 4 512,1,4,4 | 1 , 8192 1, 8192 1,8192 | |||
fc6 | 1 , 8192 1, 8192 1,8192 | 1 , 4096 1, 4096 1,4096 | ReLU+dropout 0.5 0.5 0.5 | ||
fc7 | 1 , 4096 1, 4096 1,4096 | 1 , 4096 1, 4096 1,4096 | ReLU+dropout 0.5 0.5 0.5 | ||
fc8 | 1 , 4096 1, 4096 1,4096 | 1 , 487 1, 487 1,487 | softmax |
predict.py
代码""" How to use C3D network. """
import numpy as np
import torch
from torch.autograd import Variable
from os.path import join
# python标准库中的一个重要模块,主要用来查找符合特定规则(通配符)的目录和文件
from glob import glob
# 用于导入和处理视频的帧,因此项目的输入是视频提好的帧
import skimage.io as io
from skimage.transform import resize
from C3D_model import C3D
def get_sport_clip(clip_name, verbose=True):
"""
Loads a clip to be fed to C3D for classification.
TODO: should I remove mean here?
Parameters
----------
clip_name: str
the name of the clip (subfolder in 'data'). 此处为'roger'
verbose: bool
if True, shows the unrolled clip (default is True).
Returns
-------
Tensor
(batch_size, channels, frames, height, weight)
a pytorch batch (n, ch, fr, h, w).
"""
# glob.glob返回符合匹配条件的所有文件的路径,list形式
# ./data/roger/*.png
clip = sorted(glob(join('data', clip_name, '*.png')))
# 将每一帧转化为(112, 200, 3)的shape,并转为np.array形式
# (frames, 112, 200, 3),论文中将frames设为16,即以16帧为单位进行特征提取
# 所以如果要提取滑动窗口特征,就应该将提前准备好输入图片
# 每个特征提取单位放在一个文件夹中(如此处的'roger')
clip = np.array([resize(io.imread(frame), output_shape=(112, 200), preserve_range=True) for frame in clip])
# 把weight中间的112拿出来(这应该对应论文中的jittering)
clip = clip[:, :, 44:44+112, :] # crop centrally
# 这一步会将所有clip以视频序列的形式展示出来(如下图)
# 如果要处理的视频比较多或者没有可视化需求,就设为false吧
if verbose:
clip_img = np.reshape(clip.transpose(1, 0, 2, 3), (112, 16 * 112, 3))
io.imshow(clip_img.astype(np.uint8))
io.show()
# 增加batch_size维度,并转化为tensor
clip = clip.transpose(3, 0, 1, 2) # ch, fr, h, w
clip = np.expand_dims(clip, axis=0) # batch axis
clip = np.float32(clip)
return torch.from_numpy(clip)
def read_labels_from_file(filepath):
"""
Reads Sport1M labels from file
Parameters
----------
filepath: str
the file.
Returns
-------
list
list of sport names.
"""
with open(filepath, 'r') as f:
labels = [line.strip() for line in f.readlines()]
return labels
def main():
"""
Main function.
"""
# load a clip to be predicted
X = get_sport_clip('roger')
# X: batch_size, channels, frames, height, weight
# X: 1, 3, 16, 112, 112
X = Variable(X)
X = X.cuda()
# get network pretrained model
net = C3D()
net.load_state_dict(torch.load('c3d.pickle'))
net.cuda()
net.eval()
# perform prediction
# prediction: 1, 487(这里是指有487个类,)
# 如果是提特征的话,后面这些都可以不要,只把fc6的拿出来就可以
prediction = net(X)
prediction = prediction.data.cpu().numpy()
# read labels
labels = read_labels_from_file('labels.txt')
# print top predictions
top_inds = prediction[0].argsort()[::-1][:5] # reverse sort and take five largest items
print('\nTop 5:')
for i in top_inds:
print('{:.5f} {}'.format(prediction[0][i], labels[i]))
# entry point
if __name__ == '__main__':
main()
根据视频名称、窗口值 [ 64 , 128 , 256 , 512 ] [64,128,256,512] [64,128,256,512]和滑动值 [ 13 , 26 , 51 , 102 ] [13,26,51,102] [13,26,51,102],准备好视频的输入帧
修改网络输出,只要fc6的输出来表示视频特征
输出为.npy
格式的文件,存入特征、其他属性
最终完成代码如下:
""" How to use C3D network. """
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import torch
from torch.autograd import Variable
from os.path import join
from glob import glob
import skimage.io as io
from skimage.transform import resize
from C3D_model import C3D
import csv
from tqdm import tqdm
def get_sport_clip(clip_name, verbose=True):
"""
Loads a clip to be fed to C3D for classification.
TODO: should I remove mean here?
Parameters
----------
clip_name: str
the name of the clip (subfolder in 'data').
verbose: bool
if True, shows the unrolled clip (default is True).
Returns
-------
Tensor
a pytorch batch (n, ch, fr, h, w).
"""
clip = sorted(glob(join('data', clip_name, '*.png')))
clip = np.array([resize(io.imread(frame), output_shape=(112, 200), preserve_range=True) for frame in clip])
clip = clip[:, :, 44:44+112, :] # crop centrally
if verbose:
clip_img = np.reshape(clip.transpose(1, 0, 2, 3), (112, 16 * 112, 3))
io.imshow(clip_img.astype(np.uint8))
io.show()
clip = clip.transpose(3, 0, 1, 2) # ch, fr, h, w
clip = np.expand_dims(clip, axis=0) # batch axis
clip = np.float32(clip)
return torch.from_numpy(clip)
def get_candidate_clip(file_path, start_frame, end_frame, window_size, verbose=True):
"""
Loads a clip to be fed to C3D for classification.
TODO: should I remove mean here?
Parameters
----------
clip_name: str
the name of the clip (subfolder in 'data').
verbose: bool
if True, shows the unrolled clip (default is True).
Returns
-------
Tensor
a pytorch batch (n, ch, fr, h, w).
"""
clip = [os.path.join(file_path, 'img_{:06d}.jpg'.format(frame)) for frame in range(start_frame, end_frame)]
clip = np.array([resize(io.imread(frame), output_shape=(112, 200), preserve_range=True) for frame in clip])
clip = clip[:, :, 44:44 + 112, :] # crop centrally
if verbose:
clip_img = np.reshape(clip.transpose(1, 0, 2, 3), (112, window_size * 112, 3))
io.imshow(clip_img.astype(np.uint8))
io.show()
clip = clip.transpose(3, 0, 1, 2) # ch, fr, h, w
clip = np.expand_dims(clip, axis=0) # batch axis
clip = np.float32(clip)
return torch.from_numpy(clip)
def read_labels_from_file(filepath):
"""
Reads Sport1M labels from file
Parameters
----------
filepath: str
the file.
Returns
-------
list
list of sport names.
"""
with open(filepath, 'r') as f:
labels = [line.strip() for line in f.readlines()]
return labels
def main():
"""
Main function.
"""
# load a clip to be predicted
anno_path = '../Charades_v1_train.csv'
frame_path = '/data/wangyan/Charades-STA/Charades_v1_rgb/'
window_size = [64, 128, 256, 512]
slidng_size = [13, 26, 51, 102]
save_path = 'candidate.npy'
save_list = []
# get network pretrained model
net = C3D()
net.load_state_dict(torch.load('c3d.pickle'))
net.cuda()
net.eval()
with open(anno_path, 'r') as f:
reader = csv.reader(f)
# for row in tqdm(enumerate(reader), desc='Video Processing...', total=len(f.readlines())):
for row in reader:
video_name = row[0]
file_path = os.path.join(frame_path, video_name)
video_length = row[-1]
if os.path.exists(file_path):
img_len = len(os.listdir(file_path))
for i, window in enumerate(window_size):
for point in range(1, img_len - window + 1, slidng_size[i]):
X = get_candidate_clip(file_path, point, min(img_len + 1, point + window), window, False)
# X = get_sport_clip('roger')
X = Variable(X)
X = X.cuda()
# perform prediction
prediction, feature = net(X)
feature = feature.data.cpu().numpy()
feature = np.mean(feature, axis=0)
candidate = {}
candidate['video_name'] = video_name
candidate['clip_info'] = str(point) + '-' + str(min(img_len + 1, point + window))
candidate['v_glob_feature'] = feature
candidate['video_length'] = video_length
save_list.append(candidate)
np.save(save_path, save_list)
# entry point
if __name__ == '__main__':
main()