标签(空格分隔): 源码
先来看看prototxt网络配置,以光流的为例,prototxt核心部分如下,这里只摘取了,数据输入层和损失函数层,中间层是BN-Inception结构(略)。
name: "BN-Inception"
"""
一个批次读入数据的大小为: batch_size, num_segment * new_length * modilty, image_w, image_h
其他的参量下面都有解释,这里说一下modilty,对于光流图像来说,它是灰度图像,但是它存在两个
方向的光流x, y;对于RGB来说,存在三个通道。
"""
layer {
name: "data"
type: "VideoData"
top: "data"
top: "label"
video_data_param {
#文本文件,每一行的是一个视频的信息:视频图像的路径 视频的帧数 视频的类别
source: "data/ucf101_flow_train_split_1.txt"
batch_size: 32
new_length: 5
#视频平均分成num_segments段,采样时从每一段采样连续new_length帧
num_segments: 3
#模态,RGB或者光流
modality: FLOW
shuffle: true
#视频图像的命名方式%c取(x, y) %05d取第几帧
name_pattern: "flow_%c_%05d.jpg"
}
transform_param{
crop_size: 224
mirror: true
fix_crop: true
more_fix_crop: true
multi_scale: true
max_distort: 1
scale_ratios: [1,.875,.75]
mean_value: 128
is_flow: true
}
include: { phase: TRAIN }
}
layer {
name: "data"
type: "VideoData"
top: "data"
top: "label"
video_data_param {
source: "data/ucf101_flow_val_split_1.txt"
batch_size: 1
new_length: 5
num_segments: 3
modality: FLOW
name_pattern: "flow_%c_%05d.jpg"
}
transform_param{
crop_size: 224
mirror: false
mean_value: 128
is_flow: true
}
include: { phase: TEST }
}
#通过数据读入层,以上面参数设定为例,输入数据大小为(32, 30, 224, 224)
#对输入数据做一个Reshap,变成(96, 10, 224, 224)
#其目的是,第二维度(通道数)变成从num_segments段中,采样一次得到的通道数
#new_length * modility,方便后续测试用(因为第二维度关系到权值大小的问题)
layer { name: "data_reshape" type: "Reshape" bottom: "data" top: "data_reshape" reshape_param { shape { dim: [-1, 10, 224, 224] } }}
"""
BN-Inception Network
"""
####################################### global pool #######################################
# 全局池化层,将feature map变成 1 * 1
layer { name: "global_pool" top: "global_pool" bottom: "inception_5b/output" type: "Pooling"
pooling_param { pool: AVE kernel_size: 7 stride: 1 } }
layer { name: "dropout" top: "global_pool" bottom: "global_pool" type: "Dropout"
dropout_param { dropout_ratio: 0.7 } }
####################################### loss accuracy #######################################
# 将通道数变成101,进而分类, 此时输出数据为96 * 101
layer { name: "fc-action" type: "InnerProduct" bottom: "global_pool" top: "fc"
param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 }
inner_product_param { num_output: 101
weight_filler { type: "gaussian" std: 0.001 }
bias_filler { type: "constant" value: 0 } } }
#进行还原,实际上,网络的batch_size = 32, 该层的输出为:32,1,3,101
#其中第三维度3表示,一段视频分的段数num_segments
#也就是说,每一段视频采样的图像帧对该视频的类别进行打分,然后进行融合(mean, max等),与论文描述一致
layer { name: "reshape_fc" bottom: "fc" top: "reshape_fc" type: "Reshape"
reshape_param { shape { dim: [-1, 1, 3, 101 ] } }}
layer { name: "pool_fusion" bottom: "reshape_fc" top: "pool_fc" type: "Pooling"
pooling_param { pool: AVE kernel_h: 3 kernel_w: 1} }
layer { name: "loss" type: "SoftmaxWithLoss" bottom: "pool_fc" bottom: "label" top: "loss" softmax_param {axis: 3} }
layer { name: "accuracy_top1" type: "Accuracy" bottom: "pool_fc" bottom: "label" top: "accuracy" accuracy_param {axis: 3}
include { phase: TEST } }
在TSN中,视频中的每一帧图像以及对应的光流都是事先生成,并存放在文件夹中,同时生成ucf101_flow_train_split_1.txt相关的文本文件,该文本文件每一行的格式:视频图像的路径 该段视频的帧数 该段视频的类别。
我们来看看video_data_layer的源码
#include
#include
#include
#include
#include
#include "caffe/data_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"
#ifdef USE_MPI
#include "mpi.h"
#include
using namespace boost::filesystem;
#endif
namespace caffe{
template <typename Dtype>
VideoDataLayer:: ~VideoDataLayer(){
this->JoinPrefetchThread();
}
template <typename Dtype>
void VideoDataLayer:: DataLayerSetUp(const vector *>& bottom, const vector *>& top){
//读取prototxt配置文件的参数
const int new_height = this->layer_param_.video_data_param().new_height();
const int new_width = this->layer_param_.video_data_param().new_width();
const int new_length = this->layer_param_.video_data_param().new_length();
const int num_segments = this->layer_param_.video_data_param().num_segments();
const string& source = this->layer_param_.video_data_param().source();
//获取数据集相关的信息文件
LOG(INFO) << "Opening file: " << source;
打开该文本文件
std:: ifstream infile(source.c_str());
string filename;
int label;
int length;
//数据分别流入,文件路径,视频长度,视频类别
while (infile >> filename >> length >> label){
//分别压入lines_和lines_duration_容器中
lines_.push_back(std::make_pair(filename,label));
lines_duration_.push_back(length);
}
//打乱视频
if (this->layer_param_.video_data_param().shuffle()){
const unsigned int prefectch_rng_seed = caffe_rng_rand();
prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
ShuffleVideos();
}
//lines_容器大大小,也就表示训练集(测试集)中,视频的个数
LOG(INFO) << "A total of " << lines_.size() << " videos.";
lines_id_ = 0;
//check name patter
//图像命名的格式
if (this->layer_param_.video_data_param().name_pattern() == ""){
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_RGB){
name_pattern_ = "image_%04d.jpg";
}else if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
name_pattern_ = "flow_%c_%04d.jpg";
}
}else{
name_pattern_ = this->layer_param_.video_data_param().name_pattern();
}
//进行视频图像读入测试
Datum datum;
bool is_color = !this->layer_param_.video_data_param().grayscale();
const unsigned int frame_prefectch_rng_seed = caffe_rng_rand();
frame_prefetch_rng_.reset(new Caffe::RNG(frame_prefectch_rng_seed));
//将视频分成num_segments段的平均间隔
int average_duration = (int) lines_duration_[lines_id_]/num_segments;
vector<int> offsets;//采样图像索引偏移
for (int i = 0; i < num_segments; ++i){
caffe::rng_t* frame_rng = static_cast(frame_prefetch_rng_->generator());
int offset = (*frame_rng)() % (average_duration - new_length + 1);
offsets.push_back(offset+i*average_duration);//相对于第一帧图像的偏移量(索引偏移)
}
//不同的模态,读入程序也有差别
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW)
//光流读入测试
CHECK(ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str()));
else
//RGB读入测试
CHECK(ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str()));
//数据的预处理
//prefetch_data_是一个Blob类对象,在Caffe中,通过内部线程,将数据
//读入等待队列,然后从等待队列中读取数据,在这一点上和tensorflow有些相似
//tensorflow中,队列大小用户可以自己定义,在Caffe中默认为3
//从下面的代码中,可以看到top_data = prefetch_data_
const int crop_size = this->layer_param_.transform_param().crop_size();
const int batch_size = this->layer_param_.video_data_param().batch_size();
if (crop_size > 0){//crop
top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
} else {
top[0]->Reshape(batch_size, datum.channels(), datum.height(), datum.width());
this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(), datum.width());
}
LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," << top[0]->width();
//为label分配内存
top[1]->Reshape(batch_size, 1, 1, 1);
this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
//预处理后的数据分配内存
vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
this->transformed_data_.Reshape(top_shape);
}
//内部线程,读取数据
template <typename Dtype>
void VideoDataLayer::InternalThreadEntry(){
Datum datum;
CHECK(this->prefetch_data_.count());
//获取top_data以及top_label指针,内容可修改
Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
Dtype* top_label = this->prefetch_label_.mutable_cpu_data();
VideoDataParameter video_data_param = this->layer_param_.video_data_param();
const int batch_size = video_data_param.batch_size();
const int new_height = video_data_param.new_height();
const int new_width = video_data_param.new_width();
const int new_length = video_data_param.new_length();
const int num_segments = video_data_param.num_segments();
//获取训练集(测试集)视频的数量
const int lines_size = lines_.size();
bool is_color = !this->layer_param_.video_data_param().grayscale();
for (int item_id = 0; item_id < batch_size; ++item_id){//大循环,循环batch_size
CHECK_GT(lines_size, lines_id_);//当前读取视频的索引小于视频的数量
//以下一段程序和DataLayerSetUp读入测试代码相类似
vector<int> offsets;
int average_duration = (int) lines_duration_[lines_id_] / num_segments;
for (int i = 0; i < num_segments; ++i){
if (this->phase_==TRAIN){
if (average_duration >= new_length){
caffe::rng_t* frame_rng = static_cast(frame_prefetch_rng_->generator());
int offset = (*frame_rng)() % (average_duration - new_length + 1);
offsets.push_back(offset+i*average_duration);
} else {
offsets.push_back(0);
}
} else{
if (average_duration >= new_length)
offsets.push_back(int((average_duration-new_length+1)/2 + i*average_duration));
else
offsets.push_back(0);
}
}
if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
//读入数据,存放到datum中
if(!ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str())) {
continue;
}
} else{
if(!ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str())) {
continue;
}
}
//读取了batch_size = 1的数据 就开始预处理
int offset1 = this->prefetch_data_.offset(item_id);//获取指针偏移
this->transformed_data_.set_cpu_data(top_data + offset1);//预处理和源数据内存块重合
this->data_transformer_->Transform(datum, &(this->transformed_data_));//预处理
top_label[item_id] = lines_[lines_id_].second;//存入标签
//LOG()
//next iteration
lines_id_++;//下一次视频索引
if (lines_id_ >= lines_size) {
DLOG(INFO) << "Restarting data prefetching from start.";
lines_id_ = 0;
if(this->layer_param_.video_data_param().shuffle()){
ShuffleVideos();
}
}
}
}
INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);
}
在video_data_layer中,用到了ReadSegmentFlowToDatum
和 ReadSegmentRGBToDatum
分别将光流和RGB读入并转换成Datum
这样的数据结构,在这此前,先来看看caffe.proto
对Datum
是怎么描述的
message Datum {
//Datum数据块的相关信息,channel height width
optional int32 channels = 1;
optional int32 height = 2;
optional int32 width = 3;
// the actual image data, in bytes
//存放数据字节(byte)类型的
optional bytes data = 4;
//标签,int32类型的
optional int32 label = 5;
// Optionally, the datum could also hold float data.
repeated float float_data = 6;
// If true data contains an encoded image that need to be decoded
optional bool encoded = 7 [default = false];
}
从上面的定义可以看到,Datum
用来存放一个实例的信息,数据(channel, height, width)包括标签的,下面,将一个实例图像(可能包含多张图像,根据channels的数值来确定)数据存入Datum
//filename:视频图像存放的路径
//label:该视频对应的标签
//offsets:采样图像,相对于第一帧视频的偏移量
//height,width图像的尺寸
//length: 从采样点开始,连续采样图像的帧数(对于光流来说,一帧包含x和y两张图像)
//datum: Datum的数据指针
//name_pattern: 命名模式
bool ReadSegmentFlowToDatum(const string& filename, const int label,
const vector<int> offsets, const int height, const int width, const int length, Datum* datum,
const char* name_pattern ){
//用OpenCV读取图像数据
cv::Mat cv_img_x, cv_img_y;
string* datum_string;
//图像文件名字符串
char tmp[30];
for (int i = 0; i < offsets.size(); ++i){//大循环,读取每一段中,采样图像
int offset = offsets[i];//获取该段偏移量
for (int file_id = 1; file_id < length+1; ++file_id){//连续读取图像,循环,
sprintf(tmp,name_pattern, 'x', int(file_id+offset));//x图像文件名
string filename_x = filename + "/" + tmp;//文件名+路径
//读取图像(灰度形式读取)
cv::Mat cv_img_origin_x = cv::imread(filename_x, CV_LOAD_IMAGE_GRAYSCALE);
sprintf(tmp, name_pattern, 'y', int(file_id+offset));//y图像文件名
string filename_y = filename + "/" + tmp;//文件名+路径
cv::Mat cv_img_origin_y = cv::imread(filename_y, CV_LOAD_IMAGE_GRAYSCALE);
if (!cv_img_origin_x.data || !cv_img_origin_y.data){
LOG(ERROR) << "Could not load file " << filename_x << " or " << filename_y;
return false;
}
//Resize图像尺度
if (height > 0 && width > 0){
cv::resize(cv_img_origin_x, cv_img_x, cv::Size(width, height));
cv::resize(cv_img_origin_y, cv_img_y, cv::Size(width, height));
}else{
cv_img_x = cv_img_origin_x;
cv_img_y = cv_img_origin_y;
}
//开始读取数据是,应给datum类指针赋值相关的值,以及data_string表示Datum数据块指针(可修改)
if (file_id==1 && i==0){
int num_channels = 2;
//通道数 = 2(x和y) * length * 视频分段数
datum->set_channels(num_channels*length*offsets.size());
datum->set_height(cv_img_x.rows);
datum->set_width(cv_img_x.cols);
datum->set_label(label);
datum->clear_data();
datum->clear_float_data();
datum_string = datum->mutable_data();
}
//遍历整个图像,将数据压入datum_string中,因此datum就获取了当前实例的数据
for (int h = 0; h < cv_img_x.rows; ++h){
for (int w = 0; w < cv_img_x.cols; ++w){
datum_string->push_back(static_cast<char>(cv_img_x.at(h,w)));
}
}
for (int h = 0; h < cv_img_y.rows; ++h){
for (int w = 0; w < cv_img_y.cols; ++w){
datum_string->push_back(static_cast<char>(cv_img_y.at(h,w)));
}
}
}
}
return true;
}
eval_net.py
import argparse
import os
import sys
import math
import cv2
import numpy as np
import multiprocessing
from sklearn.metrics import confusion_matrix
sys.path.append('.')
from pyActionRecog import parse_directory
from pyActionRecog import parse_split_file
from pyActionRecog.utils.video_funcs import default_aggregation_func
parser = argparse.ArgumentParser()
#数据集,ucf101和hmdb51
parser.add_argument('dataset', type=str, choices=['ucf101', 'hmdb51'])
#数据集中训练集和测试集的三种分类方式
parser.add_argument('split', type=int, choices=[1, 2, 3],
help='on which split to test the network')
#模态 光流或者RGB
parser.add_argument('modality', type=str, choices=['rgb', 'flow'])
#视频图像的根目录,其目录结构为frame_path/video_name/jpge(光流或者RGB)
parser.add_argument('frame_path', type=str, help="root directory holding the frames")
#测试网络deploy.prototxt
parser.add_argument('net_proto', type=str)
#网络权重caffemodel文件
parser.add_argument('net_weights', type=str)
#视频图像的命名形式
parser.add_argument('--rgb_prefix', type=str, help="prefix of RGB frames", default='img_')
parser.add_argument('--flow_x_prefix', type=str, help="prefix of x direction flow images", default='flow_x_')
parser.add_argument('--flow_y_prefix', type=str, help="prefix of y direction flow images", default='flow_y_')
#在测试时,将视频分为很多段,从每一段中采样一帧对每一个类别进行打分
#最后进行预测分数融合,获得最终的预测结果
parser.add_argument('--num_frame_per_video', type=int, default=25,
help="prefix of y direction flow images")
#保持所有测试集每一类的分数,方便后面的RGB和光流的结果融合
parser.add_argument('--save_scores', type=str, default=None, help='the filename to save the scores in')
#线程数,支持多线程
parser.add_argument('--num_worker', type=int, default=1)
#caffe路径的根目录
parser.add_argument("--caffe_path", type=str, default='./lib/caffe-action/', help='path to the caffe toolbox')
#GPU的使用个数
parser.add_argument("--gpus", type=int, nargs='+', default=None, help='specify list of gpu to use')
args = parser.parse_args()
print args
sys.path.append(os.path.join(args.caffe_path, 'python'))
from pyActionRecog.action_caffe import CaffeNet
# build neccessary information
print args.dataset
#parse_split_file和parse_directory用来解析数据集的信息的,
#比如每段视频的帧数,对应的类别,路径,数据集的个数之类的
split_tp = parse_split_file(args.dataset)
f_info = parse_directory(args.frame_path,
args.rgb_prefix, args.flow_x_prefix, args.flow_y_prefix)
gpu_list = args.gpus
#eval_video_list视频名字的列表
eval_video_list = split_tp[args.split - 1][1]
score_name = 'fc-action'
def build_net():
global net
my_id = multiprocessing.current_process()._identity[0] \
if args.num_worker > 1 else 1
if gpu_list is None:
net = CaffeNet(args.net_proto, args.net_weights, my_id-1)
else:
net = CaffeNet(args.net_proto, args.net_weights, gpu_list[my_id - 1])
#测试网络的核心部分,输入表示视频的名字以及对应的类别(一段视频)
def eval_video(video):
global net
label = video[1]
vid = video[0]
#从f_info获取该视频对应的视频图像的路径
video_frame_path = f_info[0][vid]
#对于RGB图像只需要读入一次,对于光流图像有x光流和y光流,需要读入两次
if args.modality == 'rgb':
cnt_indexer = 1
elif args.modality == 'flow':
cnt_indexer = 2
else:
raise ValueError(args.modality)
frame_cnt = f_info[cnt_indexer][vid]
stack_depth = 0
#对于RGB图像,连续采集一帧图像;对于光流连续采集5帧图像
if args.modality == 'rgb':
stack_depth = 1
elif args.modality == 'flow':
stack_depth = 5
#在测试过程中,将一段视频分成很多段,每段采集一次
step = (frame_cnt - stack_depth) / (args.num_frame_per_video-1)
if step > 0:
frame_ticks = range(1, min((2 + step * (args.num_frame_per_video-1)), frame_cnt+1), step)
else:
frame_ticks = [1] * args.num_frame_per_video
assert(len(frame_ticks) == args.num_frame_per_video)
frame_scores = []
#循环每一次采集
for tick in frame_ticks:
#RGB和光流的采集模式不一样
if args.modality == 'rgb':
name = '{}{:05d}.jpg'.format(args.rgb_prefix, tick)
frame = cv2.imread(os.path.join(video_frame_path, name), cv2.IMREAD_COLOR)
#net.predict_single_frame进行测试,该函数实现了多尺度的分类
#其输出(num_scale,101)
scores = net.predict_single_frame([frame,], score_name, frame_size=(340, 256))
frame_scores.append(scores)
#光流的过程同RGB一样
if args.modality == 'flow':
frame_idx = [min(frame_cnt, tick+offset) for offset in xrange(stack_depth)]
flow_stack = []
for idx in frame_idx:
x_name = '{}{:05d}.jpg'.format(args.flow_x_prefix, idx)
y_name = '{}{:05d}.jpg'.format(args.flow_y_prefix, idx)
flow_stack.append(cv2.imread(os.path.join(video_frame_path, x_name), cv2.IMREAD_GRAYSCALE))
flow_stack.append(cv2.imread(os.path.join(video_frame_path, y_name), cv2.IMREAD_GRAYSCALE))
scores = net.predict_single_flow_stack(flow_stack, score_name, frame_size=(340, 256))
frame_scores.append(scores)
print 'video {} done'.format(vid)
sys.stdin.flush()
return np.array(frame_scores), label
# 多线程
if args.num_worker > 1:
pool = multiprocessing.Pool(args.num_worker, initializer=build_net)
video_scores = pool.map(eval_video, eval_video_list) #列表到函数的映射
else:
build_net()
#通过这样的一个映射之后,video_scores尺寸为(num_videos, frame_ticks, num_scale, 101)
video_scores = map(eval_video, eval_video_list)
#做对每一个视频的(frame_ticks, num_scale, 101)做一个结果融合(平均或者取最大值)
video_pred = [np.argmax(default_aggregation_func(x[0])) for x in video_scores]
video_labels = [x[1] for x in video_scores]
cf = confusion_matrix(video_labels, video_pred).astype(float)
cls_cnt = cf.sum(axis=1)
cls_hit = np.diag(cf)
cls_acc = cls_hit/cls_cnt
print cls_acc
print 'Accuracy {:.02f}%'.format(np.mean(cls_acc)*100)
if args.save_scores is not None:
np.savez(args.save_scores, scores=video_scores, labels=video_labels)
在eval_net.py文件中,有这样的两个函数predict_single_flow_stack
和 predict_single_frame
,来进行一个多尺度的预测,下面以predict_single_flow_stack
为例,函数所在的文件为pyActionRecong/action_caffe.py文件中,作者定义了一个CaffeNet类,predict_single_flow_stack
为该类函数:
def predict_single_flow_stack(self, frame, score_name, over_sample=True, frame_size=None):
if frame_size is not None:
frame = fast_list2arr([cv2.resize(x, frame_size) for x in frame])
else:
frame = fast_list2arr(frame)
#多尺度
if over_sample:
os_frame = flow_stack_oversample(frame, (self._sample_shape[2], self._sample_shape[3]))
else:
os_frame = fast_list2arr([frame])
data = os_frame - np.float32(128.0)
#网络输出
self._net.blobs['data'].reshape(*data.shape)
self._net.reshape()
#前向传播获取每类得分
out = self._net.forward(blobs=[score_name,], data=data)
#其输出是(scale_num, 101)
return out[score_name].copy()