temporal segment networks 核心代码

temporal segment networks 核心代码

标签(空格分隔): 源码


  • temporal segment networks 核心代码
    • 网络结构 prototxt
    • 视频图像的读入 video_data_layer
    • 网络测试

网络结构 prototxt

先来看看prototxt网络配置,以光流的为例,prototxt核心部分如下,这里只摘取了,数据输入层和损失函数层,中间层是BN-Inception结构(略)。

name: "BN-Inception"
"""
一个批次读入数据的大小为: batch_size, num_segment * new_length * modilty, image_w, image_h
其他的参量下面都有解释,这里说一下modilty,对于光流图像来说,它是灰度图像,但是它存在两个
方向的光流x, y;对于RGB来说,存在三个通道。
"""

layer {
  name: "data"
  type: "VideoData"
  top: "data"
  top: "label"
  video_data_param {
  #文本文件,每一行的是一个视频的信息:视频图像的路径 视频的帧数 视频的类别
    source: "data/ucf101_flow_train_split_1.txt"
    batch_size: 32
    new_length: 5
  #视频平均分成num_segments段,采样时从每一段采样连续new_length帧
    num_segments: 3
  #模态,RGB或者光流
    modality: FLOW
    shuffle: true
  #视频图像的命名方式%c取(x, y) %05d取第几帧
    name_pattern: "flow_%c_%05d.jpg"
  }
  transform_param{
    crop_size: 224
    mirror: true
    fix_crop: true
    more_fix_crop: true
    multi_scale: true
    max_distort: 1
    scale_ratios: [1,.875,.75]
    mean_value: 128
    is_flow: true
  }
  include: { phase: TRAIN }
}
layer {
  name: "data"
  type: "VideoData"
  top: "data"
  top: "label"
  video_data_param {
    source: "data/ucf101_flow_val_split_1.txt"
    batch_size: 1
    new_length: 5
    num_segments: 3
    modality: FLOW
    name_pattern: "flow_%c_%05d.jpg"
  }
  transform_param{
    crop_size: 224
    mirror: false
    mean_value: 128
    is_flow: true
  }
  include: { phase: TEST }
}
#通过数据读入层,以上面参数设定为例,输入数据大小为(32, 30, 224, 224)

#对输入数据做一个Reshap,变成(96, 10, 224, 224)
#其目的是,第二维度(通道数)变成从num_segments段中,采样一次得到的通道数
#new_length * modility,方便后续测试用(因为第二维度关系到权值大小的问题)
layer { name: "data_reshape" type: "Reshape" bottom: "data" top: "data_reshape" reshape_param { shape { dim: [-1, 10, 224, 224] } }}

"""
BN-Inception Network
"""

####################################### global pool #######################################
# 全局池化层,将feature map变成 1 * 1
layer { name: "global_pool" top: "global_pool" bottom: "inception_5b/output" type: "Pooling"
  pooling_param { pool: AVE kernel_size: 7 stride: 1 } }
layer { name: "dropout" top: "global_pool" bottom: "global_pool" type: "Dropout"
dropout_param { dropout_ratio: 0.7 } }

####################################### loss accuracy #######################################
# 将通道数变成101,进而分类, 此时输出数据为96 * 101
layer { name: "fc-action" type: "InnerProduct" bottom: "global_pool" top: "fc"
  param { lr_mult: 1 decay_mult: 1 } param { lr_mult: 2 decay_mult: 0 }
  inner_product_param { num_output: 101
    weight_filler { type: "gaussian" std: 0.001 }
    bias_filler { type: "constant" value: 0 } } }
#进行还原,实际上,网络的batch_size = 32, 该层的输出为:32,1,3,101
#其中第三维度3表示,一段视频分的段数num_segments
#也就是说,每一段视频采样的图像帧对该视频的类别进行打分,然后进行融合(mean, max等),与论文描述一致
layer { name: "reshape_fc" bottom: "fc" top: "reshape_fc" type: "Reshape"
  reshape_param { shape { dim: [-1, 1, 3, 101 ] } }}
layer { name: "pool_fusion" bottom: "reshape_fc" top: "pool_fc" type: "Pooling"
  pooling_param { pool: AVE kernel_h: 3 kernel_w: 1} }
layer { name: "loss" type: "SoftmaxWithLoss" bottom: "pool_fc" bottom: "label" top: "loss" softmax_param {axis: 3} }
layer { name: "accuracy_top1" type: "Accuracy" bottom: "pool_fc" bottom: "label" top: "accuracy" accuracy_param {axis: 3}
  include { phase: TEST } }

视频图像的读入 video_data_layer

在TSN中,视频中的每一帧图像以及对应的光流都是事先生成,并存放在文件夹中,同时生成ucf101_flow_train_split_1.txt相关的文本文件,该文本文件每一行的格式:视频图像的路径 该段视频的帧数 该段视频的类别。
我们来看看video_data_layer的源码

#include 
#include 
#include 
#include 
#include 

#include "caffe/data_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"

#ifdef USE_MPI
#include "mpi.h"
#include 
using namespace boost::filesystem;
#endif

namespace caffe{
template <typename Dtype>
VideoDataLayer:: ~VideoDataLayer(){
    this->JoinPrefetchThread();
}

template <typename Dtype>
void VideoDataLayer:: DataLayerSetUp(const vector*>& bottom, const vector*>& top){
//读取prototxt配置文件的参数
    const int new_height  = this->layer_param_.video_data_param().new_height();
    const int new_width  = this->layer_param_.video_data_param().new_width();
    const int new_length  = this->layer_param_.video_data_param().new_length();
    const int num_segments = this->layer_param_.video_data_param().num_segments();
    const string& source = this->layer_param_.video_data_param().source();
//获取数据集相关的信息文件
    LOG(INFO) << "Opening file: " << source;
    打开该文本文件
    std:: ifstream infile(source.c_str());
    string filename;
    int label;
    int length;
    //数据分别流入,文件路径,视频长度,视频类别
    while (infile >> filename >> length >> label){
    //分别压入lines_和lines_duration_容器中
        lines_.push_back(std::make_pair(filename,label));
        lines_duration_.push_back(length);
    }
    //打乱视频
    if (this->layer_param_.video_data_param().shuffle()){
        const unsigned int prefectch_rng_seed = caffe_rng_rand();
        prefetch_rng_1_.reset(new Caffe::RNG(prefectch_rng_seed));
        prefetch_rng_2_.reset(new Caffe::RNG(prefectch_rng_seed));
        ShuffleVideos();
    }
//lines_容器大大小,也就表示训练集(测试集)中,视频的个数
    LOG(INFO) << "A total of " << lines_.size() << " videos.";
    lines_id_ = 0;

    //check name patter
    //图像命名的格式
    if (this->layer_param_.video_data_param().name_pattern() == ""){
        if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_RGB){
            name_pattern_ = "image_%04d.jpg";
        }else if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
            name_pattern_ = "flow_%c_%04d.jpg";
        }
    }else{
        name_pattern_ = this->layer_param_.video_data_param().name_pattern();
    }
//进行视频图像读入测试
    Datum datum;
        bool is_color = !this->layer_param_.video_data_param().grayscale();
    const unsigned int frame_prefectch_rng_seed = caffe_rng_rand();
    frame_prefetch_rng_.reset(new Caffe::RNG(frame_prefectch_rng_seed));
    //将视频分成num_segments段的平均间隔
    int average_duration = (int) lines_duration_[lines_id_]/num_segments;
    vector<int> offsets;//采样图像索引偏移
    for (int i = 0; i < num_segments; ++i){
        caffe::rng_t* frame_rng = static_cast(frame_prefetch_rng_->generator());
        int offset = (*frame_rng)() % (average_duration - new_length + 1);
        offsets.push_back(offset+i*average_duration);//相对于第一帧图像的偏移量(索引偏移)
    }
    //不同的模态,读入程序也有差别
    if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW)
    //光流读入测试
        CHECK(ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                     offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str()));
    else
    //RGB读入测试
        CHECK(ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                    offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str()));
    //数据的预处理
    //prefetch_data_是一个Blob类对象,在Caffe中,通过内部线程,将数据
    //读入等待队列,然后从等待队列中读取数据,在这一点上和tensorflow有些相似
    //tensorflow中,队列大小用户可以自己定义,在Caffe中默认为3
    //从下面的代码中,可以看到top_data = prefetch_data_
    const int crop_size = this->layer_param_.transform_param().crop_size();
    const int batch_size = this->layer_param_.video_data_param().batch_size();
    if (crop_size > 0){//crop
        top[0]->Reshape(batch_size, datum.channels(), crop_size, crop_size);
        this->prefetch_data_.Reshape(batch_size, datum.channels(), crop_size, crop_size);
    } else {
        top[0]->Reshape(batch_size, datum.channels(), datum.height(), datum.width());
        this->prefetch_data_.Reshape(batch_size, datum.channels(), datum.height(), datum.width());
    }
    LOG(INFO) << "output data size: " << top[0]->num() << "," << top[0]->channels() << "," << top[0]->height() << "," << top[0]->width();
    //为label分配内存
    top[1]->Reshape(batch_size, 1, 1, 1);
    this->prefetch_label_.Reshape(batch_size, 1, 1, 1);
    //预处理后的数据分配内存
    vector<int> top_shape = this->data_transformer_->InferBlobShape(datum);
    this->transformed_data_.Reshape(top_shape);
}
//内部线程,读取数据
template <typename Dtype>
void VideoDataLayer::InternalThreadEntry(){

    Datum datum;
    CHECK(this->prefetch_data_.count());
    //获取top_data以及top_label指针,内容可修改
    Dtype* top_data = this->prefetch_data_.mutable_cpu_data();
    Dtype* top_label = this->prefetch_label_.mutable_cpu_data();

    VideoDataParameter video_data_param = this->layer_param_.video_data_param();
    const int batch_size = video_data_param.batch_size();
    const int new_height = video_data_param.new_height();
    const int new_width = video_data_param.new_width();
    const int new_length = video_data_param.new_length();
    const int num_segments = video_data_param.num_segments();
    //获取训练集(测试集)视频的数量
    const int lines_size = lines_.size();

        bool is_color = !this->layer_param_.video_data_param().grayscale();
    for (int item_id = 0; item_id < batch_size; ++item_id){//大循环,循环batch_size
        CHECK_GT(lines_size, lines_id_);//当前读取视频的索引小于视频的数量
        //以下一段程序和DataLayerSetUp读入测试代码相类似
        vector<int> offsets;
        int average_duration = (int) lines_duration_[lines_id_] / num_segments;
        for (int i = 0; i < num_segments; ++i){
            if (this->phase_==TRAIN){
                if (average_duration >= new_length){
                    caffe::rng_t* frame_rng = static_cast(frame_prefetch_rng_->generator());
                    int offset = (*frame_rng)() % (average_duration - new_length + 1);
                    offsets.push_back(offset+i*average_duration);
                } else {
                    offsets.push_back(0);
                }
            } else{
                if (average_duration >= new_length)
                offsets.push_back(int((average_duration-new_length+1)/2 + i*average_duration));
                else
                offsets.push_back(0);
            }
        }
        if (this->layer_param_.video_data_param().modality() == VideoDataParameter_Modality_FLOW){
        //读入数据,存放到datum中
            if(!ReadSegmentFlowToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                       offsets, new_height, new_width, new_length, &datum, name_pattern_.c_str())) {
                continue;
            }
        } else{
            if(!ReadSegmentRGBToDatum(lines_[lines_id_].first, lines_[lines_id_].second,
                                      offsets, new_height, new_width, new_length, &datum, is_color, name_pattern_.c_str())) {
                continue;
            }
        }
        //读取了batch_size = 1的数据 就开始预处理
        int offset1 = this->prefetch_data_.offset(item_id);//获取指针偏移
         this->transformed_data_.set_cpu_data(top_data + offset1);//预处理和源数据内存块重合
        this->data_transformer_->Transform(datum, &(this->transformed_data_));//预处理
        top_label[item_id] = lines_[lines_id_].second;//存入标签
        //LOG()

        //next iteration
        lines_id_++;//下一次视频索引
        if (lines_id_ >= lines_size) {
            DLOG(INFO) << "Restarting data prefetching from start.";
            lines_id_ = 0;
            if(this->layer_param_.video_data_param().shuffle()){
                ShuffleVideos();
            }
        }
    }
}

INSTANTIATE_CLASS(VideoDataLayer);
REGISTER_LAYER_CLASS(VideoData);
}

在video_data_layer中,用到了ReadSegmentFlowToDatumReadSegmentRGBToDatum分别将光流和RGB读入并转换成Datum这样的数据结构,在这此前,先来看看caffe.protoDatum是怎么描述的

message Datum {
//Datum数据块的相关信息,channel height width
  optional int32 channels = 1;
  optional int32 height = 2;
  optional int32 width = 3;
  // the actual image data, in bytes
//存放数据字节(byte)类型的
  optional bytes data = 4;
  //标签,int32类型的
  optional int32 label = 5;
  // Optionally, the datum could also hold float data.
  repeated float float_data = 6;
  // If true data contains an encoded image that need to be decoded
  optional bool encoded = 7 [default = false];
}

从上面的定义可以看到,Datum用来存放一个实例的信息,数据(channel, height, width)包括标签的,下面,将一个实例图像(可能包含多张图像,根据channels的数值来确定)数据存入Datum

//filename:视频图像存放的路径
//label:该视频对应的标签
//offsets:采样图像,相对于第一帧视频的偏移量
//height,width图像的尺寸
//length: 从采样点开始,连续采样图像的帧数(对于光流来说,一帧包含x和y两张图像)
//datum: Datum的数据指针
//name_pattern: 命名模式
bool ReadSegmentFlowToDatum(const string& filename, const int label,
    const vector<int> offsets, const int height, const int width, const int length, Datum* datum,
    const char* name_pattern ){
    //用OpenCV读取图像数据
    cv::Mat cv_img_x, cv_img_y;
    string* datum_string;
    //图像文件名字符串
    char tmp[30];
    for (int i = 0; i < offsets.size(); ++i){//大循环,读取每一段中,采样图像
        int offset = offsets[i];//获取该段偏移量
        for (int file_id = 1; file_id < length+1; ++file_id){//连续读取图像,循环,
            sprintf(tmp,name_pattern, 'x', int(file_id+offset));//x图像文件名
            string filename_x = filename + "/" + tmp;//文件名+路径
            //读取图像(灰度形式读取)
            cv::Mat cv_img_origin_x = cv::imread(filename_x, CV_LOAD_IMAGE_GRAYSCALE);
            sprintf(tmp, name_pattern, 'y', int(file_id+offset));//y图像文件名
            string filename_y = filename + "/" + tmp;//文件名+路径
            cv::Mat cv_img_origin_y = cv::imread(filename_y, CV_LOAD_IMAGE_GRAYSCALE);
            if (!cv_img_origin_x.data || !cv_img_origin_y.data){
                LOG(ERROR) << "Could not load file " << filename_x << " or " << filename_y;
                return false;
            }
            //Resize图像尺度
            if (height > 0 && width > 0){
                cv::resize(cv_img_origin_x, cv_img_x, cv::Size(width, height));
                cv::resize(cv_img_origin_y, cv_img_y, cv::Size(width, height));
            }else{
                cv_img_x = cv_img_origin_x;
                cv_img_y = cv_img_origin_y;
            }
            //开始读取数据是,应给datum类指针赋值相关的值,以及data_string表示Datum数据块指针(可修改)
            if (file_id==1 && i==0){
                int num_channels = 2;
                //通道数 = 2(x和y) * length * 视频分段数
                datum->set_channels(num_channels*length*offsets.size());
                datum->set_height(cv_img_x.rows);
                datum->set_width(cv_img_x.cols);
                datum->set_label(label);
                datum->clear_data();
                datum->clear_float_data();
                datum_string = datum->mutable_data();
            }
            //遍历整个图像,将数据压入datum_string中,因此datum就获取了当前实例的数据
            for (int h = 0; h < cv_img_x.rows; ++h){
                for (int w = 0; w < cv_img_x.cols; ++w){
                    datum_string->push_back(static_cast<char>(cv_img_x.at(h,w)));
                }
            }
            for (int h = 0; h < cv_img_y.rows; ++h){
                for (int w = 0; w < cv_img_y.cols; ++w){
                    datum_string->push_back(static_cast<char>(cv_img_y.at(h,w)));
                }
            }
        }
    }
    return true;
}

网络测试

eval_net.py

import argparse
import os
import sys
import math
import cv2
import numpy as np
import multiprocessing
from sklearn.metrics import confusion_matrix

sys.path.append('.')
from pyActionRecog import parse_directory
from pyActionRecog import parse_split_file

from pyActionRecog.utils.video_funcs import default_aggregation_func

parser = argparse.ArgumentParser()
#数据集,ucf101和hmdb51
parser.add_argument('dataset', type=str, choices=['ucf101', 'hmdb51'])
#数据集中训练集和测试集的三种分类方式
parser.add_argument('split', type=int, choices=[1, 2, 3],
                    help='on which split to test the network')
#模态 光流或者RGB
parser.add_argument('modality', type=str, choices=['rgb', 'flow'])
#视频图像的根目录,其目录结构为frame_path/video_name/jpge(光流或者RGB)
parser.add_argument('frame_path', type=str, help="root directory holding the frames")
#测试网络deploy.prototxt
parser.add_argument('net_proto', type=str)
#网络权重caffemodel文件
parser.add_argument('net_weights', type=str)
#视频图像的命名形式
parser.add_argument('--rgb_prefix', type=str, help="prefix of RGB frames", default='img_')
parser.add_argument('--flow_x_prefix', type=str, help="prefix of x direction flow images", default='flow_x_')
parser.add_argument('--flow_y_prefix', type=str, help="prefix of y direction flow images", default='flow_y_')
#在测试时,将视频分为很多段,从每一段中采样一帧对每一个类别进行打分
#最后进行预测分数融合,获得最终的预测结果
parser.add_argument('--num_frame_per_video', type=int, default=25,
                    help="prefix of y direction flow images")
#保持所有测试集每一类的分数,方便后面的RGB和光流的结果融合
parser.add_argument('--save_scores', type=str, default=None, help='the filename to save the scores in')
#线程数,支持多线程
parser.add_argument('--num_worker', type=int, default=1)
#caffe路径的根目录
parser.add_argument("--caffe_path", type=str, default='./lib/caffe-action/', help='path to the caffe toolbox')
#GPU的使用个数
parser.add_argument("--gpus", type=int, nargs='+', default=None, help='specify list of gpu to use')
args = parser.parse_args()

print args

sys.path.append(os.path.join(args.caffe_path, 'python'))
from pyActionRecog.action_caffe import CaffeNet

# build neccessary information
print args.dataset
#parse_split_file和parse_directory用来解析数据集的信息的,
#比如每段视频的帧数,对应的类别,路径,数据集的个数之类的
split_tp = parse_split_file(args.dataset)
f_info = parse_directory(args.frame_path,
                         args.rgb_prefix, args.flow_x_prefix, args.flow_y_prefix)

gpu_list = args.gpus

#eval_video_list视频名字的列表
eval_video_list = split_tp[args.split - 1][1]

score_name = 'fc-action'


def build_net():
    global net
    my_id = multiprocessing.current_process()._identity[0] \
        if args.num_worker > 1 else 1
    if gpu_list is None:
        net = CaffeNet(args.net_proto, args.net_weights, my_id-1)
    else:
        net = CaffeNet(args.net_proto, args.net_weights, gpu_list[my_id - 1])

#测试网络的核心部分,输入表示视频的名字以及对应的类别(一段视频)
def eval_video(video):
    global net

    label = video[1]
    vid = video[0]
    #从f_info获取该视频对应的视频图像的路径
    video_frame_path = f_info[0][vid]
    #对于RGB图像只需要读入一次,对于光流图像有x光流和y光流,需要读入两次
    if args.modality == 'rgb':
        cnt_indexer = 1
    elif args.modality == 'flow':
        cnt_indexer = 2
    else:
        raise ValueError(args.modality)
    frame_cnt = f_info[cnt_indexer][vid]

    stack_depth = 0
    #对于RGB图像,连续采集一帧图像;对于光流连续采集5帧图像
    if args.modality == 'rgb':
        stack_depth = 1
    elif args.modality == 'flow':
        stack_depth = 5
    #在测试过程中,将一段视频分成很多段,每段采集一次
    step = (frame_cnt - stack_depth) / (args.num_frame_per_video-1)
    if step > 0:
        frame_ticks = range(1, min((2 + step * (args.num_frame_per_video-1)), frame_cnt+1), step)
    else:
        frame_ticks = [1] * args.num_frame_per_video

    assert(len(frame_ticks) == args.num_frame_per_video)

    frame_scores = []
    #循环每一次采集
    for tick in frame_ticks:
    #RGB和光流的采集模式不一样
        if args.modality == 'rgb':
            name = '{}{:05d}.jpg'.format(args.rgb_prefix, tick)
            frame = cv2.imread(os.path.join(video_frame_path, name), cv2.IMREAD_COLOR)
            #net.predict_single_frame进行测试,该函数实现了多尺度的分类
            #其输出(num_scale,101)

            scores = net.predict_single_frame([frame,], score_name, frame_size=(340, 256))
            frame_scores.append(scores)
            #光流的过程同RGB一样
        if args.modality == 'flow':
            frame_idx = [min(frame_cnt, tick+offset) for offset in xrange(stack_depth)]
            flow_stack = []
            for idx in frame_idx:
                x_name = '{}{:05d}.jpg'.format(args.flow_x_prefix, idx)
                y_name = '{}{:05d}.jpg'.format(args.flow_y_prefix, idx)
                flow_stack.append(cv2.imread(os.path.join(video_frame_path, x_name), cv2.IMREAD_GRAYSCALE))
                flow_stack.append(cv2.imread(os.path.join(video_frame_path, y_name), cv2.IMREAD_GRAYSCALE))
            scores = net.predict_single_flow_stack(flow_stack, score_name, frame_size=(340, 256))
            frame_scores.append(scores)

    print 'video {} done'.format(vid)
    sys.stdin.flush()
    return np.array(frame_scores), label
# 多线程
if args.num_worker > 1:
    pool = multiprocessing.Pool(args.num_worker, initializer=build_net)
    video_scores = pool.map(eval_video, eval_video_list) #列表到函数的映射
else:
    build_net()
    #通过这样的一个映射之后,video_scores尺寸为(num_videos, frame_ticks, num_scale, 101)
    video_scores = map(eval_video, eval_video_list)
#做对每一个视频的(frame_ticks, num_scale, 101)做一个结果融合(平均或者取最大值)
video_pred = [np.argmax(default_aggregation_func(x[0])) for x in video_scores]
video_labels = [x[1] for x in video_scores]

cf = confusion_matrix(video_labels, video_pred).astype(float)

cls_cnt = cf.sum(axis=1)
cls_hit = np.diag(cf)

cls_acc = cls_hit/cls_cnt

print cls_acc

print 'Accuracy {:.02f}%'.format(np.mean(cls_acc)*100)

if args.save_scores is not None:
    np.savez(args.save_scores, scores=video_scores, labels=video_labels)

在eval_net.py文件中,有这样的两个函数predict_single_flow_stackpredict_single_frame,来进行一个多尺度的预测,下面以predict_single_flow_stack为例,函数所在的文件为pyActionRecong/action_caffe.py文件中,作者定义了一个CaffeNet类,predict_single_flow_stack为该类函数:

def predict_single_flow_stack(self, frame, score_name, over_sample=True, frame_size=None):

        if frame_size is not None:
            frame = fast_list2arr([cv2.resize(x, frame_size) for x in frame])
        else:
            frame = fast_list2arr(frame)
        #多尺度
        if over_sample:
            os_frame = flow_stack_oversample(frame, (self._sample_shape[2], self._sample_shape[3]))
        else:
            os_frame = fast_list2arr([frame])

        data = os_frame - np.float32(128.0)
        #网络输出
        self._net.blobs['data'].reshape(*data.shape)
        self._net.reshape()
        #前向传播获取每类得分
        out = self._net.forward(blobs=[score_name,], data=data)
        #其输出是(scale_num, 101)
        return out[score_name].copy()

你可能感兴趣的:(深度学习与计算机视觉,caffe源代码)