- 在得到相对纯净代码之后,借助VS2013,首先对预测部分代码进行解析。如有分析不当的地方,欢迎指正。
- 本文分析的代码素材可在https://github.com/SimpleDoger/Darknet_CPU_Only找到,欢迎Watch、Star、Fork
- 限于篇幅,本文贴出的代码会略有删减
- 文中出现的流程图为yolov3的运行时函数调用流程图,yolov3没有使用到的相关函数并没有出现在流程图里面。
- 本文最后有简单的运行时性能分析
1. 总体
网络模型预测部分是一个完整的从输入前向传播到输出的过程。作为具体实现,应当包含这么几个过程:
- 加载标签等相关资源
- 加载网络模型
- 加载待检测图片
- 预测
- 输出结果
这里从执行命令:
./darknet detect cfg/yolov3.cfg yolov3.weights data/dog.jpg
开始分析得到darknet框架的具体实现及相关函数,总体如下图所示。
整个预测部分的代码总体逻辑放在函数test_detector里面(相关代码段意义已备注出)
void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen)
{
// 加载标签相关资源
list *options = read_data_cfg(datacfg);
char *name_list = option_find_str(options, "names", "data/names.list");
char **names = get_labels(name_list);
image **alphabet = load_alphabet();
// 加载网络模型
network *net = load_network(cfgfile, weightfile, 0);
set_batch_network(net, 1);
double time;
char buff[256];
char *input = buff;
float nms=.45;
while(1){
if(filename){
strncpy(input, filename, 256);
} else {
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
// 加载待检测图片
image im = load_image_color(input,0,0);
// 归一化图片
image sized = letterbox_image(im, net->w, net->h);
layer l = net->layers[net->n-1];
float *X = sized.data;
time=what_time_is_it_now();
// 预测
network_predict(net, X);
printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now()-time);
// 预测框输出
int nboxes = 0;
detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
// 将预测框绘制到图片上
draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
free_detections(dets, nboxes);
// 输出结果到文件
if(outfile){
save_image(im, outfile);
}
else{
save_image(im, "predictions");
}
free_image(im);
free_image(sized);
if (filename) break;
}
}
2. 文件加载、输入输出
这部分代码属于服务型代码,同时也占据了大部分代码篇幅。看起来很繁杂,其实相对较为简单。主要分为:
- 配置文件的读取
- 图片文件的读写
- 网络模型文件的加载
从coco.names里加载标签字符串由get_labels函数完成,网络模型的加载总体由load_network函数完成,parse_network_cfg函数读取yolov3.cfg文件并根据文件内的规则完成网络的内存分配。然后load_weights函数加载yolov3.weights权重文件。由于是验证,不牵涉到权重文件的更新。
用到的整个网络模型对象由make_network生成
network *make_network(int n)
{
network *net = calloc(1, sizeof(network));
net->n = n;
net->layers = calloc(net->n, sizeof(layer));
net->seen = calloc(1, sizeof(size_t));
net->t = calloc(1, sizeof(int));
net->cost = calloc(1, sizeof(float));
return net;
}
parse_network_cfg函数中会根据LAYER_TYPE的判断来实例化各个层,具体是用调用“parse_***(options, params);”。然后加入到“net”中。
network *parse_network_cfg(char *filename)
{
list *sections = read_cfg(filename);
... ...
fprintf(stderr, "layer filters size input output\n");
while(n){
params.index = count;
fprintf(stderr, "%5d ", count);
s = (section *)n->val;
options = s->options;
layer l = {0};
LAYER_TYPE lt = string_to_layer_type(s->type);
if(lt == CONVOLUTIONAL){
l = parse_convolutional(options, params);
}else if(lt == DECONVOLUTIONAL){
l = parse_deconvolutional(options, params);
}else if(lt == LOCAL){
l = parse_local(options, params);
... ...
}else if(lt == DROPOUT){
l = parse_dropout(options, params);
l.output = net->layers[count-1].output;
l.delta = net->layers[count-1].delta;
}else{
fprintf(stderr, "Type not recognized: %s\n", s->type);
}
... ...
}
... ...
return net;
}
例如parse_convolutional函数:
convolutional_layer parse_convolutional(list *options, size_params params)
{
int n = option_find_int(options, "filters",1);
int size = option_find_int(options, "size",1);
int stride = option_find_int(options, "stride",1);
int pad = option_find_int_quiet(options, "pad",0);
int padding = option_find_int_quiet(options, "padding",0);
int groups = option_find_int_quiet(options, "groups", 1);
if(pad) padding = size/2;
char *activation_s = option_find_str(options, "activation", "logistic");
ACTIVATION activation = get_activation(activation_s);
int batch,h,w,c;
h = params.h;
w = params.w;
c = params.c;
batch=params.batch;
if(!(h && w && c)) error("Layer before convolutional layer must output image.");
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
int binary = option_find_int_quiet(options, "binary", 0);
int xnor = option_find_int_quiet(options, "xnor", 0);
convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,groups,size,stride,padding,activation, batch_normalize, binary, xnor, params.net->adam);
layer.flipped = option_find_int_quiet(options, "flipped", 0);
layer.dot = option_find_float_quiet(options, "dot", 0);
return layer;
}
里面前面部分是对[convolutional]字段相关参数的读取。
[convolutional]
batch_normalize=1
filters=64
size=3
stride=2
pad=1
activation=leaky
layer的具体对象由make_convolutional_layer生成。
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
{
int i;
convolutional_layer l = {0};
l.type = CONVOLUTIONAL;
... ...
l.update = update_convolutional_layer;
if(binary){
l.binary_weights = calloc(l.nweights, sizeof(float));
l.cweights = calloc(l.nweights, sizeof(char));
l.scales = calloc(n, sizeof(float));
}
if(xnor){
l.binary_weights = calloc(l.nweights, sizeof(float));
l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
}
if(batch_normalize){
l.scales = calloc(n, sizeof(float));
l.scale_updates = calloc(n, sizeof(float));
for(i = 0; i < n; ++i){
l.scales[i] = 1;
}
l.mean = calloc(n, sizeof(float));
... ...
l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
}
if(adam){
l.m = calloc(l.nweights, sizeof(float));
... ...
l.scale_v = calloc(n, sizeof(float));
}
... ...
fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
return l;
}
大部分都是内存的分配以及初始化相关参数相关。
其它层同理。
从源码可以看出,Darknet支持的网络层格式有:
CONVOLUTIONAL, DECONVOLUTIONAL, CONNECTED, MAXPOOL, SOFTMAX, DETECTION,
DROPOUT, CROP, ROUTE, COST, NORMALIZATION, AVGPOOL, LOCAL, SHORTCUT,
ACTIVE, RNN, GRU, LSTM, CRNN, BATCHNORM, NETWORK, XNOR, REGION, YOLO,
ISEG, REORG, UPSAMPLE, LOGXENT, L2NORM, BLANK
当net对象在内存中实例化完成后,读取yolov3.weights文件,完成“net”网络里参数的加载。主要实现函数是load_weights_upto,通过对网络层类型(type)的判断,决定是load_convolutional_weights、load_batchnorm_weights还是load_connected_weights。
void load_weights_upto(network *net, char *filename, int start, int cutoff)
{
fprintf(stderr, "Loading weights from %s...", filename);
fflush(stdout);
FILE *fp = fopen(filename, "rb");
if(!fp) file_error(filename);
... ...
fread(&major, sizeof(int), 1, fp);
fread(&minor, sizeof(int), 1, fp);
fread(&revision, sizeof(int), 1, fp);
if ((major*10 + minor) >= 2 && major < 1000 && minor < 1000){
fread(net->seen, sizeof(size_t), 1, fp);
} else {
int iseen = 0;
fread(&iseen, sizeof(int), 1, fp);
*net->seen = iseen;
}
int transpose = (major > 1000) || (minor > 1000);
int i;
for(i = start; i < net->n && i < cutoff; ++i){
layer l = net->layers[i];
if (l.dontload) continue;
if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
load_convolutional_weights(l, fp);
}
if(l.type == CONNECTED){
load_connected_weights(l, fp, transpose);
}
if(l.type == BATCHNORM){
load_batchnorm_weights(l, fp);
}
if(l.type == CRNN){
load_convolutional_weights(*(l.input_layer), fp);
... ...
}
if(l.type == RNN){
load_connected_weights(*(l.input_layer), fp, transpose);
... ...
}
if (l.type == LSTM) {
load_connected_weights(*(l.wi), fp, transpose);
... ...
}
if (l.type == GRU) {
if(1){
load_connected_weights(*(l.wz), fp, transpose);
... ...
}else{
load_connected_weights(*(l.reset_layer), fp, transpose);
... ...
}
}
if(l.type == LOCAL){
int locations = l.out_w*l.out_h;
... ...
}
}
fprintf(stderr, "Done!\n");
fclose(fp);
}
以load_convolutional_weights为例,其实实现的也就是将数据赋值到对应的层。
void load_convolutional_weights(layer l, FILE *fp)
{
if(l.numload) l.n = l.numload;
int num = l.c/l.groups*l.n*l.size*l.size;
fread(l.biases, sizeof(float), l.n, fp);
if (l.batch_normalize && (!l.dontloadscales)){
fread(l.scales, sizeof(float), l.n, fp);
fread(l.rolling_mean, sizeof(float), l.n, fp);
fread(l.rolling_variance, sizeof(float), l.n, fp);
}
fread(l.weights, sizeof(float), num, fp);
if (l.flipped) {
transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
}
}
接下来就是图片的读取与保存,通过load_image_color函数读取图片到image结构体。save_image保存到jpg图片。
typedef struct {
int w;
int h;
int c;
float *data;
} image;
从图片文件中加载到一个image结构体的具体实现代码是load_image_stb
image load_image_stb(char *filename, int channels)
{
int w, h, c;
unsigned char *data = stbi_load(filename, &w, &h, &c, channels);
if (!data) {
fprintf(stderr, "Cannot load image \"%s\"\nSTB Reason: %s\n", filename, stbi_failure_reason());
exit(0);
}
if(channels) c = channels;
int i,j,k;
image im = make_image(w, h, c);
for(k = 0; k < c; ++k){
for(j = 0; j < h; ++j){
for(i = 0; i < w; ++i){
int dst_index = i + w*j + w*h*k;
int src_index = k + c*i + c*w*j;
im.data[dst_index] = (float)data[src_index]/255.;
}
}
}
free(data);
return im;
}
比较有意思的是,这样的话,“*data”里面的数据将是“RR...R GG...G BB...B”而不是“RGB RGB ... RGB”
整个流程图如下:
3. 网络预测
前面做了那么多的铺垫,整个预测框架的核心函数其实也就两个:
forward_network (network *net)
get_network_boxes (network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num)
其中forward_network函数写得很巧妙。个人认为“l.forward(l, net);”这句是整个代码中最有灵魂的一句。将函数定义到结构体里面,颠覆了我结构体里面只能放数据这一认识。这一句将根据网络层的不同,选择对应的前向传播函数。
void forward_network(network *netp)
{
network net = *netp;
int i;
for(i = 0; i < net.n; ++i){
net.index = i;
layer l = net.layers[i];
if(l.delta){
fill_cpu(l.outputs * l.batch, 0, l.delta, 1);
}
l.forward(l, net);
net.input = l.output;
if(l.truth) {
net.truth = l.output;
}
}
calc_network_cost(netp);
}
这种做法有点像C++里面的虚函数,在实例化后才知道具体的实现函数。作为forward(struct layer, struct network);这个“虚函数”,它的响应函数有如下26种:
l.forward = forward_activation_layer;
l.forward = forward_avgpool_layer;
l.forward = forward_batchnorm_layer;
l.forward = forward_connected_layer;
l.forward = forward_convolutional_layer;
l.forward = forward_cost_layer;
l.forward = forward_crnn_layer;
l.forward = forward_crop_layer;
l.forward = forward_deconvolutional_layer;
l.forward = forward_detection_layer;
l.forward = forward_dropout_layer;
l.forward = forward_gru_layer;
l.forward = forward_iseg_layer;
l.forward = forward_l2norm_layer;
l.forward = forward_local_layer;
l.forward = forward_logistic_layer;
l.forward = forward_lstm_layer;
l.forward = forward_maxpool_layer;
l.forward = forward_region_layer;
l.forward = forward_reorg_layer;
l.forward = forward_rnn_layer;
l.forward = forward_route_layer;
l.forward = forward_shortcut_layer;
l.forward = forward_softmax_layer;
l.forward = forward_upsample_layer;
l.forward = forward_yolo_layer;
同样,以forward_convolutional_layer为例:
void forward_convolutional_layer(convolutional_layer l, network net)
{
int i, j;
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
if(l.xnor){
binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
swap_binary(&l);
binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
net.input = l.binary_input;
}
int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
for(j = 0; j < l.groups; ++j){
float *a = l.weights + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output + (i*l.groups + j)*n*m;
float *im = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
if (l.size == 1) {
b = im;
} else {
im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
}
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
if(l.batch_normalize){
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
if(l.binary || l.xnor) swap_binary(&l);
}
其中im2col_cpu是将图拉成一个矩阵,可将卷积操作转成两个矩阵相乘。gemm函数就是两个矩阵相乘的实现函数。当存在归一化时,交由forward_batchnorm_layer进行归一化。否则使用add_bias添加偏置项。最终,由activate_array采用对应得激活函数进行激活。整体流程图如下:
get_network_boxes函数实现的是预测框的获取,先交由make_network_boxes,根据阈值,得到预测框的个数,并实例化detection *dets,然后传入fill_network_boxes得到每个预测框的相关信息。完成后,通过do_nms_sort做非极大化抑制。最终,draw_detections画出结果。整体流程如下:
4. 性能分析
整个性能测试在i7-7700K(单核4.2GHz)下完成,用CPU的占用比来评价相对时间占用可能并不严谨。但是从中依然可以得到的结论是:
在预测阶段,绝大部分耗费的时间都是在大矩阵乘法上
因此,在性能优化上来讲,主要目标也就是对矩阵乘法函数gemm函数的优化。这也从侧面证明了并行计算架构的GPU优越性所在。
顺带一提的是,在Linux系统下,预测时间180秒左右,而在Windows下要193秒左右。
从上图分析报告中可以看出,在yolov3的预测中,gemm函数占了90%以上,其次是im2col函数,然后是load_network。
最后,知道该优化哪里了吧。。。再多的奇淫巧计弄在其它地方,可能收获颇微。
----- 逃 -----