最近项目中会频繁用到yolov3这个目标检测算法框架,由于其在速度和精度尤其是小物体检测的能力上都比较突出所以目前应用面很广泛,在应用yolov3的过程中经常会遇到一些算法上的疑点,由于之前没有好好学习过darknet这个轻量级DL算法框架所以决定从yolov3入手理清一些darknet以及yolov3的概念,查漏补缺并纠正之前可能错误的理解。
git clone https://github.com/pjreddie/darknet
cd darknet && make # 编译darknet,如果需要使用GPU和opencv set GPU=1 CUDNN=1 OPENCV=1
mkdir model && cd model # 创建model文件夹放置darknet模型
wget https://pjreddie.com/media/files/yolov3.weights # 下载yolov3在coco数据上的模型
./darknet detector test cfg/coco.data cfg/yolov3.cfg model/yolov3.weights data/dog.jpg # 加载yolov3配置文件和模型参数进行检测
# yolov3 log 从36层截取:0-74层一共53个conv layer其余都是res layer即shortcut操作,75-105层为yolov3的特征交互层分为三种尺度
layer filters size input output
36 res 33 52 x 52 x 256 -> 52 x 52 x 256
37 conv 512 3 x 3 / 2 52 x 52 x 256 -> 26 x 26 x 512 1.595 BFLOPs
38 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
39 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
40 res 37 26 x 26 x 512 -> 26 x 26 x 512
41 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
42 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
43 res 40 26 x 26 x 512 -> 26 x 26 x 512
44 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
45 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
46 res 43 26 x 26 x 512 -> 26 x 26 x 512
47 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
48 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
49 res 46 26 x 26 x 512 -> 26 x 26 x 512
50 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
51 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
52 res 49 26 x 26 x 512 -> 26 x 26 x 512
53 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
54 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
55 res 52 26 x 26 x 512 -> 26 x 26 x 512
56 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
57 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
58 res 55 26 x 26 x 512 -> 26 x 26 x 512
59 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
60 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
61 res 58 26 x 26 x 512 -> 26 x 26 x 512
62 conv 1024 3 x 3 / 2 26 x 26 x 512 -> 13 x 13 x1024 1.595 BFLOPs
63 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
64 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
65 res 62 13 x 13 x1024 -> 13 x 13 x1024
66 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
67 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
68 res 65 13 x 13 x1024 -> 13 x 13 x1024
69 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
70 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
71 res 68 13 x 13 x1024 -> 13 x 13 x1024
72 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
73 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
74 res 71 13 x 13 x1024 -> 13 x 13 x1024
75 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
76 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
77 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
78 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
79 conv 512 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 512 0.177 BFLOPs
80 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024 1.595 BFLOPs
81 conv 255 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 255 0.088 BFLOPs
82 yolo # small尺寸的特征图 13*13*(3*(5+80))
83 route 79
84 conv 256 1 x 1 / 1 13 x 13 x 512 -> 13 x 13 x 256 0.044 BFLOPs
85 upsample 2x 13 x 13 x 256 -> 26 x 26 x 256 # 对当前特征层进行上采样
86 route 85 61 # concat 85和61层 起到特征合并的作用 类似FPN的思想
87 conv 256 1 x 1 / 1 26 x 26 x 768 -> 26 x 26 x 256 0.266 BFLOPs
88 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
89 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
90 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
91 conv 256 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 256 0.177 BFLOPs
92 conv 512 3 x 3 / 1 26 x 26 x 256 -> 26 x 26 x 512 1.595 BFLOPs
93 conv 255 1 x 1 / 1 26 x 26 x 512 -> 26 x 26 x 255 0.177 BFLOPs
94 yolo # middle尺寸的特征图 26*26*(3*(5+80))
95 route 91
96 conv 128 1 x 1 / 1 26 x 26 x 256 -> 26 x 26 x 128 0.044 BFLOPs
97 upsample 2x 26 x 26 x 128 -> 52 x 52 x 128 # 上采样
98 route 97 36 # cocat 97和36层
99 conv 128 1 x 1 / 1 52 x 52 x 384 -> 52 x 52 x 128 0.266 BFLOPs
100 conv 256 3 x 3 / 1 52 x 52 x 128 -> 52 x 52 x 256 1.595 BFLOPs
101 conv 128 1 x 1 / 1 52 x 52 x 256 -> 52 x 52 x 128 0.177 BFLOPs
102 conv 256 3 x 3 / 1 52 x 52 x 128 -> 52 x 52 x 256 1.595 BFLOPs
103 conv 128 1 x 1 / 1 52 x 52 x 256 -> 52 x 52 x 128 0.177 BFLOPs
104 conv 256 3 x 3 / 1 52 x 52 x 128 -> 52 x 52 x 256 1.595 BFLOPs
105 conv 255 1 x 1 / 1 52 x 52 x 256 -> 52 x 52 x 255 0.353 BFLOPs
106 yolo # large尺寸的特征图 52*52*(3*(5+80))
Loading weights from model/yolov3.weights...Done!
data/dog.jpg: Predicted in 0.024054 seconds. # 1080T inference time
# 图像中类别和置信度
dog: 99%
truck: 92%
bicycle: 99%
//example/darknet.c main函数
} else if (0 == strcmp(argv[1], "detector")){
run_detector(argc, argv);
//example/detector.c run_detector函数
if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile, fullscreen); // 根据系统参数配置网络输入文件信息thresh=0.5, hier_thresh=0.5(看代码不知道这个参数是否用到,后面再分析吧),outfile=null fullscreen=0
//example/detector.c test_detector函数
void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen)
{
/*将/data/coco.names里面的label加载到names中*/
list *options = read_data_cfg(datacfg);
char *name_list = option_find_str(options, "names", "data/names.list");
char **names = get_labels(name_list);
image **alphabet = load_alphabet(); // 将/data/label的图像加载到 image数组中,darknet中最后展现在result中的label都是以图像的形式展现出来的,而不是用put_text到图像中的
network *net = load_network(cfgfile, weightfile, 0); // 加载cfg和参数构建darknet network -> 稍后具体分析(1)
set_batch_network(net, 1); // 将 network里面layer的batch_size都设置为1
srand(2222222);
double time;
char buff[256];
char *input = buff;
float nms=.45;
while(1){
if(filename){
strncpy(input, filename, 256);
} else {
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
/*yolov3输入的图像预处理:
1.除以255归一化
2.图像居中等比例缩放padding 127.5/255
3.BGR2RGB
4.NHWC2NCHW
图像处理部分逻辑比较简单,需要注意的主要是等比例缩放,在不使用opencv的情况下使用C图像库stb_image,用图像w h c以及数据data初始化一个image结构体*/
image im = load_image_color(input,0,0);
image sized = letterbox_image(im, net->w, net->h);
//image sized = resize_image(im, net->w, net->h);
//image sized2 = resize_max(im, net->w);
//image sized = crop_image(sized2, -((net->w - sized2.w)/2), -((net->h - sized2.h)/2), net->w, net->h);
//resize_network(net, sized.w, sized.h);
layer l = net->layers[net->n-1]; // 获取最后一个yolo layer, 主要是为了获取类别信息吧,因为三个yolo layer的input size都不相同
float *X = sized.data;
time=what_time_is_it_now();
network_predict(net, X); // 连续调用layer的forward做inference
printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now()-time);
int nboxes = 0;
detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes); //调用yolo_layer对三个output tensor进行分析 -> 稍后具体分析(2)
//printf("%d\n", nboxes);
//if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
if (nms) do_nms_sort(dets, nboxes, l.classes, nms); // 对三个层级的bbox做nms, nms的算法思想不难但是没有好好看过实现,关于darknet的nms还是需要理解一下 -> 分析完yolo_layer后简单分析一下nms的实现 (3)
draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes); // 把检测到的目标展示出来, 第一次研究这个function的时候发现不是简单的展示, 还做了一些小处理的 -> 稍后分析 (4)
free_detections(dets, nboxes);
if(outfile){
save_image(im, outfile);
}
else{
save_image(im, "predictions");
#ifdef OPENCV
make_window("predictions", 512, 512, 0);
show_image(im, "predictions", 0);
#endif
}
free_image(im);
free_image(sized);
if (filename) break;
}
}
//src/network.c load_network函数
network *load_network(char *cfg, char *weights, int clear)
{
network *net = parse_network_cfg(cfg); //将网络的cfg文件参数化,即解析cfg配置文件
if(weights && weights[0] != 0){
load_weights(net, weights); // 根据cfg构建的network按照layer的顺序加载对一个的layer参数权重
}
if(clear) (*net->seen) = 0; // *net->seen 代表目前网络已经处理的图像数量 batch_num = net->batch * net->subdivisions 可以算法网络已经处理的batch数量
return net;
}
//src/parser.c parse_network_cfg函数
network *parse_network_cfg(char *filename)
{
/*分析read_cfg: 个人理解darknet将cfg中每一个layer当做节点node其中val为section构建成一个链表list
其中涉及的数结构有:
typedef struct{
char *type; //存放 layer name
list *options; // 暂时不太清晰list成员的作用(存放layer的属性字段?)
}section;
typedef struct node{
void *val; // 存放当前section
struct node *next;
struct node *prev;
} node;
typedef struct list{
int size; // 链表节点个数
node *front;
node *back;
} list;
*/
list *sections = read_cfg(filename);
node *n = sections->front;
if(!n) error("Config file has no sections");
network *net = make_network(sections->size - 1); // 为构建网络分类内存 calloc (malloc并且初始化为0)
net->gpu_index = gpu_index;
size_params params;
section *s = (section *)n->val;
list *options = s->options;
if(!is_network(s)) error("First section must be [net] or [network]");
parse_net_options(options, net); // 初始化网络全局参数
params.h = net->h;
params.w = net->w;
params.c = net->c;
params.inputs = net->inputs;
params.batch = net->batch;
params.time_steps = net->time_steps;
params.net = net;
size_t workspace_size = 0;
n = n->next;
int count = 0;
free_section(s);
fprintf(stderr, "layer filters size input output\n");
while(n){ // 初始化每一层的参数,这部分内容比较多,就不在yolov3这个模块展开了,如果有必要的话会单独对网路参数和layer参数的加载进行学习和分析
params.index = count;
fprintf(stderr, "%5d ", count);
s = (section *)n->val;
options = s->options;
layer l = {0};
LAYER_TYPE lt = string_to_layer_type(s->type);
if(lt == CONVOLUTIONAL){
l = parse_convolutional(options, params);
}else if(lt == DECONVOLUTIONAL){
l = parse_deconvolutional(options, params);
}else if(lt == LOCAL){
l = parse_local(options, params);
}else if(lt == ACTIVE){
l = parse_activation(options, params);
}else if(lt == LOGXENT){
l = parse_logistic(options, params);
}else if(lt == L2NORM){
l = parse_l2norm(options, params);
}else if(lt == RNN){
l = parse_rnn(options, params);
}else if(lt == GRU){
l = parse_gru(options, params);
}else if (lt == LSTM) {
l = parse_lstm(options, params);
}else if(lt == CRNN){
l = parse_crnn(options, params);
}else if(lt == CONNECTED){
l = parse_connected(options, params);
}else if(lt == CROP){
l = parse_crop(options, params);
}else if(lt == COST){
l = parse_cost(options, params);
}else if(lt == REGION){
l = parse_region(options, params);
}else if(lt == YOLO){ // yolov3独有的yolo_layer
l = parse_yolo(options, params);
}else if(lt == ISEG){
l = parse_iseg(options, params);
}else if(lt == DETECTION){
l = parse_detection(options, params);
}else if(lt == SOFTMAX){
l = parse_softmax(options, params);
net->hierarchy = l.softmax_tree;
}else if(lt == NORMALIZATION){
l = parse_normalization(options, params);
}else if(lt == BATCHNORM){
l = parse_batchnorm(options, params);
}else if(lt == MAXPOOL){
l = parse_maxpool(options, params);
}else if(lt == REORG){
l = parse_reorg(options, params);
}else if(lt == AVGPOOL){
l = parse_avgpool(options, params);
}else if(lt == ROUTE){
l = parse_route(options, params, net);
}else if(lt == UPSAMPLE){
l = parse_upsample(options, params, net);
}else if(lt == SHORTCUT){
l = parse_shortcut(options, params, net);
}else if(lt == DROPOUT){
l = parse_dropout(options, params);
l.output = net->layers[count-1].output;
l.delta = net->layers[count-1].delta;
#ifdef GPU
l.output_gpu = net->layers[count-1].output_gpu;
l.delta_gpu = net->layers[count-1].delta_gpu;
#endif
}else{
fprintf(stderr, "Type not recognized: %s\n", s->type);
}
l.clip = net->clip;
l.truth = option_find_int_quiet(options, "truth", 0);
l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
l.dontsave = option_find_int_quiet(options, "dontsave", 0);
l.dontload = option_find_int_quiet(options, "dontload", 0);
l.numload = option_find_int_quiet(options, "numload", 0);
l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
l.smooth = option_find_float_quiet(options, "smooth", 0);
option_unused(options);
net->layers[count] = l;
if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
free_section(s);
n = n->next;
++count;
if(n){ // 这部分将连接的两个层之间的输入输出shape统一
params.h = l.out_h;
params.w = l.out_w;
params.c = l.out_c;
params.inputs = l.outputs;
}
}
free_list(sections);
layer out = get_network_output_layer(net); //返回网络的输出layer
net->outputs = out.outputs;
net->truths = out.outputs;
if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths;
net->output = out.output;
net->input = calloc(net->inputs*net->batch, sizeof(float)); // 给网络的输入和gt分配内存
net->truth = calloc(net->truths*net->batch, sizeof(float));
#ifdef GPU
net->output_gpu = out.output_gpu;
net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
#endif
if(workspace_size){
//printf("%ld\n", workspace_size);
#ifdef GPU
if(gpu_index >= 0){
net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
}else {
net->workspace = calloc(1, workspace_size);
}
#else
net->workspace = calloc(1, workspace_size);
#endif
}
return net;
}
//src/parser.c load_wieghts和load_weights_upto函数
void load_weights_upto(network *net, char *filename, int start, int cutoff)
{
#ifdef GPU
if(net->gpu_index >= 0){
cuda_set_device(net->gpu_index);
}
#endif
fprintf(stderr, "Loading weights from %s...", filename);
fflush(stdout);
FILE *fp = fopen(filename, "rb");
if(!fp) file_error(filename);
int major;
int minor;
int revision;
fread(&major, sizeof(int), 1, fp);
fread(&minor, sizeof(int), 1, fp);
fread(&revision, sizeof(int), 1, fp);
if ((major*10 + minor) >= 2 && major < 1000 && minor < 1000){
fread(net->seen, sizeof(size_t), 1, fp);
} else {
int iseen = 0;
fread(&iseen, sizeof(int), 1, fp);
*net->seen = iseen;
}
int transpose = (major > 1000) || (minor > 1000);
int i;
for(i = start; i < net->n && i < cutoff; ++i){ // 这篇博文中没有对模型权重的加载进行深入分析,需要加载权重的layer并不多,根据当前network的layer type判断是否需要加载权重,按照net中layer的顺序从权重文件中将相应的数据读出并写入layer存放权重的内存空间中
layer l = net->layers[i];
if (l.dontload) continue;
if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
load_convolutional_weights(l, fp);
}
if(l.type == CONNECTED){
load_connected_weights(l, fp, transpose);
}
if(l.type == BATCHNORM){
load_batchnorm_weights(l, fp);
}
if(l.type == CRNN){
load_convolutional_weights(*(l.input_layer), fp);
load_convolutional_weights(*(l.self_layer), fp);
load_convolutional_weights(*(l.output_layer), fp);
}
if(l.type == RNN){
load_connected_weights(*(l.input_layer), fp, transpose);
load_connected_weights(*(l.self_layer), fp, transpose);
load_connected_weights(*(l.output_layer), fp, transpose);
}
if (l.type == LSTM) {
load_connected_weights(*(l.wi), fp, transpose);
load_connected_weights(*(l.wf), fp, transpose);
load_connected_weights(*(l.wo), fp, transpose);
load_connected_weights(*(l.wg), fp, transpose);
load_connected_weights(*(l.ui), fp, transpose);
load_connected_weights(*(l.uf), fp, transpose);
load_connected_weights(*(l.uo), fp, transpose);
load_connected_weights(*(l.ug), fp, transpose);
}
if (l.type == GRU) {
if(1){
load_connected_weights(*(l.wz), fp, transpose);
load_connected_weights(*(l.wr), fp, transpose);
load_connected_weights(*(l.wh), fp, transpose);
load_connected_weights(*(l.uz), fp, transpose);
load_connected_weights(*(l.ur), fp, transpose);
load_connected_weights(*(l.uh), fp, transpose);
}else{
load_connected_weights(*(l.reset_layer), fp, transpose);
load_connected_weights(*(l.update_layer), fp, transpose);
load_connected_weights(*(l.state_layer), fp, transpose);
}
}
if(l.type == LOCAL){
int locations = l.out_w*l.out_h;
int size = l.size*l.size*l.c*l.n*locations;
fread(l.biases, sizeof(float), l.outputs, fp);
fread(l.weights, sizeof(float), size, fp);
#ifdef GPU
if(gpu_index >= 0){
push_local_layer(l);
}
#endif
}
}
fprintf(stderr, "Done!\n");
fclose(fp);// 至此darknet的网络模型和参数加载部分都已经完成了
}
//src/network.c get_network_boxes函数
/*
在解析yolov3数据这个部分涉及的数据结构:
typedef struct{
float x, y, w, h; // 存放bbox的中心点坐标和w、h
} box;
typedef struct detection{
box bbox; // bbox的坐标信息
int classes; // 类别个数
float *prob; // 类别置信度数组
float *mask;
float objectness; // 目标置信度数组
int sort_class; // bbox所属类别?
} detection;
*/
detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num) // 输入参数:network、图像的原始w和h,thresh是指过滤bbox的时候用到的阈值,hier这个阈值在yolov3中没有用到, map=0和relative=1用途不太明显看后面分析吧,num为检测到的bbox数目(具体是哪个阶段的:to add)
{
detection *dets = make_network_boxes(net, thresh, num); // 根据yolo_layer所生成bbox的数量分配相应的内存空间用于后续bbox信息的存储
fill_network_boxes(net, w, h, thresh, hier, map, relative, dets); // 往分配好内存空间的bbox里面填入具体的数据
return dets;
}
//src/network.c make_network_boxes函数
detection *make_network_boxes(network *net, float thresh, int *num)
{
layer l = net->layers[net->n - 1];
int i;
int nboxes = num_detections(net, thresh); //计算经过obj prob阈值过滤后的bbox的数量
if(num) *num = nboxes;
detection *dets = calloc(nboxes, sizeof(detection)); //以下几个步骤是根据初步bbox的数量分配对应的空间用以存储bbox的具体数据
for(i = 0; i < nboxes; ++i){
dets[i].prob = calloc(l.classes, sizeof(float));
if(l.coords > 4){
dets[i].mask = calloc(l.coords-4, sizeof(float));
}
}
return dets;
}
->//src/network.c num_detections函数
int num_detections(network *net, float thresh)
{
int i;
int s = 0;
for(i = 0; i < net->n; ++i){
layer l = net->layers[i]; //遍历network里面的layer,因为yolov3中有三个yolo_layer 都需要统计所生成bbox的num然后汇总
if(l.type == YOLO){ //由于yolov3生成的bbox数量很多,需要先用thresh过滤掉大量的bbox
s += yolo_num_detections(l, thresh);
}
if(l.type == DETECTION || l.type == REGION){
s += l.w*l.h*l.n;
}
}
return s;
}
-->//src/yolo_layer.c yolo_num_detections函数
int yolo_num_detections(layer l, float thresh)
{
int i, n;
int count = 0;
for (i = 0; i < l.w*l.h; ++i){ // feature map 的 size
for(n = 0; n < l.n; ++n){ // anchor的数目 n = 3
int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
/*
这里解释一下yolo_layer输入tensor的数据分布情况以小尺寸的feature map为例子:
255*13*13 -> 3*(5+80)*13*13 -> 3(4*13*13 + 1*13*13 + 80*13*13) 每个点三个anchor,每个anchor有4个坐标值、1个obj_prob、80个classe_prob
这里需要取出obj_prob 然后根据thresh=0.5进行过滤
*/
if(l.output[obj_index] > thresh){ //
++count;
}
}
}
return count;
}
//src/network.c fill_network_boxes函数
void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
{
int j;
for(j = 0; j < net->n; ++j){
layer l = net->layers[j];
if(l.type == YOLO){ //往detection bbox里面填经过解析之后的 坐标、置信度等信息
int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets); // 返回该阶段yolo_layer的检测数量,以此定位在dets数组中的位置 w,h为图像w,h; net->w,net->h为网络输入的w,h
dets += count;
}
if(l.type == REGION){ // for yolov2,yolov2中才会用到hier_thresh
get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
dets += l.w*l.h*l.n;
}
if(l.type == DETECTION){ // for yolov1
get_detection_detections(l, w, h, thresh, dets);
dets += l.w*l.h*l.n;
}
}
}
->//src/yolo_layer.c 函数get_yolo_detections
int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
{
int i,j,n;
float *predictions = l.output; //这里解释一下在yolo_layer test的forward时,layer.output等于上一层的输出,即上一层conv的输出: 255*13*13/255*26*26/255*52*52
/*
在yolo_layer forward的时候还做了一些计算操作: 在训练阶段为了把x,y,o,classes
回归预测值约束在[0,1],使用了logistic函数(sigmoid函数),其实这是沿用了yolov2
论文的思路
void forward_yolo_layer(const layer l, network net)
{
int i,j,b,t,n;
memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
#ifndef GPU
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);// sigmoid(tx)/sigmoid(ty)
index = entry_index(l, b, n*l.w*l.h, 4);
activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);// sigmoid(to)/sigmoid(classes prob)
}
}
}
*/
if (l.batch == 2) avg_flipped_yolo(l); // 针对当batch=2的情况下对outpu[1]做了一个水平翻转的操作(类似镜像?),然后做了一个平均的运算output[0] = (output[0]+ output[1])/2
int count = 0;
for (i = 0; i < l.w*l.h; ++i){
int row = i / l.w;
int col = i % l.w;
for(n = 0; n < l.n; ++n){
int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
float objectness = predictions[obj_index]; // 取出目标置信度和阈值判断,这样的话保证 bbox和make_network_boxes是一致的
if(objectness <= thresh) continue;
int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
// get_yolo_box是一个比较核心的function,分析见后面
dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
dets[count].objectness = objectness;
dets[count].classes = l.classes;
for(j = 0; j < l.classes; ++j){
int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
float prob = objectness*predictions[class_index]; // bbox最终存放的prob是目标prob和类别prob的乘积(这里需要特别注意)
dets[count].prob[j] = (prob > thresh) ? prob : 0; //prob小于 thresh 0.5的都置为0
}
++count;
}
}
correct_yolo_boxes(dets, count, w, h, netw, neth, relative);// 根据图像原始大小对bbox的预测值进行修正
return count;
}
-->/src/yolo_layer.c 函数get_yolo_box
box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
/*
输入参数解析:(*x 预测数据),(*biases 存放anchor数据),
(i、j 对应在feature map上的坐标),
(n 表示anchor数组的mask,为了让三个yolo_layer能取到自己对应的三组anchor, 小尺寸feature map对应大size anchor,比较好理解小尺寸特征图负责检查大尺寸目标),
(index,当前bbox对应的数据的起始下标),
(lw lh,特征图的w h),
(w h, 网络输入的w h),
(同一个bbox数据之间的stride lw*lh)
*/
{
box b; // 网络为了每一个bbox都给出了4个坐标预测值: tx ty tw ty
/*
其中tx 和 ty是相对于当前feature map坐标的偏移
除以lw&&lh 是计算出bbox坐标在图像中的比例
*/
b.x = (i + x[index + 0*stride]) / lw;
b.y = (j + x[index + 1*stride]) / lh;
/*
e^tw * biases[2*n] 表示学习到的w回归值和对应prior bbox(anchor) w的乘积得到
bbox在网络输入size基础上的w size, 除以 net_w得到相对于网络输入图像的比例
h的计算同理, 这部分的内容涉及到yolov3论文中的图二
*/
b.w = exp(x[index + 2*stride]) * biases[2*n] / w;
b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
return b;
/*补充一下,这里算出的x,y,w,h都是相对于net input size的比例*/
}
--->/src/yolo_layer.c correct_yolo_boxe函数
void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
int i;
int new_w=0;
int new_h=0;
if (((float)netw/w) < ((float)neth/h)) {
// yolov3在做test和training的时候w和h是按照等比例缩放的,缩放之后的图像置于网络输入的中心部分,空缺部分用常量填补,这里是判断 图像的w和h的相对大小,然后算出网络输入中图像部分的实际w和h
new_w = netw;
new_h = (h * netw)/w;
} else {
new_h = neth;
new_w = (w * neth)/h;
}
for (i = 0; i < n; ++i){
box b = dets[i].bbox; // 针对原图的size计算bbox的相对坐标和尺寸
// 关于x和y的坐标计算不是太理解,但是达到的目的是为了将坐标映射到原图,对图像缩放后坐标的计算相关知识还不太清楚,有清晰的同学可以补充
b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
b.y = (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
// w和h的计算就比较简单了
b.w *= (float)netw/new_w;
b.h *= (float)neth/new_h;
if(!relative){ // yolov3里面 relative = 1因此这里不做运算,关于坐标的相对值转换成绝对值,是在后面画图的阶段进行的
b.x *= w;
b.w *= w;
b.y *= h;
b.h *= h;
}
dets[i].bbox = b;
}
}
//src/box.c do_nms_sort函数
void do_nms_sort(detection *dets, int total, int classes, float thresh)
{
int i, j, k;
k = total-1;
for(i = 0; i <= k; ++i){ // 筛选掉目标置信度为0的bbox,不过在yolov3这一步没有意义因为在之前已经筛选掉objectness小于0.5的目标了
if(dets[i].objectness == 0){
detection swap = dets[i];
dets[i] = dets[k];
dets[k] = swap;
--k;
--i;
}
}
total = k+1;
/*
这里有一点需要主要的是虽然上一阶段我们得出了total个detection,但是每一个detection对应着
classes个prob
*/
for(k = 0; k < classes; ++k){
for(i = 0; i < total; ++i){
dets[i].sort_class = k;
}
qsort(dets, total, sizeof(detection), nms_comparator); // 将每一类的bbox按照prob值从大到小的排序(降序)
/*
int nms_comparator(const void *pa, const void *pb)
{
detection a = *(detection *)pa;
detection b = *(detection *)pb;
float diff = 0;
if(b.sort_class >= 0){
diff = a.prob[b.sort_class] - b.prob[b.sort_class];
} else {
diff = a.objectness - b.objectness;
}
if(diff < 0) return 1;
else if(diff > 0) return -1;
return 0;
}
*/
for(i = 0; i < total; ++i){
if(dets[i].prob[k] == 0) continue;
box a = dets[i].bbox;
for(j = i+1; j < total; ++j){ // 使用双层循环通过iou大于0.45来筛选overlap超过阈值的bbox
box b = dets[j].bbox;
if (box_iou(a, b) > thresh){ // 筛选的方式是直接将对应class的prob置零
dets[j].prob[k] = 0;
}
}
}
}
}
// src//image.c
void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
{
int i,j;
for(i = 0; i < num; ++i){
char labelstr[4096] = {0};
int class = -1;
for(j = 0; j < classes; ++j){
if (dets[i].prob[j] > thresh){
// 这里需要注意一点的是 一个det有可能有多个class的prob > 0.5 ,但是这里的class标记住记录prob第一个大于0.5的class,但是会把所有prob > 0.5的class的label拼接在一起
if (class < 0) {
strcat(labelstr, names[j]);
class = j;
} else {
strcat(labelstr, ", ");
strcat(labelstr, names[j]);
}
printf("%s: %.0f%%\n", names[j], dets[i].prob[j]*100);
}
}
if(class >= 0){
int width = im.h * .006;
/*
if(0){
width = pow(prob, 1./2.)*10+1;
alphabet = 0;
}
*/
//printf("%d %s: %.0f%%\n", i, names[class], prob*100);
int offset = class*123457 % classes;
float red = get_color(2,offset,classes);
float green = get_color(1,offset,classes);
float blue = get_color(0,offset,classes);
float rgb[3];
//width = prob*20+2;
rgb[0] = red;
rgb[1] = green;
rgb[2] = blue;
box b = dets[i].bbox;
//printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
/*
这里很重要的啊:在之前的yolo_layer阶段我们已经算法了各det bbox相对于原图的
坐标和尺寸的比例 b.x b.y b.w b.h
*/
int left = (b.x-b.w/2.)*im.w; // 左边坐标
int right = (b.x+b.w/2.)*im.w; // 右边坐标
int top = (b.y-b.h/2.)*im.h; // 顶部坐标
int bot = (b.y+b.h/2.)*im.h; // 底部坐标
// 限制bbox超出边界
if(left < 0) left = 0;
if(right > im.w-1) right = im.w-1;
if(top < 0) top = 0;
if(bot > im.h-1) bot = im.h-1;
// 以下内容是darkent draw box和lable的过程,没有深究,我们只需要了解到输出bbox坐标这个阶段就行了
draw_box_width(im, left, top, right, bot, width, red, green, blue);
if (alphabet) {
image label = get_label(alphabet, labelstr, (im.h*.03));
draw_label(im, top + width, left, label, rgb);
free_image(label);
}
if (dets[i].mask){
image mask = float_to_image(14, 14, 1, dets[i].mask);
image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
image tmask = threshold_image(resized_mask, .5);
embed_image(tmask, im, left, top);
free_image(mask);
free_image(resized_mask);
free_image(tmask);
}
}
}
}
// yolov3 anchor值计算
/*
yolov3和yolov2 anchor的求法相同,在COCO和VOC数据上随意选择了9个聚类簇和三种尺度然后把聚类簇均匀分布在个尺度上;
但是yolov3和yolov2的anchor大小差异明显引用作者的原话:
* In YOLOv2 I made some design choice errors, I made the anchor box size be
relative to the feature size in the last layer. Since the network was down-
sampling by 32. This means it was relative to 32 pixels so an anchor of 9x9 was actually 288px x 288px.
* In YOLOv3 anchor sizes are actual pixel values. this simplifies a lot of
stuff and was only a little bit harder to implement;
yolov2用最后一层feature map的size来定义anchor的size,yolov3是相对于network输入size来定义anchor size;
* yolov3 cfg中的anchor size是相对于416*416求得的,首先聚类出9组anchor此时求出的数值
w h是相对于图像size的比例,然后乘上416,需要注意的是yolov3训练过程中cfg random=1表示用到
yolov2中的Multi-Scale Training思想,就是用一种输入size算出的anchor去训练多尺度图像
[320,608]都是32的倍数;这样在做推理的时候输入size也是可以变化的;
*/