背景:之前代码过多无用代码,我们只需要神经网络的前馈运算的代码。卷积核需要不用gemm的方式而用更简单的卷积来运算。
目的:读懂简化版的yolo2_light代码,并将其中卷积改为9个并行的基本单元。
代码地址: https://github.com/AlexeyAB/yolo2_light
相关文章:
YOLOv3:Darknet代码解析(一)安装Darknet
YOLOv3:Darknet代码解析(二)代码初步
YOLOv3:Darknet代码解析(三)卷积操作
YOLOv3:Darknet代码解析(四)结构更改与训练
YOLOv3:Darknet代码解析(五)权重与特征存储
YOLOv3:Darknet代码解析(六)简化的程序与卷积拆分
目录
1.函数的功能与位置
2.检验
2.1 程序输出的检验
2.2 卷积的检验
3.卷积运算
3.1 zynqNet中卷积的运算
3.2 原始的卷积语句
3.3 改为定值卷积
3.4 3*3的9并行
~/datasets/xxr/yolo2_light/bin$ ./darknet detector test obj.data obj.cfg obj_6000.weights 17.jpg
layer filters size input output
0 conv 16 3 x 3 / 1 416 x 416 x 3 -> 416 x 416 x 16
1 max 2 x 2 / 2 416 x 416 x 16 -> 208 x 208 x 16
2 conv 32 3 x 3 / 1 208 x 208 x 16 -> 208 x 208 x 32
3 max 2 x 2 / 2 208 x 208 x 32 -> 104 x 104 x 32
4 conv 64 3 x 3 / 1 104 x 104 x 32 -> 104 x 104 x 64
5 max 2 x 2 / 2 104 x 104 x 64 -> 52 x 52 x 64
6 conv 128 3 x 3 / 1 52 x 52 x 64 -> 52 x 52 x 128
7 max 2 x 2 / 2 52 x 52 x 128 -> 26 x 26 x 128
8 conv 256 3 x 3 / 1 26 x 26 x 128 -> 26 x 26 x 256
9 max 2 x 2 / 2 26 x 26 x 256 -> 13 x 13 x 256
10 conv 512 3 x 3 / 1 13 x 13 x 256 -> 13 x 13 x 512
11 max 2 x 2 / 1 13 x 13 x 512 -> 13 x 13 x 512
12 conv 1024 3 x 3 / 1 13 x 13 x 512 -> 13 x 13 x1024
13 conv 1024 3 x 3 / 1 13 x 13 x1024 -> 13 x 13 x1024
14 conv 30 1 x 1 / 1 13 x 13 x1024 -> 13 x 13 x 30
15 detection
Loading weights from obj_6000.weights...Done!
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Skip layer: 3
Fuse Convolutional layer l->size = 3
Fuse Convolutional layer l->size = 3
Fuse Convolutional layer l->size = 1
Skip layer: 21
running yolov2_forward_netwrok_cpu
layer num: 0 CONVOLUTIONAL
FilterNum:16,Input Channels:3,Input Height:416,input width:416,FilterSize:3
layer num: 1 MAXPOOL
layer num: 2 CONVOLUTIONAL
FilterNum:32,Input Channels:16,Input Height:208,input width:208,FilterSize:3
layer num: 3 MAXPOOL
layer num: 4 CONVOLUTIONAL
FilterNum:64,Input Channels:32,Input Height:104,input width:104,FilterSize:3
layer num: 5 MAXPOOL
layer num: 6 CONVOLUTIONAL
FilterNum:128,Input Channels:64,Input Height:52,input width:52,FilterSize:3
layer num: 7 MAXPOOL
layer num: 8 CONVOLUTIONAL
FilterNum:256,Input Channels:128,Input Height:26,input width:26,FilterSize:3
layer num: 9 MAXPOOL
layer num: 10 CONVOLUTIONAL
FilterNum:512,Input Channels:256,Input Height:13,input width:13,FilterSize:3
layer num: 11 MAXPOOL
layer num: 12 CONVOLUTIONAL
FilterNum:1024,Input Channels:512,Input Height:13,input width:13,FilterSize:3
layer num: 13 CONVOLUTIONAL
FilterNum:1024,Input Channels:1024,Input Height:13,input width:13,FilterSize:3
layer num: 14 CONVOLUTIONAL
FilterNum:30,Input Channels:1024,Input Height:13,input width:13,FilterSize:1
layer num: 15 REGION
Check done!
17.jpg: Predicted in 1.542322 seconds.
classes= 1 : 80% (left_x: 124 top_y: 82 width: 45 height: 53)
classes= 1 : 70% (left_x: 187 top_y: 17 width: 28 height: 40)
classes= 1 : 83% (left_x: 195 top_y: 157 width: 50 height: 64)
Not compiled with OpenCV, saving to predictions.png instead
只有前馈之后,网络的代码简短了不少。
主函数:mian.c中,main函数中,运用network_predict_cpu这个函数来获得图片的输出。
前馈计算:yolov2_forward_network.c中,定义了network_predict_cpu函数yolov2_forward_network_cpu函数作用就是一层一层的前馈计算。
卷积运算:forward_convolutional_layer_cpu 也在这个函数中
~/datasets/xxr/yolo2_light/bin$ ./darknet detector test obj.data obj.cfg obj_6000.weights 17.jpg
predictions.png
将每次卷积结果输入check.txt文件,
// yolov2_forward_network.c
void yolov2_forward_network_cpu(network net, network_state state)
{
FILE* check=fopen("check.txt","w");
printf("\n running yolov2_forward_netwrok_cpu\n ");
state.workspace = net.workspace;
int i;
for (i = 0; i < net.n; ++i) {
printf("layer num: %d \t",i);
state.index = i;
layer l = net.layers[i];
if (l.type == CONVOLUTIONAL) {
printf(" CONVOLUTIONAL \n");
forward_convolutional_layer_cpu(l, state);
fwrite(l.output,l.outputs,sizeof(float),check);
}
else if (l.type == MAXPOOL) {
printf(" MAXPOOL \n");
forward_maxpool_layer_cpu(l, state);
}
state.input = l.output;
}
fclose(check);
}
写入文件后,程序更改为这样,把每层卷积的l.output进行进行检验。
void yolov2_forward_network_cpu(network net, network_state state)
{
FILE* check=fopen("check.txt","r");
printf("\n running yolov2_forward_netwrok_cpu\n ");
state.workspace = net.workspace;
int i;
for (i = 0; i < net.n; ++i) {
printf("layer num: %d \t",i);
state.index = i;
layer l = net.layers[i];
if (l.type == CONVOLUTIONAL) {
printf(" CONVOLUTIONAL \n");
forward_convolutional_layer_cpu(l, state);
float *buffer=(float*)malloc(l.outputs*sizeof(float));
fread(buffer,l.outputs,sizeof(float),check);
for(int i=0;i
//input
~/datasets/xxr/yolo2_light/bin$ ./darknet detector test voc.data head-hw-v2.cfg obj_9000.weights dog.jpg
//output
......
FilterNum:1024,Input Channels:1024,Input Height:13,input width:13,FilterSize:3
layer num: 14 CONVOLUTIONAL
FilterNum:30,Input Channels:1024,Input Height:13,input width:13,FilterSize:1
layer num: 15 REGION
Check done!
dog.jpg: Predicted in 1.412451 seconds.
有输出check done!表示验证正确。
oid ProcessingElement::macc2d(const data_t pixels[9], const data_t weights[9],
data_t& result) {
#pragma HLS inline
data_t accumulator = 0.0f;
data_t multresult[9];
#pragma HLS ARRAY_PARTITION variable = multresult complete dim = 0
L_MACC_multiply:
for (int i = 0; i < 9; i++) {
#pragma HLS UNROLL
multresult[i] = pixels[i] * weights[i];
}
L_MACC_accumulate:
for (int i = 0; i < 9; i++) {
#pragma HLS UNROLL
accumulator = accumulator + multresult[i];
}
LOG("PE: macc2D -> %.2f \n", accumulator);
result = accumulator;
}
运用ARRAY_PARTATION指令可以9个一起并行运算。先乘再加。
yolov2_forward_network.c文件中,forward_convolutional_layer_cpu 函数
// l.n - number of filters on this layer
// l.c - channels of input-array
// l.h - height of input-array
// l.w - width of input-array
// l.size - width and height of filters (the same size for all filters)
printf("FilterNum:%d,Input Channels:%d,Input Height:%d,input width:%d,FilterSize:%d\n",
l.n,l.c,l.h,l.w,l.size);
// 1. Convolution !!!
#ifndef GEMMCONV
int fil;
// filter index
#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP
for (fil = 0; fil < l.n; ++fil) {
int chan, y, x, f_y, f_x;
// channel index
for (chan = 0; chan < l.c; ++chan)
// input - y
for (y = 0; y < l.h; ++y)
// input - x
for (x = 0; x < l.w; ++x)
{
int const output_index = fil*l.w*l.h + y*l.w + x;
int const weights_pre_index = fil*l.c*l.size*l.size + chan*l.size*l.size;
int const input_pre_index = chan*l.w*l.h;
float sum = 0;
// filter - y
for (f_y = 0; f_y < l.size; ++f_y)
{
int input_y = y + f_y - l.pad;
// filter - x
for (f_x = 0; f_x < l.size; ++f_x)
{
int input_x = x + f_x - l.pad;
if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;
int input_index = input_pre_index + input_y*l.w + input_x;
int weights_index = weights_pre_index + f_y*l.size + f_x;
sum += state.input[input_index] * l.weights[weights_index];
}
}
// l.output[filters][width][height] +=
// state.input[channels][width][height] *
// l.weights[filters][channels][filter_width][filter_height];
l.output[output_index] += sum;
}
}
14层之外的卷积层大小为3*3,14层为1*1卷积核。定值为3*3
if(state.index!=14){
for (f_y = 0; f_y < 3; ++f_y)
{
int input_y = y + f_y - l.pad;
// filter - x
for (f_x = 0; f_x < 3; ++f_x)
{
int input_x = x + f_x - l.pad;
if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;
int input_index = input_pre_index + input_y*l.w + input_x;
int weights_index = weights_pre_index + f_y*l.size + f_x;
sum += state.input[input_index] * l.weights[weights_index];
}
}
else{
// filter - y
f_y = 0;
int input_y = y + f_y - l.pad;
// filter - x
f_x = 0;
int input_x = x + f_x - l.pad;
if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;
int input_index = input_pre_index + input_y*l.w + input_x;
int weights_index = weights_pre_index + f_y*l.size + f_x;
sum += state.input[input_index] * l.weights[weights_index];
}
改为9的并行,将3*3的卷积作为基本的卷积单元。
float input_buffer[9],weight_buffer[9],product_buffer[9];
for(f_y=0;f_y<3;++f_y){
intinput_y=y+f_y-l.pad;
// filter - x
for (f_x = 0; f_x < 3; ++f_x)
{
int input_x = x + f_x - l.pad;
if (input_y < 0 || input_x < 0 || input_y >= l.h || input_x >= l.w) continue;
int input_index = input_pre_index + input_y*l.w + input_x;
int weights_index = weights_pre_index + f_y*l.size + f_x;
int buffer_idx=3*f_y+f_x;
input_buffer[buffer_idx]=state.input[input_index];
weight_buffer[buffer_idx]=l.weights[weights_index];
//sum += state.input[input_index] * l.weights[weights_index];
}
}
for(i=0;i<9;i++){
product_buffer[i]=input_buffer[i]*weight_buffer[i];
}
for(i=0;i<9;i++){
sum+=product_buffer[i];
}
至此,我们将卷积改为了最基本的9个并行的运算。并用3.1中的方法验证通过。
相关文章:
YOLOv3:Darknet代码解析(一)安装Darknet
YOLOv3:Darknet代码解析(二)代码初步
YOLOv3:Darknet代码解析(三)卷积操作
YOLOv3:Darknet代码解析(四)结构更改与训练
YOLOv3:Darknet代码解析(五)权重与特征存储
YOLOv3:Darknet代码解析(六)简化的程序与卷积拆分