源码是官方的2014.4 TRD工程里的,整个工程是基于zc702板子的,但手里只有块小zybo >_< 里面的硬件设计很有参考价值,最近想用FPGA加速surf算法,先在这分析下TRD工程里sobel edge detection的例程。
wiki
这里不同于xapp1167,直接调用hls::cv的库函数,sobel边缘提取算法是重新实现的,更方便了解hls的算法实现的特点。
void image_filter(AXI_STREAM& video_in, AXI_STREAM& video_out, int rows, int cols,
int C_XR0C0, int C_XR0C1, int C_XR0C2, int C_XR1C0, int C_XR1C1, int C_XR1C2, int C_XR2C0, int C_XR2C1, int C_XR2C2,
int C_YR0C0, int C_YR0C1, int C_YR0C2, int C_YR1C0, int C_YR1C1, int C_YR1C2, int C_YR2C0, int C_YR2C1, int C_YR2C2,
int c_high_thresh, int c_low_thresh, int c_invert)
{
//Create AXI streaming interfaces for the core
//这里定义axi-stream接口用于stream图像数据
#pragma HLS INTERFACE axis port=video_in bundle=INPUT_STREAM
#pragma HLS INTERFACE axis port=video_out bundle=OUTPUT_STREAM
//设置rows、cols 为axilite总线上的寄存器,用于改变处理图像的大小(图像的最大尺寸为1920*1080)
#pragma HLS INTERFACE s_axilite port=rows bundle=CONTROL_BUS offset=0x14
#pragma HLS INTERFACE s_axilite port=cols bundle=CONTROL_BUS offset=0x1C
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
//#pragma HLS INTERFACE ap_stable port=rows
//#pragma HLS INTERFACE ap_stable port=cols
//设置sobel算子x、y方向的滤波模板 方便PS端改变模板(比如可以改成Prewitt算子)
#pragma HLS INTERFACE s_axilite port= C_XR0C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR0C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR0C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C2 bundle=CONTROL_BUS
//x、y阈值
#pragma HLS INTERFACE s_axilite port= c_high_thresh bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= c_low_thresh bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= c_invert bundle=CONTROL_BUS
YUV_IMAGE img_0(rows, cols);
YUV_IMAGE img_1(rows, cols);
#pragma HLS dataflow
//将axi-stream 转换为 hls::mat (hls::mat是数据流的形式)
hls::AXIvideo2Mat(video_in, img_0);
//sobel edge detection implement
sobel_filter_core(img_0, img_1, rows, cols,
C_XR0C0, C_XR0C1, C_XR0C2, C_XR1C0, C_XR1C1, C_XR1C2, C_XR2C0, C_XR2C1, C_XR2C2,
C_YR0C0, C_YR0C1, C_YR0C2, C_YR1C0, C_YR1C1, C_YR1C2, C_YR2C0, C_YR2C1, C_YR2C2,
c_high_thresh, c_low_thresh, c_invert);
//hls::mat 转换为axi-stream输出
hls::Mat2AXIvideo(img_1, video_out);
}
top function 是一个标准的hls 图像处理结构,具体内容请参看xapp1167文档
void sobel_filter_core(YUV_IMAGE& src, YUV_IMAGE& dst, int rows, int cols,
int C_XR0C0, int C_XR0C1, int C_XR0C2, int C_XR1C0, int C_XR1C1, int C_XR1C2, int C_XR2C0, int C_XR2C1, int C_XR2C2,
int C_YR0C0, int C_YR0C1, int C_YR0C2, int C_YR1C0, int C_YR1C1, int C_YR1C2, int C_YR2C0, int C_YR2C1, int C_YR2C2,
int c_high_thresh, int c_low_thresh, int c_invert)
{
Y_BUFFER buff_A;
Y_WINDOW buff_C;
//Y_BUFFER Y_WINDOW 定义如下
//typedef hls::Window<3, 3, unsigned char> Y_WINDOW;
//typedef hls::LineBuffer<3, MAX_WIDTH, unsigned char> Y_BUFFER;
//hls特有的memory结构 具体特征说明见下方
for(int row = 0; row < rows+1; row++){
for(int col = 0; col < cols+1; col++){
#pragma HLS loop_flatten off
// loop_flatten 选项说明
//Allows nested loops to be collapsed into a single loop with improved latency.
//
#pragma HLS dependence variable=&buff_A false
// dependence 选项说明
//Used to provide additional information that can overcome loop-carry dependencies and allow loops to be pipelined (or pipelined with lower intervals).
#pragma HLS PIPELINE II = 1
// PIPELINE 选项说明
//Reduces the initiation interval by allowing the concurrent execution of operations within a loop or function.
//流水的迭代次数为1
// Temp values are used to reduce the number of memory reads
unsigned char temp;
YUV_PIXEL tempx;
//Line Buffer fill
if(col < cols){
buff_A.shift_down(col);
//行存shift
temp = buff_A.getval(0,col);
//这里的行存里的row = 0 和row = 1 的值是相同的
}
//There is an offset to accomodate the active pixel region
//There are only MAX_WIDTH and MAX_HEIGHT valid pixels in the image
if(col < cols && row < rows){
YUV_PIXEL new_pix;
src >> new_pix;
tempx = new_pix;
buff_A.insert_bottom(tempx.val[0],col);
//插入新值
}
//Shift the processing window to make room for the new column
buff_C.shift_right();
//窗右移空出一列
//The Sobel processing window only needs to store luminance values
//rgb2y function computes the luminance from the color pixel
if(col < cols){
//将数据从行存里复制到窗中,
//对于这里为什么将之前的 temp = buff_A.getval(0,col)
//而不是直接复制行存中row=1的数据,就如同对行存中row=2的数据的操作
//eg: buff_C.insert(buff_A.getval(2,col),2,0);
// buff_C.insert(buff_A.getval(1,col),1,0);
// buff_C.insert(tempx.val[0],0,0);
//有疑问,猜想可能是Synthesis的时候并行化有影响
buff_C.insert(buff_A.getval(2,col),2,0);
buff_C.insert(temp,1,0);
buff_C.insert(tempx.val[0],0,0);
}
YUV_PIXEL edge;
//如下是基本的sobel算法的流程了 注意图像边缘的位置排除
//The sobel operator only works on the inner part of the image
//This design assumes there are no edges on the boundary of the image
if( row <= 1 || col <= 1 || row > (rows-1) || col > (cols-1)){
edge.val[0] = 0;
edge.val[1] = 128;
} else {
//Sobel operation on the inner portion of the image
edge = sobel_operator(&buff_C,
C_XR0C0, C_XR0C1, C_XR0C2, C_XR1C0, C_XR1C1, C_XR1C2, C_XR2C0, C_XR2C1, C_XR2C2,
C_YR0C0, C_YR0C1, C_YR0C2, C_YR1C0, C_YR1C1, C_YR1C2, C_YR2C0, C_YR2C1, C_XR2C2,
c_high_thresh, c_low_thresh, c_invert);
}
//The output image is offset from the input to account for the line buffer
if(row > 0 && col > 0) {
dst << edge;
}
}
}
}
参照ug902文档
The main features of the LineBuffer class are:
• Support for all data types through parameterization
• User-defined number of rows and columns
• Automatic banking of rows into separate memory banks for increased memory
bandwidth
• Provides all the methods for using and debugging line buffers in an algorithmic design
• Support for all data types through parametrization
• User-defined number of rows and columns
• Automatic partitioning into individual registers for maximum bandwidth
• Provides all the methods to use and debug memory windows in the context of an
algorithm
由于采用的是stearm流的形式处理图像数据,所以并不能访问图像上任意一点的值,为了方便滤波模板的操作提供了这两种数据结构,这里的sobel算子要求有一个3*3大小的窗与模板相乘,要产生这样的窗则需要一个三行数据的行存,两层的row、col循环中每次都把行存中col列的数据shift_down操作然后将新得到的数据插入到底部,注意这里和ug902的文档有出入,查看源码,发现ug902的文档上是错的 源码中定义的 top和bottom方向如下
/* Member functions of LineBuffer class */
/* +---+---+-... ...-+---+---+
* R-1 | | | | | |
* +---+---+-... ...-+---+---+
* R-2 | | | | | |
* +---+---+-... ...-+---+---+
* ... ... ... ...
* +---+---+-... ...-+---+---+
* 1 | | | | | |
* +---+---+-... ...-+---+---+
* 0 | | | | | |
* +---+---+-... ...-+---+---+
* 0 1 ... ... C-2 C-1 (origin is at bottom-left point)
*/
可能是官方的文档(v2014.1)没有更新
x、y方向模板与窗相乘 没啥可说的了
YUV_PIXEL sobel_operator(Y_WINDOW *window,
int XR0C0, int XR0C1, int XR0C2, int XR1C0, int XR1C1, int XR1C2, int XR2C0, int XR2C1, int XR2C2,
int YR0C0, int YR0C1, int YR0C2, int YR1C0, int YR1C1, int YR1C2, int YR2C0, int YR2C1, int YR2C2,
int high_thesh, int low_thresh, int invert)
{
short x_weight = 0;
short y_weight = 0;
short edge_weight;
unsigned char edge_val;
YUV_PIXEL pixel;
char i;
char j;
const char x_op[3][3] = {{XR0C0,XR0C1,XR0C2},
{XR1C0,XR1C1,XR1C2},
{XR2C0,XR2C1,XR2C2}};
const char y_op[3][3] = {{YR0C0,YR0C1,YR0C2},
{YR1C0,YR1C1,YR1C2},
{YR2C0,YR2C1,YR2C2}};
//Compute approximation of the gradients in the X-Y direction
for(i=0; i < 3; i++){
for(j = 0; j < 3; j++){
// X direction gradient
x_weight = x_weight + (window->getval(i,j) * x_op[i][j]);
// Y direction gradient
y_weight = y_weight + (window->getval(i,j) * y_op[i][j]);
}
}
edge_weight = ABS(x_weight) + ABS(y_weight);
if (edge_weight < 255)
edge_val = (255-(unsigned char)(edge_weight));
else
edge_val = 0;
//Edge thresholding
if(edge_val > high_thesh)
edge_val = 255;
else if(edge_val < low_thresh)
edge_val = 0;
// Invert
if (invert == 1)
edge_val = 255 - edge_val;
pixel.val[0] = edge_val;
pixel.val[1] = 128;
return pixel;
}