void _2DConv(int8_t *in,int8_t *out,int8_t *filter,uint32_t nrows,uint32_t ncols,uint8_t k,uint8_t stride)
/*
in:输入图像
out:输出图像
filter:滤波器
nrows:输出图像的行数
ncols:输出图像的列数
k:滤波器大小(k*k)
stride:滤波窗口步进长度
*/
{
uint16_t r,c,i,j;
uint32_t nrows_i=nrows*stride+k-1;//输入图像的行数
uint32_t ncols_i=ncols*stride+k-1;//输入图像的列数
for(r=0;r<nrows;r++)
{
for(c=0;c<=ncols;c++)
{
for(i=0;i<k;i++)
{
for(j=0;j<k;j++)
{
uint32_t offset=(r*stride+i)*ncols_i+(c*stride+j);
int32_t in_tmp=*(in+offset);
int32_t filter_tmp=*(filter+i*k+j);
*(out+r*ncols+c)+=in_tmp *filter_tmp;
}
}
}
}
}
void HW_2DConv_Mmap_1(int8_t *pixel_in,int8_t *pixel_out,int32_t addr_reserved)
{
/*
AXI4数据总线(m_axi):
pixel_in用于FPGA加速器主动读取DDR的输入图像和滤波器
pixel_out用于FPGA加速器主动写入输出结果到DDR中
AXI4Lite控制总线(s_axilite):
用于输入输出图像访存基地址
*/
#pragma HLS INTERFACE m_axi depth =482*272+3*3 port=pixel_in offset=slave bundle=user_axi_in register
#pragma HLS INTERFACE m_axi depth =482*270 port=pixel_out offset=slave bundle=user_axi_out register
#pragma HLS INTERFACE s_axilite port=pixel_in bundle=user_axi4lite_config register
#pragma HLS INTERFACE s_axilite port=pixel_out bundle=user_axi4lite_config register
#pragma HLS INTERFACE s_axilite port=addr_reserved offset=0xFFF0 bundle=user_axi4lite_config register
#pragma HLS INTERFACE s_axilite port=return bundle=user_axi4lite_config register
int8_t *filter_base=pixel_in+ROWS_I*COLS_I;
_2DConv(pixel_in,pixel_out,filter_base,ROWS_O,COLS_O,FILTER_SIZE,STRIDE);
}
实现结果的时间和资源
里面的两层迭代需要展开
外面的两层迭代需要流水
void _2DConv(int8_t *in,int8_t *out,int8_t *filter,uint32_t nrows,uint32_t ncols,uint8_t k,uint8_t stride)
{
uint16_t r,c,i,j;
uint32_t nrows_i=nrows*stride+k-1;//输入图像的行数
uint32_t ncols_i=ncols*stride+k-1;//输入图像的列数
for(r=0;r<nrows;r++)
{
for(c=0;c<=ncols;c++)
{
#pragma HLS PIPELINE II=1//流水
for(i=0;i<k;i++)
{
for(j=0;j<k;j++)
{
uint32_t offset=(r*stride+i)*ncols_i+(c*stride+j);
int32_t in_tmp=*(in+offset);
int32_t filter_tmp=*(filter+i*k+j);
*(out+r*ncols+c)+=in_tmp *filter_tmp;
}
}
}
}
}
实现结果的时间和资源
void _2DConv——Op(int8_t *in,int8_t *out,int8_t *filter,uint32_t nrows,uint32_t ncols,uint8_t k,uint8_t stride)
{
uint16_t r,c,i,j;
uint32_t nrows_i=nrows*stride+k-1;//输入图像的行数
uint32_t ncols_i=ncols*stride+k-1;//输入图像的列数
int8_t *filter_based=in+ROWS_I*COLS_I;
int8_t filter_buffer[FILTER_SIZE][FILTER_SIZE]
//*pragma HLS ARRAY_PARTITION variable=filter_buffer complete dim=0
memcpy (&filter_buffer[0][0],filter_base,FILTER_SIZE*FILTER_SIZE*sizeof(int8_t));//提前将卷积核放到片上
for(r=0;r<nrows;r++)
{
for(c=0;c<=ncols;c++)
{
#pragma HLS PIPELINE II=1
int8_t store_data=0;
for(i=0;i<k;i++)
{
for(j=0;j<k;j++)
{
uint32_t offset=(r*stride+i)*ncols_i+(c*stride+j);
int8_t val=*(in+offset);
int8_t res=0;
if(filter_buffer[i][j]==0)//针对二值卷积核进行优化
res=0;
else if(filter_buffer[i][j]==1)
res=val;
else if(filter_buffer[i][j]==-1)
res=-val;
store_data+=res
*(out+r*ncols+c)+=store_data;
}
}
}
}
}
卷积核缓存到片上ram
实现结果的时间和资源
2D卷积操作必须对k*k大小的窗口内的所有像素点进行计算
字节从DDR中取像素进行计算的话,访存延迟大,访存周期不确定,无法实现高效的流水处理
行缓冲(line buffer)
窗口缓冲(windows buffer)
template<typename T,int LROW,int LCOL>
class ap_linebuffer{
public:
TM[LROW][LCOL];
#pragma AP ARRAY_PARTITION variable=M dim=1 complete
ap_linebuffer(){
};
}
template<typename T,int LROW,int LCOL>
class ap_window{
public:
TM[LROW][LCOL];
#pragma AP ARRAY_PARTITION variable=M dim=1 complete
ap_window(){
};
}
完整代码
void _2DConv_Op(int8_t *in,int8_t *out,int8_t *filter,uint32_t nrows,uint32_t ncols,uint8_t k,uint8_t stride)
{
uint16_t r,c,i,j;
uint32_t nrows_i=nrows*stride+k-1;//输入图像的行数
uint32_t ncols_i=ncols*stride+k-1;//输入图像的列数
int8_t *filter_based=in+ROWS_I*COLS_I;
int8_t filter_buffer[FILTER_SIZE][FILTER_SIZE]
//*pragma HLS ARRAY_PARTITION variable=filter_buffer complete dim=0
memcpy (&filter_buffer[0][0],filter_base,FILTER_SIZE*FILTER_SIZE*sizeof(int8_t));//提前将FILTER_SIZE*FILTER_SIZE个像素放到片上
ap_linebuffer<int8_t,FILTER_SIZE,COLS_I> line_buffer;
ROW_LOOP:for(r=0;r<ROWS_I;r++)
{
ap_window<int8_t,FILTER_SIZE,FILTER_SIZE> window_buffer;
//fill the line buffer
COL_LOOP:for(c=0;c<=COLS_I;c++)
{
#pragma HLS PIPELINE II=1
int8_t store_data=0;
int8_t load_data=*(in_base+r*COLS_I+c);
line_buffer.shift_up(c);
line_buffer.insert_bottom(load_data,c);//行缓冲,保证数据从DDR里面流水的取出
if(r>=2)
{
window_buffer.shift_right();
window_buffer,insert(line_buffer.getval(2,c),0,2);
window_buffer,insert(line_buffer.getval(1,c),1,2);
window_buffer,insert(line_buffer.getval(0,c),2,2);
}//从行缓冲中获取数据,构建卷积计算的窗口缓冲
if(r>=2&&c>=2)
{
for(i=0;i<FILTER_SIZE;i++)
{
for(j=0;j<FILTER_SIZE;j++)
{
int8_t res=0;
int8_t val=window_buffer.getval(i,j);//从窗口中获取像素值,进行卷积计算
if(filter_buffer[i][j]==0)//针对二值卷积核进行优化
res=0;
else if(filter_buffer[i][j]==1)
res=val;
else if(filter_buffer[i][j]==-1)
res=-val;
store_data+=res
}
}
*(out_base+(r-2)*COLS_O+c(c-2))+=store_data;
}
}
}
}
实现结果的时间和资源