FPGA图像处理_中值滤波实现(含源码)

非线性滤波器在通常情况下没有特定的转移函数。一类比较重要的非线性滤波就是统计排序滤波器,统计排序滤波器即对窗口内的像素值进行排序并通过多路选择器选择使用排序后的值,例如中值滤波、最大/最小值滤波等。排序滤波器或者其组合,可以在很多图像处理的场合得到应用。用接近中间位置的排序值作为输出,进行图像的平滑滤波,能得到很好的噪声平滑性质,中值滤波对去除椒盐噪声十分有用,而形态学滤波中主要用到的算子就是最大/最小值滤波。

统计排序滤波的数学定义如下:设 r r r为处理窗口的半径,设 I ( x , y ) I(x,y) I(x,y)为输入像素值, g ( x , y ) g(x,y) g(x,y)为输出像素值,则有如下定义:
g ( x , y ) = S o r t ( I ( x + i , y + j ) , n ) , − r ≤ i ≤ r , − r ≤ j ≤ r , 0 ≤ n < ( 2 r + 1 ) 2 g(x,y)=Sort(I(x+i,y+j),n),-r\leq i \leq r,-r\leq j \leq r,0\leq n <(2r+1)^2 g(x,y)=Sort(I(x+i,y+j),n),rir,rjr,0n<(2r+1)2
上式中, S o r t Sort Sort算子代表对 i i i j j j的有效区域进行排序运算,同时输出排序结果的n个值。由数学定义不难看出,排序滤波器主要完成对图像当前窗口内的所有像素进行排序,同时按照指定输出排序结果。若令 n = ( 2 r + 1 ) 2 / 2 n = (2r+1)^2/2 n=(2r+1)2/2,则上式则变成中值滤波器,若排序结果按升序排列, n = 0 n=0 n=0,则为最小值滤波器。同样,若 n = ( 2 r + 1 ) 2 − 1 n = (2r+1)^2-1 n=(2r+1)21,则为最大值滤波器。

基于以上介绍,关于中值滤波如何在FPGA中实现,相信已经有了大致的思绪,关键点在于:

  1. 窗口如何生成
  2. 排序算法如何设计
    常见的窗口大小有 3 × 3 3\times3 3×3 5 × 5 5\times5 5×5 7 × 7 7\times7 7×7,针对 3 × 3 3\times3 3×3窗口大小的排序算法比较简单,故不多做介绍,读者可自行推导,本文以 5 × 5 5\times5 5×5窗口为例,想要实现 5 × 5 5\times5 5×5窗口,就必须对图像的前4行数据进行缓存,方法有多种,例如RAM、FIFO、移位寄存器等,本文通过调用VIVADO自带的移位寄存器IP核进行操作,IP核可以在IP Catalog中找到,如图所示:
    FPGA图像处理_中值滤波实现(含源码)_第1张图片
    FPGA图像处理_中值滤波实现(含源码)_第2张图片
    数据深度(Depth)即为图像一行的像素个数,此处是以960*480图像大小为例,若缓存前4行数据,只需串联4个移位寄存器IP核,当第5行数据到来时,移位寄存器组同步输出前4行数据,我在网上找了一个简单的示意图如下,便于理解:
    FPGA图像处理_中值滤波实现(含源码)_第3张图片
    代码如下:
`timescale 1ns / 1ps

module VIP_Matrix_Generate_5X5_8bit(
//global clock
	input				clk,  				//cmos video pixel clock
	input				rst_n,				//global reset

	//Image data prepred to be processd
	input				per_frame_vsync,	//Prepared Image data vsync valid signal
	input				per_frame_href,		//Prepared Image data href vaild  signal
	input				per_frame_clken,	//Prepared Image data output/capture enable clock
	input		[7:0]	per_img_Y,			//Prepared Image brightness input

	//Image data has been processd
	output				matrix_frame_vsync,	//Prepared Image data vsync valid signal
	output				matrix_frame_href,	//Prepared Image data href vaild  signal
	output				matrix_frame_clken,	//Prepared Image data output/capture enable clock	
	output	reg	[7:0]	matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15,	//5X5 Matrix output
	output	reg	[7:0]	matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25,
	output	reg	[7:0]	matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35,
    output	reg	[7:0]	matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45,
    output	reg	[7:0]	matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55
    );
    

//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
//sync row3_data with per_frame_clken & row1_data & raw2_data
wire	[7:0]	row1_data;	//frame data of the 1th row
wire	[7:0]	row2_data;	//frame data of the 2th row
wire	[7:0]	row3_data;	//frame data of the 3th row
wire	[7:0]	row4_data;	//frame data of the 4th row
reg		[7:0]	row5_data;	//frame data of the 5th row

//***********************************************************************此处移位寄存器落后一个时钟周期
always@(posedge clk or negedge rst_n)
begin
	if(!rst_n)
		row5_data <= 0;
	else 
		begin
		if(per_frame_clken)   //将输入的信号用像素使能时钟同步一拍,以保证数据与Shift_RAM输出的数据保持同步
			row5_data <= per_img_Y;
		else
			row5_data <= row5_data;
		end	
end

//借助4个移位寄存器进行串联
c_shift_ram_0 c_shift_ram_row4 (
  .D(row5_data),      // input wire [7 : 0] D
  .CLK(clk),  // input wire CLK
  .CE(per_frame_clken),    // input wire CE
  .Q(row4_data)      // output wire [7 : 0] Q
);

c_shift_ram_0 c_shift_ram_row3 (
  .D(row4_data),      // input wire [7: 0] D
  .CLK(clk),  // input wire CLK
  .CE(per_frame_clken),    // input wire CE
  .Q(row3_data)      // output wire [7 : 0] Q
);

c_shift_ram_0 c_shift_ram_row2 (
  .D(row3_data),      // input wire [7 : 0] D
  .CLK(clk),  // input wire CLK
  .CE(per_frame_clken),    // input wire CE
  .Q(row2_data)      // output wire [7 : 0] Q
);

c_shift_ram_0 c_shift_ram_row1 (
  .D(row2_data),      // input wire [7 : 0] D
  .CLK(clk),  // input wire CLK
  .CE(per_frame_clken),    // input wire CE
  .Q(row1_data)      // output wire [7 : 0] Q
);

//------------------------------------------
//lag 2 clocks signal sync  
reg	[1:0]	per_frame_vsync_r;
reg	[1:0]	per_frame_href_r;	
reg	[1:0]	per_frame_clken_r;
always@(posedge clk or negedge rst_n)
begin
	if(!rst_n)
		begin
		per_frame_vsync_r <= 0;
		per_frame_href_r <= 0;
		per_frame_clken_r <= 0;
		end
	else
		begin
		per_frame_vsync_r 	<= 	{per_frame_vsync_r[0], 	per_frame_vsync};
		per_frame_href_r 	<= 	{per_frame_href_r[0], 	per_frame_href};
		per_frame_clken_r 	<= 	{per_frame_clken_r[0], 	per_frame_clken};
		end
end
//为了简化运算,放弃掉第1、2行的边缘数据
//为了简化运算,放弃掉每一行的前两个像素点
//为什么要将移位寄存器的读使能延迟一个时钟周期呢?要搞清楚我要传入矩阵的数据在何时为有效数据,我需要的是row5_data,而他只落后于per_img_Y一个时钟周期
wire	read_frame_href		=	per_frame_href_r[0];	//RAM read href sync signal
wire	read_frame_clken	=	per_frame_clken_r[0];	//RAM read enable
//为什么要将矩阵的读使能延迟两个时钟周期呢?要搞清楚我的矩阵在何时为有效数据,只要matrix_p55传入第一个数据,便为有效数据,而他落后于row5_data一个时钟周期
assign	matrix_frame_vsync 	= 	per_frame_vsync_r[1];
assign	matrix_frame_href 	= 	per_frame_href_r[1];
assign	matrix_frame_clken 	= 	per_frame_clken_r[1];


//---------------------------------------------------------------------------
//---------------------------------------------------
/***********************************************该处得到5x5矩阵又落后了一个时钟周期
	(1)	Read data from Shift_RAM
	(2) Caculate the Sobel
	(3) Steady data after Sobel generate
************************************************/
always@(posedge clk or negedge rst_n)
begin
	if(!rst_n) begin
	   {matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15} <= 40'd0;
       {matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25} <= 40'd0;
       {matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35} <= 40'd0;
       {matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45} <= 40'd0;
       {matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55} <= 40'd0;
    end
	else if(read_frame_href) begin
	   if(read_frame_clken) begin//Shift_RAM data read clock enable
	       {matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15} <= {matrix_p12, matrix_p13,matrix_p14, matrix_p15,row1_data};	//1th shift input
		   {matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25} <= {matrix_p22, matrix_p23,matrix_p24, matrix_p25,row2_data};	//2th shift input
		   {matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35} <= {matrix_p32, matrix_p33,matrix_p34, matrix_p35,row3_data};	//3th shift input
		   {matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45} <= {matrix_p42, matrix_p43,matrix_p44, matrix_p45,row4_data};    //4th shift input
		   {matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55} <= {matrix_p52, matrix_p53,matrix_p54, matrix_p55,row5_data};    //5th shift input
	   end		
	   else begin
	       {matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15} <=  {matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15};   
           {matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25} <=  {matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25};
           {matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35} <=  {matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35};
           {matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45} <=  {matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45};
           {matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55} <=  {matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55};
	   end	
	end
	else begin
	   {matrix_p11, matrix_p12, matrix_p13,matrix_p14, matrix_p15} <= 40'd0;   
       {matrix_p21, matrix_p22, matrix_p23,matrix_p24, matrix_p25} <= 40'd0;
       {matrix_p31, matrix_p32, matrix_p33,matrix_p34, matrix_p35} <= 40'd0;
       {matrix_p41, matrix_p42, matrix_p43,matrix_p44, matrix_p45} <= 40'd0;
       {matrix_p51, matrix_p52, matrix_p53,matrix_p54, matrix_p55} <= 40'd0;
    end
end
endmodule

目前我们得到了一个 5 × 5 5\times5 5×5大小的窗口,接下来就是排序算法的设计,如何获取窗口的中间值呢?
由于在FPGA的图像处理领域,中值滤波的处理窗口不会太大,消耗的资源也不会太大,因此,在选择排序方法时优先考虑时间开销比较小的算法,故采用并行全比较排序算法,其基本原理就是在同一时刻完成所有数据与其他数据的比较结果,因此其时间复杂度最小,仅需一个时钟即可完成排序工作,很适合FPGA流水线处理。现假定要对 n n n个数据 d 0 , d 1 , d 2 , . . . , d n d0,d1,d2,...,dn d0,d1,d2,...,dn进行排序,那么进行并行排序的步骤如下。
(1)同时得到这 n n n个数,即对这 n n n个数进行对齐。
(2)同时将这 n n n个数分别与其他书做比较,并记录比较结果。不妨规定:若当前数大于其他数,则将结果记为1;若当前数小于等于其他数,则将结果记为0.
(3)计算每个数第(2)步的所有结果之和:由于一共是 n n n个数目,因此,必然有 n − 1 n-1 n1个比较结果。
(4)第(3)步结果的值即为排序结果
但是这样做,会有一个缺点就是假设对5个数进行排序,而这5个数完全一致,那么只能得到最大值,而没法得到中值或者其他序列的输出,因此对于相同的数值,必须找出其“异性点”进行区别对待。一个明显的“异性”便是各个数值的输入次序。现做如下规定:
(1)当前数目大于本数据之前输入数据时,结果为1,小于或等于时记为0。
(2)当前数目大于等于本数据之后输入数据时,结果记为1,小于时记为0。
如此一来,便不会出现上述情况。下面就是对窗口内一行5个数据进行排序的代码展示:

`timescale 1ns / 1ps

//该模块实现对5X5模块中的一行数据,也就是5个数据进行排序,输出最大值、次最大值,中间值,次最小值,最小值
//该模块实现了消耗6个CLK,得到5个数据的中位数
module VIP_Sort_1d(
    input           clk,
    input           rst_n,
    input           din_valid,
    input   [7:0]   din_1,din_2,din_3,din_4,din_5,
    
    output          dout_valid, 
    output  [7:0]   dout_max,dout_max_sec,dout_mid,dout_min_sec,dout_min
    );
    
parameter KSZ = 5  ; //需要排列的数据个数    
parameter MAX_NUM = 8;
parameter OUT_ID = (KSZ >> 1);  // 待输出的排序ID
reg [MAX_NUM-1:0] cmp_sum[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r2[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r3[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r4[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r5[0:KSZ-1];
reg [7:0] din[0:KSZ-1]; //定义一个存储空间,将输入数据缓存起来
reg [7:0] dout_temp_mid;
reg [7:0] dout_temp_max;
reg [7:0] dout_temp_max_sec;
reg [7:0] dout_temp_min;
reg [7:0] dout_temp_min_sec;

always @ (posedge clk or negedge rst_n) begin   //消耗了1个CLK
    if(!rst_n) begin
        din[0]  <=  8'd0;
        din[1]  <=  8'd0;
        din[2]  <=  8'd0;
        din[3]  <=  8'd0;
        din[4]  <=  8'd0;
    end
    else if(din_valid == 1'b1)begin
        din[0]  <=  din_1;
        din[1]  <=  din_2;
        din[2]  <=  din_3;
        din[3]  <=  din_4;
        din[4]  <=  din_5;
    end
end

reg [39:0] din_1_r,din_2_r,din_3_r,din_4_r,din_5_r;
always @ (posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        din_1_r <=  40'd0;
        din_2_r <=  40'd0;
        din_3_r <=  40'd0;
        din_4_r <=  40'd0;
        din_5_r <=  40'd0;
    end
    else begin
        din_1_r <=  {din_1_r[31:0],din_1};
        din_2_r <=  {din_2_r[31:0],din_2};
        din_3_r <=  {din_3_r[31:0],din_3};
        din_4_r <=  {din_4_r[31:0],din_4};
        din_5_r <=  {din_5_r[31:0],din_5};
    end
end



//关键比较代码如下:
//将第一个数据与其他数据做比较,结果存放在cmp_result[0]中
reg    cmp_result[0:KSZ-1][0:KSZ-1]; //比较中间结果信号

generate
    begin:xhdl1
        genvar i;
        for(i = 1;i <= KSZ - 1;i = i + 1)
        begin:CMP_1st
            always @ (posedge clk or negedge rst_n) begin
                if(!rst_n) begin
                    cmp_result[0][i] <= 1'b0;
                    cmp_result[0][0] <= 1'b0;
                end
                else
                    cmp_result[0][i] <= (din[0] >= din[i])?1'b1:1'b0;
            end
        end
    end
endgenerate

//其他数据的比较电路
generate
    begin:xhdl4
        genvar i;
        for(i = 2;i <= KSZ;i = i + 1) //除了第一个数据的总共KSZ-1个数据
        begin:CMP_Others
            begin: xhdl2
                genvar j;
                for(j = 1;j <= i - 1;j = j + 1)
                begin:CMP_Previous    //与本数据之前的数据作比较
                    always @ (posedge clk or negedge rst_n) begin
                        if(!rst_n) begin
                            cmp_result[i-1][j-1]    <= 1'b0;
                            cmp_result[i-1][i-1]    <= 1'b0;
                        end
                        else
                            cmp_result[i-1][j-1]    <= (din[i - 1] > din[j - 1])?1'b1:1'b0;
                    end
                end
            end
            begin:xhdl3
                genvar j;
                for(j = i + 1;j <= KSZ;j = j + 1)
                begin:CMP_After    //与本数据之后的数据作比较
                    always @ (posedge clk or negedge rst_n) begin
                        if(!rst_n)
                            cmp_result[i-1][j-1] <=  1'b0;
                        else
                            cmp_result[i-1][j-1] <=  (din[i - 1] >= din[j - 1])?1'b1:1'b0;
                    end
                end
            end
        end
    end
endgenerate

//将数据有效信号延迟6个CLK
reg [5:0] din_valid_r;
always @ (posedge clk or negedge rst_n) begin
    if(!rst_n)
        din_valid_r <= 6'd0;
    else
        din_valid_r <= {din_valid_r[4:0],din_valid};
end

//将比较结果进行相加
generate
    begin:xhdl5
        genvar i;
        for(i = 0;i <= KSZ - 1;i = i + 1)
        begin:CMP_r_sum
            if(KSZ == 5)
                begin:sum_ksz_5
                    always @(posedge clk or negedge rst_n) begin
                        if(!rst_n) begin
                            cmp_sum[i]     <= {MAX_NUM{1'b0}};
                            cmp_sum_r[i]   <= {MAX_NUM{1'b0}};
                            cmp_sum_r2[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r3[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r4[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r5[i]  <= {MAX_NUM{1'b0}};
                        end
                        else begin
                            if(din_valid_r[1] == 1'b1) begin
                                cmp_sum_r[i]    <= (cmp_result[i][0])+(cmp_result[i][4]);
                                cmp_sum_r2[i]   <= (cmp_result[i][1])+(cmp_result[i][3]);
                                cmp_sum_r3[i]   <= (cmp_result[i][2]);
                            end
                            if(din_valid_r[2] == 1'b1) begin
                                cmp_sum_r4[i]   <= cmp_sum_r[i] + cmp_sum_r2[i];
                                cmp_sum_r5[i]   <= cmp_sum_r3[i] ;
                            end
                            if(din_valid_r[3] == 1'b1)
                                cmp_sum[i]     <= cmp_sum_r4[i] + cmp_sum_r5[i];
                        end                  
                    end
             end
          end
    end

    if(KSZ == 5)
    begin:dout_ksz_5
        always @ (posedge clk or negedge rst_n) begin
            if(!rst_n) begin
                dout_temp_mid   <= 8'd0;
                dout_temp_min_sec<= 8'd0;
                dout_temp_max   <= 8'd0;
                dout_temp_max_sec<= 8'd0;
                dout_temp_min   <= 8'd0;
            end
            else begin
                if(din_valid_r[4] == 1'b1) begin
                    if(cmp_sum[0] == OUT_ID + 2)            //求解最大值
                        dout_temp_max   <= din_1_r[39:32];   
                    if(cmp_sum[1] == OUT_ID + 2)   
                        dout_temp_max   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID + 2)   
                        dout_temp_max   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID + 2)   
                        dout_temp_max   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID + 2)  
                        dout_temp_max   <= din_5_r[39:32];
                        
                    if(cmp_sum[0] == OUT_ID + 1)            //求解次最大值
                        dout_temp_max_sec   <= din_1_r[39:32];   
                    if(cmp_sum[1] == OUT_ID + 1)   
                        dout_temp_max_sec   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID + 1)   
                        dout_temp_max_sec   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID + 1)   
                        dout_temp_max_sec   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID + 1)  
                        dout_temp_max_sec   <= din_5_r[39:32];
                        
                    if(cmp_sum[0] == OUT_ID)                //求解中间值
                        dout_temp_mid   <= din_1_r[39:32];
                    if(cmp_sum[1] == OUT_ID)   
                        dout_temp_mid   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID)   
                        dout_temp_mid   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID)   
                        dout_temp_mid   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID)  
                        dout_temp_mid   <= din_5_r[39:32]; 
                           
                    if(cmp_sum[0] == OUT_ID - 1)            //求解次最小值
                        dout_temp_min_sec   <= din_1_r[39:32];
                    if(cmp_sum[1] == OUT_ID - 1)   
                        dout_temp_min_sec   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID - 1)   
                        dout_temp_min_sec   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID - 1)   
                        dout_temp_min_sec   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID - 1)  
                        dout_temp_min_sec   <= din_5_r[39:32]; 
                        
                    if(cmp_sum[0] == OUT_ID - 2)            //求解最小值
                        dout_temp_min   <= din_1_r[39:32];
                    if(cmp_sum[1] == OUT_ID - 2)   
                        dout_temp_min   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID - 2)   
                        dout_temp_min   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID - 2)   
                        dout_temp_min   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID - 2)  
                        dout_temp_min   <= din_5_r[39:32]; 
                end
            end  
        end  
     end 
endgenerate

assign dout_max = dout_temp_max;
assign dout_max_sec = dout_temp_max_sec;
assign dout_mid = dout_temp_mid;
assign dout_min_sec = dout_temp_min_sec;
assign dout_min = dout_temp_min;

assign dout_valid = din_valid_r[5];
endmodule

对窗口内每行数据都调用此模块,我们可以得到每行数据的中值,但是仅仅是这5个中值不能够代表整个窗口数据的中值,我们还需要找到每行数据的次最大值中的最小值,以及次最小值中的最大值与这5个中值一起组成7个数据,找出这7个数据的中值即为整个窗口的中值。我们仅需要对上述模块进行改写,便能实现对7个数据进行比较,代码改写如下:

`timescale 1ns / 1ps

//该模块实现了消耗6个CLK,得到7个数据的中位数
module VIP_Sort_2d(
    input           clk,
    input           rst_n,
    input           din_valid,
    input   [7:0]   din_1,din_2,din_3,din_4,din_5,din_6,din_7,
    
    output          dout_valid, 
    output  [7:0]   dout_mid
    );
    
parameter KSZ = 7  ; //需要排列的数据个数    
parameter MAX_NUM = 8;
parameter OUT_ID = (KSZ >> 1);  // 待输出的排序ID
reg [MAX_NUM-1:0] cmp_sum[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r2[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r3[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r4[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r5[0:KSZ-1];
reg [MAX_NUM-1:0] cmp_sum_r6[0:KSZ-1];

reg [7:0] din[0:KSZ-1]; //定义一个存储空间,将输入数据缓存起来
reg [7:0] dout_temp_mid;

always @ (posedge clk or negedge rst_n) begin   //消耗了1个CLK
    if(!rst_n) begin
        din[0]  <=  8'd0;
        din[1]  <=  8'd0;
        din[2]  <=  8'd0;
        din[3]  <=  8'd0;
        din[4]  <=  8'd0;
        din[5]  <=  8'd0;
        din[6]  <=  8'd0;
    end
    else if(din_valid == 1'b1)begin
        din[0]  <=  din_1;
        din[1]  <=  din_2;
        din[2]  <=  din_3;
        din[3]  <=  din_4;
        din[4]  <=  din_5;
        din[5]  <=  din_6;
        din[6]  <=  din_7;
    end
end

reg [39:0] din_1_r,din_2_r,din_3_r,din_4_r,din_5_r,din_6_r,din_7_r;
always @ (posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        din_1_r <=  40'd0;
        din_2_r <=  40'd0;
        din_3_r <=  40'd0;
        din_4_r <=  40'd0;
        din_5_r <=  40'd0;
        din_6_r <=  40'd0;
        din_7_r <=  40'd0;
    end
    else begin
        din_1_r <=  {din_1_r[31:0],din_1};
        din_2_r <=  {din_2_r[31:0],din_2};
        din_3_r <=  {din_3_r[31:0],din_3};
        din_4_r <=  {din_4_r[31:0],din_4};
        din_5_r <=  {din_5_r[31:0],din_5};
        din_6_r <=  {din_6_r[31:0],din_6};
        din_7_r <=  {din_7_r[31:0],din_7};
    end
end



//关键比较代码如下:
//将第一个数据与其他数据做比较,结果存放在cmp_result[0]中
reg    cmp_result[0:KSZ-1][0:KSZ-1]; //比较中间结果信号

generate
    begin:xhdl1
        genvar i;
        for(i = 1;i <= KSZ - 1;i = i + 1)
        begin:CMP_1st
            always @ (posedge clk or negedge rst_n) begin
                if(!rst_n) begin
                    cmp_result[0][i] <= 1'b0;
                    cmp_result[0][0] <= 1'b0;
                end
                else
                    cmp_result[0][i] <= (din[0] >= din[i])?1'b1:1'b0;
            end
        end
    end
endgenerate

//其他数据的比较电路
generate
    begin:xhdl4
        genvar i;
        for(i = 2;i <= KSZ;i = i + 1) //除了第一个数据的总共KSZ-1个数据
        begin:CMP_Others
            begin: xhdl2
                genvar j;
                for(j = 1;j <= i - 1;j = j + 1)
                begin:CMP_Previous    //与本数据之前的数据作比较
                    always @ (posedge clk or negedge rst_n) begin
                        if(!rst_n) begin
                            cmp_result[i-1][j-1]    <= 1'b0;
                            cmp_result[i-1][i-1]    <= 1'b0;
                        end
                        else
                            cmp_result[i-1][j-1]    <= (din[i - 1] > din[j - 1])?1'b1:1'b0;
                    end
                end
            end
            begin:xhdl3
                genvar j;
                for(j = i + 1;j <= KSZ;j = j + 1)
                begin:CMP_After    //与本数据之后的数据作比较
                    always @ (posedge clk or negedge rst_n) begin
                        if(!rst_n)
                            cmp_result[i-1][j-1] <=  1'b0;
                        else
                            cmp_result[i-1][j-1] <=  (din[i - 1] >= din[j - 1])?1'b1:1'b0;
                    end
                end
            end
        end
    end
endgenerate

//将数据有效信号延迟6个CLK
reg [5:0] din_valid_r;
always @ (posedge clk or negedge rst_n) begin
    if(!rst_n)
        din_valid_r <= 6'd0;
    else
        din_valid_r <= {din_valid_r[4:0],din_valid};
end

//将比较结果进行相加
generate
    begin:xhdl5
        genvar i;
        for(i = 0;i <= KSZ - 1;i = i + 1)
        begin:CMP_r_sum
            if(KSZ == 7)
                begin:sum_ksz_5
                    always @(posedge clk or negedge rst_n) begin
                        if(!rst_n) begin
                            cmp_sum[i]     <= {MAX_NUM{1'b0}};
                            cmp_sum_r[i]   <= {MAX_NUM{1'b0}};
                            cmp_sum_r2[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r3[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r4[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r5[i]  <= {MAX_NUM{1'b0}};
                            cmp_sum_r6[i]  <= {MAX_NUM{1'b0}};   
                        end
                        else begin
                            if(din_valid_r[1] == 1'b1) begin
                                cmp_sum_r[i]    <= (cmp_result[i][0])+(cmp_result[i][6]);
                                cmp_sum_r2[i]   <= (cmp_result[i][1])+(cmp_result[i][5]);
                                cmp_sum_r3[i]   <= (cmp_result[i][2])+(cmp_result[i][4]);
                                cmp_sum_r4[i]   <= (cmp_result[i][3]);
                            end
                            if(din_valid_r[2] == 1'b1) begin
                                cmp_sum_r5[i]   <= cmp_sum_r[i] + cmp_sum_r2[i];
                                cmp_sum_r6[i]   <= cmp_sum_r3[i]+ cmp_sum_r4[i];
                            end
                            if(din_valid_r[3] == 1'b1)
                                cmp_sum[i]     <= cmp_sum_r5[i] + cmp_sum_r6[i];
                        end                  
                    end
             end
          end
    end

    if(KSZ == 7)
    begin:dout_ksz_5
        always @ (posedge clk or negedge rst_n) begin
            if(!rst_n)
                dout_temp_mid   <= 8'd0;
            else begin
                if(din_valid_r[4] == 1'b1) begin
                    if(cmp_sum[0] == OUT_ID)
                        dout_temp_mid   <= din_1_r[39:32];
                    if(cmp_sum[1] == OUT_ID)   
                        dout_temp_mid   <= din_2_r[39:32];
                    if(cmp_sum[2] == OUT_ID)   
                        dout_temp_mid   <= din_3_r[39:32];
                    if(cmp_sum[3] == OUT_ID)   
                        dout_temp_mid   <= din_4_r[39:32];  
                    if(cmp_sum[4] == OUT_ID)  
                        dout_temp_mid   <= din_5_r[39:32];   
                    if(cmp_sum[5] == OUT_ID)  
                        dout_temp_mid   <= din_6_r[39:32];
                    if(cmp_sum[6] == OUT_ID)  
                        dout_temp_mid   <= din_7_r[39:32];            
                end
            end  
        end  
     end 
endgenerate
assign dout_mid = dout_temp_mid;
assign dout_valid = din_valid_r[5];
endmodule

对以上代码进行拼接,我们便能实现基于FPGA图像处理的中值滤波算法,中值滤波模块代码如下:

`timescale 1ns / 1ps
//5X5的中值滤波
module VIP_8Bit_Medfilt(
    input               clk   ,
    input               rst_n ,
    input				per_frame_vsync,	//Prepared Image data vsync valid signal
	input				per_frame_href,		//Prepared Image data href vaild  signal
	input				per_frame_clken,	//Prepared Image data output/capture enable clock
	input	[7:0]	    per_img_8Bit,		//Prepared Image Bit flag outout(1: Value, 0:inValid)
    
    //Image data has been processd
	output				post_frame_vsync,	//Processed Image data vsync valid signal
	output				post_frame_href,	//Processed Image data href vaild  signal
	output				post_frame_clken,	//Processed Image data output/capture enable clock
	output	[7:0]	    post_img_8Bit		//Processed Image Bit flag outout(1: Value, 0:inValid)
    );
    
//调用5X5的卷积核,消耗2个CLK
wire			matrix_frame_vsync;	//Prepared Image data vsync valid signal
wire			matrix_frame_href;	//Prepared Image data href vaild  signal
wire			matrix_frame_clken;	//Prepared Image data output/capture enable clock	
wire	[7:0]	matrix_p11, matrix_p12, matrix_p13, matrix_p14, matrix_p15;	//5X5 Matrix output
wire	[7:0]	matrix_p21, matrix_p22, matrix_p23, matrix_p24, matrix_p25;	
wire	[7:0]	matrix_p31, matrix_p32, matrix_p33, matrix_p34, matrix_p35;	
wire	[7:0]	matrix_p41, matrix_p42, matrix_p43, matrix_p44, matrix_p45;	
wire	[7:0]	matrix_p51, matrix_p52, matrix_p53, matrix_p54, matrix_p55;	

VIP_Matrix_Generate_5X5_8bit VIP_Matrix_Generate_5X5_8bit_inst(
//global clock
	.clk  (clk),  				//cmos video pixel clock
	.rst_n(rst_n),				//global reset

	//Image data prepred to be processd
	.per_frame_vsync(per_frame_vsync),	//Prepared Image data vsync valid signal
	.per_frame_href (per_frame_href),		//Prepared Image data href vaild  signal
	.per_frame_clken(per_frame_clken),	//Prepared Image data output/capture enable clock
	.per_img_Y(per_img_8Bit),			//Prepared Image brightness input

	//Image data has been processd
	.matrix_frame_vsync(matrix_frame_vsync),	//Prepared Image data vsync valid signal
	.matrix_frame_href (matrix_frame_href),	//Prepared Image data href vaild  signal
	.matrix_frame_clken(matrix_frame_clken),	//Prepared Image data output/capture enable clock	
	.matrix_p11(matrix_p11),.matrix_p12(matrix_p12),.matrix_p13(matrix_p13),.matrix_p14(matrix_p14),.matrix_p15(matrix_p15),	//3X3 Matrix output
	.matrix_p21(matrix_p21),.matrix_p22(matrix_p22),.matrix_p23(matrix_p23),.matrix_p24(matrix_p24),.matrix_p25(matrix_p25),
	.matrix_p31(matrix_p31),.matrix_p32(matrix_p32),.matrix_p33(matrix_p33),.matrix_p34(matrix_p34),.matrix_p35(matrix_p35),
    .matrix_p41(matrix_p41),.matrix_p42(matrix_p42),.matrix_p43(matrix_p43),.matrix_p44(matrix_p44),.matrix_p45(matrix_p45),
    .matrix_p51(matrix_p51),.matrix_p52(matrix_p52),.matrix_p53(matrix_p53),.matrix_p54(matrix_p54),.matrix_p55(matrix_p55)
    );
    
wire dout_valid;
wire [7:0] row1_max,row1_mid,row1_min;
wire [7:0] row2_max,row2_mid,row2_min;
wire [7:0] row3_max,row3_mid,row3_min;
wire [7:0] row4_max,row4_mid,row4_min;
wire [7:0] row5_max,row5_mid,row5_min;

//获取5X5模板中每一行的次最大值、中间值、和次最小值,消耗6个CLK
VIP_Sort_1d VIP_Sort_1d_row1(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(matrix_frame_clken),
    .din_1(matrix_p11),.din_2(matrix_p12),.din_3(matrix_p13),.din_4(matrix_p14),.din_5(matrix_p15),
    
    .dout_valid(dout_valid), 
    .dout_max(),.dout_max_sec(row1_max),.dout_mid(row1_mid),.dout_min_sec(row1_min),.dout_min()
    );
    
VIP_Sort_1d VIP_Sort_1d_row2(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(matrix_frame_clken),
    .din_1(matrix_p21),.din_2(matrix_p22),.din_3(matrix_p23),.din_4(matrix_p24),.din_5(matrix_p25),
    
    .dout_valid(), 
    .dout_max(),.dout_max_sec(row2_max),.dout_mid(row2_mid),.dout_min_sec(row2_min),.dout_min()
    );
    
VIP_Sort_1d VIP_Sort_1d_row3(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(matrix_frame_clken),
    .din_1(matrix_p31),.din_2(matrix_p32),.din_3(matrix_p33),.din_4(matrix_p34),.din_5(matrix_p35),
    
    .dout_valid(), 
    .dout_max(),.dout_max_sec(row3_max),.dout_mid(row3_mid),.dout_min_sec(row3_min),.dout_min()
    );
    
VIP_Sort_1d VIP_Sort_1d_row4(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(matrix_frame_clken),
    .din_1(matrix_p41),.din_2(matrix_p42),.din_3(matrix_p43),.din_4(matrix_p44),.din_5(matrix_p45),
    
    .dout_valid(), 
    .dout_max(),.dout_max_sec(row4_max),.dout_mid(row4_mid),.dout_min_sec(row4_min),.dout_min()
    );
    
VIP_Sort_1d VIP_Sort_1d_row5(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(matrix_frame_clken),
    .din_1(matrix_p51),.din_2(matrix_p52),.din_3(matrix_p53),.din_4(matrix_p54),.din_5(matrix_p55),
    
    .dout_valid(), 
    .dout_max(),.dout_max_sec(row5_max),.dout_mid(row5_mid),.dout_min_sec(row5_min),.dout_min()
    );

//寻找每一行次最大值中的最小值,次最小值中的最大值,消耗6个CLK
wire dout_valid_col;
wire [7:0] dout_max_min;
VIP_Sort_1d VIP_Sort_1d_col1(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(dout_valid),
    .din_1(row1_max),.din_2(row2_max),.din_3(row3_max),.din_4(row4_max),.din_5(row5_max),
    
    .dout_valid(dout_valid_col), 
    .dout_max(),.dout_max_sec(),.dout_mid(),.dout_min_sec(),.dout_min(dout_max_min)
    );
    
wire [7:0] dout_min_max;
VIP_Sort_1d VIP_Sort_1d_col2(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(dout_valid),
    .din_1(row1_min),.din_2(row2_min),.din_3(row3_min),.din_4(row4_min),.din_5(row5_min),
    
    .dout_valid(), 
    .dout_max(dout_min_max),.dout_max_sec(),.dout_mid(),.dout_min_sec(),.dout_min()
    );
    
//此时row1_mid、row2_mid、row3_mid、row4_mid、row5_mid落后于dout_max_min、dout_min_max6个CLK
//为了保持同步,将中间值也延迟6个CLK
reg [47:0] row1_mid_r,row2_mid_r,row3_mid_r,row4_mid_r,row5_mid_r;
always @ (posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        row1_mid_r  <=  48'd0;
        row2_mid_r  <=  48'd0;
        row3_mid_r  <=  48'd0;
        row4_mid_r  <=  48'd0;
        row5_mid_r  <=  48'd0;
    end
    else begin
        row1_mid_r  <=  {row1_mid_r[39:0],row1_mid};
        row2_mid_r  <=  {row2_mid_r[39:0],row2_mid};
        row3_mid_r  <=  {row3_mid_r[39:0],row3_mid};
        row4_mid_r  <=  {row4_mid_r[39:0],row4_mid};
        row5_mid_r  <=  {row5_mid_r[39:0],row5_mid};
    end
end

wire [7:0] row1_mid_r2,row2_mid_r2,row3_mid_r2,row4_mid_r2,row5_mid_r2;
assign row1_mid_r2 = row1_mid_r[47:40];
assign row2_mid_r2 = row2_mid_r[47:40];
assign row3_mid_r2 = row3_mid_r[47:40];
assign row4_mid_r2 = row4_mid_r[47:40];
assign row5_mid_r2 = row5_mid_r[47:40];

//计算采集到的7个数据的中位数,消耗6个CLK
wire [7:0] dout;
VIP_Sort_2d VIP_Sort_2d_inst(
    .clk  (clk),
    .rst_n(rst_n),
    .din_valid(dout_valid_col),
    .din_1(dout_max_min),.din_2(dout_min_max),.din_3(row1_mid_r2),.din_4(row2_mid_r2),.din_5(row3_mid_r2),.din_6(row4_mid_r2),.din_7(row5_mid_r2),
    
    .dout_valid(dout_valid_1), 
    .dout_mid(dout)
    );    
    
//为了保持同步,将输入控制信号也延时2+6+6+6 = 20个CLK
reg [19:0]  post_frame_vsync_r;
reg [19:0]  post_frame_href_r;
reg [19:0]  post_frame_clken_r;

always @ (posedge clk or negedge rst_n) begin
    if(!rst_n) begin
        post_frame_vsync_r  <=  20'd0;
        post_frame_href_r   <=  20'd0;
        post_frame_clken_r  <=  20'd0;
    end
    else begin
        post_frame_vsync_r  <=  {post_frame_vsync_r[18:0],per_frame_vsync};
        post_frame_href_r   <=  {post_frame_href_r[18:0] ,per_frame_href };
        post_frame_clken_r  <=  {post_frame_clken_r[18:0],per_frame_clken};
    end
end

assign post_frame_vsync = post_frame_vsync_r[19];
assign post_frame_href  = post_frame_href_r [19];
assign post_frame_clken = post_frame_clken_r[19];
assign post_img_8Bit    = dout;
endmodule

现在,我们来仿真观察效果如何
原图(加入噪声密度:0.3的椒盐噪声):

处理之后的图:

可以看到效果还是非常明显的,但是仔细观察经过中值滤波后的图像就会发现,在整幅图像的左边缘和下边缘会存在“黑边”,这是由于代码设计的缺陷所导致的,为了兼顾代码的简单可操作,同时保证效果,舍弃了边缘数据,感兴趣的同学可以继续研究,如何让算法更完善,效果更好。

参考文献:牟新刚,周晓,郑晓亮.基于FPGA的数字图像处理原理及应用;
阮秋琦.数字图像处理(MATLAB版)(第二版)》(本科教学版);
CrazyBingo.基于VIP_Board Big的FPGA入门进阶及图像处理算法开发教程;
韩玉鑫,王晓凯,陆金旺.实时彩色图像自适应中值滤波算法的FPGA实现[J].计算测量与控制.2022.30(7):173-180;
穆向阳,雷妍.一种中值滤波快速系统的FPGA实现[J].西安石油大学学报.2022.6(24):1-8;

你可能感兴趣的:(#,基于FPGA的图像处理,fpga开发,图像处理,人工智能)