h.265/HEVC解码器verilog实现

分享自己几年之前写的h265解码器的代码,代码全部分享于github上,点击链接进去,https://github.com/tishi43/h265_decoder

c参考代码在  https://github.com/tishi43/h265_c_reference

同时分享另一个年代比较久远的h264解码器,大概有10年了,起初是作为我表弟课题来做的,绝大部分由我的表弟完成。在Altera还统治FPGA的时代,在CycloneIII上验证,这个版本是比较老的版本,后面有在Xilinx ZYNQ7020上验证和功能的完善。

https://github.com/tishi43/h264_decoder

以下大致介绍一下h.265解码器,在zynq7035平台上验证。

顶层文件是decode_stream.sv,

主要包括stream接口用于向解码器喂码流,axi3接口用于写入解码后yuv数据,以及参考帧的读入。

module decode_stream (

 //global signals

 input  wire                              clk,

 input  wire                              rst,

 input  wire                              en,

 //interface to bitstream memory or fifo

 input  wire [ 7:0]                       stream_mem_data_in,

 input  wire                              stream_mem_valid,

 output wire [31:0]                       stream_mem_addr_out,

 output wire                              stream_mem_rd, // request stream read by read_nalu

 input  wire                              stream_mem_end, // end of stream reached

 output wire [12:0]                       pic_width_in_luma_samples, //max 4096

 output wire [11:0]                       pic_height_in_luma_samples, //max 2160

 output wire [63:0]                       pic_num,

 output wire [ 3:0]                       cur_pic_dpb_slot,

 output wire                              write_yuv,

 input  wire                              ext_mem_init_done,

 input  wire [31:0]                       fd_log,

 input  wire [31:0]                       fd_pred,

 input  wire [31:0]                       fd_intra_pred_chroma,

 input  wire [31:0]                       fd_tq_luma,

 input  wire [31:0]                       fd_tq_cb,

 input  wire [31:0]                       fd_tq_cr,

 input  wire [31:0]                       fd_filter,

 input  wire [31:0]                       fd_deblock,

 //axi bus read if

 input  wire                              m_axi_arready,

 output wire                              m_axi_arvalid,

 output wire [ 3:0]                       m_axi_arlen,

 output wire [31:0]                       m_axi_araddr,

 output wire [ 5:0]                       m_axi_arid,

 output wire [ 2:0]                       m_axi_arsize,

 output wire [ 1:0]                       m_axi_arburst,

 output wire [ 2:0]                       m_axi_arprot,

 output wire [ 3:0]                       m_axi_arcache,

 output wire [ 1:0]                       m_axi_arlock,

 output wire [ 3:0]                       m_axi_arqos,

 output wire                              m_axi_rready,

 input  wire [63:0]                       m_axi_rdata,

 input  wire                              m_axi_rvalid,

 input  wire                              m_axi_rlast,

 //axi bus write if

 input  wire                              m_axi_awready, // Indicates slave is ready to accept a

 output wire [ 5:0]                       m_axi_awid,    // Write ID

 output wire [31:0]                       m_axi_awaddr,  // Write address

 output wire [ 3:0]                       m_axi_awlen,   // Write Burst Length

 output wire [ 2:0]                       m_axi_awsize,  // Write Burst size

 output wire [ 1:0]                       m_axi_awburst, // Write Burst type

 output wire [ 1:0]                       m_axi_awlock,  // Write lock type

 output wire [ 3:0]                       m_axi_awcache, // Write Cache type

 output wire [ 2:0]                       m_axi_awprot,  // Write Protection type

 output wire                              m_axi_awvalid, // Write address valid

 input  wire                              m_axi_wready,  // Write data ready

 output wire [ 5:0]                       m_axi_wid,     // Write ID tag

 output wire [63:0]                       m_axi_wdata,    // Write data

 output wire [ 7:0]                       m_axi_wstrb,    // Write strobes

 output wire                              m_axi_wlast,    // Last write transaction

 output wire                              m_axi_wvalid,   // Write valid

 input  wire [ 5:0]                       m_axi_bid,     // Response ID

 input  wire [ 1:0]                       m_axi_bresp,   // Write response

 input  wire                              m_axi_bvalid,  // Write reponse valid

 output wire                              m_axi_bready,  // Response ready

 output wire [ 5:0]                       m_axi_rrid,

 input  wire [ 1:0]                       m_axi_rresp

);

整体结构如下图

h.265/HEVC解码器verilog实现_第1张图片

大致包含如下部分

read_nalu.v,rbsp_buffer.v,rbsp_buffer_simple.v,

码流进来读nalu,buffer管理。

bitstream_controller.sv,

pps.v,

sps.v,

vps.v,

slice_header.

slice_data.sv,

cu.sv,

tu.sv

这些文件用于码流解析,分别解vps,sps,pps,slice_header,slice_data,cu,tu.

cabac.v,cabac_bypass_decode_bin.v,cabac_decode_bin.v,cabac_terminate_decode_bin.v,

dec_bin_cu.v,dec_bin_gt1_etc.v,dec_bin_sd.v,dec_bin_sig.v,dec_bin_xy_pref.v,

cabac解码,3种bin的解码,以及为了减少关键路径,分成5块。

trans_quant_32.sv,trans_quant_16.sv

反量化,IDCT

intra_pred_32.sv, intra_pred_16.sv

luma和chroma的帧内预测,

inter_pred_luma.sv, inter_pred_chroma.sv

luma和chroma的帧间预测,

mv.sv

求mv,参考帧,derive_motion_vector_prediction, derive_temporal_motion_vector,两种mv的计算,这块是h.265相比h.264复杂很多的地方。

filter_64.sv,filter_32.sv,

luma和chroma的滤波,包含去方块滤波和sao滤波。

下面是zynq7035平台上的验证,block_design如下,

h.265/HEVC解码器verilog实现_第2张图片

添加ZYNQ7000的处理器模块,暴露AXI HP slave interface,和一个AXI GP interface,

一个AXI-Stream FIFO,AXI Interconnect,AXI Interconnect转换32位宽度到8位,来喂给decode_stream,

AXI HP slave interface, 用于PL访问PS侧的ddr,

AXI GP interface,用于通过读写AXI-stream FIFO的寄存器来把码流喂向PL端。

ARM端SDK的代码,如下,从sd卡读取码流文件,写AXI-stream fifo寄存器,把码流喂向PL,解码完成保存yuv文件到sd卡。

#include 

#include "platform.h"

#include "xil_printf.h"

#include "xdevcfg.h"

#include "xparameters.h"

#include "ff.h"



int SD_Init(void);

int Sd_Test_Write(void);

int Sd_Test_Read(void);



#define BASE_ADDR 0x43c00000

#define ISR BASE_ADDR

#define IER (BASE_ADDR+0x4)

#define TDFR (BASE_ADDR+0x8)

#define TDFV (BASE_ADDR+0xC)

#define TDFD (BASE_ADDR+0x10)

#define TLR (BASE_ADDR+0x14)

#define TDR (BASE_ADDR+0x2c)



static FATFS fatfs;

#define WIDTH 3840

#define HEIGHT 2160



int main()

{

    init_platform();

    print("zynq_sd_card_fatfs-test a\r\n");

    SD_Init();



    xil_printf("PLL status %x\r\n",*(unsigned int *)0xf800010c);





    FIL fil;

    FRESULT rc;

    UINT br;

    unsigned int *bit_addr=(unsigned int *)0x1000000;

    rc = f_open(&fil,"in.265",FA_READ);

    if(rc)

    {

        xil_printf("ERROR : f_open returned %d\r\n",rc);

        return XST_FAILURE;

    }

    rc = f_lseek(&fil, 0);

    rc = f_read(&fil,bit_addr,4096000,&br);

    xil_printf("br=%d\r\n",br);

    rc = f_close(&fil);







    unsigned int i;

    unsigned int j,k;

    unsigned int counter;

    unsigned int tdfv=0;

    unsigned int isr;

    unsigned int decode_times=0;



    i=0;

    while(i<((br+3)/4))

    {

            *(unsigned int *)ISR = 0xffffffff;



            *(unsigned int *)IER = 0x0C000000;

            *(unsigned int *)TDR = 0x2;

            *(unsigned int *)TDFD = bit_addr[i];

            *(unsigned int *)TDFD = bit_addr[i+1];

            *(unsigned int *)TDFD = bit_addr[i+2];

            *(unsigned int *)TDFD = bit_addr[i+3];

            *(unsigned int *)TDFD = bit_addr[i+4];

            *(unsigned int *)TDFD = bit_addr[i+5];

            *(unsigned int *)TDFD = bit_addr[i+6];

            *(unsigned int *)TDFD = bit_addr[i+7];



            *(unsigned int *)TLR = 0x20;



            do{

                tdfv=*(volatile unsigned int *)TDFV;

            }while(tdfv<400);





        i+=8;

    }



        counter=200*1024*1024; //wait enough time for decode finish

        while(counter>0){

            counter--;

        }



         rc = f_open(&fil,"out.yuv",FA_WRITE|FA_CREATE_NEW);

    if(rc)

    {

        xil_printf("ERROR : f_open returned %d\r\n",rc);

        return XST_FAILURE;

    }

    for (j=0;j<5;j++){

        for(k=0;k

你可能感兴趣的:(h.264,verilog)