neon 64bits hevc优化例子

由于之前ffmpeg中hevc decoder不支持neon 64bits的优化,所以参与这部分工作。

大部分指令从这里查:

http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0802a/USHLL_advsimd_vector.html


另外还有两个文档:

ARMv8-A programming guide.pdf

ARMv8-A_Architecture_Reference_Manual_(Issue_A.a).pdf



 - data type, 8B/16B/4H/8H/2S/4S/2D. B represents byte (8-bit). H represents half-word (16-bit). S represents word (32-bit). D represents a double-word (64-bit).

For example:
UADDLP V0.8H, V0.16B
FADD V0.4S, V0.4S, V0.4S


这里是一个mc部分的代码,原始c函数是:put_hevc_pel_bi_pixels。汇编的做法是一次处理8个bytes的数据,一直处理到输入最后一行,然后回到第一行,水平便宜加8,处理下一列的8bytes数据。最后一列有可能有4bits的数据,单独处理




对应优化汇编以及注释解释(右边是类似C的解释):

x0 uint8_t *_dst,                      dst,        
x1 ptrdiff_t _dststride,               dststride,  
x2 uint8_t *_src,                      src1,       
x3 ptrdiff_t _srcstride,               src1stride, 
x4 int16_t *src2,                      lc->tmp,    
x5 int height,                         block_h,    
x6 intptr_t mx,                        mx1,        
x7 intptr_t my,                        my1,        
sp0 int width                           block_w     

#define MAX_PB_SIZE #64
#define MAX_PB_DOUBLESIZE #128

ff_hevc_put_qpel_bi_neon_8
   ldr w6, [sp]
                              w6= width
        mov     x7, MAX_PB_DOUBLESIZE
                              my = 128
                              x7 = 128
        mov     x10, x4
                              x10 = src2
        mov     x11, x0
                              x11 = _dst
        mov     x12, x5
                              x12 = height
        mov     x13, x2
                              x13 = _src
        cmp     w6, #4
                              cmp (width , 4)
        b.eq    4f
                              if width==4, goto 4: // else handle 8 bytes per loop
8:      subs    x5, x5, #1
                              height=height-1 //  {{{{ 8  for (y = 0; y < height; y++) {
        ld1         {v0.1D}, [x2], x3
                                      v0 = _src
                                      _src += _srcstride
        ld1         {v1.2D}, [x4], x7
                                      v1 = src2
                                      src2 += 128
        ushll       v0.8H, v0.8B, #6
                                      v0 = v0<<6 (by 8)
        add     v0.8H, v0.8H, v1.8H
                                      v0=v0+v1
        srshr     v0.8H, v0.8H, #7
                                      v0=v0>>7
        sqxtun  v0.8B, v0.8H
                                      v0 from H to B (signed)
        st1         {v0.1D}, [x0], x1
                                      x0(_dst) = v0.1D,
                                      x0 += x1(_dststride)
        b.ne    8b
                                      compare the CPSR.NZCV in previous  (subs    x5, x5, #1) result
                                      if (Z != 1) // (x5 >0)
                                      goto 8:
        
        subs    w6, w6, #8
                                      w6=w6-8 // width-8 , already handle previous 8 bytes data for all lines per loop
        b.eq    99f
                                      if (previous w6==8) goto 99:
        add     x11, x11, #8
                                      x11 += 8 // _dst+=8
        add     x10, x10, #16
                                      x10 +=16 // src2+=16
        add     x13, x13, #8
                                      x13+=8 // _src+=8
        mov     x4, x10
                                      x4=x10 // x4=src2
        mov     x0, x11
                                      x0=x11 // x0=_dst
        mov     x5, x12
                                      x5=x12 // x5=height
        mov     x2, x13
                                      x2=x13 // x13=_src
        cmp     w6, #4
                                      whether w6(width)=4
        b.ne    8b
                                      if (w6!=4) goto 8:// 8 }}}}
4:      subs    x5, x5, #2
                              x5(height) = x5-2, save CPSR.NZCV // {{{{ 4  
        ld1         {v0.S}[0], [x2], x3
                                      v0=_src
                                      x2+=x3 // _src+=_srcstride
        ld1         {v2.S}[0], [x2], x3
                                      v2=_src
                                      x2+=x3 // _src+=_srcstride
        ld1         {v1.1D}, [x4], x7
                                      v1=src2
                                      x4+=x7 // src2+=128
        ld1         {v3.1D}, [x4], x7
                                      v3=src2
                                      x4+=x7 // src2+=128        
        ushll       v0.8H, v0.8B, #6
                                      v0=v0<<6 to short
        ushll       v2.8H, v2.8B, #6
                                      v2=v2<<6 to short
        add     v0.8H, v0.8H, v1.8H
                                      v0=v0+v1 short
        add     v2.8H, v2.8H, v3.8H
                                      v2=v2+v3 short
        srshr     v0.8H, v0.8H, #7
                                      v0=v0>>7
        srshr     v2.8H, v2.8H, #7
                                      v2=v2>>7
        sqxtun  v0.8B, v0.8H
                                      v0 to 8bits
        sqxtun  v2.8B, v2.8H
                                      v2 to 8bits
        st1         {v0.1D}, [x0], x1
                                      _dst=v0
                                      _dst+=_dststride
        st1         {v2.1D}, [x0], x1
                                      _dst=v2
                                      _dst+=_dststride
        b.ne    4b
                                      if (x5(height) != 2) goto 4// 4 }}}}
99:     ret
endfunc


fu



更多hevc在neon上解码的代码:
https://github.com/Kagami/ffmpeg-hevc-accel

你可能感兴趣的:(codec)