由于之前ffmpeg中hevc decoder不支持neon 64bits的优化,所以参与这部分工作。
大部分指令从这里查:
http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0802a/USHLL_advsimd_vector.html
另外还有两个文档:
ARMv8-A programming guide.pdf
ARMv8-A_Architecture_Reference_Manual_(Issue_A.a).pdf
- data type, 8B/16B/4H/8H/2S/4S/2D. B represents byte (8-bit). H represents half-word (16-bit). S represents word (32-bit). D represents a double-word (64-bit).
For example:
UADDLP V0.8H, V0.16B
FADD V0.4S, V0.4S, V0.4S
这里是一个mc部分的代码,原始c函数是:put_hevc_pel_bi_pixels。汇编的做法是一次处理8个bytes的数据,一直处理到输入最后一行,然后回到第一行,水平便宜加8,处理下一列的8bytes数据。最后一列有可能有4bits的数据,单独处理
对应优化汇编以及注释解释(右边是类似C的解释):
x0 uint8_t *_dst, dst,
x1 ptrdiff_t _dststride, dststride,
x2 uint8_t *_src, src1,
x3 ptrdiff_t _srcstride, src1stride,
x4 int16_t *src2, lc->tmp,
x5 int height, block_h,
x6 intptr_t mx, mx1,
x7 intptr_t my, my1,
sp0 int width block_w
#define MAX_PB_SIZE #64
#define MAX_PB_DOUBLESIZE #128
ff_hevc_put_qpel_bi_neon_8
ldr w6, [sp]
w6= width
mov x7, MAX_PB_DOUBLESIZE
my = 128
x7 = 128
mov x10, x4
x10 = src2
mov x11, x0
x11 = _dst
mov x12, x5
x12 = height
mov x13, x2
x13 = _src
cmp w6, #4
cmp (width , 4)
b.eq 4f
if width==4, goto 4: // else handle 8 bytes per loop
8: subs x5, x5, #1
height=height-1 // {{{{ 8 for (y = 0; y < height; y++) {
ld1 {v0.1D}, [x2], x3
v0 = _src
_src += _srcstride
ld1 {v1.2D}, [x4], x7
v1 = src2
src2 += 128
ushll v0.8H, v0.8B, #6
v0 = v0<<6 (by 8)
add v0.8H, v0.8H, v1.8H
v0=v0+v1
srshr v0.8H, v0.8H, #7
v0=v0>>7
sqxtun v0.8B, v0.8H
v0 from H to B (signed)
st1 {v0.1D}, [x0], x1
x0(_dst) = v0.1D,
x0 += x1(_dststride)
b.ne 8b
compare the CPSR.NZCV in previous (subs x5, x5, #1) result
if (Z != 1) // (x5 >0)
goto 8:
subs w6, w6, #8
w6=w6-8 // width-8 , already handle previous 8 bytes data for all lines per loop
b.eq 99f
if (previous w6==8) goto 99:
add x11, x11, #8
x11 += 8 // _dst+=8
add x10, x10, #16
x10 +=16 // src2+=16
add x13, x13, #8
x13+=8 // _src+=8
mov x4, x10
x4=x10 // x4=src2
mov x0, x11
x0=x11 // x0=_dst
mov x5, x12
x5=x12 // x5=height
mov x2, x13
x2=x13 // x13=_src
cmp w6, #4
whether w6(width)=4
b.ne 8b
if (w6!=4) goto 8:// 8 }}}}
4: subs x5, x5, #2
x5(height) = x5-2, save CPSR.NZCV // {{{{ 4
ld1 {v0.S}[0], [x2], x3
v0=_src
x2+=x3 // _src+=_srcstride
ld1 {v2.S}[0], [x2], x3
v2=_src
x2+=x3 // _src+=_srcstride
ld1 {v1.1D}, [x4], x7
v1=src2
x4+=x7 // src2+=128
ld1 {v3.1D}, [x4], x7
v3=src2
x4+=x7 // src2+=128
ushll v0.8H, v0.8B, #6
v0=v0<<6 to short
ushll v2.8H, v2.8B, #6
v2=v2<<6 to short
add v0.8H, v0.8H, v1.8H
v0=v0+v1 short
add v2.8H, v2.8H, v3.8H
v2=v2+v3 short
srshr v0.8H, v0.8H, #7
v0=v0>>7
srshr v2.8H, v2.8H, #7
v2=v2>>7
sqxtun v0.8B, v0.8H
v0 to 8bits
sqxtun v2.8B, v2.8H
v2 to 8bits
st1 {v0.1D}, [x0], x1
_dst=v0
_dst+=_dststride
st1 {v2.1D}, [x0], x1
_dst=v2
_dst+=_dststride
b.ne 4b
if (x5(height) != 2) goto 4// 4 }}}}
99: ret
endfunc
fu
更多hevc在neon上解码的代码:
https://github.com/Kagami/ffmpeg-hevc-accel