diff mbox series

[FFmpeg-devel,v2,1/2] lavc/aarch64: add hevc qpel assembly

Message ID 20220203135151.90166-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,v2,1/2] lavc/aarch64: add hevc qpel assembly | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

J. Dekker Feb. 3, 2022, 1:51 p.m. UTC
Thanks: Rafal Dabrowa <fatwildcat@gmail.com>
---
 libavcodec/aarch64/Makefile               |    1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   67 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2799 +++++++++++++++++++++
 3 files changed, 2867 insertions(+)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

 Had trouble testing on a Linux machine as well, but have a workflow
 setup for that now so should be easier in the future. Passes FATE on
 both macOS and Linux.

Comments

Martin Storsjö Feb. 7, 2022, 10:11 p.m. UTC | #1
On Thu, 3 Feb 2022, J. Dekker wrote:

> Thanks: Rafal Dabrowa <fatwildcat@gmail.com>
> ---
> libavcodec/aarch64/Makefile               |    1 +
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   67 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2799 +++++++++++++++++++++
> 3 files changed, 2867 insertions(+)
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
> Had trouble testing on a Linux machine as well, but have a workflow
> setup for that now so should be easier in the future. Passes FATE on
> both macOS and Linux.

> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
> +        uint8_t *src, ptrdiff_t srcstride,
> +        int height, intptr_t mx, intptr_t my, int width));

Passing a whole parenthesized expression like this, via one macro 
parameter, feels quite unorthodox to me, but it does seem to work now with 
all compilers I have to test with, so I guess it's tolerable that way.

> +
> +#include "libavutil/aarch64/asm.S"
> +#define MAX_PB_SIZE 64
> +
> +.Lqpel_filters:
> +        .byte  0,  0,  0,  0,  0,  0, 0,  0

This assembles incorrectly with gas-preprocessor targeting MSVC armasm64.

Normally we enclose all such constants in const/endconst, which sets up 
the appropriate section and all that. But if put into the const data 
section, it's probably too far away for an 'adr' instruction, so then 
you'd need to use the movrel macro (expanding to 'adrp' + 'add').

A less elegant workaround for armasm/gas-preprocessor is to just add a 
'.text' above this.

> +        .byte -1,  4,-10, 58, 17, -5, 1,  0
> +        .byte -1,  4,-11, 40, 40,-11, 4, -1
> +        .byte  0,  1, -5, 17, 58,-10, 4, -1
> +
> +.macro load_qpel_filterb freg, xreg
> +        adr             \xreg, .Lqpel_filters
> +        add             \xreg, \xreg, \freg, lsl #3
> +        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
> +        ld4r           {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]

Please follow the normal coding style (align the starting '{' just like 
other characters at the start of the operand column, don't leave it 
outside. This goes for the whole file.

> +        neg             v0.16b, v0.16b
> +        neg             v2.16b, v2.16b
> +        neg             v5.16b, v5.16b
> +        neg             v7.16b, v7.16b

Why these negations? Can't you just change the corresponding umlsl/umlal 
instructions matchingly?

Also, can't those umlsl/umlal use the elementwise form, e.g. v0.b[0], so 
you wouldn't need to waste 8 full registers on the coefficients? (If 
you've got enough registers so you don't need to clobber v8-v15, there's 
probably no benefit in squeezing things tighter though. But if there's 
code that could be made more efficient if you'd have more spare registers, 
that could help.)

> +.endm
> +
> +.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
> +        umlsl           \dst\().8h, \src0\().8b, v0.8b

Could this first one be plain 'umull' (if you wouldn't negate the 
coefficient), avoiding an extra 'movi v28.8h, #0'?

> +        umlal           \dst\().8h, \src1\().8b, v1.8b
> +        umlsl           \dst\().8h, \src2\().8b, v2.8b
> +        umlal           \dst\().8h, \src3\().8b, v3.8b
> +        umlal           \dst\().8h, \src4\().8b, v4.8b
> +        umlsl           \dst\().8h, \src5\().8b, v5.8b
> +        umlal           \dst\().8h, \src6\().8b, v6.8b
> +        umlsl           \dst\().8h, \src7\().8b, v7.8b
> +.endm
> +
> +.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
> +        umlsl2          \dst\().8h, \src0\().16b, v0.16b
> +        umlal2          \dst\().8h, \src1\().16b, v1.16b
> +        umlsl2          \dst\().8h, \src2\().16b, v2.16b
> +        umlal2          \dst\().8h, \src3\().16b, v3.16b
> +        umlal2          \dst\().8h, \src4\().16b, v4.16b
> +        umlsl2          \dst\().8h, \src5\().16b, v5.16b
> +        umlal2          \dst\().8h, \src6\().16b, v6.16b
> +        umlsl2          \dst\().8h, \src7\().16b, v7.16b
> +.endm
> +
> +.macro load_qpel_filterh freg, xreg
> +        adr             \xreg, .Lqpel_filters
> +        add             \xreg, \xreg, \freg, lsl #3
> +        ld1            {v0.8b}, [\xreg]
> +        sxtl            v0.8h, v0.8b
> +.endm
> +
> +.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
> +        smlal           \dst\().4s, \src4\().4h, v0.h[4]
> +        smlal           \dst\().4s, \src5\().4h, v0.h[5]
> +        smlal           \dst\().4s, \src6\().4h, v0.h[6]
> +        smlal           \dst\().4s, \src7\().4h, v0.h[7]
> +.ifc \op, sshr
> +        sshr            \dst\().4s, \dst\().4s, \shift
> +.else
> +        \op             \dst\().4h, \dst\().4s, \shift
> +.endif
> +.endm
> +
> +.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
> +        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
> +        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
> +        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
> +        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
> +        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
> +        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
> +        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
> +        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
> +.ifc \op, sshr
> +        sshr            \dst\().4s, \dstt\().4s, \shift
> +.else
> +        \op             \dst\().8h, \dstt\().4s, \shift
> +.endif
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #8
> +        mov             x14, #(MAX_PB_SIZE * 2)
> +1:      ld1            {v16.8b}, [x1], #8
> +        ld1            {v17.s}[0], [x1], x2
> +.macro calc src0, src1, idx
> +        ushr            \src0\().2d, \src1\().2d, #8
> +        mov             \src0\().b[7], v17.b[\idx]
> +.endm
> +        calc            v18, v16, 0
> +        calc            v19, v18, 1
> +        calc            v20, v19, 2

This operation looks weird. Isn't this equivalent of "ext v18.8b, v16.8b, 
v17.8b, #1; ext.v19.8b, v16.8b, v17.8b, #2" etc?

> +        ushr            v21.2d, v20.2d, #8
> +        ushr            v22.2d, v21.2d, #8
> +        ushr            v23.2d, v22.2d, #8
> +        ushr            v24.2d, v23.2d, #8

... and here, more 'ext'. Also, this whole sequence of 10 instructions 
above is completely serial, where every single instruction depends on the 
result of the previous one. That's pretty bad for pipelining.

> +        movi            v28.8h, #0

This instruction could be avoided if the first instruction in the macro 
would be a 'mull'

> +        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
> +        subs            w3, w3, #1
> +        st1            {v28.4h}, [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        mov             x14, #(MAX_PB_SIZE * 2 - 8)
> +1:      ld1            {v16.8b, v17.8b}, [x1], x2
> +        // same macro
> +        calc            v18, v16, 0
> +        calc            v19, v18, 1
> +        calc            v20, v19, 2
> +        calc            v21, v20, 3
> +        calc            v22, v21, 4
> +        ushr            v23.2d, v22.2d, #8
> +        ushr            v24.2d, v23.2d, #8
> +        movi            v28.8h, #0
> +        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
> +        st1            {v28.4h}, [x0], #8
> +        subs            w3, w3, #1
> +        st1            {v28.s}[2], [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
> +        sxtw x4, w4
> +        sxtw x7, w7

Indentation is off

> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        mov             x14, #(MAX_PB_SIZE * 2)
> +1:      ld1            {v16.8b, v17.8b}, [x1], x2
> +        // same macro
> +        calc            v18, v16, 0
> +        calc            v19, v18, 1
> +        calc            v20, v19, 2
> +        calc            v21, v20, 3
> +        calc            v22, v21, 4
> +        calc            v23, v22, 5
> +        calc            v24, v23, 6
> +.purgem calc
> +        movi            v28.8h, #0
> +        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
> +        subs            w3, w3, #1
> +        st1            {v28.8h}, [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #16
> +        mov             x14, #(MAX_PB_SIZE * 2 - 16)
> +1:      ld2            {v16.8b, v17.8b}, [x1], #16
> +        ld1            {v27.s}[0], [x1], x2
> +        ushr            v18.2d, v16.2d, #8
> +        ushr            v19.2d, v17.2d, #8
> +        mov             v18.b[7], v27.b[0]
> +        mov             v19.b[7], v27.b[1]

Please look into using 'ext' here too


> +        ushr            v20.2d, v18.2d, #8
> +        ushr            v21.2d, v19.2d, #8
> +        mov             v20.b[7], v27.b[2]
> +        mov             v21.b[7], v27.b[3]
> +        ushr            v22.2d, v20.2d, #8
> +        ushr            v23.2d, v21.2d, #8
> +        ushr            v24.2d, v22.2d, #8
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        zip1            v16.8h, v28.8h, v29.8h
> +        zip2            v17.8h, v28.8h, v29.8h

I'm not sure about why this function deinterleaves things and reinterleavs 
them afterwards, but maybe it's necessary for what this does.

> +        st1            {v16.8h}, [x0], #16
> +        subs            w3, w3, #1
> +        st1            {v17.4h}, [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #16
> +        mov             x14, #(MAX_PB_SIZE * 2)
> +1:      ld2            {v16.8b, v17.8b}, [x1], #16
> +        ld1            {v27.8b}, [x1], x2
> +        ushr            v18.2d, v16.2d, #8
> +        ushr            v19.2d, v17.2d, #8
> +        mov             v18.b[7], v27.b[0]
> +        mov             v19.b[7], v27.b[1]
> +        ushr            v20.2d, v18.2d, #8
> +        ushr            v21.2d, v19.2d, #8
> +        mov             v20.b[7], v27.b[2]
> +        mov             v21.b[7], v27.b[3]
> +        ushr            v22.2d, v20.2d, #8
> +        ushr            v23.2d, v21.2d, #8
> +        mov             v22.b[7], v27.b[4]
> +        mov             v23.b[7], v27.b[5]
> +        ushr            v24.2d, v22.2d, #8
> +        mov             v24.b[7], v27.b[6]

Same thing about 'ext' for shifting (and in all other functions below)

> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        subs            w3, w3, #1
> +        st2            {v28.8h, v29.8h}, [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #24
> +        mov             x14, #(MAX_PB_SIZE * 2)
> +1:      ld3            {v16.8b, v17.8b, v18.8b}, [x1], #24
> +        ld1            {v27.8b}, [x1], x2
> +        ushr            v19.2d, v16.2d, #8
> +        ushr            v20.2d, v17.2d, #8
> +        ushr            v21.2d, v18.2d, #8
> +        mov             v19.b[7], v27.b[0]
> +        mov             v20.b[7], v27.b[1]
> +        mov             v21.b[7], v27.b[2]
> +        ushr            v22.2d, v19.2d, #8
> +        ushr            v23.2d, v20.2d, #8
> +        ushr            v24.2d, v21.2d, #8
> +        mov             v22.b[7], v27.b[3]
> +        mov             v23.b[7], v27.b[4]
> +        mov             v24.b[7], v27.b[5]
> +        ushr            v25.2d, v22.2d, #8
> +        mov             v25.b[7], v27.b[6]
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        movi            v30.8h, #0
> +        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
> +        subs            w3, w3, #1
> +        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #32
> +        mov             x14, #(MAX_PB_SIZE * 2)
> +1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        ld1            {v27.8b}, [x1], x2
> +        movi            v30.8h, #0
> +        movi            v31.8h, #0
> +        ushr            v20.2d, v16.2d, #8
> +        ushr            v21.2d, v17.2d, #8
> +        ushr            v22.2d, v18.2d, #8
> +        ushr            v23.2d, v19.2d, #8
> +        mov             v20.b[7], v27.b[0]
> +        mov             v21.b[7], v27.b[1]
> +        mov             v22.b[7], v27.b[2]
> +        mov             v23.b[7], v27.b[3]
> +        ushr            v24.2d, v20.2d, #8
> +        ushr            v25.2d, v21.2d, #8
> +        ushr            v26.2d, v22.2d, #8
> +        mov             v24.b[7], v27.b[4]
> +        mov             v25.b[7], v27.b[5]
> +        mov             v26.b[7], v27.b[6]
> +        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
> +        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
> +        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #48
> +        mov             x7, #24
> +        mov             x14, #80
> +1:      ld3            {v16.16b, v17.16b, v18.16b}, [x1], x7
> +        movi            v28.8h, #0
> +        ld1            {v26.8b}, [x1], x7
> +        movi            v29.8h, #0
> +        ld1            {v27.8b}, [x1], x2
> +        movi            v30.8h, #0
> +        ushr            v19.2d, v16.2d, #8
> +        ushr            v20.2d, v17.2d, #8
> +        ushr            v21.2d, v18.2d, #8
> +        mov             v19.b[7], v26.b[0]
> +        mov             v19.b[15], v27.b[0]
> +        mov             v20.b[7], v26.b[1]
> +        mov             v20.b[15], v27.b[1]
> +        mov             v21.b[7], v26.b[2]
> +        mov             v21.b[15], v27.b[2]
> +        ushr            v22.2d, v19.2d, #8
> +        ushr            v23.2d, v20.2d, #8
> +        ushr            v24.2d, v21.2d, #8
> +        mov             v22.b[7], v26.b[3]
> +        mov             v22.b[15], v27.b[3]
> +        mov             v23.b[7], v26.b[4]
> +        mov             v23.b[15], v27.b[4]
> +        mov             v24.b[7], v26.b[5]
> +        mov             v24.b[15], v27.b[5]
> +        ushr            v25.2d, v22.2d, #8
> +        mov             v25.b[7], v26.b[6]
> +        mov             v25.b[15], v27.b[6]
> +        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
> +        st3            {v28.8h, v29.8h, v30.8h}, [x0], #48
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        movi            v30.8h, #0
> +        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
> +        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
> +        load_qpel_filterb x4, x5
> +        sub             x1, x1, #3
> +        sub             x2, x2, #64
> +        mov             x7, #32
> +1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
> +        ld1            {v27.8b}, [x1], x7
> +        ld1            {v28.8b}, [x1], x2
> +        ushr            v20.2d, v16.2d, #8
> +        ushr            v21.2d, v17.2d, #8
> +        ushr            v22.2d, v18.2d, #8
> +        ushr            v23.2d, v19.2d, #8
> +        mov             v20.b[7], v27.b[0]
> +        mov             v21.b[7], v27.b[1]
> +        mov             v22.b[7], v27.b[2]
> +        mov             v23.b[7], v27.b[3]
> +        mov             v20.b[15], v28.b[0]
> +        mov             v21.b[15], v28.b[1]
> +        mov             v22.b[15], v28.b[2]
> +        mov             v23.b[15], v28.b[3]
> +        ushr            v24.2d, v20.2d, #8
> +        ushr            v25.2d, v21.2d, #8
> +        ushr            v26.2d, v22.2d, #8
> +        mov             v24.b[7], v27.b[4]
> +        mov             v25.b[7], v27.b[5]
> +        mov             v26.b[7], v27.b[6]
> +        mov             v24.b[15], v28.b[4]
> +        mov             v25.b[15], v28.b[5]
> +        mov             v26.b[15], v28.b[6]
> +.macro calc fn
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        movi            v30.8h, #0
> +        movi            v31.8h, #0
> +        \fn             v28, v16, v17, v18, v19, v20, v21, v22, v23
> +        \fn             v29, v17, v18, v19, v20, v21, v22, v23, v24
> +        \fn             v30, v18, v19, v20, v21, v22, v23, v24, v25
> +        \fn             v31, v19, v20, v21, v22, v23, v24, v25, v26
> +        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
> +.endm
> +        calc            calc_qpelb
> +        calc            calc_qpelb2
> +.purgem calc
> +        subs            w3, w3, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +.macro calc_all
> +        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
> +        b.eq            2f
> +        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
> +        b.eq            2f
> +        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
> +        b.eq            2f
> +        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
> +        b.eq            2f
> +        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
> +        b.eq            2f
> +        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
> +        b.eq            2f
> +        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
> +        b.eq            2f
> +        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
> +        b.hi            1b
> +.endm
> +
> +.macro calc_all2
> +        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
> +        b.eq            2f
> +        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
> +        b.eq            2f
> +        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
> +        b.eq            2f
> +        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
> +        b.eq            2f
> +        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
> +        b.eq            2f
> +        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
> +        b.eq            2f
> +        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
> +        b.eq            2f
> +        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
> +        b.hi            1b
> +.endm
> +
> +function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        sub             x1, x1, x2
> +        ld1            {v16.s}[0], [x1], x2
> +        ld1            {v17.s}[0], [x1], x2
> +        ld1            {v18.s}[0], [x1], x2
> +        ld1            {v19.s}[0], [x1], x2
> +        ld1            {v20.s}[0], [x1], x2
> +        ld1            {v21.s}[0], [x1], x2
> +        ld1            {v22.s}[0], [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().s}[0], [x1], x2
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        st1            {v24.4h}, [x0], x9
> +        subs            w3, w3, #1
> +        b.eq            2f
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc

The calc_all macro expands to _al lot_ of code. It would be good if you'd 
just expand it once (or at most twice) and have all other cases jump into 
that instantiation of it. To be clear, this object file right now adds 41 
KB of executable code. (The corresponding vp9mc_neon.o contains 10 KB of 
executable code, and vp9mc_16bpp_neon.o another 9 KB.)

But it seems like each of them expand into different code. In that case, 
I'd suggest to unroll less. Instead of unrolling, just do a series of "mov 
v16, v17; mov v17, v18" etc, to shift the registers and skip unrolling. It 
costs a little extra, but avoids wasting instruction cache.


> +
> +function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2 - 8)
> +        sub             x1, x1, x2
> +        ld1            {v16.8b}, [x1], x2
> +        ld1            {v17.8b}, [x1], x2
> +        ld1            {v18.8b}, [x1], x2
> +        ld1            {v19.8b}, [x1], x2
> +        ld1            {v20.8b}, [x1], x2
> +        ld1            {v21.8b}, [x1], x2
> +        ld1            {v22.8b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8b}, [x1], x2
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        st1            {v24.4h}, [x0], #8
> +        st1            {v24.s}[2], [x0], x9
> +        subs            w3, w3, #1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        sub             x1, x1, x2
> +        ld1            {v16.8b}, [x1], x2
> +        ld1            {v17.8b}, [x1], x2
> +        ld1            {v18.8b}, [x1], x2
> +        ld1            {v19.8b}, [x1], x2
> +        ld1            {v20.8b}, [x1], x2
> +        ld1            {v21.8b}, [x1], x2
> +        ld1            {v22.8b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8b}, [x1], x2
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        st1            {v24.8h}, [x0], x9
> +        subs            w3, w3, #1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2 - 16)
> +        sub             x1, x1, x2
> +        ld1            {v16.16b}, [x1], x2
> +        ld1            {v17.16b}, [x1], x2
> +        ld1            {v18.16b}, [x1], x2
> +        ld1            {v19.16b}, [x1], x2
> +        ld1            {v20.16b}, [x1], x2
> +        ld1            {v21.16b}, [x1], x2
> +        ld1            {v22.16b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().16b}, [x1], x2
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        st1            {v24.8h}, [x0], #16
> +        subs            w3, w3, #1
> +        st1            {v25.4h}, [x0], x9
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        sub             x1, x1, x2
> +        ld1            {v16.16b}, [x1], x2
> +        ld1            {v17.16b}, [x1], x2
> +        ld1            {v18.16b}, [x1], x2
> +        ld1            {v19.16b}, [x1], x2
> +        ld1            {v20.16b}, [x1], x2
> +        ld1            {v21.16b}, [x1], x2
> +        ld1            {v22.16b}, [x1], x2
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().16b}, [x1], x2
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        subs            w3, w3, #1
> +        st1            {v24.8h, v25.8h}, [x0], x9
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +// todo: reads #32 bytes
> +function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
> +        sub             sp, sp, #48
> +        st1            {v8.16b, v9.16b, v10.16b}, [sp]
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        sub             x1, x1, x2
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        ld1            {v16.16b, v17.16b}, [x1], x2
> +        ld1            {v18.16b, v19.16b}, [x1], x2
> +        ld1            {v20.16b, v21.16b}, [x1], x2
> +        ld1            {v22.16b, v23.16b}, [x1], x2
> +        ld1            {v24.16b, v25.16b}, [x1], x2
> +        ld1            {v26.16b, v27.16b}, [x1], x2
> +        ld1            {v28.16b, v29.16b}, [x1], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().16b, \tmp1\().16b}, [x1], x2
> +        movi            v8.8h, #0
> +        movi            v9.8h, #0
> +        movi            v10.8h, #0
> +        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> +        subs            w3, w3, #1
> +        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      ld1            {v8.16b, v9.16b, v10.16b}, [sp], #48
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
> +        sub             sp, sp, #64
> +        st1            {v8.16b-v11.16b}, [sp]
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        sub             x1, x1, x2
> +        ld1            {v16.16b, v17.16b}, [x1], x2
> +        ld1            {v18.16b, v19.16b}, [x1], x2
> +        ld1            {v20.16b, v21.16b}, [x1], x2
> +        ld1            {v22.16b, v23.16b}, [x1], x2
> +        ld1            {v24.16b, v25.16b}, [x1], x2
> +        ld1            {v26.16b, v27.16b}, [x1], x2
> +        ld1            {v28.16b, v29.16b}, [x1], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().16b, \tmp1\().16b}, [x1], x2
> +        movi            v8.8h, #0
> +        movi            v9.8h, #0
> +        movi            v10.8h, #0
> +        movi            v11.8h, #0
> +        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> +        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> +        subs            w3, w3, #1
> +        st1            {v8.8h-v11.8h}, [x0], x9
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      ld1            {v8.16b-v11.16b}, [sp], #64
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
> +        stp             x5, x30, [sp, #-16]!
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x2, x3, [sp, #-16]!
> +        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
> +        ldp             x2, x3, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        ldr             x5, [sp]
> +        add             x0, x0, #48
> +        add             x1, x1, #24
> +        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
> +        ldp             xzr, x30, [sp], #16
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
> +        sub             sp, sp, #64
> +        st1            {v8.16b-v11.16b}, [sp]
> +        load_qpel_filterb x5, x4
> +        sub             x1, x1, x2, lsl #1
> +        sub             x1, x1, x2
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +0:      mov             x8, x1          // src
> +        ld1            {v16.16b, v17.16b}, [x8], x2
> +        mov             w11, w3         // height
> +        ld1            {v18.16b, v19.16b}, [x8], x2
> +        mov             x10, x0         // dst
> +        ld1            {v20.16b, v21.16b}, [x8], x2
> +        ld1            {v22.16b, v23.16b}, [x8], x2
> +        ld1            {v24.16b, v25.16b}, [x8], x2
> +        ld1            {v26.16b, v27.16b}, [x8], x2
> +        ld1            {v28.16b, v29.16b}, [x8], x2
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().16b, \tmp1\().16b}, [x8], x2
> +        movi            v8.8h, #0
> +        movi            v9.8h, #0
> +        movi            v10.8h, #0
> +        movi            v11.8h, #0
> +        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
> +        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> +        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
> +        subs            x11, x11, #1
> +        st1            {v8.8h-v11.8h}, [x10], x9
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      add             x0, x0, #64
> +        add             x1, x1, #32
> +        subs            w6, w6, #32
> +        b.hi            0b
> +        ld1            {v8.16b-v11.16b}, [sp], #64
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
> +        add             w10, w3, #7
> +        mov             x7, #128
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        sub             x1, x1, x2, lsl #1
> +        add             x3, x3, #7
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        ld1            {v16.4h}, [sp], x7
> +        ld1            {v17.4h}, [sp], x7
> +        ld1            {v18.4h}, [sp], x7
> +        ld1            {v19.4h}, [sp], x7
> +        ld1            {v20.4h}, [sp], x7
> +        ld1            {v21.4h}, [sp], x7
> +        ld1            {v22.4h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().4h}, [sp], x7
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> +        subs            w3, w3, #1
> +        st1            {v1.4h}, [x0], x7
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
> +        add             w10, w3, #7
> +        mov             x7, #128
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        sub             x1, x1, x2, lsl #1
> +        add             x3, x3, #7
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x8, #120
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        ld1            {v16.8h}, [sp], x7
> +        ld1            {v17.8h}, [sp], x7
> +        ld1            {v18.8h}, [sp], x7
> +        ld1            {v19.8h}, [sp], x7
> +        ld1            {v20.8h}, [sp], x7
> +        ld1            {v21.8h}, [sp], x7
> +        ld1            {v22.8h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8h}, [sp], x7
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> +        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> +        st1            {v1.4h}, [x0], #8
> +        subs            w3, w3, #1
> +        st1            {v1.s}[2], [x0], x8
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
> +        add             w10, w3, #7
> +        lsl             x10, x10, #7
> +        sub             x1, x1, x2, lsl #1
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        add             x3, x3, #7
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x7, #128
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        ld1            {v16.8h}, [sp], x7
> +        ld1            {v17.8h}, [sp], x7
> +        ld1            {v18.8h}, [sp], x7
> +        ld1            {v19.8h}, [sp], x7
> +        ld1            {v20.8h}, [sp], x7
> +        ld1            {v21.8h}, [sp], x7
> +        ld1            {v22.8h}, [sp], x7
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8h}, [sp], x7
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
> +        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h}, [x0], x7
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
> +        add             w10, w3, #7
> +        lsl             x10, x10, #7
> +        sub             x1, x1, x2, lsl #1
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        add             x3, x3, #7
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x7, #128
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        mov             x8, #112
> +        ld1            {v16.8h, v17.8h}, [sp], x7
> +        ld1            {v18.8h, v19.8h}, [sp], x7
> +        ld1            {v20.8h, v21.8h}, [sp], x7
> +        ld1            {v22.8h, v23.8h}, [sp], x7
> +        ld1            {v24.8h, v25.8h}, [sp], x7
> +        ld1            {v26.8h, v27.8h}, [sp], x7
> +        ld1            {v28.8h, v29.8h}, [sp], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x7
> +        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
> +        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
> +        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> +        st1            {v1.8h}, [x0], #16
> +        subs            w3, w3, #1
> +        st1            {v2.4h}, [x0], x8
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
> +        add             w10, w3, #7
> +        lsl             x10, x10, #7
> +        sub             x1, x1, x2, lsl #1
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x3, x3, #7
> +        add             x0, sp, #32
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x7, #128
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        ld1            {v16.8h, v17.8h}, [sp], x7
> +        ld1            {v18.8h, v19.8h}, [sp], x7
> +        ld1            {v20.8h, v21.8h}, [sp], x7
> +        ld1            {v22.8h, v23.8h}, [sp], x7
> +        ld1            {v24.8h, v25.8h}, [sp], x7
> +        ld1            {v26.8h, v27.8h}, [sp], x7
> +        ld1            {v28.8h, v29.8h}, [sp], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x7
> +        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
> +        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
> +        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> +        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h, v2.8h}, [x0], x7
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
> +        sub             sp, sp, #64
> +        st1            {v8.16b-v11.16b}, [sp]
> +        sub             x1, x1, x2, lsl #1
> +        sub             sp, sp, #64
> +        add             w10, w3, #7
> +        st1            {v12.16b-v15.16b}, [sp]
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        add             x3, x3, #7
> +        sub             x1, x1, x2
> +        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x7, #128
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +        ld1            {v8.8h-v10.8h}, [sp], x7
> +        ld1            {v11.8h-v13.8h}, [sp], x7
> +        ld1            {v14.8h-v16.8h}, [sp], x7
> +        ld1            {v17.8h-v19.8h}, [sp], x7
> +        ld1            {v20.8h-v22.8h}, [sp], x7
> +        ld1            {v23.8h-v25.8h}, [sp], x7
> +        ld1            {v26.8h-v28.8h}, [sp], x7
> +1:      ld1            {v29.8h-v31.8h}, [sp], x7
> +        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
> +        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
> +        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
> +        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
> +        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
> +        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2

For code like this, please try to align register columns vertically, if 
possible...

Also, please unroll less here.

> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v8.8h-v10.8h}, [sp], x7
> +        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
> +        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
> +        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
> +        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
> +        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
> +        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v11.8h-v13.8h}, [sp], x7
> +        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
> +        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
> +        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
> +        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
> +        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
> +        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v14.8h-v16.8h}, [sp], x7
> +        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
> +        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
> +        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
> +        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
> +        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
> +        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v17.8h-v19.8h}, [sp], x7
> +        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
> +        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
> +        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
> +        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
> +        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
> +        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v20.8h-v22.8h}, [sp], x7
> +        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
> +        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
> +        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
> +        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
> +        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
> +        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v23.8h-v25.8h}, [sp], x7
> +        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
> +        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
> +        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
> +        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
> +        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
> +        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.eq            2f
> +
> +        ld1            {v26.8h-v28.8h}, [sp], x7
> +        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
> +        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
> +        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
> +        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
> +        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
> +        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
> +        subs            w3, w3, #1
> +        st1            {v1.8h-v3.8h}, [x0], x7
> +        b.hi            1b
> +2:      ld1            {v12.16b-v15.16b}, [sp], #64
> +        ld1            {v8.16b-v11.16b}, [sp], #64
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
> +        add             w10, w3, #7
> +        sub             x1, x1, x2, lsl #1
> +        lsl             x10, x10, #7
> +        sub             x1, x1, x2
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x3, [sp, #-16]!
> +        add             x3, x3, #7
> +        stp             x5, x30, [sp, #-16]!
> +        add             x0, sp, #32
> +        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
> +        ldp             x5, x30, [sp], #16
> +        mov             x7, #128
> +        ldp             x0, x3, [sp], #16
> +        load_qpel_filterh x5, x4
> +0:      mov             x8, sp          // src
> +        ld1            {v16.8h, v17.8h}, [x8], x7
> +        mov             w9, w3          // height
> +        ld1            {v18.8h, v19.8h}, [x8], x7
> +        mov             x5, x0          // dst
> +        ld1            {v20.8h, v21.8h}, [x8], x7
> +        ld1            {v22.8h, v23.8h}, [x8], x7
> +        ld1            {v24.8h, v25.8h}, [x8], x7
> +        ld1            {v26.8h, v27.8h}, [x8], x7
> +        ld1            {v28.8h, v29.8h}, [x8], x7
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x7
> +        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
> +        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
> +        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
> +        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
> +        subs            x9, x9, #1
> +        st1            {v1.8h, v2.8h}, [x5], x7
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      add             x0, x0, #32
> +        add             sp, sp, #32
> +        subs            w6, w6, #16
> +        b.hi            0b
> +        add             w10, w3, #6
> +        add             sp, sp, #64          // discard rest of first line
> +        lsl             x10, x10, #7
> +        add             sp, sp, x10         // tmp_array without first line
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
> +        stp             xzr, x30, [sp, #-16]!
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x2, x3, [sp, #-16]!
> +        stp             x4, x5, [sp, #-16]!

Storing more than one register like this is usually done with a series of 
instructions like this:

     stp x0, x1, [sp, #-64]!
     stp x2, x3, [sp, #16]
     stp x4, x5, [sp, #32]
     ...

That way, sp only has to be updated once.

Also you don't need to store xzr. If you don't have a pair of registers 
to store, back it up with 'str' instead of 'stp'. (But the stack must be 
kept 16 byte aligned.)

Same thing in reverse when restoring registers:

     ldp x4, x5, [sp, #32]
     ldp x2, x3, [sp, #16]
     ldp x0, x1, [sp], #64

> +        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
> +        ldp             x4, x5, [sp], #16
> +        ldp             x2, x3, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        add             x1, x1, #24
> +        add             x0, x0, #48
> +        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
> +        ldp             xzr, x30, [sp], #16
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
> +        stp             xzr, x30, [sp, #-16]!
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x2, x3, [sp, #-16]!
> +        stp             x4, x5, [sp, #-16]!
> +        mov             x6, #32
> +        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
> +        ldp             x4, x5, [sp], #16
> +        ldp             x2, x3, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        add             x1, x1, #32
> +        add             x0, x0, #64
> +        mov             x6, #32
> +        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
> +        ldp             xzr, x30, [sp], #16
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +.macro calc op, src
> +        \op             v20.8h, v16.8b, v\src\().8b
> +        ushr            v16.2d, v16.2d, #8
> +        mov             v16.b[7], v17.b[\src]
> +.endm
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6

Align the macro parameters as instruction operands, like other instances 
of 'calc' above.

> +// no purgem
> +        umlsl           v20.8h, v16.8b, v7.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        subs            w4, w4, #1
> +        st1            {v20.s}[0], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +        sub             x1, x1, #4
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +// same macro as above
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6
> +        umlsl           v20.8h, v16.8b, v7.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        st1            {v20.s}[0], [x0], #4
> +        subs            w4, w4, #1
> +        st1            {v20.h}[2], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +// same macro as above
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6
> +.purgem calc
> +        umlsl           v20.8h, v16.8b, v7.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        subs            w4, w4, #1
> +        st1            {v20.8b}, [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +        sub             x1, x1, #8
> +1:      ld2            {v16.8b, v17.8b}, [x2]
> +        movi            v20.8h, #0
> +        ldr             w12, [x2, #16]
> +        movi            v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> +        \op1            \r0\().8h, \r1\().8b, \src0\().8b
> +        \op2            \r0\().8h, \r2\().8b, \src1\().8b
> +.if \tail-1
> +        ushr            \r1\().2d, \r1\().2d, #8
> +.endif
> +.endm
> +        calc umlsl, umlal, v20, v16, v17, v0, v1
> +        mov             v16.b[7], w12
> +        lsr             x12, x12, #8
> +        calc umlsl, umlal, v21, v17, v16, v0, v1
> +        mov             v17.b[7], w12
> +        lsr             x12, x12, #8
> +        calc umlsl, umlal, v20, v16, v17, v2, v3

Same here about aligning macro parameters. Also, if we didn't switch the 
sign of some of the coefficients, this could all use umlal consistently?

> +        mov             v16.b[7], w12
> +        calc umlsl, umlal, v21, v17, v16, v2, v3
> +        calc umlal, umlsl, v20, v16, v17, v4, v5
> +        calc umlal, umlsl, v21, v17, v16, v4, v5
> +        calc umlal, umlsl, v20, v16, v17, v6, v7
> +        calc umlal, umlsl, v21, v17, v16, v6, v7, 1
> +.purgem calc
> +        zip1            v16.8h, v20.8h, v21.8h
> +        zip2            v17.8h, v20.8h, v21.8h
> +        sqrshrun        v20.8b, v16.8h, #6
> +        sqrshrun2       v20.16b, v17.8h, #6
> +        st1            {v20.8b}, [x0], #8
> +        add             x2, x2, x3
> +        st1            {v20.s}[2], [x0], x1
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld2            {v16.8b, v17.8b}, [x2]
> +        ldr             x12, [x2, #16]
> +        movi            v20.8h, #0
> +        movi            v21.8h, #0
> +.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
> +        \op1            \dst\().8h, \r0\().8b, \src0\().8b
> +        \op2            \dst\().8h, \r1\().8b, \src1\().8b
> +        ushr            \r0\().2d, \r0\().2d, #8
> +        mov             \r0\().b[7], w12
> +.if \tail-1
> +        lsr             x12, x12, #8
> +.endif
> +.endm
> +        calc umlsl, umlal, v20, v16, v17, v0, v1
> +        calc umlsl, umlal, v21, v17, v16, v0, v1
> +        calc umlsl, umlal, v20, v16, v17, v2, v3
> +        calc umlsl, umlal, v21, v17, v16, v2, v3
> +        calc umlal, umlsl, v20, v16, v17, v4, v5
> +        calc umlal, umlsl, v21, v17, v16, v4, v5
> +        calc umlal, umlsl, v20, v16, v17, v6, v7, 1
> +.purgem calc
> +        umlal           v21.8h, v17.8b, v6.8b
> +        umlsl           v21.8h, v16.8b, v7.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        sqrshrun        v21.8b, v21.8h, #6
> +        st2            {v20.8b, v21.8b}, [x0], x1
> +        add             x2, x2, x3
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld3            {v16.8b-v18.8b}, [x2]
> +        ldr             x12, [x2, #24]
> +        movi            v20.8h, #0
> +        movi            v21.8h, #0
> +        movi            v22.8h, #0
> +.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
> +        \op1            \dst\().8h, \r0\().8b, \src0\().8b
> +        \op2            \dst\().8h, \r1\().8b, \src1\().8b
> +        umlsl           \dst\().8h, \r2\().8b, \src2\().8b
> +        ushr            \r0\().2d, \r0\().2d, #8
> +        mov             \r0\().b[7], w12
> +        lsr             x12, x12, #8
> +.endm
> +        calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
> +        calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
> +        calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
> +        calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
> +        calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
> +        calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
> +.purgem calc
> +        umlal           v20.8h, v16.8b, v6.8b
> +        umlsl           v20.8h, v17.8b, v7.8b
> +        ushr            v16.2d, v16.2d, #8
> +        mov             v16.b[7], w12
> +        umlal           v21.8h, v17.8b, v6.8b
> +        umlsl           v21.8h, v18.8b, v7.8b
> +        umlal           v22.8h, v18.8b, v6.8b
> +        umlsl           v22.8h, v16.8b, v7.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        sqrshrun        v22.8b, v22.8h, #6
> +        sqrshrun        v21.8b, v21.8h, #6
> +        st3            {v20.8b-v22.8b}, [x0], x1
> +        add             x2, x2, x3
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld4            {v16.8b-v19.8b}, [x2]
> +        ldr             x12, [x2, #32]
> +        movi            v20.8h, #0
> +        movi            v21.8h, #0
> +        movi            v22.8h, #0
> +        movi            v23.8h, #0
> +.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
> +        \op1            \dst\().8h, \r0\().8b, \src0\().8b
> +        \op2            \dst\().8h, \r1\().8b, \src1\().8b
> +        \op1            \dst\().8h, \r2\().8b, \src2\().8b
> +        \op2            \dst\().8h, \r3\().8b, \src3\().8b
> +        ushr            \r0\().2d, \r0\().2d, #8
> +        mov             \r0\().b[7], w12
> +.if \tail-1
> +        lsr             x12, x12, #8
> +.endif
> +.endm
> +        calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
> +        calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
> +        calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
> +        calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
> +        calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
> +        calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
> +        calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
> +.purgem calc
> +        umlal           v23.8h, v19.8b, v4.8b
> +        sqrshrun        v20.8b, v20.8h, #6
> +        umlsl           v23.8h, v16.8b, v5.8b
> +        sqrshrun        v21.8b, v21.8h, #6
> +        umlal           v23.8h, v17.8b, v6.8b
> +        sqrshrun        v22.8b, v22.8h, #6
> +        umlsl           v23.8h, v18.8b, v7.8b
> +        sqrshrun        v23.8b, v23.8h, #6
> +        st4            {v20.8b-v23.8b}, [x0], x1
> +        add             x2, x2, x3
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld3            {v16.16b-v18.16b}, [x2]
> +        movi            v20.8h, #0
> +        movi            v21.8h, #0
> +        movi            v22.8h, #0
> +        ldr             x12, [x2, #24]
> +        movi            v23.8h, #0
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        ldr             x13, [x2, #48]
> +.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
> +        \op1            \dst0\().8h, \r0\().8b,  \src0\().8b
> +        \op2            \dst0\().8h, \r1\().8b,  \src1\().8b
> +        umlsl           \dst0\().8h, \r2\().8b,  \src2\().8b
> +        \op1\()2        \dst1\().8h, \r0\().16b, \src0\().16b
> +        \op2\()2        \dst1\().8h, \r1\().16b, \src1\().16b
> +        umlsl2          \dst1\().8h, \r2\().16b, \src2\().16b
> +        ushr            \r0\().2d, \r0\().2d, #8
> +        mov             \r0\().b[7], w12
> +        mov             \r0\().b[15], w13
> +        lsr             x12, x12, #8
> +        lsr             x13, x13, #8
> +.endm
> +        calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
> +        calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
> +        calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
> +        calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
> +        calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
> +        calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
> +.purgem calc
> +.macro calc r0, r1, r2, r3
> +        umlal           \r0\().8h, \r2\().8b, v6.8b
> +        umlsl           \r0\().8h, \r3\().8b, v7.8b
> +        umlal2          \r1\().8h, \r2\().16b, v6.16b
> +        umlsl2          \r1\().8h, \r3\().16b, v7.16b
> +.endm
> +        calc            v20, v23, v16, v17
> +        ushr            v16.2d, v16.2d, #8
> +        mov             v16.b[7], w12
> +        mov             v16.b[15], w13
> +        calc            v21, v24, v17, v18
> +        calc            v22, v25, v18, v16
> +.purgem calc
> +        sqrshrun        v20.8b, v20.8h, #6
> +        sqrshrun        v21.8b, v21.8h, #6
> +        sqrshrun        v22.8b, v22.8h, #6
> +        sqrshrun2       v20.16b, v23.8h, #6
> +        sqrshrun2       v21.16b, v24.8h, #6
> +        sqrshrun2       v22.16b, v25.8h, #6
> +        st3            {v20.16b-v22.16b}, [x0], x1
> +        add             x2, x2, x3
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
> +        load_qpel_filterb x5, x6
> +        sub             x2, x2, #3
> +1:      ld4            {v16.16b-v19.16b}, [x2]
> +        ldr             x12, [x2, #32]
> +        ldr             x13, [x2, #64]
> +        movi            v20.8h, #0
> +        movi            v21.8h, #0
> +        movi            v22.8h, #0
> +        movi            v23.8h, #0
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        movi            v26.8h, #0
> +        movi            v27.8h, #0
> +.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
> +        \op1            \dst0\().8h, \r0\().8b,  \src0\().8b
> +        \op2            \dst0\().8h, \r1\().8b,  \src1\().8b
> +        \op1            \dst0\().8h, \r2\().8b,  \src2\().8b
> +        \op2            \dst0\().8h, \r3\().8b,  \src3\().8b
> +        \op1\()2        \dst1\().8h, \r0\().16b,  \src0\().16b
> +        \op2\()2        \dst1\().8h, \r1\().16b,  \src1\().16b
> +        \op1\()2        \dst1\().8h, \r2\().16b,  \src2\().16b
> +        \op2\()2        \dst1\().8h, \r3\().16b,  \src3\().16b
> +.if \tail-1
> +        ushr            \r0\().2d, \r0\().2d, #8
> +        mov             \r0\().b[7], w12
> +        mov             \r0\().b[15], w13
> +        lsr             x12, x12, #8
> +        lsr             x13, x13, #8
> +.endif
> +.endm
> +        calc            umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
> +        calc            umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
> +        calc            umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
> +        calc            umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
> +        calc            umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
> +        calc            umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
> +        calc            umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
> +        calc            umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
> +.purgem calc
> +        sqrshrun        v20.8b, v20.8h, #6
> +        sqrshrun        v21.8b, v21.8h, #6
> +        sqrshrun        v22.8b, v22.8h, #6
> +        sqrshrun        v23.8b, v23.8h, #6
> +        sqrshrun2       v20.16b, v24.8h, #6
> +        sqrshrun2       v21.16b, v25.8h, #6
> +        sqrshrun2       v22.16b, v26.8h, #6
> +        sqrshrun2       v23.16b, v27.8h, #6
> +        st4            {v20.16b-v23.16b}, [x0], x1
> +        add             x2, x2, x3
> +        subs            w4, w4, #1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
> +        load_qpel_filterb x6, x5
> +        sub             x2, x2, x3, lsl #1
> +        sub             x2, x2, x3
> +        ld1            {v16.s}[0], [x2], x3
> +        ld1            {v17.s}[0], [x2], x3
> +        ld1            {v18.s}[0], [x2], x3
> +        ld1            {v19.s}[0], [x2], x3
> +        ld1            {v20.s}[0], [x2], x3
> +        ld1            {v21.s}[0], [x2], x3
> +        ld1            {v22.s}[0], [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().s}[0], [x2], x3
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        sqrshrun        v24.8b, v24.8h, #6
> +        subs            w4, w4, #1
> +        st1            {v24.s}[0], [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
> +        load_qpel_filterb x6, x5
> +        sub             x2, x2, x3, lsl #1
> +        sub             x1, x1, #4
> +        sub             x2, x2, x3
> +        ld1            {v16.8b}, [x2], x3
> +        ld1            {v17.8b}, [x2], x3
> +        ld1            {v18.8b}, [x2], x3
> +        ld1            {v19.8b}, [x2], x3
> +        ld1            {v20.8b}, [x2], x3
> +        ld1            {v21.8b}, [x2], x3
> +        ld1            {v22.8b}, [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8b}, [x2], x3
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        sqrshrun        v24.8b, v24.8h, #6
> +        st1            {v24.s}[0], [x0], #4
> +        subs            w4, w4, #1
> +        st1            {v24.h}[2], [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
> +        load_qpel_filterb x6, x5
> +        sub             x2, x2, x3, lsl #1
> +        sub             x2, x2, x3
> +        ld1            {v16.8b}, [x2], x3
> +        ld1            {v17.8b}, [x2], x3
> +        ld1            {v18.8b}, [x2], x3
> +        ld1            {v19.8b}, [x2], x3
> +        ld1            {v20.8b}, [x2], x3
> +        ld1            {v21.8b}, [x2], x3
> +        ld1            {v22.8b}, [x2], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8b}, [x2], x3
> +        movi            v24.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        sqrshrun        v24.8b, v24.8h, #6
> +        subs            w4, w4, #1
> +        st1            {v24.8b}, [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2: ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
> +        load_qpel_filterb x6, x5
> +        sub             x2, x2, x3, lsl #1
> +        sub             x1, x1, #8
> +        sub             x2, x2, x3
> +0:      mov             x8, x2          // src
> +        ld1            {v16.16b}, [x8], x3
> +        mov             w11, w4         // height
> +        ld1            {v17.16b}, [x8], x3
> +        mov             x10, x0         // dst
> +        ld1            {v18.16b}, [x8], x3
> +        ld1            {v19.16b}, [x8], x3
> +        ld1            {v20.16b}, [x8], x3
> +        ld1            {v21.16b}, [x8], x3
> +        ld1            {v22.16b}, [x8], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().16b}, [x8], x3
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        sqrshrun        v24.8b, v24.8h, #6
> +        sqrshrun2       v24.16b, v25.8h, #6
> +        st1            {v24.8b}, [x10], #8
> +        subs            x11, x11, #1
> +        st1            {v24.s}[2], [x10], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      add             x0, x0, #12
> +        add             x2, x2, #12
> +        subs            w7, w7, #12
> +        b.ne            0b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
> +        load_qpel_filterb x6, x5
> +        sub             x2, x2, x3, lsl #1
> +        sub             x2, x2, x3
> +0:      mov             x8, x2          // src
> +        ld1            {v16.16b}, [x8], x3
> +        mov             w11, w4         // height
> +        ld1            {v17.16b}, [x8], x3
> +        mov             x10, x0         // dst
> +        ld1            {v18.16b}, [x8], x3
> +        ld1            {v19.16b}, [x8], x3
> +        ld1            {v20.16b}, [x8], x3
> +        ld1            {v21.16b}, [x8], x3
> +        ld1            {v22.16b}, [x8], x3
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().16b}, [x8], x3
> +        movi            v24.8h, #0
> +        movi            v25.8h, #0
> +        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
> +        sqrshrun        v24.8b, v24.8h, #6
> +        sqrshrun2       v24.16b, v25.8h, #6
> +        subs            x11, x11, #1
> +        st1            {v24.16b}, [x10], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      add             x0, x0, #16
> +        add             x2, x2, #16
> +        subs            w7, w7, #16
> +        b.ne            0b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
> +        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)

Align the operands at the right column

> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
> +        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
> +        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
> +        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        stp             x30, xzr, [sp, #-16]!
> +        sub             x1, x1, x3
> +        add             x0, sp, #48
> +        mov             x2, x3
> +        add             x3, x4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
> +        ldp             x30, xzr, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        ldp             x0, x1, [sp], #16
> +        load_qpel_filterh x6, x5
> +        ld1            {v16.4h}, [sp], x9
> +        ld1            {v17.4h}, [sp], x9
> +        ld1            {v18.4h}, [sp], x9
> +        ld1            {v19.4h}, [sp], x9
> +        ld1            {v20.4h}, [sp], x9
> +        ld1            {v21.4h}, [sp], x9
> +        ld1            {v22.4h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().4h}, [sp], x9
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> +        sqxtun          v1.8b, v1.8h
> +        subs            w4, w4, #1
> +        st1            {v1.s}[0], [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        stp             x30, xzr, [sp, #-16]!
> +        sub             x1, x1, x3
> +        add             x0, sp, #48
> +        mov             x2, x3
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
> +        ldp             x30, xzr, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        ldp             x0, x1, [sp], #16
> +        load_qpel_filterh x6, x5
> +        sub             x1, x1, #4
> +        ld1            {v16.8h}, [sp], x9
> +        ld1            {v17.8h}, [sp], x9
> +        ld1            {v18.8h}, [sp], x9
> +        ld1            {v19.8h}, [sp], x9
> +        ld1            {v20.8h}, [sp], x9
> +        ld1            {v21.8h}, [sp], x9
> +        ld1            {v22.8h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8h}, [sp], x9
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> +        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> +        sqxtun          v1.8b, v1.8h
> +        st1            {v1.s}[0], [x0], #4
> +        subs            w4, w4, #1
> +        st1            {v1.h}[2], [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        stp             x30, xzr, [sp, #-16]!
> +        sub             x1, x1, x3
> +        add             x0, sp, #48
> +        mov             x2, x3
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
> +        ldp             x30, xzr, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        ldp             x0, x1, [sp], #16
> +        load_qpel_filterh x6, x5
> +        ld1            {v16.8h}, [sp], x9
> +        ld1            {v17.8h}, [sp], x9
> +        ld1            {v18.8h}, [sp], x9
> +        ld1            {v19.8h}, [sp], x9
> +        ld1            {v20.8h}, [sp], x9
> +        ld1            {v21.8h}, [sp], x9
> +        ld1            {v22.8h}, [sp], x9
> +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1            {\tmp\().8h}, [sp], x9
> +        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
> +        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
> +        sqxtun          v1.8b, v1.8h
> +        subs            w4, w4, #1
> +        st1            {v1.8b}, [x0], x1
> +.endm
> +1:      calc_all
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        stp             x7, x30, [sp, #-16]!
> +        sub             x1, x1, x3
> +        mov             x2, x3
> +        add             x0, sp, #48
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
> +        ldp             x7, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        ldp             x0, x1, [sp], #16
> +        load_qpel_filterh x6, x5
> +        sub             x1, x1, #8
> +        ld1            {v16.8h, v17.8h}, [sp], x9
> +        ld1            {v18.8h, v19.8h}, [sp], x9
> +        ld1            {v20.8h, v21.8h}, [sp], x9
> +        ld1            {v22.8h, v23.8h}, [sp], x9
> +        ld1            {v24.8h, v25.8h}, [sp], x9
> +        ld1            {v26.8h, v27.8h}, [sp], x9
> +        ld1            {v28.8h, v29.8h}, [sp], x9
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x9
> +        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn, #12
> +        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn2, #12
> +        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
> +        sqxtun          v1.8b, v1.8h
> +        sqxtun2         v1.16b, v2.8h
> +        st1            {v1.8b}, [x0], #8
> +        subs            w4, w4, #1
> +        st1            {v1.s}[2], [x0], x1
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        stp             x7, x30, [sp, #-16]!
> +        add             x0, sp, #48
> +        sub             x1, x2, x3, lsl #1
> +        sub             x1, x1, x3
> +        mov             x2, x3
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
> +        ldp             x7, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +.Lqpel_uni_hv16_loop:
> +        mov             x9, #(MAX_PB_SIZE * 2)
> +        load_qpel_filterh x6, x5
> +        sub             w12, w9, w7, lsl #1
> +0:      mov             x8, sp          // src
> +        ld1            {v16.8h, v17.8h}, [x8], x9
> +        mov             w11, w4         // height
> +        ld1            {v18.8h, v19.8h}, [x8], x9
> +        mov             x10, x0         // dst
> +        ld1            {v20.8h, v21.8h}, [x8], x9
> +        ld1            {v22.8h, v23.8h}, [x8], x9
> +        ld1            {v24.8h, v25.8h}, [x8], x9
> +        ld1            {v26.8h, v27.8h}, [x8], x9
> +        ld1            {v28.8h, v29.8h}, [x8], x9
> +.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
> +        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x9
> +        calc_qpelh      v1,     \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn,  #12
> +        calc_qpelh2     v1, v2, \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn2, #12
> +        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn,  #12
> +        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
> +        sqxtun          v1.8b, v1.8h
> +        subs            x11, x11, #1
> +        sqxtun2         v1.16b, v2.8h
> +        st1            {v1.16b}, [x10], x1
> +.endm
> +1:      calc_all2
> +.purgem calc
> +2:      add             x0, x0, #16
> +        add             sp, sp, #32
> +        subs            w7, w7, #16
> +        b.ne            0b
> +        add             w10, w4, #6
> +        add             sp, sp, x12         // discard rest of first line
> +        lsl             x10, x10, #7
> +        add             sp, sp, x10         // tmp_array without first line
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
> +        stp             x6, x30, [sp, #-16]!
> +        mov             x7, #16
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x2, x3, [sp, #-16]!
> +        stp             x4, x5, [sp, #-16]!
> +        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
> +        ldp             x4, x5, [sp], #16
> +        ldp             x2, x3, [sp], #16
> +        add             x2, x2, #16
> +        ldp             x0, x1, [sp], #16
> +        mov             x7, #8
> +        add             x0, x0, #16
> +        ldr             x6, [sp]
> +        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
> +        ldp             xzr, x30, [sp], #16
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        stp             x7, x30, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        add             x0, sp, #48
> +        sub             x1, x1, x3
> +        mov             x2, x3
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
> +        ldp             x7, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        b .Lqpel_uni_hv16_loop

Align the operand

> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        sub             x1, x2, x3, lsl #1
> +        stp             x7, x30, [sp, #-16]!
> +        sub             x1, x1, x3
> +        mov             x2, x3
> +        add             x0, sp, #48
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
> +        ldp             x7, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        b .Lqpel_uni_hv16_loop
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
> +        add             w10, w4, #7
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10         // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        stp             x7, x30, [sp, #-16]!
> +        add             x0, sp, #48
> +        sub             x1, x2, x3, lsl #1
> +        mov             x2, x3
> +        sub             x1, x1, x3
> +        add             w3, w4, #7
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
> +        ldp             x7, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        b .Lqpel_uni_hv16_loop
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
> +        load_qpel_filterb x6, x7
> +        sub             x2, x2, #3
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +.macro calc op, idx
> +        \op             v20.8h, v16.8b, v\idx\().8b
> +        ushr            v16.2d, v16.2d, #8
> +        mov             v16.b[7], v17.b[\idx]
> +.endm
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6
> +        umlsl           v20.8h, v16.8b, v7.8b
> +        ld1            {v24.8h}, [x4], x10
> +        sqadd           v16.8h, v20.8h, v24.8h
> +        sqrshrun        v16.8b, v16.8h, #7
> +        subs            w5, w5, #1
> +        st1            {v16.s}[0], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
> +        load_qpel_filterb x6, x7
> +        sub             x2, x2, #3
> +        sub             x1, x1, #4
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +        // same macro
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6
> +        umlsl           v20.8h, v16.8b, v7.8b
> +        ld1            {v24.8h}, [x4], x10
> +        sqadd           v16.8h, v20.8h, v24.8h
> +        sqrshrun        v16.8b, v16.8h, #7
> +        st1            {v16.s}[0], [x0], #4
> +        subs            w5, w5, #1
> +        st1            {v16.h}[2], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
> +        load_qpel_filterb x6, x7
> +        sub             x2, x2, #3
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld1            {v16.8b, v17.8b}, [x2], x3
> +        movi            v20.8h, #0
> +        // same macro
> +        calc umlsl, 0
> +        calc umlal, 1
> +        calc umlsl, 2
> +        calc umlal, 3
> +        calc umlal, 4
> +        calc umlsl, 5
> +        calc umlal, 6
> +        umlsl           v20.8h, v16.8b, v7.8b
> +.purgem calc
> +        ld1            {v24.8h}, [x4], x10
> +        sqadd           v16.8h, v20.8h, v24.8h
> +        sqrshrun        v16.8b, v16.8h, #7
> +        subs            w5, w5, #1
> +        st1            {v16.8b}, [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
> +        load_qpel_filterb x6, x7
> +        sub             x2, x2, #3
> +        sub             x1, x1, #8
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld2            {v16.8b, v17.8b}, [x2]
> +        movi            v20.8h, #0
> +        ldr             w12, [x2, #16]
> +        movi            v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> +        \op1            \r0\().8h, \r1\().8b, \src0\().8b
> +        \op2            \r0\().8h, \r2\().8b, \src1\().8b
> +.if \tail-1

Wouldn't '.if \tail == 0' be less obscure?

> +        ushr            \r1\().2d, \r1\().2d, #8
> +.endif
> +.endm
> +        calc            umlsl, umlal, v20, v16, v17, v0, v1
> +        mov             v16.b[7], w12

Moving data via GPRs like this is quite inelegant and slow. Can't this be 
done with proper vector instructions (ext)?

> +        lsr             x12, x12, #8
> +        calc            umlsl, umlal, v21, v17, v16, v0, v1
> +        mov             v17.b[7], w12
> +        lsr             x12, x12, #8
> +        calc            umlsl, umlal, v20, v16, v17, v2, v3
> +        mov             v16.b[7], w12
> +        calc            umlsl, umlal, v21, v17, v16, v2, v3
> +        calc            umlal, umlsl, v20, v16, v17, v4, v5
> +        calc            umlal, umlsl, v21, v17, v16, v4, v5
> +        calc            umlal, umlsl, v20, v16, v17, v6, v7
> +        calc            umlal, umlsl, v21, v17, v16, v6, v7, 1
> +.purgem calc
> +        ld2            {v24.8h, v25.8h}, [x4], x10
> +        sqadd           v16.8h, v20.8h, v24.8h
> +        sqadd           v17.8h, v21.8h, v25.8h
> +        sqrshrun        v16.8b, v16.8h, #7
> +        sqrshrun        v17.8b, v17.8h, #7
> +        zip1            v16.16b, v16.16b, v17.16b
> +        st1            {v16.8b}, [x0], #8
> +        subs            w5, w5, #1
> +        st1            {v16.s}[2], [x0], x1
> +        add             x2, x2, x3
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
> +        load_qpel_filterb x6, x7
> +        sub             x2, x2, #3
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +1:      ld2            {v16.8b, v17.8b}, [x2]
> +        movi            v20.8h, #0
> +        ldr             x12, [x2, #16]
> +        movi            v21.8h, #0
> +.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
> +        \op1            \r0\().8h, \r1\().8b, \src0\().8b
> +        \op2            \r0\().8h, \r2\().8b, \src1\().8b
> +        ushr            \r1\().2d, \r1\().2d, #8
> +        mov             \r1\().b[7], w12

Same, please use proper vector shifting via ext, don't mix data in GPRs 
when doing SIMD

The same ccomments apply to most of the remaining code too.

And overall, please unroll less. In particular, try to get rid of every 
case of calc_all. At most, unroll the calculation once or twice, but then 
shift registers inbetween.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..8592692479 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -63,4 +63,5 @@  NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..3e5d85247e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,63 @@  void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
                                   int16_t *sao_offset_val, int sao_left_class,
                                   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+    void ff_hevc_put_hevc_##fn##4_8_neon args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+            uint8_t *src, ptrdiff_t srcstride,
+            int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +136,17 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         // for the current size, but if enabled for bigger sizes, the cases
         // of non-multiple of 8 seem to arise.
 //        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
+
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..e8cc6f5f25
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,2799 @@ 
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+        .byte  0,  0,  0,  0,  0,  0, 0,  0
+        .byte -1,  4,-10, 58, 17, -5, 1,  0
+        .byte -1,  4,-11, 40, 40,-11, 4, -1
+        .byte  0,  1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+        ld4r           {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+        neg             v0.16b, v0.16b
+        neg             v2.16b, v2.16b
+        neg             v5.16b, v5.16b
+        neg             v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld1            {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sshr
+        sshr            \dst\().4s, \dst\().4s, \shift
+.else
+        \op             \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sshr
+        sshr            \dst\().4s, \dstt\().4s, \shift
+.else
+        \op             \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #8
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b}, [x1], #8
+        ld1            {v17.s}[0], [x1], x2
+.macro calc src0, src1, idx
+        ushr            \src0\().2d, \src1\().2d, #8
+        mov             \src0\().b[7], v17.b[\idx]
+.endm
+        calc            v18, v16, 0
+        calc            v19, v18, 1
+        calc            v20, v19, 2
+        ushr            v21.2d, v20.2d, #8
+        ushr            v22.2d, v21.2d, #8
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        subs            w3, w3, #1
+        st1            {v28.4h}, [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        // same macro
+        calc            v18, v16, 0
+        calc            v19, v18, 1
+        calc            v20, v19, 2
+        calc            v21, v20, 3
+        calc            v22, v21, 4
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.4h}, [x0], #8
+        subs            w3, w3, #1
+        st1            {v28.s}[2], [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+        sxtw x4, w4
+        sxtw x7, w7
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        // same macro
+        calc            v18, v16, 0
+        calc            v19, v18, 1
+        calc            v20, v19, 2
+        calc            v21, v20, 3
+        calc            v22, v21, 4
+        calc            v23, v22, 5
+        calc            v24, v23, 6
+.purgem calc
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        subs            w3, w3, #1
+        st1            {v28.8h}, [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2 - 16)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.s}[0], [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        ushr            v24.2d, v22.2d, #8
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        zip1            v16.8h, v28.8h, v29.8h
+        zip2            v17.8h, v28.8h, v29.8h
+        st1            {v16.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1            {v17.4h}, [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[4]
+        mov             v23.b[7], v27.b[5]
+        ushr            v24.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        subs            w3, w3, #1
+        st2            {v28.8h, v29.8h}, [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #24
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x1], #24
+        ld1            {v27.8b}, [x1], x2
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v27.b[0]
+        mov             v20.b[7], v27.b[1]
+        mov             v21.b[7], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[3]
+        mov             v23.b[7], v27.b[4]
+        mov             v24.b[7], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        subs            w3, w3, #1
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #32
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        ld1            {v27.8b}, [x1], x2
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #48
+        mov             x7, #24
+        mov             x14, #80
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x1], x7
+        movi            v28.8h, #0
+        ld1            {v26.8b}, [x1], x7
+        movi            v29.8h, #0
+        ld1            {v27.8b}, [x1], x2
+        movi            v30.8h, #0
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v26.b[0]
+        mov             v19.b[15], v27.b[0]
+        mov             v20.b[7], v26.b[1]
+        mov             v20.b[15], v27.b[1]
+        mov             v21.b[7], v26.b[2]
+        mov             v21.b[15], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v26.b[3]
+        mov             v22.b[15], v27.b[3]
+        mov             v23.b[7], v26.b[4]
+        mov             v23.b[15], v27.b[4]
+        mov             v24.b[7], v26.b[5]
+        mov             v24.b[15], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v26.b[6]
+        mov             v25.b[15], v27.b[6]
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], #48
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #64
+        mov             x7, #32
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+        ld1            {v27.8b}, [x1], x7
+        ld1            {v28.8b}, [x1], x2
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        mov             v20.b[15], v28.b[0]
+        mov             v21.b[15], v28.b[1]
+        mov             v22.b[15], v28.b[2]
+        mov             v23.b[15], v28.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        mov             v24.b[15], v28.b[4]
+        mov             v25.b[15], v28.b[5]
+        mov             v26.b[15], v28.b[6]
+.macro calc fn
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        \fn             v28, v16, v17, v18, v19, v20, v21, v22, v23
+        \fn             v29, v17, v18, v19, v20, v21, v22, v23, v24
+        \fn             v30, v18, v19, v20, v21, v22, v23, v24, v25
+        \fn             v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+.endm
+        calc            calc_qpelb
+        calc            calc_qpelb2
+.purgem calc
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro calc_all
+        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
+        b.eq            2f
+        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
+        b.eq            2f
+        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
+        b.hi            1b
+.endm
+
+.macro calc_all2
+        calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
+        b.eq            2f
+        calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
+        b.eq            2f
+        calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
+        b.eq            2f
+        calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
+        b.eq            2f
+        calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
+        b.eq            2f
+        calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
+        b.eq            2f
+        calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
+        b.eq            2f
+        calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
+        b.hi            1b
+.endm
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1            {v16.s}[0], [x1], x2
+        ld1            {v17.s}[0], [x1], x2
+        ld1            {v18.s}[0], [x1], x2
+        ld1            {v19.s}[0], [x1], x2
+        ld1            {v20.s}[0], [x1], x2
+        ld1            {v21.s}[0], [x1], x2
+        ld1            {v22.s}[0], [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1            {v24.4h}, [x0], x9
+        subs            w3, w3, #1
+        b.eq            2f
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 8)
+        sub             x1, x1, x2
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1            {v24.8h}, [x0], x9
+        subs            w3, w3, #1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2 - 16)
+        sub             x1, x1, x2
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        st1            {v24.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1            {v25.4h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        subs            w3, w3, #1
+        st1            {v24.8h, v25.8h}, [x0], x9
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+        sub             sp, sp, #48
+        st1            {v8.16b, v9.16b, v10.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            w3, w3, #1
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1            {v8.16b, v9.16b, v10.16b}, [sp], #48
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b-v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().16b, \tmp1\().16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            w3, w3, #1
+        st1            {v8.8h-v11.8h}, [x0], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      ld1            {v8.16b-v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+        stp             x5, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x5, [sp]
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b-v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+0:      mov             x8, x1          // src
+        ld1            {v16.16b, v17.16b}, [x8], x2
+        mov             w11, w3         // height
+        ld1            {v18.16b, v19.16b}, [x8], x2
+        mov             x10, x0         // dst
+        ld1            {v20.16b, v21.16b}, [x8], x2
+        ld1            {v22.16b, v23.16b}, [x8], x2
+        ld1            {v24.16b, v25.16b}, [x8], x2
+        ld1            {v26.16b, v27.16b}, [x8], x2
+        ld1            {v28.16b, v29.16b}, [x8], x2
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().16b, \tmp1\().16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        subs            x11, x11, #1
+        st1            {v8.8h-v11.8h}, [x10], x9
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #64
+        add             x1, x1, #32
+        subs            w6, w6, #32
+        b.hi            0b
+        ld1            {v8.16b-v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        ld1            {v16.4h}, [sp], x7
+        ld1            {v17.4h}, [sp], x7
+        ld1            {v18.4h}, [sp], x7
+        ld1            {v19.4h}, [sp], x7
+        ld1            {v20.4h}, [sp], x7
+        ld1            {v21.4h}, [sp], x7
+        ld1            {v22.4h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().4h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        subs            w3, w3, #1
+        st1            {v1.4h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+        add             w10, w3, #7
+        mov             x7, #128
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x8, #120
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        subs            w3, w3, #1
+        st1            {v1.s}[2], [x0], x8
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x7, #128
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x7
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h}, [x0], x7
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x7, #128
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x8, #112
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        st1            {v1.8h}, [x0], #16
+        subs            w3, w3, #1
+        st1            {v2.4h}, [x0], x8
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+        add             w10, w3, #7
+        lsl             x10, x10, #7
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x3, x3, #7
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x7, #128
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h, v2.8h}, [x0], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b-v11.16b}, [sp]
+        sub             x1, x1, x2, lsl #1
+        sub             sp, sp, #64
+        add             w10, w3, #7
+        st1            {v12.16b-v15.16b}, [sp]
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        add             x3, x3, #7
+        sub             x1, x1, x2
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x7, #128
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        ld1            {v8.8h-v10.8h}, [sp], x7
+        ld1            {v11.8h-v13.8h}, [sp], x7
+        ld1            {v14.8h-v16.8h}, [sp], x7
+        ld1            {v17.8h-v19.8h}, [sp], x7
+        ld1            {v20.8h-v22.8h}, [sp], x7
+        ld1            {v23.8h-v25.8h}, [sp], x7
+        ld1            {v26.8h-v28.8h}, [sp], x7
+1:      ld1            {v29.8h-v31.8h}, [sp], x7
+        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v8.8h-v10.8h}, [sp], x7
+        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v11.8h-v13.8h}, [sp], x7
+        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v14.8h-v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v17.8h-v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v20.8h-v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v23.8h-v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.eq            2f
+
+        ld1            {v26.8h-v28.8h}, [sp], x7
+        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+        subs            w3, w3, #1
+        st1            {v1.8h-v3.8h}, [x0], x7
+        b.hi            1b
+2:      ld1            {v12.16b-v15.16b}, [sp], #64
+        ld1            {v8.16b-v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+        add             w10, w3, #7
+        sub             x1, x1, x2, lsl #1
+        lsl             x10, x10, #7
+        sub             x1, x1, x2
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        add             x3, x3, #7
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x5, x30, [sp], #16
+        mov             x7, #128
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+0:      mov             x8, sp          // src
+        ld1            {v16.8h, v17.8h}, [x8], x7
+        mov             w9, w3          // height
+        ld1            {v18.8h, v19.8h}, [x8], x7
+        mov             x5, x0          // dst
+        ld1            {v20.8h, v21.8h}, [x8], x7
+        ld1            {v22.8h, v23.8h}, [x8], x7
+        ld1            {v24.8h, v25.8h}, [x8], x7
+        ld1            {v26.8h, v27.8h}, [x8], x7
+        ld1            {v28.8h, v29.8h}, [x8], x7
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x7
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqshrn2
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqshrn2
+        subs            x9, x9, #1
+        st1            {v1.8h, v2.8h}, [x5], x7
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #32
+        add             sp, sp, #32
+        subs            w6, w6, #16
+        b.hi            0b
+        add             w10, w3, #6
+        add             sp, sp, #64          // discard rest of first line
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x1, x1, #24
+        add             x0, x0, #48
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x1, x1, #32
+        add             x0, x0, #64
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+.macro calc op, src
+        \op             v20.8h, v16.8b, v\src\().8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[\src]
+.endm
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+// no purgem
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        subs            w4, w4, #1
+        st1            {v20.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+// same macro as above
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1            {v20.h}[2], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+// same macro as above
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+.purgem calc
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        subs            w4, w4, #1
+        st1            {v20.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        movi            v20.8h, #0
+        ldr             w12, [x2, #16]
+        movi            v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+        \op1            \r0\().8h, \r1\().8b, \src0\().8b
+        \op2            \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+        ushr            \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+        calc umlsl, umlal, v20, v16, v17, v0, v1
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        calc umlsl, umlal, v21, v17, v16, v0, v1
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        calc umlsl, umlal, v20, v16, v17, v2, v3
+        mov             v16.b[7], w12
+        calc umlsl, umlal, v21, v17, v16, v2, v3
+        calc umlal, umlsl, v20, v16, v17, v4, v5
+        calc umlal, umlsl, v21, v17, v16, v4, v5
+        calc umlal, umlsl, v20, v16, v17, v6, v7
+        calc umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+        zip1            v16.8h, v20.8h, v21.8h
+        zip2            v17.8h, v20.8h, v21.8h
+        sqrshrun        v20.8b, v16.8h, #6
+        sqrshrun2       v20.16b, v17.8h, #6
+        st1            {v20.8b}, [x0], #8
+        add             x2, x2, x3
+        st1            {v20.s}[2], [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             x12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+.macro calc op1, op2, dst, r0, r1, src0, src1, tail=0
+        \op1            \dst\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst\().8h, \r1\().8b, \src1\().8b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+.if \tail-1
+        lsr             x12, x12, #8
+.endif
+.endm
+        calc umlsl, umlal, v20, v16, v17, v0, v1
+        calc umlsl, umlal, v21, v17, v16, v0, v1
+        calc umlsl, umlal, v20, v16, v17, v2, v3
+        calc umlsl, umlal, v21, v17, v16, v2, v3
+        calc umlal, umlsl, v20, v16, v17, v4, v5
+        calc umlal, umlsl, v21, v17, v16, v4, v5
+        calc umlal, umlsl, v20, v16, v17, v6, v7, 1
+.purgem calc
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        st2            {v20.8b, v21.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.8b-v18.8b}, [x2]
+        ldr             x12, [x2, #24]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, src0, src1, src2
+        \op1            \dst\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst\().8h, \r1\().8b, \src1\().8b
+        umlsl           \dst\().8h, \r2\().8b, \src2\().8b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+        lsr             x12, x12, #8
+.endm
+        calc umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+        calc umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+        calc umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+        calc umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+        calc umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+        calc umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        st3            {v20.8b-v22.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.8b-v19.8b}, [x2]
+        ldr             x12, [x2, #32]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+        \op1            \dst\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst\().8h, \r1\().8b, \src1\().8b
+        \op1            \dst\().8h, \r2\().8b, \src2\().8b
+        \op2            \dst\().8h, \r3\().8b, \src3\().8b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+.if \tail-1
+        lsr             x12, x12, #8
+.endif
+.endm
+        calc umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+        calc umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+        calc umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+        calc umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+        calc umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+        calc umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+        calc umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+        umlal           v23.8h, v19.8b, v4.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        umlsl           v23.8h, v16.8b, v5.8b
+        sqrshrun        v21.8b, v21.8h, #6
+        umlal           v23.8h, v17.8b, v6.8b
+        sqrshrun        v22.8b, v22.8h, #6
+        umlsl           v23.8h, v18.8b, v7.8b
+        sqrshrun        v23.8b, v23.8h, #6
+        st4            {v20.8b-v23.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.16b-v18.16b}, [x2]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        ldr             x12, [x2, #24]
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        ldr             x13, [x2, #48]
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2
+        \op1            \dst0\().8h, \r0\().8b,  \src0\().8b
+        \op2            \dst0\().8h, \r1\().8b,  \src1\().8b
+        umlsl           \dst0\().8h, \r2\().8b,  \src2\().8b
+        \op1\()2        \dst1\().8h, \r0\().16b, \src0\().16b
+        \op2\()2        \dst1\().8h, \r1\().16b, \src1\().16b
+        umlsl2          \dst1\().8h, \r2\().16b, \src2\().16b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+        mov             \r0\().b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+.endm
+        calc umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+        calc umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+        calc umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+        calc umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+        calc umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+        calc umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+.macro calc r0, r1, r2, r3
+        umlal           \r0\().8h, \r2\().8b, v6.8b
+        umlsl           \r0\().8h, \r3\().8b, v7.8b
+        umlal2          \r1\().8h, \r2\().16b, v6.16b
+        umlsl2          \r1\().8h, \r3\().16b, v7.16b
+.endm
+        calc            v20, v23, v16, v17
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        calc            v21, v24, v17, v18
+        calc            v22, v25, v18, v16
+.purgem calc
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v20.16b, v23.8h, #6
+        sqrshrun2       v21.16b, v24.8h, #6
+        sqrshrun2       v22.16b, v25.8h, #6
+        st3            {v20.16b-v22.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.16b-v19.16b}, [x2]
+        ldr             x12, [x2, #32]
+        ldr             x13, [x2, #64]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+        \op1            \dst0\().8h, \r0\().8b,  \src0\().8b
+        \op2            \dst0\().8h, \r1\().8b,  \src1\().8b
+        \op1            \dst0\().8h, \r2\().8b,  \src2\().8b
+        \op2            \dst0\().8h, \r3\().8b,  \src3\().8b
+        \op1\()2        \dst1\().8h, \r0\().16b,  \src0\().16b
+        \op2\()2        \dst1\().8h, \r1\().16b,  \src1\().16b
+        \op1\()2        \dst1\().8h, \r2\().16b,  \src2\().16b
+        \op2\()2        \dst1\().8h, \r3\().16b,  \src3\().16b
+.if \tail-1
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+        mov             \r0\().b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+.endif
+.endm
+        calc            umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+        calc            umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+        calc            umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+        calc            umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+        calc            umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+        calc            umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+        calc            umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+        calc            umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun2       v20.16b, v24.8h, #6
+        sqrshrun2       v21.16b, v25.8h, #6
+        sqrshrun2       v22.16b, v26.8h, #6
+        sqrshrun2       v23.16b, v27.8h, #6
+        st4            {v20.16b-v23.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1            {v24.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #4
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1            {v24.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1            {v24.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #8
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        ld1            {v16.16b}, [x8], x3
+        mov             w11, w4         // height
+        ld1            {v17.16b}, [x8], x3
+        mov             x10, x0         // dst
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        subs            x11, x11, #1
+        st1            {v24.s}[2], [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #12
+        add             x2, x2, #12
+        subs            w7, w7, #12
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        ld1            {v16.16b}, [x8], x3
+        mov             w11, w4         // height
+        ld1            {v17.16b}, [x8], x3
+        mov             x10, x0         // dst
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        subs            x11, x11, #1
+        st1            {v24.16b}, [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #16
+        add             x2, x2, #16
+        subs            w7, w7, #16
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x30, xzr, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().4h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        subs            w4, w4, #1
+        st1            {v1.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x30, xzr, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #4
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1            {v1.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x30, xzr, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        subs            w4, w4, #1
+        st1            {v1.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x0, sp, #48
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #8
+        ld1            {v16.8h, v17.8h}, [sp], x9
+        ld1            {v18.8h, v19.8h}, [sp], x9
+        ld1            {v20.8h, v21.8h}, [sp], x9
+        ld1            {v22.8h, v23.8h}, [sp], x9
+        ld1            {v24.8h, v25.8h}, [sp], x9
+        ld1            {v26.8h, v27.8h}, [sp], x9
+        ld1            {v28.8h, v29.8h}, [sp], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [sp], x9
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn, #12
+        calc_qpelh2     v1, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sqrshrn2, #12
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        subs            w4, w4, #1
+        st1            {v1.s}[2], [x0], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+        mov             x9, #(MAX_PB_SIZE * 2)
+        load_qpel_filterh x6, x5
+        sub             w12, w9, w7, lsl #1
+0:      mov             x8, sp          // src
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        mov             w11, w4         // height
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        mov             x10, x0         // dst
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+        calc_qpelh      v1,     \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn,  #12
+        calc_qpelh2     v1, v2, \src0, \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7,  sqrshrn2, #12
+        calc_qpelh      v2,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn,  #12
+        calc_qpelh2     v2, v3, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        subs            x11, x11, #1
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #16
+        add             sp, sp, #32
+        subs            w7, w7, #16
+        b.ne            0b
+        add             w10, w4, #6
+        add             sp, sp, x12         // discard rest of first line
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+        stp             x6, x30, [sp, #-16]!
+        mov             x7, #16
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        add             x2, x2, #16
+        ldp             x0, x1, [sp], #16
+        mov             x7, #8
+        add             x0, x0, #16
+        ldr             x6, [sp]
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        add             x0, sp, #48
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x0, sp, #48
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+        add             w10, w4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        mov             x2, x3
+        sub             x1, x1, x3
+        add             w3, w4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+.macro calc op, idx
+        \op             v20.8h, v16.8b, v\idx\().8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[\idx]
+.endm
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        subs            w5, w5, #1
+        st1            {v16.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        // same macro
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], #4
+        subs            w5, w5, #1
+        st1            {v16.h}[2], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        // same macro
+        calc umlsl, 0
+        calc umlal, 1
+        calc umlsl, 2
+        calc umlal, 3
+        calc umlal, 4
+        calc umlsl, 5
+        calc umlal, 6
+        umlsl           v20.8h, v16.8b, v7.8b
+.purgem calc
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        subs            w5, w5, #1
+        st1            {v16.8b}, [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        movi            v20.8h, #0
+        ldr             w12, [x2, #16]
+        movi            v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+        \op1            \r0\().8h, \r1\().8b, \src0\().8b
+        \op2            \r0\().8h, \r2\().8b, \src1\().8b
+.if \tail-1
+        ushr            \r1\().2d, \r1\().2d, #8
+.endif
+.endm
+        calc            umlsl, umlal, v20, v16, v17, v0, v1
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        calc            umlsl, umlal, v21, v17, v16, v0, v1
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        calc            umlsl, umlal, v20, v16, v17, v2, v3
+        mov             v16.b[7], w12
+        calc            umlsl, umlal, v21, v17, v16, v2, v3
+        calc            umlal, umlsl, v20, v16, v17, v4, v5
+        calc            umlal, umlsl, v21, v17, v16, v4, v5
+        calc            umlal, umlsl, v20, v16, v17, v6, v7
+        calc            umlal, umlsl, v21, v17, v16, v6, v7, 1
+.purgem calc
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        zip1            v16.16b, v16.16b, v17.16b
+        st1            {v16.8b}, [x0], #8
+        subs            w5, w5, #1
+        st1            {v16.s}[2], [x0], x1
+        add             x2, x2, x3
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        movi            v20.8h, #0
+        ldr             x12, [x2, #16]
+        movi            v21.8h, #0
+.macro calc op1, op2, r0, r1, r2, src0, src1, tail=0
+        \op1            \r0\().8h, \r1\().8b, \src0\().8b
+        \op2            \r0\().8h, \r2\().8b, \src1\().8b
+        ushr            \r1\().2d, \r1\().2d, #8
+        mov             \r1\().b[7], w12
+.if \tail-1
+        lsr             x12, x12, #8
+.endif
+.endm
+        calc            umlsl, umlal, v20, v16, v17, v0, v1
+        calc            umlsl, umlal, v21, v17, v16, v0, v1
+        calc            umlsl, umlal, v20, v16, v17, v2, v3
+        calc            umlsl, umlal, v21, v17, v16, v2, v3
+        calc            umlal, umlsl, v20, v16, v17, v4, v5
+        calc            umlal, umlsl, v21, v17, v16, v4, v5
+        calc            umlal, umlsl, v20, v16, v17, v6, v7, 1
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+.purgem calc
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        subs            w5, w5, #1
+        st2            {v16.8b, v17.8b}, [x0], x1
+        add             x2, x2, x3
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld3            {v16.8b-v18.8b}, [x2]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        ldr             x12, [x2, #24]
+        movi            v22.8h, #0
+.macro calc op1, op2, r0, r1, r2, r3, src0, src1, src2, tail=0
+        \op1            \r0\().8h, \r1\().8b, \src0\().8b
+        \op2            \r0\().8h, \r2\().8b, \src1\().8b
+        umlsl           \r0\().8h, \r3\().8b, \src2\().8b
+        ushr            \r1\().2d, \r1\().2d, #8
+        mov             \r1\().b[7], w12
+        lsr             x12, x12, #8
+.endm
+        calc            umlsl, umlal, v20, v16, v17, v18, v0, v1, v2
+        calc            umlsl, umlal, v21, v17, v18, v16, v0, v1, v2
+        calc            umlsl, umlal, v22, v18, v16, v17, v0, v1, v2
+        calc            umlal, umlal, v20, v16, v17, v18, v3, v4, v5
+        calc            umlal, umlal, v21, v17, v18, v16, v3, v4, v5
+        calc            umlal, umlal, v22, v18, v16, v17, v3, v4, v5
+.purgem calc
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        ld3            {v23.8h, v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v23.8h
+        sqadd           v17.8h, v21.8h, v24.8h
+        sqadd           v18.8h, v22.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        subs            w5, w5, #1
+        st3            {v16.8b, v17.8b, v18.8b}, [x0], x1
+        add             x2, x2, x3
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld4            {v16.8b-v19.8b}, [x2]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        ldr             x12, [x2, #32]
+        movi            v23.8h, #0
+.macro calc op1, op2, dst, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+        \op1            \dst\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst\().8h, \r1\().8b, \src1\().8b
+        \op1            \dst\().8h, \r2\().8b, \src2\().8b
+        \op2            \dst\().8h, \r3\().8b, \src3\().8b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+.if \tail-1
+        lsr             x12, x12, #8
+.endif
+.endm
+        calc            umlsl, umlal, v20, v16, v17, v18, v19, v0, v1, v2, v3
+        calc            umlsl, umlal, v21, v17, v18, v19, v16, v0, v1, v2, v3
+        calc            umlsl, umlal, v22, v18, v19, v16, v17, v0, v1, v2, v3
+        calc            umlsl, umlal, v23, v19, v16, v17, v18, v0, v1, v2, v3
+        calc            umlal, umlsl, v20, v16, v17, v18, v19, v4, v5, v6, v7
+        calc            umlal, umlsl, v21, v17, v18, v19, v16, v4, v5, v6, v7
+        calc            umlal, umlsl, v22, v18, v19, v16, v17, v4, v5, v6, v7, 1
+.purgem calc
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqadd           v18.8h, v22.8h, v26.8h
+        sqadd           v19.8h, v23.8h, v27.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun        v19.8b, v19.8h, #7
+        st4            {v16.8b-v19.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            w5, w5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #80
+        mov             x11, x7         // height
+1:      ld3            {v16.16b-v18.16b}, [x2]
+        ldr             x12, [x2, #24]
+        ldr             x13, [x2, #48]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, src0, src1, src2, tail=0
+        \op1            \dst0\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst0\().8h, \r1\().8b, \src1\().8b
+        umlsl           \dst0\().8h, \r2\().8b, \src2\().8b
+        \op1\()2        \dst1\().8h, \r0\().16b, \src0\().16b
+        \op2\()2        \dst1\().8h, \r1\().16b, \src1\().16b
+        umlsl2          \dst1\().8h, \r2\().16b, \src2\().16b
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+        mov             \r0\().b[15], w13
+
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+
+.endm
+        calc            umlsl, umlal, v20, v23, v16, v17, v18, v0, v1, v2
+        calc            umlsl, umlal, v21, v24, v17, v18, v16, v0, v1, v2
+        calc            umlsl, umlal, v22, v25, v18, v16, v17, v0, v1, v2
+        calc            umlal, umlal, v20, v23, v16, v17, v18, v3, v4, v5
+        calc            umlal, umlal, v21, v24, v17, v18, v16, v3, v4, v5
+        calc            umlal, umlal, v22, v25, v18, v16, v17, v3, v4, v5
+.purgem calc
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        umlal2          v23.8h, v16.16b, v6.16b
+        umlsl2          v23.8h, v17.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal2          v24.8h, v17.16b, v6.16b
+        umlsl2          v24.8h, v18.16b, v7.16b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v18.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], #48
+        sqadd           v16.8h, v20.8h, v26.8h
+        sqadd           v17.8h, v21.8h, v27.8h
+        sqadd           v18.8h, v22.8h, v28.8h
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], x10
+        sqadd           v19.8h, v23.8h, v26.8h
+        sqadd           v20.8h, v24.8h, v27.8h
+        sqadd           v21.8h, v25.8h, v28.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun2       v16.16b, v19.8h, #7
+        sqrshrun2       v17.16b, v20.8h, #7
+        sqrshrun2       v18.16b, v21.8h, #7
+        subs            w5, w5, #1
+        st3            {v16.16b-v18.16b}, [x0], x1
+        add             x2, x2, x3
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+1:      ld4            {v16.16b-v19.16b}, [x2]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        ldr             x12, [x2, #32]
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        ldr             x13, [x2, #64]
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+.macro calc op1, op2, dst0, dst1, r0, r1, r2, r3, src0, src1, src2, src3, tail=0
+        \op1            \dst0\().8h, \r0\().8b, \src0\().8b
+        \op2            \dst0\().8h, \r1\().8b, \src1\().8b
+        \op1            \dst0\().8h, \r2\().8b, \src2\().8b
+        \op2            \dst0\().8h, \r3\().8b, \src3\().8b
+        \op1\()2        \dst1\().8h, \r0\().16b, \src0\().16b
+        \op2\()2        \dst1\().8h, \r1\().16b, \src1\().16b
+        \op1\()2        \dst1\().8h, \r2\().16b, \src2\().16b
+        \op2\()2        \dst1\().8h, \r3\().16b, \src3\().16b
+.if \tail-1
+        ushr            \r0\().2d, \r0\().2d, #8
+        mov             \r0\().b[7], w12
+        lsr             x12, x12, #8
+        mov             \r0\().b[15], w13
+        lsr             x13, x13, #8
+.endif
+.endm
+        calc            umlsl, umlal, v20, v24, v16, v17, v18, v19, v0, v1, v2, v3
+        calc            umlsl, umlal, v21, v25, v17, v18, v19, v16, v0, v1, v2, v3
+        calc            umlsl, umlal, v22, v26, v18, v19, v16, v17, v0, v1, v2, v3
+        calc            umlsl, umlal, v23, v27, v19, v16, v17, v18, v0, v1, v2, v3
+        calc            umlal, umlsl, v20, v24, v16, v17, v18, v19, v4, v5, v6, v7
+        calc            umlal, umlsl, v21, v25, v17, v18, v19, v16, v4, v5, v6, v7
+        calc            umlal, umlsl, v22, v26, v18, v19, v16, v17, v4, v5, v6, v7
+        calc            umlal, umlsl, v23, v27, v19, v16, v17, v18, v4, v5, v6, v7, 1
+.purgem calc
+        ld4            {v28.8h-v31.8h}, [x4], #64
+        sqadd           v20.8h, v20.8h, v28.8h
+        sqadd           v21.8h, v21.8h, v29.8h
+        sqadd           v22.8h, v22.8h, v30.8h
+        sqadd           v23.8h, v23.8h, v31.8h
+        ld4            {v28.8h-v31.8h}, [x4], #64
+        sqadd           v24.8h, v24.8h, v28.8h
+        sqadd           v25.8h, v25.8h, v29.8h
+        sqadd           v26.8h, v26.8h, v30.8h
+        sqadd           v27.8h, v27.8h, v31.8h
+        sqrshrun        v16.8b, v20.8h, #7
+        sqrshrun        v17.8b, v21.8h, #7
+        sqrshrun        v18.8b, v22.8h, #7
+        sqrshrun        v19.8b, v23.8h, #7
+        sqrshrun2       v16.16b, v24.8h, #7
+        sqrshrun2       v17.16b, v25.8h, #7
+        sqrshrun2       v18.16b, v26.8h, #7
+        sqrshrun2       v19.16b, v27.8h, #7
+        subs            w5, w5, #1
+        st4            {v16.16b-v19.16b}, [x0], x1
+        add             x2, x2, x3
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        ld1            {v25.4h}, [x4], x12 // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        subs            w5, w5, #1
+        st1            {v25.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        sub             x1, x1, #4
+        ld1            {v17.8b}, [x2], x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        ld1            {v25.8h}, [x4], x12 // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        subs            w5, w5, #1
+        st1            {v25.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+ .macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        subs            w5, w5, #1
+        st1            {v25.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1            {v16.16b}, [x2], x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        subs            w5, w5, #1
+        st1            {v26.s}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        subs            w5, w5, #1
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        add             x4, x4, #32
+        bl              X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v12.16b-v15.16b}, [sp]
+        sub             x2, x2, x3, lsl #1
+        sub             sp, sp, #64
+        st1            {v8.16b-v11.16b}, [sp]
+        sub             x2, x2, x3
+        load_qpel_filterb x7, x6
+        ldr             w6, [sp, #128]
+        mov             x12, #(MAX_PB_SIZE * 2)
+0:      mov             x8, x2          // src
+        ld1            {v16.16b, v17.16b}, [x8], x3
+        mov             w11, w5         // height
+        ld1            {v18.16b, v19.16b}, [x8], x3
+        mov             x10, x0         // dst
+        ld1            {v20.16b, v21.16b}, [x8], x3
+        mov             x9, x4          // src2
+        ld1            {v22.16b, v23.16b}, [x8], x3
+        ld1            {v24.16b, v25.16b}, [x8], x3
+        ld1            {v26.16b, v27.16b}, [x8], x3
+        ld1            {v28.16b, v29.16b}, [x8], x3
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb2     v9,  \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7
+        calc_qpelb      v10, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        calc_qpelb2     v11, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        subs            x11, x11, #1
+        st1            {v12.16b, v13.16b}, [x10], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #32 // dst
+        add             x2, x2, #32 // src
+        add             x4, x4, #64 // src2
+        subs            w6, w6, #32
+        b.ne            0b
+        ld1            {v8.16b-v11.16b}, [sp], #64
+        ld1            {v12.16b-v15.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x8, #32
+        stp             x8, x8, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+        ldp             x8, xzr, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             w3, w5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().4h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        subs            w5, w5, #1
+        st1            {v1.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        sub             x1, x1, #4
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+        calc_qpelh2     v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        subs            w5, w5, #1
+        st1            {v1.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        sub             x1, x2, x3, lsl #1
+        stp             x7, x30, [sp, #-16]!
+        sub             x1, x1, x3
+        add             x0, sp, #48
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1            {\tmp\().8h}, [sp], x9
+        calc_qpelh      v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+        calc_qpelh2     v2, v2, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        subs            w5, w5, #1
+        st1            {v1.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        add             x4, x4, #16
+        ldp             x0, x1, [sp], #16
+        add             x2, x2, #8
+        add             x0, x0, #8
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #16          // width
+.Lqpel_bi_hv16_loop:
+        load_qpel_filterh x7, x8
+        mov             x9, #(MAX_PB_SIZE * 2)
+        mov             x10, x6
+
+0:      mov             x8, sp          // src
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        mov             w11, w5         // height
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        mov             x12, x4         // src2
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        mov             x7, x0          // dst
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+.macro calc tmp0, tmp1, src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1            {\tmp0\().8h, \tmp1\().8h}, [x8], x9
+        calc_qpelh      v1,     \src0,  \src1, \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sshr
+        calc_qpelh2     v2, v2, \src0, \src1,  \src2,  \src3,  \src4,  \src5,  \src6,  \src7, sshr
+        calc_qpelh      v3,     \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+        calc_qpelh2     v4, v4, \src8, \src9, \src10, \src11, \src12, \src13, \src14, \src15, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        subs            x11, x11, #1
+        st1            {v1.16b}, [x7], x1
+.endm
+1:      calc_all2
+.purgem calc
+2:      add             x0, x0, #16
+        add             sp, sp, #32
+        subs            x10, x10, #16
+        add             x4, x4, #32
+        b.ne            0b
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             x10, x10, x6, lsl #1 // part of first line
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x4, x4, #32
+        add             x2, x2, #16
+        add             x0, x0, #16
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        mov             x2, x3
+        sub             x1, x1, x3
+        add             w3, w5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #32 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        mov             x2, x3
+        sub             x1, x1, x3
+        add             w3, w5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #48 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+        add             w10, w5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        mov             x2, x3
+        sub             x1, x1, x3
+        add             w3, w5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #64          // width
+        b               .Lqpel_bi_hv16_loop
+endfunc