diff mbox series

[FFmpeg-devel,5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv

Message ID 20230604041756.5196-5-Logan.Lyu@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

Logan.Lyu June 4, 2023, 4:17 a.m. UTC
From: Logan Lyu <Logan.Lyu@myais.com.cn>

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 703 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +
 2 files changed, 710 insertions(+)

Comments

Martin Storsjö June 12, 2023, 8:19 a.m. UTC | #1
On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:

> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
> libavcodec/aarch64/hevcdsp_epel_neon.S    | 703 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +
> 2 files changed, 710 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 32f052a7b1..24a74d2c7d 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -718,6 +718,709 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>         ret
> endfunc
>
> +.macro epel_uni_w_hv_start
> +        mov             x15, x5         //denom
> +        mov             x16, x6         //wx
> +        mov             x17, x7         //ox
> +        add             w15, w15, #6    //shift = denom+6
> +
> +
> +        ldp             x5, x6, [sp]
> +        ldp             x7, xzr, [sp, #16]

Why ldp into xzr, that seems pointless?

> +
> +        sub             sp, sp, #128
> +        stp             q12, q13, [sp]

This could be "stp q12, q13, [sp, #-128]!"

> +        stp             q14, q15, [sp, #32]
> +        stp             q8, q9,   [sp, #64]
> +        stp             q10, q11, [sp, #96]
> +
> +        dup             v13.8h, w16     //wx
> +        dup             v14.4s, w17     //ox
> +
> +        mov             w17, #1
> +        lsl             w17, w17, w15
> +        lsr             w17, w17, #1
> +        dup             v15.4s, w17
> +
> +        neg             w15, w15        // -shift
> +        dup             v12.4s, w15     //shift
> +.endm
> +
> +.macro epel_uni_w_hv_end
> +        smull           v28.4s, v4.4h, v13.4h
> +        smull2          v29.4s, v4.8h, v13.8h
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        sqxtn           v4.4h, v28.4s
> +        sqxtn2          v4.8h, v29.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end2
> +        smull           v28.4s, v4.4h, v13.4h
> +        smull2          v29.4s, v4.8h, v13.8h
> +        smull           v30.4s, v5.4h, v13.4h
> +        smull2          v31.4s, v5.8h, v13.8h
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        add             v30.4s, v30.4s, v15.4s
> +        add             v31.4s, v31.4s, v15.4s
> +
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        sshl            v30.4s, v30.4s, v12.4s
> +        sshl            v31.4s, v31.4s, v12.4s
> +
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        add             v30.4s, v30.4s, v14.4s
> +        add             v31.4s, v31.4s, v14.4s
> +
> +        sqxtn           v4.4h, v28.4s
> +        sqxtn2          v4.8h, v29.4s
> +        sqxtn           v5.4h, v30.4s
> +        sqxtn2          v5.8h, v31.4s
> +.endm
> +
> +.macro epel_uni_w_hv_end3
> +        smull           v1.4s,  v4.4h, v13.4h
> +        smull2          v2.4s,  v4.8h, v13.8h
> +        smull           v28.4s, v5.4h, v13.4h
> +        smull2          v29.4s, v5.8h, v13.8h
> +        smull           v30.4s, v6.4h, v13.4h
> +        smull2          v31.4s, v6.8h, v13.8h
> +        add             v1.4s, v1.4s, v15.4s
> +        add             v2.4s, v2.4s, v15.4s
> +        add             v28.4s, v28.4s, v15.4s
> +        add             v29.4s, v29.4s, v15.4s
> +        add             v30.4s, v30.4s, v15.4s
> +        add             v31.4s, v31.4s, v15.4s
> +
> +        sshl            v1.4s, v1.4s, v12.4s
> +        sshl            v2.4s, v2.4s, v12.4s
> +        sshl            v28.4s, v28.4s, v12.4s
> +        sshl            v29.4s, v29.4s, v12.4s
> +        sshl            v30.4s, v30.4s, v12.4s
> +        sshl            v31.4s, v31.4s, v12.4s
> +        add             v1.4s, v1.4s, v14.4s
> +        add             v2.4s, v2.4s, v14.4s
> +        add             v28.4s, v28.4s, v14.4s
> +        add             v29.4s, v29.4s, v14.4s
> +        add             v30.4s, v30.4s, v14.4s
> +        add             v31.4s, v31.4s, v14.4s
> +
> +        sqxtn           v4.4h, v1.4s
> +        sqxtn2          v4.8h, v2.4s
> +        sqxtn           v5.4h, v28.4s
> +        sqxtn2          v5.8h, v29.4s
> +        sqxtn           v6.4h, v30.4s
> +        sqxtn2          v6.8h, v31.4s
> +.endm
> +
> +.macro calc_epelh dst, src0, src1, src2, src3
> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
> +        sqshrn          \dst\().4h, \dst\().4s, #6
> +.endm
> +
> +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
> +        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
> +        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
> +        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
> +        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
> +        sqshrn2         \dst\().8h, \tmp\().4s, #6
> +.endm
> +
> +.macro load_epel_filterh freg, xreg
> +        movrel          \xreg, epel_filters
> +        add             \xreg, \xreg, \freg, lsl #2
> +        ld1             {v0.8b}, [\xreg]
> +        sxtl            v0.8h, v0.8b
> +.endm
> +
> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
> +        epel_uni_w_hv_start
> +        and             x4, x4, 0xffffffff

What does this "and" do here? Is it a case where the argument is "int", 
while the upper bits of the register is undefined? In those cases, you're 
best off by just using "w4", possibly "w4, uxtw" (or sxtw) instead of 
manually doing such an "and" here.

> +
> +        add             x10, x4, #3
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10     // tmp_array
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x4, x6, [sp, #-16]!
> +        stp             xzr, x30, [sp, #-16]!

Don't do consecutive decrements like this, but do one "stp ..., [sp, 
#-48]!" followed by "stp ..., [sp, #16]" etc.

> +        add             x0, sp, #48
> +        sub             x1, x2, x3
> +        mov             x2, x3
> +        add             x3, x4, #3
> +        mov             x4, x5
> +        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
> +        ldp             xzr, x30, [sp], #16
> +        ldp             x4, x6, [sp], #16
> +        ldp             x0, x1, [sp], #16
> +        load_epel_filterh x6, x5
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ld1             {v16.4h}, [sp], x10
> +        ld1             {v17.4h}, [sp], x10
> +        ld1             {v18.4h}, [sp], x10
> +1:      ld1             {v19.4h}, [sp], x10
> +        calc_epelh      v4, v16, v17, v18, v19
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v16.4h}, [sp], x10
> +        calc_epelh      v4, v17, v18, v19, v16
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v17.4h}, [sp], x10
> +        calc_epelh      v4, v18, v19, v16, v17
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.eq            2f
> +
> +        ld1             {v18.4h}, [sp], x10
> +        calc_epelh      v4, v19, v16, v17, v18
> +        epel_uni_w_hv_end
> +        sqxtun          v4.8b, v4.8h
> +        str             s4, [x0]
> +        add             x0, x0, x1
> +        subs            x4, x4, #1
> +        b.ne            1b
> +2:
> +        ldp             q12, q13, [sp]
> +        ldp             q14, q15, [sp, #32]
> +        ldp             q8, q9,   [sp, #64]
> +        ldp             q10, q11, [sp, #96]
> +        add             sp, sp, #128

Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".

The same thing applies to all other functions in this patch too.

> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 348497bbbe..fbbc4e6071 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
>         int height, int denom, int wx, int ox,
>         intptr_t mx, intptr_t my, int width), _i8mm);
>
> +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
> +        const uint8_t *_src, ptrdiff_t _srcstride,
> +        int height, int denom, int wx, int ox,
> +        intptr_t mx, intptr_t my, int width), _i8mm);
> +
> NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
>         const uint8_t *_src, ptrdiff_t _srcstride,
>         int height, int denom, int wx, int ox,
> @@ -286,11 +291,13 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>
> +
>         if (have_i8mm(cpu_flags)) {

Stray whitespace change.

// Martin
Logan.Lyu June 18, 2023, 8:25 a.m. UTC | #2
Hi, Martin,

I modified it according to your comments. Please review again.

And here are the checkasm benchmark results of the related functions:

put_hevc_epel_uni_w_hv4_8_c: 254.6
put_hevc_epel_uni_w_hv4_8_i8mm: 102.9
put_hevc_epel_uni_w_hv6_8_c: 411.6
put_hevc_epel_uni_w_hv6_8_i8mm: 221.6
put_hevc_epel_uni_w_hv8_8_c: 669.4
put_hevc_epel_uni_w_hv8_8_i8mm: 214.9
put_hevc_epel_uni_w_hv12_8_c: 1412.6
put_hevc_epel_uni_w_hv12_8_i8mm: 481.4
put_hevc_epel_uni_w_hv16_8_c: 2425.4
put_hevc_epel_uni_w_hv16_8_i8mm: 647.4
put_hevc_epel_uni_w_hv24_8_c: 5384.1
put_hevc_epel_uni_w_hv24_8_i8mm: 1450.6
put_hevc_epel_uni_w_hv32_8_c: 9470.9
put_hevc_epel_uni_w_hv32_8_i8mm: 2497.1
put_hevc_epel_uni_w_hv48_8_c: 20930.1
put_hevc_epel_uni_w_hv48_8_i8mm: 5635.9
put_hevc_epel_uni_w_hv64_8_c: 36682.9
put_hevc_epel_uni_w_hv64_8_i8mm: 9712.6



在 2023/6/12 16:19, Martin Storsjö 写道:
> On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote:
>
>> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>>
>> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
>> ---
>> libavcodec/aarch64/hevcdsp_epel_neon.S    | 703 ++++++++++++++++++++++
>> libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +
>> 2 files changed, 710 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
>> b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> index 32f052a7b1..24a74d2c7d 100644
>> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> @@ -718,6 +718,709 @@ function 
>> ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>>         ret
>> endfunc
>>
>> +.macro epel_uni_w_hv_start
>> +        mov             x15, x5         //denom
>> +        mov             x16, x6         //wx
>> +        mov             x17, x7         //ox
>> +        add             w15, w15, #6    //shift = denom+6
>> +
>> +
>> +        ldp             x5, x6, [sp]
>> +        ldp             x7, xzr, [sp, #16]
>
> Why ldp into xzr, that seems pointless?
>
>> +
>> +        sub             sp, sp, #128
>> +        stp             q12, q13, [sp]
>
> This could be "stp q12, q13, [sp, #-128]!"
>
>> +        stp             q14, q15, [sp, #32]
>> +        stp             q8, q9,   [sp, #64]
>> +        stp             q10, q11, [sp, #96]
>> +
>> +        dup             v13.8h, w16     //wx
>> +        dup             v14.4s, w17     //ox
>> +
>> +        mov             w17, #1
>> +        lsl             w17, w17, w15
>> +        lsr             w17, w17, #1
>> +        dup             v15.4s, w17
>> +
>> +        neg             w15, w15        // -shift
>> +        dup             v12.4s, w15     //shift
>> +.endm
>> +
>> +.macro epel_uni_w_hv_end
>> +        smull           v28.4s, v4.4h, v13.4h
>> +        smull2          v29.4s, v4.8h, v13.8h
>> +        add             v28.4s, v28.4s, v15.4s
>> +        add             v29.4s, v29.4s, v15.4s
>> +        sshl            v28.4s, v28.4s, v12.4s
>> +        sshl            v29.4s, v29.4s, v12.4s
>> +        add             v28.4s, v28.4s, v14.4s
>> +        add             v29.4s, v29.4s, v14.4s
>> +        sqxtn           v4.4h, v28.4s
>> +        sqxtn2          v4.8h, v29.4s
>> +.endm
>> +
>> +.macro epel_uni_w_hv_end2
>> +        smull           v28.4s, v4.4h, v13.4h
>> +        smull2          v29.4s, v4.8h, v13.8h
>> +        smull           v30.4s, v5.4h, v13.4h
>> +        smull2          v31.4s, v5.8h, v13.8h
>> +        add             v28.4s, v28.4s, v15.4s
>> +        add             v29.4s, v29.4s, v15.4s
>> +        add             v30.4s, v30.4s, v15.4s
>> +        add             v31.4s, v31.4s, v15.4s
>> +
>> +        sshl            v28.4s, v28.4s, v12.4s
>> +        sshl            v29.4s, v29.4s, v12.4s
>> +        sshl            v30.4s, v30.4s, v12.4s
>> +        sshl            v31.4s, v31.4s, v12.4s
>> +
>> +        add             v28.4s, v28.4s, v14.4s
>> +        add             v29.4s, v29.4s, v14.4s
>> +        add             v30.4s, v30.4s, v14.4s
>> +        add             v31.4s, v31.4s, v14.4s
>> +
>> +        sqxtn           v4.4h, v28.4s
>> +        sqxtn2          v4.8h, v29.4s
>> +        sqxtn           v5.4h, v30.4s
>> +        sqxtn2          v5.8h, v31.4s
>> +.endm
>> +
>> +.macro epel_uni_w_hv_end3
>> +        smull           v1.4s,  v4.4h, v13.4h
>> +        smull2          v2.4s,  v4.8h, v13.8h
>> +        smull           v28.4s, v5.4h, v13.4h
>> +        smull2          v29.4s, v5.8h, v13.8h
>> +        smull           v30.4s, v6.4h, v13.4h
>> +        smull2          v31.4s, v6.8h, v13.8h
>> +        add             v1.4s, v1.4s, v15.4s
>> +        add             v2.4s, v2.4s, v15.4s
>> +        add             v28.4s, v28.4s, v15.4s
>> +        add             v29.4s, v29.4s, v15.4s
>> +        add             v30.4s, v30.4s, v15.4s
>> +        add             v31.4s, v31.4s, v15.4s
>> +
>> +        sshl            v1.4s, v1.4s, v12.4s
>> +        sshl            v2.4s, v2.4s, v12.4s
>> +        sshl            v28.4s, v28.4s, v12.4s
>> +        sshl            v29.4s, v29.4s, v12.4s
>> +        sshl            v30.4s, v30.4s, v12.4s
>> +        sshl            v31.4s, v31.4s, v12.4s
>> +        add             v1.4s, v1.4s, v14.4s
>> +        add             v2.4s, v2.4s, v14.4s
>> +        add             v28.4s, v28.4s, v14.4s
>> +        add             v29.4s, v29.4s, v14.4s
>> +        add             v30.4s, v30.4s, v14.4s
>> +        add             v31.4s, v31.4s, v14.4s
>> +
>> +        sqxtn           v4.4h, v1.4s
>> +        sqxtn2          v4.8h, v2.4s
>> +        sqxtn           v5.4h, v28.4s
>> +        sqxtn2          v5.8h, v29.4s
>> +        sqxtn           v6.4h, v30.4s
>> +        sqxtn2          v6.8h, v31.4s
>> +.endm
>> +
>> +.macro calc_epelh dst, src0, src1, src2, src3
>> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
>> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
>> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
>> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
>> +        sqshrn          \dst\().4h, \dst\().4s, #6
>> +.endm
>> +
>> +.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
>> +        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
>> +        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
>> +        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
>> +        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
>> +        sqshrn2         \dst\().8h, \tmp\().4s, #6
>> +.endm
>> +
>> +.macro load_epel_filterh freg, xreg
>> +        movrel          \xreg, epel_filters
>> +        add             \xreg, \xreg, \freg, lsl #2
>> +        ld1             {v0.8b}, [\xreg]
>> +        sxtl            v0.8h, v0.8b
>> +.endm
>> +
>> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
>> +        epel_uni_w_hv_start
>> +        and             x4, x4, 0xffffffff
>
> What does this "and" do here? Is it a case where the argument is 
> "int", while the upper bits of the register is undefined? In those 
> cases, you're best off by just using "w4", possibly "w4, uxtw" (or 
> sxtw) instead of manually doing such an "and" here.
>
>> +
>> +        add             x10, x4, #3
>> +        lsl             x10, x10, #7
>> +        sub             sp, sp, x10     // tmp_array
>> +        stp             x0, x1, [sp, #-16]!
>> +        stp             x4, x6, [sp, #-16]!
>> +        stp             xzr, x30, [sp, #-16]!
>
> Don't do consecutive decrements like this, but do one "stp ..., [sp, 
> #-48]!" followed by "stp ..., [sp, #16]" etc.
>
>> +        add             x0, sp, #48
>> +        sub             x1, x2, x3
>> +        mov             x2, x3
>> +        add             x3, x4, #3
>> +        mov             x4, x5
>> +        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
>> +        ldp             xzr, x30, [sp], #16
>> +        ldp             x4, x6, [sp], #16
>> +        ldp             x0, x1, [sp], #16
>> +        load_epel_filterh x6, x5
>> +        mov             x10, #(MAX_PB_SIZE * 2)
>> +        ld1             {v16.4h}, [sp], x10
>> +        ld1             {v17.4h}, [sp], x10
>> +        ld1             {v18.4h}, [sp], x10
>> +1:      ld1             {v19.4h}, [sp], x10
>> +        calc_epelh      v4, v16, v17, v18, v19
>> +        epel_uni_w_hv_end
>> +        sqxtun          v4.8b, v4.8h
>> +        str             s4, [x0]
>> +        add             x0, x0, x1
>> +        subs            x4, x4, #1
>> +        b.eq            2f
>> +
>> +        ld1             {v16.4h}, [sp], x10
>> +        calc_epelh      v4, v17, v18, v19, v16
>> +        epel_uni_w_hv_end
>> +        sqxtun          v4.8b, v4.8h
>> +        str             s4, [x0]
>> +        add             x0, x0, x1
>> +        subs            x4, x4, #1
>> +        b.eq            2f
>> +
>> +        ld1             {v17.4h}, [sp], x10
>> +        calc_epelh      v4, v18, v19, v16, v17
>> +        epel_uni_w_hv_end
>> +        sqxtun          v4.8b, v4.8h
>> +        str             s4, [x0]
>> +        add             x0, x0, x1
>> +        subs            x4, x4, #1
>> +        b.eq            2f
>> +
>> +        ld1             {v18.4h}, [sp], x10
>> +        calc_epelh      v4, v19, v16, v17, v18
>> +        epel_uni_w_hv_end
>> +        sqxtun          v4.8b, v4.8h
>> +        str             s4, [x0]
>> +        add             x0, x0, x1
>> +        subs            x4, x4, #1
>> +        b.ne            1b
>> +2:
>> +        ldp             q12, q13, [sp]
>> +        ldp             q14, q15, [sp, #32]
>> +        ldp             q8, q9,   [sp, #64]
>> +        ldp             q10, q11, [sp, #96]
>> +        add             sp, sp, #128
>
> Fold the stack increment into ldp, like "ldp q12, q13, [sp], #128".
>
> The same thing applies to all other functions in this patch too.
>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index 348497bbbe..fbbc4e6071 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  
>> ptrdiff_t _dststride,
>>         int height, int denom, int wx, int ox,
>>         intptr_t mx, intptr_t my, int width), _i8mm);
>>
>> +NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width), _i8mm);
>> +
>> NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t 
>> _dststride,
>>         const uint8_t *_src, ptrdiff_t _srcstride,
>>         int height, int denom, int wx, int ox,
>> @@ -286,11 +291,13 @@ av_cold void 
>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
>>         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, 
>> qpel_uni_w_v,);
>>
>> +
>>         if (have_i8mm(cpu_flags)) {
>
> Stray whitespace change.
>
> // Martin
>
From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_hv

Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 700 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8b6f396a0b..355679af29 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldr             x7, [sp, #16]
+
+        stp             q12, q13, [sp, #-128]!
+        stp             q14, q15, [sp, #32]
+        stp             q8, q9,   [sp, #64]
+        stp             q10, q11, [sp, #96]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             xzr, x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldp             xzr, x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1             {v25.8h, v26.8h, v27.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+
+        epel_uni_w_hv_end3
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        ldp             q12, q13, [sp], #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #24
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x17, #24
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b448d755b9..e125b0cfb2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
         }
Martin Storsjö July 1, 2023, 9:28 p.m. UTC | #3
On Sun, 18 Jun 2023, Logan.Lyu wrote:

> Hi, Martin,
>
> I modified it according to your comments. Please review again.

> From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
> From: Logan Lyu <Logan.Lyu@myais.com.cn>
> Date: Sun, 28 May 2023 10:35:43 +0800
> Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
>  hevc_epel_uni_w_hv
> 
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
>  libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
>  libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
>  2 files changed, 700 insertions(+)
> 
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index 8b6f396a0b..355679af29 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -717,6 +717,700 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>          ret
>  endfunc
> 
> +.macro epel_uni_w_hv_start
> +        mov             x15, x5         //denom
> +        mov             x16, x6         //wx
> +        mov             x17, x7         //ox
> +        add             w15, w15, #6    //shift = denom+6
> +
> +
> +        ldp             x5, x6, [sp]
> +        ldr             x7, [sp, #16]
> +
> +        stp             q12, q13, [sp, #-128]!
> +        stp             q14, q15, [sp, #32]
> +        stp             q8, q9,   [sp, #64]
> +        stp             q10, q11, [sp, #96]

Only need to back up 64 bytes, by backing up d8-d15. Also, the order
is quite weird here, why not keep them in e.g. linear order?

> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
> +        epel_uni_w_hv_start
> +        sxtw            x4, w4
> +
> +        add             x10, x4, #3
> +        lsl             x10, x10, #7
> +        sub             sp, sp, x10     // tmp_array
> +        stp             xzr, x30, [sp, #-48]!

As mentioned already in the previous review - why do you back up and
restore xzr here? That's not necessary. Yes, you should keep the stack
16 byte aligned, but you can just leave an empty slot, and just do
"str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp
when restoring.

The same goes in all functions here.

> +2:
> +        ldp             q14, q15, [sp, #32]
> +        ldp             q8, q9,   [sp, #64]
> +        ldp             q10, q11, [sp, #96]
> +        ldp             q12, q13, [sp], #128

Only need d8-d15, and weird register order here, and elsewhere.

> +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
> +        epel_uni_w_hv_start
> +        sxtw            x4, w4

FWIW, it's unusual to need an explicit sxtw instruction, but I guess
if you use it in the form "add x10, x4, #3" it might be needed.

> +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
> +        ldp             x15, x16, [sp]
> +        stp             x0, x30, [sp, #-16]!
> +        stp             x1, x2, [sp, #-16]!
> +        stp             x3, x4, [sp, #-16]!
> +        stp             x5, x6, [sp, #-16]!

Don't do consecutive stack pointer updates like this, but merge it
into one large stack decrement followed by positive offsets, like in
all the other cases of stp/ldp.

> +        mov             x17, #16
> +        stp             x17, x7, [sp, #-16]!
> +        stp             x15, x16, [sp, #-16]!
> +        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
> +        ldp             x15, x16, [sp], #16
> +        ldp             x17, x7, [sp], #16
> +        ldp             x5, x6, [sp], #16
> +        ldp             x3, x4, [sp], #16
> +        ldp             x1, x2, [sp], #16
> +        ldr             x0, [sp]
> +        add             x0, x0, #16
> +        add             x2, x2, #16
> +        mov             x17, #16
> +        stp             x17, xzr, [sp, #-16]!
> +        stp             x15, x16, [sp, #-16]!

Don't do multiple stack decrements, don't needlessly store xzr here.

The same goes for all the other functions in this patch.

// Martin
Logan.Lyu July 13, 2023, 2:54 p.m. UTC | #4
Hi, Martin,

Thanks for your comments.

I have now amended the unreasonable parts of ldp/stp that I have seen.  
And I updated patch 3 and patch 5. (Although I have attached all 5 patches)
In addition, I thought that q8-q15 was required to be saved according to 
the calling convention before, but later I confirmed that it is the 
lower 64bit, thank you for reminding.

Please take a look. If there are some small mistakes, please correct 
them directly. If there are still many problems, please remind me again, 
thank you!


在 2023/7/2 5:28, Martin Storsjö 写道:
> On Sun, 18 Jun 2023, Logan.Lyu wrote:
>
>> Hi, Martin,
>>
>> I modified it according to your comments. Please review again.
>
>> From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
>> From: Logan Lyu <Logan.Lyu@myais.com.cn>
>> Date: Sun, 28 May 2023 10:35:43 +0800
>> Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
>>  hevc_epel_uni_w_hv
>>
>> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
>> ---
>>  libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
>>  libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
>>  2 files changed, 700 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
>> b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> index 8b6f396a0b..355679af29 100644
>> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> @@ -717,6 +717,700 @@ function 
>> ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>>          ret
>>  endfunc
>>
>> +.macro epel_uni_w_hv_start
>> +        mov             x15, x5         //denom
>> +        mov             x16, x6         //wx
>> +        mov             x17, x7         //ox
>> +        add             w15, w15, #6    //shift = denom+6
>> +
>> +
>> +        ldp             x5, x6, [sp]
>> +        ldr             x7, [sp, #16]
>> +
>> +        stp             q12, q13, [sp, #-128]!
>> +        stp             q14, q15, [sp, #32]
>> +        stp             q8, q9,   [sp, #64]
>> +        stp             q10, q11, [sp, #96]
>
> Only need to back up 64 bytes, by backing up d8-d15. Also, the order
> is quite weird here, why not keep them in e.g. linear order?
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
>> +        epel_uni_w_hv_start
>> +        sxtw            x4, w4
>> +
>> +        add             x10, x4, #3
>> +        lsl             x10, x10, #7
>> +        sub             sp, sp, x10     // tmp_array
>> +        stp             xzr, x30, [sp, #-48]!
>
> As mentioned already in the previous review - why do you back up and
> restore xzr here? That's not necessary. Yes, you should keep the stack
> 16 byte aligned, but you can just leave an empty slot, and just do
> "str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp
> when restoring.
>
> The same goes in all functions here.
>
>> +2:
>> +        ldp             q14, q15, [sp, #32]
>> +        ldp             q8, q9,   [sp, #64]
>> +        ldp             q10, q11, [sp, #96]
>> +        ldp             q12, q13, [sp], #128
>
> Only need d8-d15, and weird register order here, and elsewhere.
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
>> +        epel_uni_w_hv_start
>> +        sxtw            x4, w4
>
> FWIW, it's unusual to need an explicit sxtw instruction, but I guess
> if you use it in the form "add x10, x4, #3" it might be needed.
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
>> +        ldp             x15, x16, [sp]
>> +        stp             x0, x30, [sp, #-16]!
>> +        stp             x1, x2, [sp, #-16]!
>> +        stp             x3, x4, [sp, #-16]!
>> +        stp             x5, x6, [sp, #-16]!
>
> Don't do consecutive stack pointer updates like this, but merge it
> into one large stack decrement followed by positive offsets, like in
> all the other cases of stp/ldp.
>
>> +        mov             x17, #16
>> +        stp             x17, x7, [sp, #-16]!
>> +        stp             x15, x16, [sp, #-16]!
>> +        bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
>> +        ldp             x15, x16, [sp], #16
>> +        ldp             x17, x7, [sp], #16
>> +        ldp             x5, x6, [sp], #16
>> +        ldp             x3, x4, [sp], #16
>> +        ldp             x1, x2, [sp], #16
>> +        ldr             x0, [sp]
>> +        add             x0, x0, #16
>> +        add             x2, x2, #16
>> +        mov             x17, #16
>> +        stp             x17, xzr, [sp, #-16]!
>> +        stp             x15, x16, [sp, #-16]!
>
> Don't do multiple stack decrements, don't needlessly store xzr here.
>
> The same goes for all the other functions in this patch.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From c7959c64da41d2e6a14cbd3afa019fa1792d9767 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Sat, 27 May 2023 09:42:07 +0800
Subject: [PATCH v1 3/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 503 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 509 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0411de9864..0e3bf74953 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -375,3 +375,506 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
 endfunc
 
 #endif
+
+
+.macro EPEL_UNI_W_V_HEADER
+        ldr             x12, [sp, #8]
+        movrel          x9, epel_filters
+        add             x9, x9, x12, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [x9] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x2, x2, x3
+.endm
+
+.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \d0\().4s, \d0\().4h, v30.4h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqxtn           \d0\().4h, \d0\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             s4, [x2]
+        ldr             s5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s6, [x2]
+1:
+        ldr             s7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v16, v4, v5, v6, v7
+        str             s16, [x0]
+        b.eq            2f
+        add             x0, x0, x1
+        ldr             s4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v17, v5, v6, v7, v4
+        str             s17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v18, v6, v7, v4, v5
+        str             s18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v19, v7, v4, v5, v6
+        str             s19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v6_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        sub             x1, x1, #4
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             s17, [x0], #4
+        st1             {v17.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             s18, [x0], #4
+        st1             {v18.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             s19, [x0], #4
+        st1             {v19.h}[2], [x0], x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             d16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             d17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             d18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             d19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+        sub             x1, x1, #8
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v16, v17, v4, v5, v6, v7, v24, v25, v26, v27
+        str             d16, [x0], #8
+        st1             {v16.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             d18, [x0], #8
+        st1             {v18.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             d20, [x0], #8
+        st1             {v20.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             d22, [x0], #8
+        st1             {v22.s}[2], [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+        smull2          \t3\().4s, \d1\().8h, v30.8h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqrshl          \t3\().4s, \t3\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+        sqadd           \t3\().4s, \t3\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtn2          \d1\().8h, \t3\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_epel_uni_w_v16_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v16, v17 v4, v5, v6, v7, v24, v25, v26, v27
+        str             q16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             q18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             q20, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             q22, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_w_v24_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v17, v19, v21, v23, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v19, v21, v23, v17, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v21, v23, v17, v19, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v23, v17, v19, v21, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v32_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v17, v19, v21, v23, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v19, v21, v23, v17, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v23, v17, v19, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v23, v17, v19, v21, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v48_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-32]!
+        stp             d10, d11, [sp, #16]
+
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+1:
+        ld1             {v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v19, v22, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v20, v23, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v21, v24, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v19, v22, v25, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v20, v23, v26, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v24, v27, v18, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v22, v25, v16, v19, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v23, v26, v17, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v24, v27, v18, v21, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v25, v16, v19, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v26, v17, v20, v23, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v27, v18, v21, v24, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d8, d9, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v64_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+1:
+        ld1             {v12.16b, v13.16b, v14.16b, v15.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v20, v24, v12, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v21, v25, v13, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v22, v26, v14, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v19, v23, v27, v15, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v20, v24, v12, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v21, v25, v13, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v22, v26, v14, v18, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v23, v27, v15, v19, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v24, v12, v16, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v25, v13, v17, v21, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v26, v14, v18, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v27, v15, v19, v23, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v12, v16, v20, v24, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v13, v17, v21, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v14, v18, v22, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v15, v19, v23, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d12, d13, [sp, #32]
+        ldp             d14, d15, [sp, #48]
+        ldp             d8, d9, [sp], #64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 8af0a2b4b9..4a260e1d9a 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,11 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -274,6 +279,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
Martin Storsjö July 14, 2023, 9:28 a.m. UTC | #5
On Thu, 13 Jul 2023, Logan.Lyu wrote:

> Hi, Martin,
>
> Thanks for your comments.
>
> I have now amended the unreasonable parts of ldp/stp that I have seen.  And I 
> updated patch 3 and patch 5. (Although I have attached all 5 patches)
> In addition, I thought that q8-q15 was required to be saved according to the 
> calling convention before, but later I confirmed that it is the lower 64bit, 
> thank you for reminding.
>
> Please take a look. If there are some small mistakes, please correct them 
> directly. If there are still many problems, please remind me again, thank 
> you!

Thanks, this looks mostly good to me!

In patch 3, there was still one case of a missing comma between macro 
arguments, that I fixed. I also included the checkasm benchmark numbers in 
the commit messages - please remember to add them for future patches.

I'll push these patches later after a bit more testing, if that testing 
doesn't show any further issues. Thanks!

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 32f052a7b1..24a74d2c7d 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -718,6 +718,709 @@  function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldp             x7, xzr, [sp, #16]
+
+        sub             sp, sp, #128
+        stp             q12, q13, [sp]
+        stp             q14, q15, [sp, #32]
+        stp             q8, q9,   [sp, #64]
+        stp             q10, q11, [sp, #96]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        and             x4, x4, 0xffffffff
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1             {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+
+        epel_uni_w_hv_end3
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:
+        ldp             q12, q13, [sp]
+        ldp             q14, q15, [sp, #32]
+        ldp             q8, q9,   [sp, #64]
+        ldp             q10, q11, [sp, #96]
+        add             sp, sp, #128
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #24
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x17, #24
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x17, #16
+        stp             x17, x7, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, x7, [sp], #16
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x17, #16
+        stp             x17, xzr, [sp, #-16]!
+        stp             x15, x16, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x15, x16, [sp], #16
+        ldp             x17, xzr, [sp], #16
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 348497bbbe..fbbc4e6071 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@  NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -286,11 +291,13 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
+
         if (have_i8mm(cpu_flags)) {
             NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
         }
     }