diff mbox series

[FFmpeg-devel,1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v

Message ID 33af9c88-c31d-e11e-58a3-7f9a05718c8f@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/configure_x86 warning Failed to apply patch

Commit Message

Logan.Lyu Aug. 26, 2023, 8:49 a.m. UTC
checkasm bench:
put_hevc_epel_uni_hv64_8_i8mm: 6568.7
put_hevc_epel_uni_v4_8_c: 88.7
put_hevc_epel_uni_v4_8_neon: 32.7
put_hevc_epel_uni_v6_8_c: 185.4
put_hevc_epel_uni_v6_8_neon: 44.9
put_hevc_epel_uni_v8_8_c: 333.9
put_hevc_epel_uni_v8_8_neon: 44.4
put_hevc_epel_uni_v12_8_c: 728.7
put_hevc_epel_uni_v12_8_neon: 119.7
put_hevc_epel_uni_v16_8_c: 1224.2
put_hevc_epel_uni_v16_8_neon: 139.7
put_hevc_epel_uni_v24_8_c: 2531.2
put_hevc_epel_uni_v24_8_neon: 329.9
put_hevc_epel_uni_v32_8_c: 4739.9
put_hevc_epel_uni_v32_8_neon: 562.7
put_hevc_epel_uni_v48_8_c: 10618.7
put_hevc_epel_uni_v48_8_neon: 1256.2
put_hevc_epel_uni_v64_8_c: 19169.9
put_hevc_epel_uni_v64_8_neon: 2179.2

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logon Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_epel_neon.S    | 320 ++++++++++++++++++++++
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  2 files changed, 325 insertions(+)

          NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);

Comments

Martin Storsjö Sept. 12, 2023, 11:48 a.m. UTC | #1
Hi,

Sorry for not tending to your patches sooner.

Unfortunately, this patchset is impossible to apply - there seems to be 
garbled whitespace in the patch which would require me to manually apply 
all the changes.

Can you try sending the patches again in a way that doesn't corrupt 
whitespace? If not, can you push the branch somewhere where I can fetch 
it?

// Martin
Logan.Lyu Sept. 14, 2023, 3:55 a.m. UTC | #2
Hi Martin,

You can try the attached patchset. If that doesn't work, My code branch 
address is https://github.com/myais2023/FFmpeg/tree/hevc-aarch64

Please try it again.

Thanks


在 2023/9/12 19:48, Martin Storsjö 写道:
> Hi,
>
> Sorry for not tending to your patches sooner.
>
> Unfortunately, this patchset is impossible to apply - there seems to 
> be garbled whitespace in the patch which would require me to manually 
> apply all the changes.
>
> Can you try sending the patches again in a way that doesn't corrupt 
> whitespace? If not, can you push the branch somewhere where I can 
> fetch it?
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
From 022535be4fc50e807870e5e8d1f6449f466d061d Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Tue, 15 Aug 2023 15:24:32 +0800
Subject: [PATCH 1/9] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_v

checkasm bench:
put_hevc_epel_uni_hv64_8_i8mm: 6568.7
put_hevc_epel_uni_v4_8_c: 88.7
put_hevc_epel_uni_v4_8_neon: 32.7
put_hevc_epel_uni_v6_8_c: 185.4
put_hevc_epel_uni_v6_8_neon: 44.9
put_hevc_epel_uni_v8_8_c: 333.9
put_hevc_epel_uni_v8_8_neon: 44.4
put_hevc_epel_uni_v12_8_c: 728.7
put_hevc_epel_uni_v12_8_neon: 119.7
put_hevc_epel_uni_v16_8_c: 1224.2
put_hevc_epel_uni_v16_8_neon: 139.7
put_hevc_epel_uni_v24_8_c: 2531.2
put_hevc_epel_uni_v24_8_neon: 329.9
put_hevc_epel_uni_v32_8_c: 4739.9
put_hevc_epel_uni_v32_8_neon: 562.7
put_hevc_epel_uni_v48_8_c: 10618.7
put_hevc_epel_uni_v48_8_neon: 1256.2
put_hevc_epel_uni_v64_8_c: 19169.9
put_hevc_epel_uni_v64_8_neon: 2179.2

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 320 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 325 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index a8d694639b..7ce7eec829 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -32,6 +32,326 @@ const epel_filters, align=4
         .byte -2, 10, 58, -2
 endconst
 
+.macro load_epel_filterb freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src0, src1, src2, src3
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src1\().8b, v1.8b
+        umlal           \dst\().8h, \src2\().8b, v2.8b
+        umlsl           \dst\().8h, \src3\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src0, src1, src2, src3
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src1\().16b, v1.16b
+        umlal2          \dst\().8h, \src2\().16b, v2.16b
+        umlsl2          \dst\().8h, \src3\().16b, v3.16b
+.endm
+
+.macro calc_all4
+        calc            v16, v17, v18, v19
+        b.eq            2f
+        calc            v17, v18, v19, v16
+        b.eq            2f
+        calc            v18, v19, v16, v17
+        b.eq            2f
+        calc            v19, v16, v17, v18
+        b.ne            1b
+.endm
+
+.macro calc_all8
+        calc            v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21
+        b.ne            1b
+.endm
+
+.macro calc_all12
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v24, v25, v26, v27, v16, v17, v18
+        b.eq            2f
+        calc            v22, v23, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v25, v26, v27, v16, v17, v18, v19, v20, v21, v22, v23, v24
+        b.ne            1b
+.endm
+
+.macro calc_all16
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+        b.eq            2f
+        calc            v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19
+        b.eq            2f
+        calc            v24, v25, v26, v27, v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v28, v29, v30, v31, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
+        b.ne            1b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.s}[0], [x2], x3
+        ld1             {v17.s}[0], [x2], x3
+        ld1             {v18.s}[0], [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.s}[0], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        st1             {v4.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v4.h}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1             {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1             {v22.8b, v23.8b, v24.8b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b-v6.8b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ld1             {v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b}, [x0], x1
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b, v29.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             sp, sp, #32
+        sxtw            x3, w3
+        sxtw            x1, w1
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b, \src13\().16b, \src14\().16b, \src15\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b, v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b, v11.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+.endm
+1:      calc_all16
+.purgem calc
+2:      ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
 #if HAVE_I8MM
 
 .macro EPEL_H_HEADER
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e125b0cfb2..f1e167c50b 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,10 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -285,6 +289,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
Martin Storsjö Sept. 16, 2023, 9:46 p.m. UTC | #3
On Thu, 14 Sep 2023, Logan.Lyu wrote:

> Hi Martin,
>
> You can try the attached patchset. If that doesn't work, My code branch 
> address is https://github.com/myais2023/FFmpeg/tree/hevc-aarch64

Thanks for the patches. Functionally, they seem to work, and the issues i 
saw in the code are relatively minor. Unfortunately, some of the issues 
are issues that we've been through in many earlier patches, so I would 
hope that you would pay attention to them in the future before posting 
more patches.


In patch 1, you've got a bunch of sxtw instructions for src/dst stride 
parameters that have the type ptrdiff_t - that shouldn't be necessary?

In patch 2, you're moving the macros calc_epelh, calc_epelh2, 
load_epel_filterh - can you split out the move into a separate commit? 
(This isn't strictly necessary but would make things even clearer.)

In patch 2, you're storing below the stack, then decrementing it 
afterwards - e.g. like this:

> +        stp             x0, x30, [sp, #-16]
> +        stp             x1, x2, [sp, #-32]
> +        stp             x3, x4, [sp, #-48]
> +        stp             x5, x6, [sp, #-64]!

Please change that so that you're first predecrementing the whole area, 
then storing the other elements above that stack pointer, e.g. like this:

stp x0, x30, [sp, #-64]!
stp x1, x2, [sp, #16]
stp x3, x4, [sp, #32]

etc.

The same issue also appears in variouos places within functions like this:

> +        stp             x0, x1, [sp, #-16]
> +        stp             x4, x6, [sp, #-32]
> +        stp             xzr, x30, [sp, #-48]!

Please fix all of these cases - you can search through your patches for 
anything related to storing on the stack. Also, storing xzr here seems 
superfluous - if you've got an odd number of registers to store, just make 
one instruction str instead of stp (but keep the stack aligned).

Then in patch 4, you've got yet another pattern for doing these stores, 
where you have superfluous consecutive stack decrements like this:

> +        stp             x6, x30, [sp, #-16]!
> +        mov             x7, #16
> +        stp             x0, x1, [sp, #-16]!
> +        stp             x2, x3, [sp, #-16]!
> +        stp             x4, x5, [sp, #-16]!

Please just do one stack decrement covering all the stack space you need.

I believe these issues have been raised in earlier reviews as well.

// Martin
Logan.Lyu Sept. 23, 2023, 4:34 a.m. UTC | #4
Hi, Martin,

Thanks for your review.

> Thanks for the patches. Functionally, they seem to work, and the 
> issues i saw in the code are relatively minor. Unfortunately, some of 
> the issues are issues that we've been through in many earlier patches, 
> so I would hope that you would pay attention to them in the future 
> before posting more patches.
Okay, I have noticed the previous issues and made some modifications 
according to the issues, And I have completed the modifications based on 
your comments.

If there are any missing issues that have not been corrected, please let 
me know.



在 2023/9/17 5:46, Martin Storsjö 写道:
> On Thu, 14 Sep 2023, Logan.Lyu wrote:
>
>> Hi Martin,
>>
>> You can try the attached patchset. If that doesn't work, My code 
>> branch address is https://github.com/myais2023/FFmpeg/tree/hevc-aarch64
>
> Thanks for the patches. Functionally, they seem to work, and the 
> issues i saw in the code are relatively minor. Unfortunately, some of 
> the issues are issues that we've been through in many earlier patches, 
> so I would hope that you would pay attention to them in the future 
> before posting more patches.
>
>
> In patch 1, you've got a bunch of sxtw instructions for src/dst stride 
> parameters that have the type ptrdiff_t - that shouldn't be necessary?
>
> In patch 2, you're moving the macros calc_epelh, calc_epelh2, 
> load_epel_filterh - can you split out the move into a separate commit? 
> (This isn't strictly necessary but would make things even clearer.)
>
> In patch 2, you're storing below the stack, then decrementing it 
> afterwards - e.g. like this:
>
>> +        stp             x0, x30, [sp, #-16]
>> +        stp             x1, x2, [sp, #-32]
>> +        stp             x3, x4, [sp, #-48]
>> +        stp             x5, x6, [sp, #-64]!
>
> Please change that so that you're first predecrementing the whole 
> area, then storing the other elements above that stack pointer, e.g. 
> like this:
>
> stp x0, x30, [sp, #-64]!
> stp x1, x2, [sp, #16]
> stp x3, x4, [sp, #32]
>
> etc.
>
> The same issue also appears in variouos places within functions like 
> this:
>
>> +        stp             x0, x1, [sp, #-16]
>> +        stp             x4, x6, [sp, #-32]
>> +        stp             xzr, x30, [sp, #-48]!
>
> Please fix all of these cases - you can search through your patches 
> for anything related to storing on the stack. Also, storing xzr here 
> seems superfluous - if you've got an odd number of registers to store, 
> just make one instruction str instead of stp (but keep the stack 
> aligned).
>
> Then in patch 4, you've got yet another pattern for doing these 
> stores, where you have superfluous consecutive stack decrements like 
> this:
>
>> +        stp             x6, x30, [sp, #-16]!
>> +        mov             x7, #16
>> +        stp             x0, x1, [sp, #-16]!
>> +        stp             x2, x3, [sp, #-16]!
>> +        stp             x4, x5, [sp, #-16]!
>
> Please just do one stack decrement covering all the stack space you need.
>
> I believe these issues have been raised in earlier reviews as well.
>
> // Martin
>
From 62a59aa1fb7bc684ca0c216fd039dd0f231ad0c0 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Tue, 15 Aug 2023 16:42:25 +0800
Subject: [PATCH 04/10] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_v

checkasm bench:
put_hevc_qpel_uni_v4_8_c: 146.2
put_hevc_qpel_uni_v4_8_neon: 43.2
put_hevc_qpel_uni_v6_8_c: 303.9
put_hevc_qpel_uni_v6_8_neon: 69.7
put_hevc_qpel_uni_v8_8_c: 495.2
put_hevc_qpel_uni_v8_8_neon: 74.7
put_hevc_qpel_uni_v12_8_c: 1100.9
put_hevc_qpel_uni_v12_8_neon: 222.4
put_hevc_qpel_uni_v16_8_c: 1955.2
put_hevc_qpel_uni_v16_8_neon: 269.2
put_hevc_qpel_uni_v24_8_c: 4571.9
put_hevc_qpel_uni_v24_8_neon: 832.4
put_hevc_qpel_uni_v32_8_c: 8226.4
put_hevc_qpel_uni_v32_8_neon: 1035.7
put_hevc_qpel_uni_v48_8_c: 18324.2
put_hevc_qpel_uni_v48_8_neon: 2321.2
put_hevc_qpel_uni_v64_8_c: 37659.4
put_hevc_qpel_uni_v64_8_neon: 4122.2

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 221 ++++++++++++++++++++++
 2 files changed, 226 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index d78954f440..51d212ff72 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -192,6 +192,10 @@ NEON8_FNPROTO(qpel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -295,6 +299,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index e38dff9645..2107e31a3c 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -44,6 +44,35 @@ endconst
         sxtl            v0.8h, v0.8b
 .endm
 
+.macro load_qpel_filterb freg, xreg
+        movrel          \xreg, qpel_filters_abs
+        add             \xreg, \xreg, \freg, lsl #3
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+        ld4r            {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
 .macro put_hevc type
 .ifc \type, qpel
         // void put_hevc_qpel_h(int16_t *dst,
@@ -595,6 +624,198 @@ function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
         ret
 endfunc
 
+.macro calc_all
+        calc            v23, v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v16
+        b.eq            2f
+        calc            v17, v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17, v18
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19, v20
+        b.eq            2f
+        calc            v21, v22, v23, v16, v17, v18, v19, v20, v21
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21, v22
+        b.hi            1b
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().s}[0], [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1             {v24.s}[0], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #4
+        sub             x2, x2, x3
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        st1             {v24.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v24.h}[2], [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+        add             x2, x2, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x2], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        subs            w4, w4, #1
+        st1             {v24.8b}, [x0], x1
+.endm
+1:      calc_all
+.purgem calc
+2: ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x1, x1, #8
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        mov             w11, w4         // height
+        mov             x10, x0         // dst
+        ldr             q16, [x8]
+        ldr             q17, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q18, [x8]
+        ldr             q19, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q20, [x8]
+        ldr             q21, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q22, [x8]
+        add             x8, x8, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x8], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1             {v24.8b}, [x10], #8
+        subs            x11, x11, #1
+        st1             {v24.s}[2], [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #12
+        add             x2, x2, #12
+        subs            w7, w7, #12
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+0:      mov             x8, x2          // src
+        mov             w11, w4         // height
+        mov             x10, x0         // dst
+        ldr             q16, [x8]
+        ldr             q17, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q18, [x8]
+        ldr             q19, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q20, [x8]
+        ldr             q21, [x8, x3]
+        add             x8, x8, x3, lsl #1
+        ldr             q22, [x8]
+        add             x8, x8, x3
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().16b}, [x8], x3
+        calc_qpelb      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        calc_qpelb2     v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        subs            x11, x11, #1
+        st1             {v24.16b}, [x10], x1
+.endm
+1:      calc_all
+.purgem calc
+2:      add             x0, x0, #16
+        add             x2, x2, #16
+        subs            w7, w7, #16
+        b.ne            0b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
         sub             w10, w10, w5
Martin Storsjö Sept. 26, 2023, 12:53 p.m. UTC | #5
Hi,

Thanks, this looks mostly ok now.

There were a few minor issues left that I can fix up before pushing. There 
were a number of cases with register restoring like this:

         ldr             x30, [sp]
         ldp             x4, x6, [sp, #16]
         ldp             x0, x1, [sp, #32]
         add             sp, sp, #48

Here we should fold the sp update ino the load as well, like this:

         ldp             x4, x6, [sp, #16]
         ldp             x0, x1, [sp, #32]
         ldr             x30, [sp], #48

In a few cases, this wasn't possible, due to the location of the register 
that is being restored, like this:

         ldr             x30, [sp, #56]
         add             sp, sp, #64

For the most idiomatic aarch64 assembly, I think it would be good to 
restructure it to keep x30 at the bottom of this area, to allow using the 
same pattern for restores here too.

I pushed it with these changes. A later patch to restructure the register 
saves to avoid the separate "add sp" would be appreciated though. I quite 
certainly doesn't matter from a real-world performance perspective, but it 
would make the code more idiomatic.

// Martin




On Sat, 23 Sep 2023, Logan.Lyu wrote:

> Hi, Martin,
>
> Thanks for your review.
>
>> Thanks for the patches. Functionally, they seem to work, and the issues i 
>> saw in the code are relatively minor. Unfortunately, some of the issues are 
>> issues that we've been through in many earlier patches, so I would hope 
>> that you would pay attention to them in the future before posting more 
>> patches.
> Okay, I have noticed the previous issues and made some modifications 
> according to the issues, And I have completed the modifications based on your 
> comments.
>
> If there are any missing issues that have not been corrected, please let me 
> know.
>
>
>
> 在 2023/9/17 5:46, Martin Storsjö 写道:
>> On Thu, 14 Sep 2023, Logan.Lyu wrote:
>> 
>>> Hi Martin,
>>> 
>>> You can try the attached patchset. If that doesn't work, My code branch 
>>> address is https://github.com/myais2023/FFmpeg/tree/hevc-aarch64
>> 
>> Thanks for the patches. Functionally, they seem to work, and the issues i 
>> saw in the code are relatively minor. Unfortunately, some of the issues are 
>> issues that we've been through in many earlier patches, so I would hope 
>> that you would pay attention to them in the future before posting more 
>> patches.
>> 
>> 
>> In patch 1, you've got a bunch of sxtw instructions for src/dst stride 
>> parameters that have the type ptrdiff_t - that shouldn't be necessary?
>> 
>> In patch 2, you're moving the macros calc_epelh, calc_epelh2, 
>> load_epel_filterh - can you split out the move into a separate commit? 
>> (This isn't strictly necessary but would make things even clearer.)
>> 
>> In patch 2, you're storing below the stack, then decrementing it afterwards 
>> - e.g. like this:
>> 
>>> +        stp             x0, x30, [sp, #-16]
>>> +        stp             x1, x2, [sp, #-32]
>>> +        stp             x3, x4, [sp, #-48]
>>> +        stp             x5, x6, [sp, #-64]!
>> 
>> Please change that so that you're first predecrementing the whole area, 
>> then storing the other elements above that stack pointer, e.g. like this:
>> 
>> stp x0, x30, [sp, #-64]!
>> stp x1, x2, [sp, #16]
>> stp x3, x4, [sp, #32]
>> 
>> etc.
>> 
>> The same issue also appears in variouos places within functions like this:
>> 
>>> +        stp             x0, x1, [sp, #-16]
>>> +        stp             x4, x6, [sp, #-32]
>>> +        stp             xzr, x30, [sp, #-48]!
>> 
>> Please fix all of these cases - you can search through your patches for 
>> anything related to storing on the stack. Also, storing xzr here seems 
>> superfluous - if you've got an odd number of registers to store, just make 
>> one instruction str instead of stp (but keep the stack aligned).
>> 
>> Then in patch 4, you've got yet another pattern for doing these stores, 
>> where you have superfluous consecutive stack decrements like this:
>> 
>>> +        stp             x6, x30, [sp, #-16]!
>>> +        mov             x7, #16
>>> +        stp             x0, x1, [sp, #-16]!
>>> +        stp             x2, x3, [sp, #-16]!
>>> +        stp             x4, x5, [sp, #-16]!
>> 
>> Please just do one stack decrement covering all the stack space you need.
>> 
>> I believe these issues have been raised in earlier reviews as well.
>> 
>> // Martin
>
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index a8d694639b..7ce7eec829 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -32,6 +32,326 @@  const epel_filters, align=4
          .byte -2, 10, 58, -2
  endconst
  +.macro load_epel_filterb freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src0, src1, src2, src3
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src1\().8b, v1.8b
+        umlal           \dst\().8h, \src2\().8b, v2.8b
+        umlsl           \dst\().8h, \src3\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src0, src1, src2, src3
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src1\().16b, v1.16b
+        umlal2          \dst\().8h, \src2\().16b, v2.16b
+        umlsl2          \dst\().8h, \src3\().16b, v3.16b
+.endm
+
+.macro calc_all4
+        calc            v16, v17, v18, v19
+        b.eq            2f
+        calc            v17, v18, v19, v16
+        b.eq            2f
+        calc            v18, v19, v16, v17
+        b.eq            2f
+        calc            v19, v16, v17, v18
+        b.ne            1b
+.endm
+
+.macro calc_all8
+        calc            v16, v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v18, v19, v20, v21, v22, v23, v16, v17
+        b.eq            2f
+        calc            v20, v21, v22, v23, v16, v17, v18, v19
+        b.eq            2f
+        calc            v22, v23, v16, v17, v18, v19, v20, v21
+        b.ne            1b
+.endm
+
+.macro calc_all12
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, 
v25, v26, v27
+        b.eq            2f
+        calc            v19, v20, v21, v22, v23, v24, v25, v26, v27, 
v16, v17, v18
+        b.eq            2f
+        calc            v22, v23, v24, v25, v26, v27, v16, v17, v18, 
v19, v20, v21
+        b.eq            2f
+        calc            v25, v26, v27, v16, v17, v18, v19, v20, v21, 
v22, v23, v24
+        b.ne            1b
+.endm
+
+.macro calc_all16
+        calc            v16, v17, v18, v19, v20, v21, v22, v23, v24, 
v25, v26, v27, v28, v29, v30, v31
+        b.eq            2f
+        calc            v20, v21, v22, v23, v24, v25, v26, v27, v28, 
v29, v30, v31, v16, v17, v18, v19
+        b.eq            2f
+        calc            v24, v25, v26, v27, v28, v29, v30, v31, v16, 
v17, v18, v19, v20, v21, v22, v23
+        b.eq            2f
+        calc            v28, v29, v30, v31, v16, v17, v18, v19, v20, 
v21, v22, v23, v24, v25, v26, v27
+        b.ne            1b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.s}[0], [x2], x3
+        ld1             {v17.s}[0], [x2], x3
+        ld1             {v18.s}[0], [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.s}[0], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b, v4.8h, #6
+        st1             {v4.s}[0], [x0], #4
+        subs            w4, w4, #1
+        st1             {v4.h}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.8b}, [x2], x3
+        ld1             {v17.8b}, [x2], x3
+        ld1             {v18.8b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b}, [x2], x3
+        ld1             {v17.16b}, [x2], x3
+        ld1             {v18.16b}, [x2], x3
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b}, [x0], x1
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1             {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1             {v22.8b, v23.8b, v24.8b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.8b-v6.8b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ld1             {v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b}, [x0], x1
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+        load_epel_filterb x6, x5
+        sxtw            x3, w3
+        sxtw            x1, w1
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, 
[x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b, v29.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             sp, sp, #32
+        sxtw            x3, w3
+        sxtw            x1, w1
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x2, x2, x3
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b, \src13\().16b, \src14\().16b, 
\src15\().16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b, v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b, v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b, v11.8h, #6
+        subs            w4, w4, #1
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+.endm
+1:      calc_all16
+.purgem calc
+2:      ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
  #if HAVE_I8MM
   .macro EPEL_H_HEADER
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index e125b0cfb2..f1e167c50b 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,10 @@  NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, 
ptrdiff_t _dststride,
          int height, int denom, int wx, int ox,
          intptr_t mx, intptr_t my, int width),);
  +NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
  NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, int denom, int wx, int ox,
@@ -285,6 +289,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext 
*c, const int bit_depth)
          c->put_hevc_qpel_bi[9][0][1]   = 
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
           NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);