diff mbox series

[FFmpeg-devel,1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

Message ID 646d7216-e68d-4a49-821b-f358337797ef@myais.com.cn
State New
Headers show
Series [FFmpeg-devel,1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch

Commit Message

Logan.Lyu Oct. 14, 2023, 8:45 a.m. UTC
checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
---
  libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++++++++++++++++++++++
  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
  2 files changed, 228 insertions(+)

*c, const int bit_depth)
          c->put_hevc_qpel_bi[9][0][1]   = 
ff_hevc_put_hevc_qpel_bi_h16_8_neon;
           NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
          NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
          NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);

Comments

Michael Niedermayer Oct. 14, 2023, 5:08 p.m. UTC | #1
On Sat, Oct 14, 2023 at 04:45:39PM +0800, Logan.Lyu wrote:
[...]
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1
>          ret
>  endfunc
>  +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> +        load_epel_filterb x5, x4

This is not a valid diff, some whitespaces and newlines here are not as
they should be

thx


[...]
Logan.Lyu Oct. 22, 2023, 1:29 p.m. UTC | #2
Hi, Martin,

Could you please review these patches and let me know if there are any 
changes needed.

Thanks.


Logan Lyu

在 2023/10/14 16:45, Logan.Lyu 写道:
> checkasm bench:
> put_hevc_epel_v4_8_c: 79.9
> put_hevc_epel_v4_8_neon: 25.7
> put_hevc_epel_v6_8_c: 151.4
> put_hevc_epel_v6_8_neon: 46.4
> put_hevc_epel_v8_8_c: 250.9
> put_hevc_epel_v8_8_neon: 41.7
> put_hevc_epel_v12_8_c: 542.7
> put_hevc_epel_v12_8_neon: 108.7
> put_hevc_epel_v16_8_c: 939.4
> put_hevc_epel_v16_8_neon: 169.2
> put_hevc_epel_v24_8_c: 2104.9
> put_hevc_epel_v24_8_neon: 307.9
> put_hevc_epel_v32_8_c: 3713.9
> put_hevc_epel_v32_8_neon: 524.2
> put_hevc_epel_v48_8_c: 8175.2
> put_hevc_epel_v48_8_neon: 1197.2
> put_hevc_epel_v64_8_c: 16049.4
> put_hevc_epel_v64_8_neon: 2094.9
>
> Co-Authored-By: J. Dekker <jdek@itanimul.li>
> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn>
> ---
>  libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++++++++++++++++++++++
>  libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
>  2 files changed, 228 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
> export=1
>          ret
>  endfunc
>  +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ldr             s16, [x1]
> +        ldr             s17, [x1 ,x2]
> +        add             x1, x1, x2, lsl #1
> +        ld1             {v18.s}[0], [x1], x2
> +.macro calc src0, src1, src2, src3
> +        ld1             {\src3\().s}[0], [x1], x2
> +        movi            v4.8h, #0
> +        calc_epelb      v4, \src0, \src1, \src2, \src3
> +        subs            w3, w3, #1
> +        st1             {v4.4h}, [x0], x10
> +.endm
> +1:      calc_all4
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v6_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2 - 8)
> +        ldr             d16, [x1]
> +        ldr             d17, [x1, x2]
> +        add             x1, x1, x2, lsl #1
> +        ld1             {v18.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> +        ld1             {\src3\().8b}, [x1], x2
> +        movi            v4.8h, #0
> +        calc_epelb      v4, \src0, \src1, \src2, \src3
> +        st1             {v4.d}[0], [x0], #8
> +        subs            w3, w3, #1
> +        st1             {v4.s}[2], [x0], x10
> +.endm
> +1:      calc_all4
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v8_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ldr             d16, [x1]
> +        ldr             d17, [x1, x2]
> +        add             x1, x1, x2, lsl #1
> +        ld1             {v18.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> +        ld1             {\src3\().8b}, [x1], x2
> +        movi            v4.8h, #0
> +        calc_epelb      v4, \src0, \src1, \src2, \src3
> +        subs            w3, w3, #1
> +        st1             {v4.8h}, [x0], x10
> +.endm
> +1:      calc_all4
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v12_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ldr             q16, [x1]
> +        ldr             q17, [x1, x2]
> +        add             x1, x1, x2, lsl #1
> +        ld1             {v18.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> +        ld1             {\src3\().16b}, [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        calc_epelb      v4, \src0, \src1, \src2, \src3
> +        calc_epelb2     v5, \src0, \src1, \src2, \src3
> +        str             q4, [x0]
> +        subs            w3, w3, #1
> +        str             d5, [x0, #16]
> +        add             x0, x0, x10
> +.endm
> +1:      calc_all4
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v16_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ldr             q16, [x1]
> +        ldr             q17, [x1, x2]
> +        add             x1, x1, x2, lsl #1
> +        ld1             {v18.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3
> +        ld1            {\src3\().16b}, [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        calc_epelb      v4, \src0, \src1, \src2, \src3
> +        calc_epelb2     v5, \src0, \src1, \src2, \src3
> +        subs            w3, w3, #1
> +        st1             {v4.8h, v5.8h}, [x0], x10
> +.endm
> +1:      calc_all4
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v24_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
> +        ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
> +        ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, 
> src9, src10, src11
> +        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, 
> [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        movi            v6.8h, #0
> +        calc_epelb      v4, \src0, \src3, \src6, \src9
> +        calc_epelb      v5, \src1, \src4, \src7, \src10
> +        calc_epelb      v6, \src2, \src5, \src8, \src11
> +        subs            w3, w3, #1
> +        st1             {v4.8h-v6.8h}, [x0], x10
> +.endm
> +1:      calc_all12
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v32_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #(MAX_PB_SIZE * 2)
> +        ld1             {v16.16b, v17.16b}, [x1], x2
> +        ld1             {v18.16b, v19.16b}, [x1], x2
> +        ld1             {v20.16b, v21.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7
> +        ld1             {\src6\().16b, \src7\().16b}, [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        movi            v6.8h, #0
> +        movi            v7.8h, #0
> +        calc_epelb      v4, \src0, \src2, \src4, \src6
> +        calc_epelb2     v5, \src0, \src2, \src4, \src6
> +        calc_epelb      v6, \src1, \src3, \src5, \src7
> +        calc_epelb2     v7, \src1, \src3, \src5, \src7
> +        subs            w3, w3, #1
> +        st1             {v4.8h-v7.8h}, [x0], x10
> +.endm
> +1:      calc_all8
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v48_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             x1, x1, x2
> +        mov             x10, #64
> +        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
> +        ld1             {v19.16b, v20.16b, v21.16b}, [x1], x2
> +        ld1             {v22.16b, v23.16b, v24.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, 
> src9, src10, src11
> +        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, 
> [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        movi            v6.8h, #0
> +        movi            v7.8h, #0
> +        movi            v28.8h, #0
> +        movi            v29.8h, #0
> +        calc_epelb      v4,  \src0, \src3, \src6, \src9
> +        calc_epelb2     v5,  \src0, \src3, \src6, \src9
> +        calc_epelb      v6,  \src1, \src4, \src7, \src10
> +        calc_epelb2     v7,  \src1, \src4, \src7, \src10
> +        calc_epelb      v28, \src2, \src5, \src8, \src11
> +        calc_epelb2     v29, \src2, \src5, \src8, \src11
> +        st1             {v4.8h-v7.8h}, [x0], #64
> +        subs            w3, w3, #1
> +        st1             {v28.8h-v29.8h}, [x0], x10
> +.endm
> +1:      calc_all12
> +.purgem calc
> +2:      ret
> +endfunc
> +
> +function ff_hevc_put_hevc_epel_v64_8_neon, export=1
> +        load_epel_filterb x5, x4
> +        sub             sp, sp, #32
> +        st1             {v8.8b-v11.8b}, [sp]
> +        sub             x1, x1, x2
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
> +        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
> +        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
> +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, 
> src9, src10, src11, src12, src13, src14, src15
> +        ld1             {\src12\().16b-\src15\().16b}, [x1], x2
> +        movi            v4.8h, #0
> +        movi            v5.8h, #0
> +        movi            v6.8h, #0
> +        movi            v7.8h, #0
> +        movi            v8.8h, #0
> +        movi            v9.8h, #0
> +        movi            v10.8h, #0
> +        movi            v11.8h, #0
> +        calc_epelb      v4,  \src0, \src4, \src8,  \src12
> +        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
> +        calc_epelb      v6,  \src1, \src5, \src9,  \src13
> +        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
> +        calc_epelb      v8,  \src2, \src6, \src10, \src14
> +        calc_epelb2     v9,  \src2, \src6, \src10, \src14
> +        calc_epelb      v10, \src3, \src7, \src11, \src15
> +        calc_epelb2     v11, \src3, \src7, \src11, \src15
> +        st1             {v4.8h-v7.8h}, [x0], #64
> +        subs            w3, w3, #1
> +        st1             {v8.8h-v11.8h}, [x0], #64
> +.endm
> +1:      calc_all16
> +.purgem calc
> +2:         ld1             {v8.8b-v11.8b}, [sp]
> +        add             sp, sp, #32
> +        ret
> +endfunc
> +
>  function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
>          load_epel_filterb x6, x5
>          sub             x2, x2, x3
> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> index 4c377a7940..82e1623a67 100644
> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
> @@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
>          const uint8_t *src, ptrdiff_t srcstride,
>          int height, intptr_t mx, intptr_t my, int width),);
>  +NEON8_FNPROTO(epel_v, (int16_t *dst,
> +        const uint8_t *src, ptrdiff_t srcstride,
> +        int height, intptr_t mx, intptr_t my, int width),);
> +
>  NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>          const uint8_t *_src, ptrdiff_t _srcstride,
>          int height, intptr_t mx, intptr_t my, int width),);
> @@ -305,6 +309,7 @@ av_cold void 
> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>          c->put_hevc_qpel_bi[9][0][1]   = 
> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>           NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
> +        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
>          NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
>          NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
>          NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
Martin Storsjö Oct. 22, 2023, 5:18 p.m. UTC | #3
On Sun, 22 Oct 2023, Logan.Lyu wrote:

> Hi, Martin,
>
> Could you please review these patches and let me know if there are any 
> changes needed.

Did you see the message from Michael on Oct 14th? Your patches have 
corrupted whitespace and can't be applied. Earlier you've submitted some 
patches as attached files, and those have been possible to apply.

Secondly; I just pushed some indentation cleanup for aarch64 assembly 
yesterday. In case there are conflicts with your patches, please rebase 
your patches before attempting to resubmit them, so they apply cleanly.

// Martin
Logan.Lyu Oct. 26, 2023, 8:30 a.m. UTC | #4
Hi,

I'm sorry that I missed the message from Michael on Oct 14th due to my 
negligence.

And I missed submitting a commit that was earlier than these four 
commits, which caused the corrupted whitespace problem. Now I have 
recreated these patches.

In addition, I rebased it to ensure that these patches can be 
successfully applied on the latest master branch.

Please check again, thank you.


在 2023/10/23 1:18, Martin Storsjö 写道:
> On Sun, 22 Oct 2023, Logan.Lyu wrote:
>
>> Hi, Martin,
>>
>> Could you please review these patches and let me know if there are 
>> any changes needed.
>
> Did you see the message from Michael on Oct 14th? Your patches have 
> corrupted whitespace and can't be applied. Earlier you've submitted 
> some patches as attached files, and those have been possible to apply.
>
> Secondly; I just pushed some indentation cleanup for aarch64 assembly 
> yesterday. In case there are conflicts with your patches, please 
> rebase your patches before attempting to resubmit them, so they apply 
> cleanly.
>
> // Martin
>
From 443447657b8ea8684ab2687789b7f77845c83f3f Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu@myais.com.cn>
Date: Thu, 26 Oct 2023 09:15:24 +0800
Subject: [PATCH 2/5] lavc/aarch64: new optimization for 8-bit hevc_epel_v

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0705213eed..363750ee7f 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
         ret
 endfunc
 
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             s16, [x1]
+        ldr             s17, [x1 ,x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        str             q4, [x0]
+        subs            w3, w3, #1
+        str             d5, [x0, #16]
+        add             x0, x0, x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1            {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h-v7.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1             {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v28.8h-v29.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b-\src15\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], #64
+.endm
+1:      calc_all16
+.purgem calc
+2:     	ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
 function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
         load_epel_filterb x6, x5
         sub             x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c203d65d34..42aa76ddde 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst,
         const uint8_t *src, ptrdiff_t srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
         NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,);
         NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
Martin Storsjö Oct. 31, 2023, 12:17 p.m. UTC | #5
On Thu, 26 Oct 2023, Logan.Lyu wrote:

> And I missed submitting a commit that was earlier than these four commits, 
> which caused the corrupted whitespace problem. Now I have recreated these 
> patches.
>
> In addition, I rebased it to ensure that these patches can be successfully 
> applied on the latest master branch.
>
> Please check again, thank you.

Thanks, now these was possibly to apply, and they looked mostly ok, so I 
touched up the last details I noticed and pushed them.

Things I noticed and fixed before pushing:

A bunch of minor cosmetics, you had minor misindentations in a few places 
(that were copypasted around in lots of places), that I fixed like this:

          ld1             {v18.16b}, [x1], x2
  .macro calc src0, src1, src2, src3
-        ld1            {\src3\().16b}, [x1], x2
+        ld1             {\src3\().16b}, [x1], x2
          movi            v4.8h, #0
          movi            v5.8h, #0
          calc_epelb      v4, \src0, \src1, \src2, \src3
@@ -461,7 +461,7 @@ function ff_hevc_put_hevc_epel_v64_8_neon, export=1
  .endm
  1:      calc_all16
  .purgem calc
-2:             ld1             {v8.8b-v11.8b}, [sp]
+2:      ld1             {v8.8b-v11.8b}, [sp]
          add             sp, sp, #32
          ret

The first patch, with mostly small trivial functions, can probably be 
scheduled better for in-order cores. I'll send a patch if I can make them 
measurably faster.

In almost every patch, you have loads/stores to the stack; you use the 
fused stack decrement nicely everywhere possible, but for the loading, 
you're almost always lacking the fused stack increment. I've fixed it now 
for this patchset, but please do keep this in mind and fix it up before 
submitting any further patches. I've fixed that up like this:

          bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
-        ldp             x5, x30, [sp]
          ldp             x0, x3, [sp, #16]
-        add             sp, sp, #32
+        ldp             x5, x30, [sp], #32
          load_epel_filterh x5, x4

(In many places.)

In one place, you wrote below the stack pointer before decrementing it. 
That's ok on OSes with a defined red zone, but we shouldn't need to assume 
that; I've fixed that like this:

  function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
-        stp             x5, x30, [sp, #-16]
-        stp             x0, x1, [sp, #-32]
          stp             x2, x3, [sp, #-48]!
+        stp             x0, x1, [sp, #16]
+        stp             x5, x30, [sp, #32]

I'll push the patchset with these changes soon.


// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@  function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1
          ret
  endfunc
  +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             s16, [x1]
+        ldr             s17, [x1 ,x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.4h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        st1             {v4.d}[0], [x0], #8
+        subs            w3, w3, #1
+        st1             {v4.s}[2], [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             d16, [x1]
+        ldr             d17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1             {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        str             q4, [x0]
+        subs            w3, w3, #1
+        str             d5, [x0, #16]
+        add             x0, x0, x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ldr             q16, [x1]
+        ldr             q17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ld1             {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+        ld1            {\src3\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, \src0, \src1, \src2, \src3
+        calc_epelb2     v5, \src0, \src1, \src2, \src3
+        subs            w3, w3, #1
+        st1             {v4.8h, v5.8h}, [x0], x10
+.endm
+1:      calc_all4
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1             {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1             {v22.8b, v23.8b, v24.8b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, \src0, \src3, \src6, \src9
+        calc_epelb      v5, \src1, \src4, \src7, \src10
+        calc_epelb      v6, \src2, \src5, \src8, \src11
+        subs            w3, w3, #1
+        st1             {v4.8h-v6.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ld1             {v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\src6\().16b, \src7\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, \src0, \src2, \src4, \src6
+        calc_epelb2     v5, \src0, \src2, \src4, \src6
+        calc_epelb      v6, \src1, \src3, \src5, \src7
+        calc_epelb2     v7, \src1, \src3, \src5, \src7
+        subs            w3, w3, #1
+        st1             {v4.8h-v7.8h}, [x0], x10
+.endm
+1:      calc_all8
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1             {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1             {v22.16b, v23.16b, v24.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11
+        ld1             {\src9\().16b, \src10\().16b, \src11\().16b}, 
[x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  \src0, \src3, \src6, \src9
+        calc_epelb2     v5,  \src0, \src3, \src6, \src9
+        calc_epelb      v6,  \src1, \src4, \src7, \src10
+        calc_epelb2     v7,  \src1, \src4, \src7, \src10
+        calc_epelb      v28, \src2, \src5, \src8, \src11
+        calc_epelb2     v29, \src2, \src5, \src8, \src11
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v28.8h-v29.8h}, [x0], x10
+.endm
+1:      calc_all12
+.purgem calc
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             sp, sp, #32
+        st1             {v8.8b-v11.8b}, [sp]
+        sub             x1, x1, x2
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, 
src10, src11, src12, src13, src14, src15
+        ld1             {\src12\().16b-\src15\().16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  \src0, \src4, \src8,  \src12
+        calc_epelb2     v5,  \src0, \src4, \src8,  \src12
+        calc_epelb      v6,  \src1, \src5, \src9,  \src13
+        calc_epelb2     v7,  \src1, \src5, \src9,  \src13
+        calc_epelb      v8,  \src2, \src6, \src10, \src14
+        calc_epelb2     v9,  \src2, \src6, \src10, \src14
+        calc_epelb      v10, \src3, \src7, \src11, \src15
+        calc_epelb2     v11, \src3, \src7, \src11, \src15
+        st1             {v4.8h-v7.8h}, [x0], #64
+        subs            w3, w3, #1
+        st1             {v8.8h-v11.8h}, [x0], #64
+.endm
+1:      calc_all16
+.purgem calc
+2:     	ld1             {v8.8b-v11.8b}, [sp]
+        add             sp, sp, #32
+        ret
+endfunc
+
  function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
          load_epel_filterb x6, x5
          sub             x2, x2, x3
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c377a7940..82e1623a67 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -156,6 +156,10 @@  NEON8_FNPROTO(pel_pixels, (int16_t *dst,
          const uint8_t *src, ptrdiff_t srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
  +NEON8_FNPROTO(epel_v, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
+
  NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
          const uint8_t *_src, ptrdiff_t _srcstride,
          int height, intptr_t mx, intptr_t my, int width),);
@@ -305,6 +309,7 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext