Message ID | 646d7216-e68d-4a49-821b-f358337797ef@myais.com.cn |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v | expand |
Context | Check | Description |
---|---|---|
andriy/configure_x86 | warning | Failed to apply patch |
On Sat, Oct 14, 2023 at 04:45:39PM +0800, Logan.Lyu wrote: [...] > diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S > b/libavcodec/aarch64/hevcdsp_epel_neon.S > index b4ca1e4c20..e541db5430 100644 > --- a/libavcodec/aarch64/hevcdsp_epel_neon.S > +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S > @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, > export=1 > ret > endfunc > + > +function ff_hevc_put_hevc_epel_v4_8_neon, export=1 > + load_epel_filterb x5, x4 This is not a valid diff, some whitespaces and newlines here are not as they should be thx [...]
Hi, Martin, Could you please review these patches and let me know if there are any changes needed. Thanks. Logan Lyu 在 2023/10/14 16:45, Logan.Lyu 写道: > checkasm bench: > put_hevc_epel_v4_8_c: 79.9 > put_hevc_epel_v4_8_neon: 25.7 > put_hevc_epel_v6_8_c: 151.4 > put_hevc_epel_v6_8_neon: 46.4 > put_hevc_epel_v8_8_c: 250.9 > put_hevc_epel_v8_8_neon: 41.7 > put_hevc_epel_v12_8_c: 542.7 > put_hevc_epel_v12_8_neon: 108.7 > put_hevc_epel_v16_8_c: 939.4 > put_hevc_epel_v16_8_neon: 169.2 > put_hevc_epel_v24_8_c: 2104.9 > put_hevc_epel_v24_8_neon: 307.9 > put_hevc_epel_v32_8_c: 3713.9 > put_hevc_epel_v32_8_neon: 524.2 > put_hevc_epel_v48_8_c: 8175.2 > put_hevc_epel_v48_8_neon: 1197.2 > put_hevc_epel_v64_8_c: 16049.4 > put_hevc_epel_v64_8_neon: 2094.9 > > Co-Authored-By: J. Dekker <jdek@itanimul.li> > Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> > --- > libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + > 2 files changed, 228 insertions(+) > > diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S > b/libavcodec/aarch64/hevcdsp_epel_neon.S > index b4ca1e4c20..e541db5430 100644 > --- a/libavcodec/aarch64/hevcdsp_epel_neon.S > +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S > @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, > export=1 > ret > endfunc > + > +function ff_hevc_put_hevc_epel_v4_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ldr s16, [x1] > + ldr s17, [x1 ,x2] > + add x1, x1, x2, lsl #1 > + ld1 {v18.s}[0], [x1], x2 > +.macro calc src0, src1, src2, src3 > + ld1 {\src3\().s}[0], [x1], x2 > + movi v4.8h, #0 > + calc_epelb v4, \src0, \src1, \src2, \src3 > + subs w3, w3, #1 > + st1 {v4.4h}, [x0], x10 > +.endm > +1: calc_all4 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v6_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2 - 8) > + ldr d16, [x1] > + ldr d17, [x1, x2] > + add x1, x1, x2, lsl #1 > + ld1 {v18.8b}, [x1], x2 > +.macro calc src0, src1, src2, src3 > + ld1 {\src3\().8b}, [x1], x2 > + movi v4.8h, #0 > + calc_epelb v4, \src0, \src1, \src2, \src3 > + st1 {v4.d}[0], [x0], #8 > + subs w3, w3, #1 > + st1 {v4.s}[2], [x0], x10 > +.endm > +1: calc_all4 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v8_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ldr d16, [x1] > + ldr d17, [x1, x2] > + add x1, x1, x2, lsl #1 > + ld1 {v18.8b}, [x1], x2 > +.macro calc src0, src1, src2, src3 > + ld1 {\src3\().8b}, [x1], x2 > + movi v4.8h, #0 > + calc_epelb v4, \src0, \src1, \src2, \src3 > + subs w3, w3, #1 > + st1 {v4.8h}, [x0], x10 > +.endm > +1: calc_all4 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v12_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ldr q16, [x1] > + ldr q17, [x1, x2] > + add x1, x1, x2, lsl #1 > + ld1 {v18.16b}, [x1], x2 > +.macro calc src0, src1, src2, src3 > + ld1 {\src3\().16b}, [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + calc_epelb v4, \src0, \src1, \src2, \src3 > + calc_epelb2 v5, \src0, \src1, \src2, \src3 > + str q4, [x0] > + subs w3, w3, #1 > + str d5, [x0, #16] > + add x0, x0, x10 > +.endm > +1: calc_all4 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v16_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ldr q16, [x1] > + ldr q17, [x1, x2] > + add x1, x1, x2, lsl #1 > + ld1 {v18.16b}, [x1], x2 > +.macro calc src0, src1, src2, src3 > + ld1 {\src3\().16b}, [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + calc_epelb v4, \src0, \src1, \src2, \src3 > + calc_epelb2 v5, \src0, \src1, \src2, \src3 > + subs w3, w3, #1 > + st1 {v4.8h, v5.8h}, [x0], x10 > +.endm > +1: calc_all4 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v24_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2 > + ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2 > + ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2 > +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, > src9, src10, src11 > + ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, > [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + movi v6.8h, #0 > + calc_epelb v4, \src0, \src3, \src6, \src9 > + calc_epelb v5, \src1, \src4, \src7, \src10 > + calc_epelb v6, \src2, \src5, \src8, \src11 > + subs w3, w3, #1 > + st1 {v4.8h-v6.8h}, [x0], x10 > +.endm > +1: calc_all12 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v32_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #(MAX_PB_SIZE * 2) > + ld1 {v16.16b, v17.16b}, [x1], x2 > + ld1 {v18.16b, v19.16b}, [x1], x2 > + ld1 {v20.16b, v21.16b}, [x1], x2 > +.macro calc src0, src1, src2, src3, src4, src5, src6, src7 > + ld1 {\src6\().16b, \src7\().16b}, [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + movi v6.8h, #0 > + movi v7.8h, #0 > + calc_epelb v4, \src0, \src2, \src4, \src6 > + calc_epelb2 v5, \src0, \src2, \src4, \src6 > + calc_epelb v6, \src1, \src3, \src5, \src7 > + calc_epelb2 v7, \src1, \src3, \src5, \src7 > + subs w3, w3, #1 > + st1 {v4.8h-v7.8h}, [x0], x10 > +.endm > +1: calc_all8 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v48_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub x1, x1, x2 > + mov x10, #64 > + ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2 > + ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2 > + ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2 > +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, > src9, src10, src11 > + ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, > [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + movi v6.8h, #0 > + movi v7.8h, #0 > + movi v28.8h, #0 > + movi v29.8h, #0 > + calc_epelb v4, \src0, \src3, \src6, \src9 > + calc_epelb2 v5, \src0, \src3, \src6, \src9 > + calc_epelb v6, \src1, \src4, \src7, \src10 > + calc_epelb2 v7, \src1, \src4, \src7, \src10 > + calc_epelb v28, \src2, \src5, \src8, \src11 > + calc_epelb2 v29, \src2, \src5, \src8, \src11 > + st1 {v4.8h-v7.8h}, [x0], #64 > + subs w3, w3, #1 > + st1 {v28.8h-v29.8h}, [x0], x10 > +.endm > +1: calc_all12 > +.purgem calc > +2: ret > +endfunc > + > +function ff_hevc_put_hevc_epel_v64_8_neon, export=1 > + load_epel_filterb x5, x4 > + sub sp, sp, #32 > + st1 {v8.8b-v11.8b}, [sp] > + sub x1, x1, x2 > + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2 > + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2 > + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2 > +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, > src9, src10, src11, src12, src13, src14, src15 > + ld1 {\src12\().16b-\src15\().16b}, [x1], x2 > + movi v4.8h, #0 > + movi v5.8h, #0 > + movi v6.8h, #0 > + movi v7.8h, #0 > + movi v8.8h, #0 > + movi v9.8h, #0 > + movi v10.8h, #0 > + movi v11.8h, #0 > + calc_epelb v4, \src0, \src4, \src8, \src12 > + calc_epelb2 v5, \src0, \src4, \src8, \src12 > + calc_epelb v6, \src1, \src5, \src9, \src13 > + calc_epelb2 v7, \src1, \src5, \src9, \src13 > + calc_epelb v8, \src2, \src6, \src10, \src14 > + calc_epelb2 v9, \src2, \src6, \src10, \src14 > + calc_epelb v10, \src3, \src7, \src11, \src15 > + calc_epelb2 v11, \src3, \src7, \src11, \src15 > + st1 {v4.8h-v7.8h}, [x0], #64 > + subs w3, w3, #1 > + st1 {v8.8h-v11.8h}, [x0], #64 > +.endm > +1: calc_all16 > +.purgem calc > +2: ld1 {v8.8b-v11.8b}, [sp] > + add sp, sp, #32 > + ret > +endfunc > + > function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1 > load_epel_filterb x6, x5 > sub x2, x2, x3 > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c > b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index 4c377a7940..82e1623a67 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst, > const uint8_t *src, ptrdiff_t srcstride, > int height, intptr_t mx, intptr_t my, int width),); > +NEON8_FNPROTO(epel_v, (int16_t *dst, > + const uint8_t *src, ptrdiff_t srcstride, > + int height, intptr_t mx, intptr_t my, int width),); > + > NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, > const uint8_t *_src, ptrdiff_t _srcstride, > int height, intptr_t mx, intptr_t my, int width),); > @@ -305,6 +309,7 @@ av_cold void > ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > c->put_hevc_qpel_bi[9][0][1] = > ff_hevc_put_hevc_qpel_bi_h16_8_neon; > NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,); > + NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,); > NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,); > NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); > NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
On Sun, 22 Oct 2023, Logan.Lyu wrote: > Hi, Martin, > > Could you please review these patches and let me know if there are any > changes needed. Did you see the message from Michael on Oct 14th? Your patches have corrupted whitespace and can't be applied. Earlier you've submitted some patches as attached files, and those have been possible to apply. Secondly; I just pushed some indentation cleanup for aarch64 assembly yesterday. In case there are conflicts with your patches, please rebase your patches before attempting to resubmit them, so they apply cleanly. // Martin
Hi, I'm sorry that I missed the message from Michael on Oct 14th due to my negligence. And I missed submitting a commit that was earlier than these four commits, which caused the corrupted whitespace problem. Now I have recreated these patches. In addition, I rebased it to ensure that these patches can be successfully applied on the latest master branch. Please check again, thank you. 在 2023/10/23 1:18, Martin Storsjö 写道: > On Sun, 22 Oct 2023, Logan.Lyu wrote: > >> Hi, Martin, >> >> Could you please review these patches and let me know if there are >> any changes needed. > > Did you see the message from Michael on Oct 14th? Your patches have > corrupted whitespace and can't be applied. Earlier you've submitted > some patches as attached files, and those have been possible to apply. > > Secondly; I just pushed some indentation cleanup for aarch64 assembly > yesterday. In case there are conflicts with your patches, please > rebase your patches before attempting to resubmit them, so they apply > cleanly. > > // Martin > From 443447657b8ea8684ab2687789b7f77845c83f3f Mon Sep 17 00:00:00 2001 From: Logan Lyu <Logan.Lyu@myais.com.cn> Date: Thu, 26 Oct 2023 09:15:24 +0800 Subject: [PATCH 2/5] lavc/aarch64: new optimization for 8-bit hevc_epel_v checkasm bench: put_hevc_epel_v4_8_c: 79.9 put_hevc_epel_v4_8_neon: 25.7 put_hevc_epel_v6_8_c: 151.4 put_hevc_epel_v6_8_neon: 46.4 put_hevc_epel_v8_8_c: 250.9 put_hevc_epel_v8_8_neon: 41.7 put_hevc_epel_v12_8_c: 542.7 put_hevc_epel_v12_8_neon: 108.7 put_hevc_epel_v16_8_c: 939.4 put_hevc_epel_v16_8_neon: 169.2 put_hevc_epel_v24_8_c: 2104.9 put_hevc_epel_v24_8_neon: 307.9 put_hevc_epel_v32_8_c: 3713.9 put_hevc_epel_v32_8_neon: 524.2 put_hevc_epel_v48_8_c: 8175.2 put_hevc_epel_v48_8_neon: 1197.2 put_hevc_epel_v64_8_c: 16049.4 put_hevc_epel_v64_8_neon: 2094.9 Co-Authored-By: J. Dekker <jdek@itanimul.li> --- libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 228 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 0705213eed..363750ee7f 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 ret endfunc + +function ff_hevc_put_hevc_epel_v4_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr s16, [x1] + ldr s17, [x1 ,x2] + add x1, x1, x2, lsl #1 + ld1 {v18.s}[0], [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().s}[0], [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.4h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v6_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2 - 8) + ldr d16, [x1] + ldr d17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.8b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + st1 {v4.d}[0], [x0], #8 + subs w3, w3, #1 + st1 {v4.s}[2], [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v8_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr d16, [x1] + ldr d17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.8b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.8h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v12_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr q16, [x1] + ldr q17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.16b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + str q4, [x0] + subs w3, w3, #1 + str d5, [x0, #16] + add x0, x0, x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v16_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr q16, [x1] + ldr q17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.16b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.8h, v5.8h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v24_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2 + ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2 + ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb v5, \src1, \src4, \src7, \src10 + calc_epelb v6, \src2, \src5, \src8, \src11 + subs w3, w3, #1 + st1 {v4.8h-v6.8h}, [x0], x10 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v32_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.16b, v17.16b}, [x1], x2 + ld1 {v18.16b, v19.16b}, [x1], x2 + ld1 {v20.16b, v21.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7 + ld1 {\src6\().16b, \src7\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + calc_epelb v4, \src0, \src2, \src4, \src6 + calc_epelb2 v5, \src0, \src2, \src4, \src6 + calc_epelb v6, \src1, \src3, \src5, \src7 + calc_epelb2 v7, \src1, \src3, \src5, \src7 + subs w3, w3, #1 + st1 {v4.8h-v7.8h}, [x0], x10 +.endm +1: calc_all8 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v48_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #64 + ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2 + ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2 + ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v28.8h, #0 + movi v29.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb2 v5, \src0, \src3, \src6, \src9 + calc_epelb v6, \src1, \src4, \src7, \src10 + calc_epelb2 v7, \src1, \src4, \src7, \src10 + calc_epelb v28, \src2, \src5, \src8, \src11 + calc_epelb2 v29, \src2, \src5, \src8, \src11 + st1 {v4.8h-v7.8h}, [x0], #64 + subs w3, w3, #1 + st1 {v28.8h-v29.8h}, [x0], x10 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v64_8_neon, export=1 + load_epel_filterb x5, x4 + sub sp, sp, #32 + st1 {v8.8b-v11.8b}, [sp] + sub x1, x1, x2 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 + ld1 {\src12\().16b-\src15\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v8.8h, #0 + movi v9.8h, #0 + movi v10.8h, #0 + movi v11.8h, #0 + calc_epelb v4, \src0, \src4, \src8, \src12 + calc_epelb2 v5, \src0, \src4, \src8, \src12 + calc_epelb v6, \src1, \src5, \src9, \src13 + calc_epelb2 v7, \src1, \src5, \src9, \src13 + calc_epelb v8, \src2, \src6, \src10, \src14 + calc_epelb2 v9, \src2, \src6, \src10, \src14 + calc_epelb v10, \src3, \src7, \src11, \src15 + calc_epelb2 v11, \src3, \src7, \src11, \src15 + st1 {v4.8h-v7.8h}, [x0], #64 + subs w3, w3, #1 + st1 {v8.8h-v11.8h}, [x0], #64 +.endm +1: calc_all16 +.purgem calc +2: ld1 {v8.8b-v11.8b}, [sp] + add sp, sp, #32 + ret +endfunc + function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1 load_epel_filterb x6, x5 sub x2, x2, x3 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index c203d65d34..42aa76ddde 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_v, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width),); + NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width),); @@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,); + NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,); NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);
On Thu, 26 Oct 2023, Logan.Lyu wrote: > And I missed submitting a commit that was earlier than these four commits, > which caused the corrupted whitespace problem. Now I have recreated these > patches. > > In addition, I rebased it to ensure that these patches can be successfully > applied on the latest master branch. > > Please check again, thank you. Thanks, now these was possibly to apply, and they looked mostly ok, so I touched up the last details I noticed and pushed them. Things I noticed and fixed before pushing: A bunch of minor cosmetics, you had minor misindentations in a few places (that were copypasted around in lots of places), that I fixed like this: ld1 {v18.16b}, [x1], x2 .macro calc src0, src1, src2, src3 - ld1 {\src3\().16b}, [x1], x2 + ld1 {\src3\().16b}, [x1], x2 movi v4.8h, #0 movi v5.8h, #0 calc_epelb v4, \src0, \src1, \src2, \src3 @@ -461,7 +461,7 @@ function ff_hevc_put_hevc_epel_v64_8_neon, export=1 .endm 1: calc_all16 .purgem calc -2: ld1 {v8.8b-v11.8b}, [sp] +2: ld1 {v8.8b-v11.8b}, [sp] add sp, sp, #32 ret The first patch, with mostly small trivial functions, can probably be scheduled better for in-order cores. I'll send a patch if I can make them measurably faster. In almost every patch, you have loads/stores to the stack; you use the fused stack decrement nicely everywhere possible, but for the loading, you're almost always lacking the fused stack increment. I've fixed it now for this patchset, but please do keep this in mind and fix it up before submitting any further patches. I've fixed that up like this: bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) - ldp x5, x30, [sp] ldp x0, x3, [sp, #16] - add sp, sp, #32 + ldp x5, x30, [sp], #32 load_epel_filterh x5, x4 (In many places.) In one place, you wrote below the stack pointer before decrementing it. That's ok on OSes with a defined red zone, but we shouldn't need to assume that; I've fixed that like this: function ff_hevc_put_hevc_qpel_v48_8_neon, export=1 - stp x5, x30, [sp, #-16] - stp x0, x1, [sp, #-32] stp x2, x3, [sp, #-48]! + stp x0, x1, [sp, #16] + stp x5, x30, [sp, #32] I'll push the patchset with these changes soon. // Martin
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index b4ca1e4c20..e541db5430 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 ret endfunc + +function ff_hevc_put_hevc_epel_v4_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr s16, [x1] + ldr s17, [x1 ,x2] + add x1, x1, x2, lsl #1 + ld1 {v18.s}[0], [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().s}[0], [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.4h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v6_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2 - 8) + ldr d16, [x1] + ldr d17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.8b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + st1 {v4.d}[0], [x0], #8 + subs w3, w3, #1 + st1 {v4.s}[2], [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v8_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr d16, [x1] + ldr d17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.8b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().8b}, [x1], x2 + movi v4.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.8h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v12_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr q16, [x1] + ldr q17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.16b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + str q4, [x0] + subs w3, w3, #1 + str d5, [x0, #16] + add x0, x0, x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v16_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ldr q16, [x1] + ldr q17, [x1, x2] + add x1, x1, x2, lsl #1 + ld1 {v18.16b}, [x1], x2 +.macro calc src0, src1, src2, src3 + ld1 {\src3\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + calc_epelb v4, \src0, \src1, \src2, \src3 + calc_epelb2 v5, \src0, \src1, \src2, \src3 + subs w3, w3, #1 + st1 {v4.8h, v5.8h}, [x0], x10 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v24_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2 + ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2 + ld1 {v22.8b, v23.8b, v24.8b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().8b, \src10\().8b, \src11\().8b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb v5, \src1, \src4, \src7, \src10 + calc_epelb v6, \src2, \src5, \src8, \src11 + subs w3, w3, #1 + st1 {v4.8h-v6.8h}, [x0], x10 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v32_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #(MAX_PB_SIZE * 2) + ld1 {v16.16b, v17.16b}, [x1], x2 + ld1 {v18.16b, v19.16b}, [x1], x2 + ld1 {v20.16b, v21.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7 + ld1 {\src6\().16b, \src7\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + calc_epelb v4, \src0, \src2, \src4, \src6 + calc_epelb2 v5, \src0, \src2, \src4, \src6 + calc_epelb v6, \src1, \src3, \src5, \src7 + calc_epelb2 v7, \src1, \src3, \src5, \src7 + subs w3, w3, #1 + st1 {v4.8h-v7.8h}, [x0], x10 +.endm +1: calc_all8 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v48_8_neon, export=1 + load_epel_filterb x5, x4 + sub x1, x1, x2 + mov x10, #64 + ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2 + ld1 {v19.16b, v20.16b, v21.16b}, [x1], x2 + ld1 {v22.16b, v23.16b, v24.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11 + ld1 {\src9\().16b, \src10\().16b, \src11\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v28.8h, #0 + movi v29.8h, #0 + calc_epelb v4, \src0, \src3, \src6, \src9 + calc_epelb2 v5, \src0, \src3, \src6, \src9 + calc_epelb v6, \src1, \src4, \src7, \src10 + calc_epelb2 v7, \src1, \src4, \src7, \src10 + calc_epelb v28, \src2, \src5, \src8, \src11 + calc_epelb2 v29, \src2, \src5, \src8, \src11 + st1 {v4.8h-v7.8h}, [x0], #64 + subs w3, w3, #1 + st1 {v28.8h-v29.8h}, [x0], x10 +.endm +1: calc_all12 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_v64_8_neon, export=1 + load_epel_filterb x5, x4 + sub sp, sp, #32 + st1 {v8.8b-v11.8b}, [sp] + sub x1, x1, x2 + ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2 + ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2 + ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2 +.macro calc src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15 + ld1 {\src12\().16b-\src15\().16b}, [x1], x2 + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 + movi v8.8h, #0 + movi v9.8h, #0 + movi v10.8h, #0 + movi v11.8h, #0 + calc_epelb v4, \src0, \src4, \src8, \src12 + calc_epelb2 v5, \src0, \src4, \src8, \src12 + calc_epelb v6, \src1, \src5, \src9, \src13 + calc_epelb2 v7, \src1, \src5, \src9, \src13 + calc_epelb v8, \src2, \src6, \src10, \src14 + calc_epelb2 v9, \src2, \src6, \src10, \src14 + calc_epelb v10, \src3, \src7, \src11, \src15 + calc_epelb2 v11, \src3, \src7, \src11, \src15 + st1 {v4.8h-v7.8h}, [x0], #64 + subs w3, w3, #1 + st1 {v8.8h-v11.8h}, [x0], #64 +.endm +1: calc_all16 +.purgem calc +2: ld1 {v8.8b-v11.8b}, [sp] + add sp, sp, #32 + ret +endfunc + function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1 load_epel_filterb x6, x5 sub x2, x2, x3 diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 4c377a7940..82e1623a67 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -156,6 +156,10 @@ NEON8_FNPROTO(pel_pixels, (int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(epel_v, (int16_t *dst, + const uint8_t *src, ptrdiff_t srcstride, + int height, intptr_t mx, intptr_t my, int width),); + NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width),); @@ -305,6 +309,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext
checkasm bench: put_hevc_epel_v4_8_c: 79.9 put_hevc_epel_v4_8_neon: 25.7 put_hevc_epel_v6_8_c: 151.4 put_hevc_epel_v6_8_neon: 46.4 put_hevc_epel_v8_8_c: 250.9 put_hevc_epel_v8_8_neon: 41.7 put_hevc_epel_v12_8_c: 542.7 put_hevc_epel_v12_8_neon: 108.7 put_hevc_epel_v16_8_c: 939.4 put_hevc_epel_v16_8_neon: 169.2 put_hevc_epel_v24_8_c: 2104.9 put_hevc_epel_v24_8_neon: 307.9 put_hevc_epel_v32_8_c: 3713.9 put_hevc_epel_v32_8_neon: 524.2 put_hevc_epel_v48_8_c: 8175.2 put_hevc_epel_v48_8_neon: 1197.2 put_hevc_epel_v64_8_c: 16049.4 put_hevc_epel_v64_8_neon: 2094.9 Co-Authored-By: J. Dekker <jdek@itanimul.li> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> --- libavcodec/aarch64/hevcdsp_epel_neon.S | 223 ++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 228 insertions(+) *c, const int bit_depth) c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels,); + NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v,); NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,);