Message ID | 20230604041756.5196-1-Logan.Lyu@myais.com.cn |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels | expand |
Context | Check | Description |
---|---|---|
andriy/configure_x86 | warning | Failed to apply patch |
yinshiyou/configure_loongarch64 | warning | Failed to apply patch |
On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote: > From: Logan Lyu <Logan.Lyu@myais.com.cn> > > Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> > --- > libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 ++ > libavcodec/aarch64/hevcdsp_qpel_neon.S | 104 ++++++++++++++++++++++ > 2 files changed, 109 insertions(+) > > diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c > index 483a9d5253..5a1d520eec 100644 > --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c > +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c > @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co > void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \ > void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \ > > +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, > + const uint8_t *_src, ptrdiff_t _srcstride, > + int height, intptr_t mx, intptr_t my, int width),); > > NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, > const uint8_t *_src, ptrdiff_t _srcstride, > @@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) > c->put_hevc_qpel_bi[8][0][1] = > c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; > > + NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); > + NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); > NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); > NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); > NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); > diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S > index ed659cfe9b..6ca05b7201 100644 > --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S > +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S > @@ -490,6 +490,110 @@ put_hevc qpel > put_hevc qpel_uni > put_hevc qpel_bi > > +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 > +1: > + ldr s0, [x2] > + ldr s1, [x2, x3] > + add x2, x2, x3, lsl #1 > + str s0, [x0] > + str s1, [x0, x1] > + add x0, x0, x1, lsl #1 > + subs w4, w4, #2 > + b.hi 1b > + ret > +endfunc In a loop like this, I would recommend moving the "subs" instruction further away from the branch that depends on it. For cores with in-order execution, it does matter a fair bit, while it probably doesn't for cores with out-of-order execution. Here, the ideal location probably is after the two loads at the start. The same thing goes for all the other functions in this patch. Other than that, this looks ok. // Martin
Hi, Martin, I modified it according to your comments. Please review again. And here are the checkasm benchmark results of the related functions: The platform I tested is the g8y instance of Alibaba Cloud, with a chip based on armv9. put_hevc_pel_uni_pixels4_8_c: 35.9 put_hevc_pel_uni_pixels4_8_neon: 7.6 put_hevc_pel_uni_pixels6_8_c: 46.1 put_hevc_pel_uni_pixels6_8_neon: 20.6 put_hevc_pel_uni_pixels8_8_c: 53.4 put_hevc_pel_uni_pixels8_8_neon: 11.6 put_hevc_pel_uni_pixels12_8_c: 89.1 put_hevc_pel_uni_pixels12_8_neon: 25.9 put_hevc_pel_uni_pixels16_8_c: 106.4 put_hevc_pel_uni_pixels16_8_neon: 20.4 put_hevc_pel_uni_pixels24_8_c: 137.6 put_hevc_pel_uni_pixels24_8_neon: 47.1 put_hevc_pel_uni_pixels32_8_c: 173.6 put_hevc_pel_uni_pixels32_8_neon: 54.1 put_hevc_pel_uni_pixels48_8_c: 268.1 put_hevc_pel_uni_pixels48_8_neon: 117.1 put_hevc_pel_uni_pixels64_8_c: 346.1 put_hevc_pel_uni_pixels64_8_neon: 205.9 在 2023/6/12 15:47, Martin Storsjö 写道: > On Sun, 4 Jun 2023, Logan.Lyu@myais.com.cn wrote: > >> From: Logan Lyu <Logan.Lyu@myais.com.cn> >> >> Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> >> --- >> libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 ++ >> libavcodec/aarch64/hevcdsp_qpel_neon.S | 104 ++++++++++++++++++++++ >> 2 files changed, 109 insertions(+) >> >> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c >> b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> index 483a9d5253..5a1d520eec 100644 >> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c >> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c >> @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t >> *_dst, ptrdiff_t _dststride, co >> void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \ >> void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \ >> >> +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, >> + const uint8_t *_src, ptrdiff_t _srcstride, >> + int height, intptr_t mx, intptr_t my, int width),); >> >> NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, >> const uint8_t *_src, ptrdiff_t _srcstride, >> @@ -263,6 +266,8 @@ av_cold void >> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) >> c->put_hevc_qpel_bi[8][0][1] = >> c->put_hevc_qpel_bi[9][0][1] = >> ff_hevc_put_hevc_qpel_bi_h16_8_neon; >> >> + NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); >> + NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); >> NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); >> NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); >> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, >> qpel_uni_w_v,); >> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S >> b/libavcodec/aarch64/hevcdsp_qpel_neon.S >> index ed659cfe9b..6ca05b7201 100644 >> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S >> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S >> @@ -490,6 +490,110 @@ put_hevc qpel >> put_hevc qpel_uni >> put_hevc qpel_bi >> >> +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 >> +1: >> + ldr s0, [x2] >> + ldr s1, [x2, x3] >> + add x2, x2, x3, lsl #1 >> + str s0, [x0] >> + str s1, [x0, x1] >> + add x0, x0, x1, lsl #1 >> + subs w4, w4, #2 >> + b.hi 1b >> + ret >> +endfunc > > In a loop like this, I would recommend moving the "subs" instruction > further away from the branch that depends on it. For cores with > in-order execution, it does matter a fair bit, while it probably > doesn't for cores with out-of-order execution. Here, the ideal > location probably is after the two loads at the start. The same thing > goes for all the other functions in this patch. > > Other than that, this looks ok. > > // Martin > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". From a654b41fd8b100f631db49bd419ef65594ef32b3 Mon Sep 17 00:00:00 2001 From: Logan Lyu <Logan.Lyu@myais.com.cn> Date: Sun, 7 May 2023 16:58:30 +0800 Subject: [PATCH 1/5] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_pixels Signed-off-by: Logan Lyu <Logan.Lyu@myais.com.cn> --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 ++ libavcodec/aarch64/hevcdsp_qpel_neon.S | 104 ++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 483a9d5253..5a1d520eec 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \ +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width),); NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, @@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[8][0][1] = c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; + NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); + NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index ed659cfe9b..ed5b5027db 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -490,6 +490,110 @@ put_hevc qpel put_hevc qpel_uni put_hevc qpel_bi +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 +1: + ldr s0, [x2] + ldr s1, [x2, x3] + subs w4, w4, #2 + add x2, x2, x3, lsl #1 + str s0, [x0] + str s1, [x0, x1] + add x0, x0, x1, lsl #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1 + sub x1, x1, #4 +1: + ldr d0, [x2] + ldr d1, [x2, x3] + subs w4, w4, #2 + add x2, x2, x3, lsl #1 + str s0, [x0], #4 + st1 {v0.h}[2], [x0], x1 + str s1, [x0], #4 + st1 {v1.h}[2], [x0], x1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1 +1: + ldr d0, [x2] + ldr d1, [x2, x3] + subs w4, w4, #2 + add x2, x2, x3, lsl #1 + str d0, [x0] + str d1, [x0, x1] + add x0, x0, x1, lsl #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1 + sub x1, x1, #8 +1: + ldr q0, [x2] + ldr q1, [x2, x3] + subs w4, w4, #2 + add x2, x2, x3, lsl #1 + str d0, [x0], #8 + st1 {v0.s}[2], [x0], x1 + str d1, [x0], #8 + st1 {v1.s}[2], [x0], x1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1 +1: + ldr q0, [x2] + ldr q1, [x2, x3] + subs w4, w4, #2 + add x2, x2, x3, lsl #1 + str q0, [x0] + str q1, [x0, x1] + add x0, x0, x1, lsl #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1 +1: + ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3 + subs w4, w4, #1 + st1 {v0.8b, v1.8b, v2.8b}, [x0], x1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b}, [x2], x3 + subs w4, w4, #1 + st1 {v0.16b, v1.16b}, [x0], x1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3 + subs w4, w4, #1 + st1 {v0.16b, v1.16b, v2.16b}, [x0], x1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 + subs w4, w4, #1 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + b.hi 1b + ret +endfunc function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 mov w10, #-6
On Sun, 18 Jun 2023, Logan.Lyu wrote: > Hi, Martin, > > I modified it according to your comments. Please review again. > > And here are the checkasm benchmark results of the related functions: > > The platform I tested is the g8y instance of Alibaba Cloud, with a chip based > on armv9. Thanks for clarifying that. When updating patches, please include those benchmark numbers in the commit message, and mention the HW used for testing there in the commit message. And when tweaking patches, remember to update the benchmark numbers in the commit message if the tweak changes the results notably. The patchset is almost ok to be pushed, there's a couple issues left. I was about to just fix up the last issues myself and push them, but patch 5 had a bit more issues than I wanted to fix silently. // Martin
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 483a9d5253..5a1d520eec 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \ +NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride, + const uint8_t *_src, ptrdiff_t _srcstride, + int height, intptr_t mx, intptr_t my, int width),); NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, const uint8_t *_src, ptrdiff_t _srcstride, @@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->put_hevc_qpel_bi[8][0][1] = c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon; + NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); + NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,); NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,); diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index ed659cfe9b..6ca05b7201 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -490,6 +490,110 @@ put_hevc qpel put_hevc qpel_uni put_hevc qpel_bi +function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1 +1: + ldr s0, [x2] + ldr s1, [x2, x3] + add x2, x2, x3, lsl #1 + str s0, [x0] + str s1, [x0, x1] + add x0, x0, x1, lsl #1 + subs w4, w4, #2 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1 + sub x1, x1, #4 +1: + ldr d0, [x2] + ldr d1, [x2, x3] + add x2, x2, x3, lsl #1 + str s0, [x0], #4 + st1 {v0.h}[2], [x0], x1 + str s1, [x0], #4 + st1 {v1.h}[2], [x0], x1 + subs w4, w4, #2 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1 +1: + ldr d0, [x2] + ldr d1, [x2, x3] + add x2, x2, x3, lsl #1 + str d0, [x0] + str d1, [x0, x1] + add x0, x0, x1, lsl #1 + subs w4, w4, #2 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1 + sub x1, x1, #8 +1: + ldr q0, [x2] + ldr q1, [x2, x3] + add x2, x2, x3, lsl #1 + str d0, [x0], #8 + st1 {v0.s}[2], [x0], x1 + str d1, [x0], #8 + st1 {v1.s}[2], [x0], x1 + subs w4, w4, #2 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1 +1: + ldr q0, [x2] + ldr q1, [x2, x3] + add x2, x2, x3, lsl #1 + str q0, [x0] + str q1, [x0, x1] + add x0, x0, x1, lsl #1 + subs w4, w4, #2 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1 +1: + ld1 {v0.8b, v1.8b, v2.8b}, [x2], x3 + st1 {v0.8b, v1.8b, v2.8b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b}, [x2], x3 + st1 {v0.16b, v1.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3 + st1 {v0.16b, v1.16b, v2.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc + +function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1 +1: + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 + subs w4, w4, #1 + b.hi 1b + ret +endfunc function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1 mov w10, #-6