Message ID | 20210107121020.86179-3-josh@itanimul.li |
---|---|
State | Accepted |
Headers | show |
Series | AArch64 NEON for HEVC | expand |
Context | Check | Description |
---|---|---|
andriy/x86_make | success | Make finished |
andriy/x86_make_fate | success | Make fate finished |
andriy/PPC64_make | success | Make finished |
andriy/PPC64_make_fate | success | Make fate finished |
On Thu, 7 Jan 2021, Josh Dekker wrote: > Signed-off-by: Josh Dekker <josh@itanimul.li> > --- > libavcodec/aarch64/Makefile | 3 +- > libavcodec/aarch64/hevcdsp_idct_neon.S | 74 ++++++++++++++++++++++++++ > libavcodec/aarch64/hevcdsp_init.c | 19 +++++++ > 3 files changed, 95 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S > > diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile > index 4bdd554e7e..42d80bf74c 100644 > --- a/libavcodec/aarch64/Makefile > +++ b/libavcodec/aarch64/Makefile > @@ -54,7 +54,8 @@ NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o > # decoders/encoders > NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o > NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o > -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_add_res_neon.o > +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_add_res_neon.o \ > + aarch64/hevcdsp_idct_neon.o > NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o > NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o > NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ > diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S > new file mode 100644 > index 0000000000..cd886bb6dc > --- /dev/null > +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S > @@ -0,0 +1,74 @@ > +/* -*-arm64-*- > + * > + * AArch64 NEON optimised IDCT functions for HEVC decoding > + * > + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + */ > + > +#include "libavutil/aarch64/asm.S" > + > +.macro idct_dc size bitdepth > +function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 As a bonus, it'd be nice to have the function prototype in a comment above here, as a quick reference for what the inputs are. > + ldrsh w1, [x0] > + mov w2, #(1 << (13 - \bitdepth)) > + add w1, w1, #1 > + asr w1, w1, #1 > + add w1, w1, w2 As commented on the other patch, please align things like in existing assembly, and add another extra space after the commas here, to keep things aligned even if you'd be using e.g. w10. > + asr w1, w1, #(14 - \bitdepth) > + dup v0.8h, w1 > + dup v1.8h, w1 > +.if \size > 4 > + dup v2.8h, w1 > + dup v3.8h, w1 > +.if \size > 16 /* dc 32x32 */ > + mov x2, #4 > +1: > + subs x2, x2, #1 > +.endif > +.if \size > 8 /* dc 16x16 */ > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 > +.endif /* dc 8x8 */ > + st1 {v0.8h-v3.8h}, [x0], #64 > + st1 {v0.8h-v3.8h}, [x0], #64 In a series of stores like this, each instruction updates the pointer x0, which means that the next instruction can't start executing until the previous one is done writing back the value to that register. So for a sequence like this, it's sometimes useful to have two registers for storing things interleavedly, e.g. like this: add x12, x0, #64 mov x13, #128 st1 {}, [x0], x13 st1 {}, [x12], x13 st1 {}, [x0], x13 st1 {}, [x12], x13 ... Depending on the context and so on, it may or may not be worth doing that. For such a small trivial function like this, I'd definitely try at least. Btw, you didn't mention what core you had benchmarked your functions on. When doing micro tuning like this, it's usually good to benchmark on both big and small cores, as some optimizations can make things faster on one but slower on another one. Functionally, the patch looks ok - but as this goes into the same file as Reimar's patches also add things into (and his patches do things that are fairly 1:1 with the existing arm assembly), it might be good to hold off of pushing this one until his patches are in, to rebase this one on top of those. // Martin
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 4bdd554e7e..42d80bf74c 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -54,7 +54,8 @@ NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o # decoders/encoders NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o -NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_add_res_neon.o +NEON-OBJS-$(CONFIG_HEVC_DECODER) += aarch64/hevcdsp_add_res_neon.o \ + aarch64/hevcdsp_idct_neon.o NEON-OBJS-$(CONFIG_OPUS_DECODER) += aarch64/opusdsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \ diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S new file mode 100644 index 0000000000..cd886bb6dc --- /dev/null +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S @@ -0,0 +1,74 @@ +/* -*-arm64-*- + * + * AArch64 NEON optimised IDCT functions for HEVC decoding + * + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" + +.macro idct_dc size bitdepth +function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1 + ldrsh w1, [x0] + mov w2, #(1 << (13 - \bitdepth)) + add w1, w1, #1 + asr w1, w1, #1 + add w1, w1, w2 + asr w1, w1, #(14 - \bitdepth) + dup v0.8h, w1 + dup v1.8h, w1 +.if \size > 4 + dup v2.8h, w1 + dup v3.8h, w1 +.if \size > 16 /* dc 32x32 */ + mov x2, #4 +1: + subs x2, x2, #1 +.endif +.if \size > 8 /* dc 16x16 */ + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 +.endif /* dc 8x8 */ + st1 {v0.8h-v3.8h}, [x0], #64 + st1 {v0.8h-v3.8h}, [x0], #64 +.if \size > 16 /* dc 32x32 */ + bne 1b +.endif +.else /* dc 4x4 */ + st1 {v0.8h-v1.8h}, [x0] +.endif + ret +endfunc +.endm + +idct_dc 4 8 +idct_dc 4 10 + +idct_dc 8 8 +idct_dc 8 10 + +idct_dc 16 8 +idct_dc 16 10 + +idct_dc 32 8 +idct_dc 32 10 diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c index f0a617ab39..2cd7ef3a6c 100644 --- a/libavcodec/aarch64/hevcdsp_init.c +++ b/libavcodec/aarch64/hevcdsp_init.c @@ -23,6 +23,15 @@ #include "libavcodec/hevcdsp.h" #include "libavcodec/avcodec.h" +void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs); +void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs); +void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs); + void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride); void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, @@ -48,6 +57,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->add_residual[1] = ff_hevc_add_residual_8x8_8_neon; c->add_residual[2] = ff_hevc_add_residual_16x16_8_neon; c->add_residual[3] = ff_hevc_add_residual_32x32_8_neon; + + c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_neon; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_neon; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_neon; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_neon; } if (have_neon(cpu_flags) && bit_depth == 10) { @@ -55,5 +69,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) c->add_residual[1] = ff_hevc_add_residual_8x8_10_neon; c->add_residual[2] = ff_hevc_add_residual_16x16_10_neon; c->add_residual[3] = ff_hevc_add_residual_32x32_10_neon; + + c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_neon; + c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_neon; + c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_neon; + c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_neon; } }
Signed-off-by: Josh Dekker <josh@itanimul.li> --- libavcodec/aarch64/Makefile | 3 +- libavcodec/aarch64/hevcdsp_idct_neon.S | 74 ++++++++++++++++++++++++++ libavcodec/aarch64/hevcdsp_init.c | 19 +++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S