diff mbox series

[FFmpeg-devel,2/4] lavc/aarch64: add HEVC idct_dc NEON

Message ID 20210107121020.86179-3-josh@itanimul.li
State New
Headers show
Series AArch64 NEON for HEVC
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Jan. 7, 2021, 12:10 p.m. UTC
Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/Makefile            |  3 +-
 libavcodec/aarch64/hevcdsp_idct_neon.S | 74 ++++++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init.c      | 19 +++++++
 3 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S

Comments

Martin Storsjö Jan. 16, 2021, 11:04 p.m. UTC | #1
On Thu, 7 Jan 2021, Josh Dekker wrote:

> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/Makefile            |  3 +-
> libavcodec/aarch64/hevcdsp_idct_neon.S | 74 ++++++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init.c      | 19 +++++++
> 3 files changed, 95 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_idct_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 4bdd554e7e..42d80bf74c 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -54,7 +54,8 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
> # decoders/encoders
> NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
> NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
> -NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o
> +NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o      \
> +                                           aarch64/hevcdsp_idct_neon.o
> NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
> NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
> NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> new file mode 100644
> index 0000000000..cd886bb6dc
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -0,0 +1,74 @@
> +/* -*-arm64-*-
> + *
> + * AArch64 NEON optimised IDCT functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +.macro idct_dc size bitdepth
> +function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1

As a bonus, it'd be nice to have the function prototype in a comment above 
here, as a quick reference for what the inputs are.

> +    ldrsh w1, [x0]
> +    mov   w2, #(1 << (13 - \bitdepth))
> +    add   w1, w1, #1
> +    asr   w1, w1, #1
> +    add   w1, w1, w2

As commented on the other patch, please align things like in existing 
assembly, and add another extra space after the commas here, to keep 
things aligned even if you'd be using e.g. w10.

> +    asr   w1, w1, #(14 - \bitdepth)
> +    dup   v0.8h, w1
> +    dup   v1.8h, w1
> +.if \size > 4
> +    dup   v2.8h, w1
> +    dup   v3.8h, w1
> +.if \size > 16 /* dc 32x32 */
> +    mov x2, #4
> +1:
> +    subs x2, x2, #1
> +.endif
> +.if \size > 8 /* dc 16x16 */
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +.endif /* dc 8x8 */
> +    st1   {v0.8h-v3.8h}, [x0], #64
> +    st1   {v0.8h-v3.8h}, [x0], #64

In a series of stores like this, each instruction updates the pointer x0, 
which means that the next instruction can't start executing until the 
previous one is done writing back the value to that register. So for a 
sequence like this, it's sometimes useful to have two registers for 
storing things interleavedly, e.g. like this:

add x12,  x0,  #64
mov x13,  #128
st1 {}, [x0],  x13
st1 {}, [x12], x13
st1 {}, [x0],  x13
st1 {}, [x12], x13
...

Depending on the context and so on, it may or may not be worth doing that. 
For such a small trivial function like this, I'd definitely try at least.

Btw, you didn't mention what core you had benchmarked your functions on. 
When doing micro tuning like this, it's usually good to benchmark on both 
big and small cores, as some optimizations can make things faster on one 
but slower on another one.

Functionally, the patch looks ok - but as this goes into the same file as 
Reimar's patches also add things into (and his patches do things that are 
fairly 1:1 with the existing arm assembly), it might be good to hold off 
of pushing this one until his patches are in, to rebase this one on top of 
those.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 4bdd554e7e..42d80bf74c 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -54,7 +54,8 @@  NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 # decoders/encoders
 NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o      \
+                                           aarch64/hevcdsp_idct_neon.o
 NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
new file mode 100644
index 0000000000..cd886bb6dc
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -0,0 +1,74 @@ 
+/* -*-arm64-*-
+ *
+ * AArch64 NEON optimised IDCT functions for HEVC decoding
+ *
+ * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro idct_dc size bitdepth
+function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
+    ldrsh w1, [x0]
+    mov   w2, #(1 << (13 - \bitdepth))
+    add   w1, w1, #1
+    asr   w1, w1, #1
+    add   w1, w1, w2
+    asr   w1, w1, #(14 - \bitdepth)
+    dup   v0.8h, w1
+    dup   v1.8h, w1
+.if \size > 4
+    dup   v2.8h, w1
+    dup   v3.8h, w1
+.if \size > 16 /* dc 32x32 */
+    mov x2, #4
+1:
+    subs x2, x2, #1
+.endif
+.if \size > 8 /* dc 16x16 */
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+.endif /* dc 8x8 */
+    st1   {v0.8h-v3.8h}, [x0], #64
+    st1   {v0.8h-v3.8h}, [x0], #64
+.if \size > 16 /* dc 32x32 */
+    bne 1b
+.endif
+.else /* dc 4x4 */
+    st1   {v0.8h-v1.8h}, [x0]
+.endif
+    ret
+endfunc
+.endm
+
+idct_dc 4 8
+idct_dc 4 10
+
+idct_dc 8 8
+idct_dc 8 10
+
+idct_dc 16 8
+idct_dc 16 10
+
+idct_dc 32 8
+idct_dc 32 10
diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c
index f0a617ab39..2cd7ef3a6c 100644
--- a/libavcodec/aarch64/hevcdsp_init.c
+++ b/libavcodec/aarch64/hevcdsp_init.c
@@ -23,6 +23,15 @@ 
 #include "libavcodec/hevcdsp.h"
 #include "libavcodec/avcodec.h"
 
+void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
+
 void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
                                      ptrdiff_t stride);
 void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
@@ -48,6 +57,11 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
+
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
     }
 
     if (have_neon(cpu_flags) && bit_depth == 10) {
@@ -55,5 +69,10 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
         c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
         c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
+
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_10_neon;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_10_neon;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
 }