diff mbox series

[FFmpeg-devel,v2,3/4] avcodec/aarch64/hevcdsp: add idct_dc NEON

Message ID 20210204113259.20112-4-josh@itanimul.li
State New
Headers show
Series avcodec/aarch64/hevcdsp
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Feb. 4, 2021, 11:32 a.m. UTC
Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 54 +++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 +++++++
 2 files changed, 70 insertions(+)

Comments

Martin Storsjö Feb. 11, 2021, 9:32 a.m. UTC | #1
On Thu, 4 Feb 2021, Josh Dekker wrote:

> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 54 +++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c | 16 +++++++
> 2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 329038a958..d3902a9e0f 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -5,6 +5,7 @@
>  *
>  * Ported from arm/hevcdsp_idct_neon.S by
>  * Copyright (c) 2020 Reimar Döffinger
> + * Copyright (c) 2020 Josh Dekker
>  *
>  * This file is part of FFmpeg.
>  *
> @@ -568,3 +569,56 @@ tr_16x4 secondpass_10, 20 - 10, 512, 1
> 
> idct_16x16 8
> idct_16x16 10
> +
> +// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
> +.macro idct_dc size bitdepth

This needs a comma between size and bitdepth. Normally, commas are 
optional, but when targeting darwin, commas between macro arguments (both 
in the declaration like this, and when invoking macros) are mandatory (due 
to backwards compatibility with a different macro syntax in legacy gas on 
darwin platforms, I think).

> +function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
> +        ldrsh   w1, [x0]

The indentation of your new code differs from the rest of the existing 
code in the file

> +        mov     w2,  #(1 << (13 - \bitdepth))
> +        add     w1,  w1, #1
> +        asr     w1,  w1, #1
> +        add     w1,  w1, w2
> +        asr     w1,  w1, #(14 - \bitdepth)

The function is a bit slower than expected in some cases; on Cortex A72 
and A73, it's actually slower than what GCC produces for small block 
sizes:

                           Cortex A53     A72      A73
hevc_idct_4x4_dc_8_c:           25.5    11.7     14.7
hevc_idct_4x4_dc_8_neon:        15.5    12.5     15.5

However these 5 instructions above can be replaced with just these three:

         mov     w2,  #((1 << (14 - \bitdepth))+1)
         add     w1,  w1, w2
         asr     w1,  w1, #(15 - \bitdepth)

With that change, I get it down to this:

hevc_idct_4x4_dc_8_neon:        12.7    10.2     13.5

So then it's universally faster than the C code at least.

A different alternative is this:

         movi    v1.8h,  #((1 << (14 - \bitdepth))+1)
         ld1r    {v4.8h}, [x0]
         add     v4.8h,  v4.8h,  v1.8h
         sshr    v0.8h,  v4.8h,  #(15 - \bitdepth)
         sshr    v1.8h,  v4.8h,  #(15 - \bitdepth)
.if \size > 4
         sshr    v2.8h,  v4.8h,  #(15 - \bitdepth)
         sshr    v3.8h,  v4.8h,  #(15 - \bitdepth)

(The reason for doing 4 sshr instructions, instead of just doing 1 
followed by 3 dups, is that this allows all of them to start possibly in 
parallel, as soon as the result of the add is available, instead of having 
a second dup instruction waiting for the result from the sshr.)

That produces the following numbers:

hevc_idct_4x4_dc_8_neon:        12.7    11.5      9.2

I.e. a tiny bit slower than the previous on A72, and faster on A73, but 
also a viable alternative.

One could also consider this prologue:

         ld1r    {v4.8h}, [x0]
         srshr   v4.8h,  v4.8h,  #1
         srshr   v0.8h,  v4.8h,  #(14 - \bitdepth)
         srshr   v1.8h,  v4.8h,  #(14 - \bitdepth)
.if \size > 4
         srshr   v2.8h,  v4.8h,  #(14 - \bitdepth)
         srshr   v3.8h,  v4.8h,  #(14 - \bitdepth)

But that's a worse combination overall:

hevc_idct_4x4_dc_8_neon:        13.5    12.5     10.2



> +        dup  v0.8h,  w1
> +        dup  v1.8h,  w1
> +.if \size > 4
> +        dup  v2.8h,  w1
> +        dup  v3.8h,  w1
> +.if \size > 16 /* dc 32x32 */
> +        mov     x2,  #4
> +1:
> +        subs    x2,  x2, #1
> +.endif
> +        add    x12,  x0,  #64
> +        mov    x13,  #128
> +.if \size > 8 /* dc 16x16 */
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +.endif /* dc 8x8 */
> +        st1   {v0.8h-v3.8h}, [ x0], x13
> +        st1   {v0.8h-v3.8h}, [x12], x13
> +.if \size > 16 /* dc 32x32 */
> +        bne 1b
> +.endif
> +.else /* dc 4x4 */
> +        st1   {v0.8h-v1.8h}, [x0]
> +.endif
> +        ret
> +endfunc
> +.endm
> +
> +idct_dc 4 8
> +idct_dc 4 10
> +
> +idct_dc 8 8
> +idct_dc 8 10
> +
> +idct_dc 16 8
> +idct_dc 16 10
> +
> +idct_dc 32 8
> +idct_dc 32 10

Missing commas between the macro arguments.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 329038a958..d3902a9e0f 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -5,6 +5,7 @@ 
  *
  * Ported from arm/hevcdsp_idct_neon.S by
  * Copyright (c) 2020 Reimar Döffinger
+ * Copyright (c) 2020 Josh Dekker
  *
  * This file is part of FFmpeg.
  *
@@ -568,3 +569,56 @@  tr_16x4 secondpass_10, 20 - 10, 512, 1
 
 idct_16x16 8
 idct_16x16 10
+
+// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
+.macro idct_dc size bitdepth
+function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
+        ldrsh   w1, [x0]
+        mov     w2,  #(1 << (13 - \bitdepth))
+        add     w1,  w1, #1
+        asr     w1,  w1, #1
+        add     w1,  w1, w2
+        asr     w1,  w1, #(14 - \bitdepth)
+        dup  v0.8h,  w1
+        dup  v1.8h,  w1
+.if \size > 4
+        dup  v2.8h,  w1
+        dup  v3.8h,  w1
+.if \size > 16 /* dc 32x32 */
+        mov     x2,  #4
+1:
+        subs    x2,  x2, #1
+.endif
+        add    x12,  x0,  #64
+        mov    x13,  #128
+.if \size > 8 /* dc 16x16 */
+        st1   {v0.8h-v3.8h}, [ x0], x13
+        st1   {v0.8h-v3.8h}, [x12], x13
+        st1   {v0.8h-v3.8h}, [ x0], x13
+        st1   {v0.8h-v3.8h}, [x12], x13
+        st1   {v0.8h-v3.8h}, [ x0], x13
+        st1   {v0.8h-v3.8h}, [x12], x13
+.endif /* dc 8x8 */
+        st1   {v0.8h-v3.8h}, [ x0], x13
+        st1   {v0.8h-v3.8h}, [x12], x13
+.if \size > 16 /* dc 32x32 */
+        bne 1b
+.endif
+.else /* dc 4x4 */
+        st1   {v0.8h-v1.8h}, [x0]
+.endif
+        ret
+endfunc
+.endm
+
+idct_dc 4 8
+idct_dc 4 10
+
+idct_dc 8 8
+idct_dc 8 10
+
+idct_dc 16 8
+idct_dc 16 10
+
+idct_dc 32 8
+idct_dc 32 10
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4c29daa6d5..fe111bd1ac 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -45,6 +45,14 @@  void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_10_neon(int16_t *coeffs, int col_limit);
+void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_8_neon(int16_t *coeffs);
+void ff_hevc_idct_4x4_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_8x8_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_16x16_dc_10_neon(int16_t *coeffs);
+void ff_hevc_idct_32x32_dc_10_neon(int16_t *coeffs);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -57,6 +65,10 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_8_neon;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_8_neon;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
@@ -65,5 +77,9 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_10_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_10_neon;
+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_10_neon;
+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_10_neon;
+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
 }