diff mbox series

[FFmpeg-devel] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions.

Message ID 20210110092712.1010-1-Reimar.Doeffinger@gmx.de
State New
Headers show
Series [FFmpeg-devel] libavcodec/aarch64/hevcdsp_idct_neon.S: Also port add_residual functions. | expand

Checks

Context Check Description
andriy/configure warning Failed to apply patch

Commit Message

Reimar Döffinger Jan. 10, 2021, 9:27 a.m. UTC
From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>

Speedup is fairly small, around 1.5%, but these are fairly simple.
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 190 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
 2 files changed, 214 insertions(+)

Comments

Martin Storsjö Jan. 15, 2021, 10:59 p.m. UTC | #1
On Sun, 10 Jan 2021, Reimar.Doeffinger@gmx.de wrote:

> From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
>
> Speedup is fairly small, around 1.5%, but these are fairly simple.
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 190 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
> 2 files changed, 214 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index 9f67e45..edd03a0 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -36,6 +36,196 @@ const trans, align=4
>         .short 31, 22, 13, 4
> endconst
> 
> +.macro clip10 in1, in2, c1, c2
> +        smax        \in1, \in1, \c1
> +        smax        \in2, \in2, \c1
> +        smin        \in1, \in1, \c2
> +        smin        \in2, \in2, \c2
> +.endm
> +
> +function ff_hevc_add_residual_4x4_8_neon, export=1
> +        ld1             {v0.8H-v1.8H}, [x1]
> +        ld1             {v2.S}[0], [x0], x2
> +        ld1             {v2.S}[1], [x0], x2
> +        ld1             {v2.S}[2], [x0], x2
> +        ld1             {v2.S}[3], [x0], x2
> +        sub             x0, x0, x2, lsl #2
> +        uxtl            v8.8H, v2.8B
> +        uxtl2           v9.8H, v2.16B
> +        sqadd           v0.8H, v0.8H, v8.8H

FWIW, as a matter of taste, I dislike the shouty uppercase version of e.g. 
element specifiers, like .8H here. The code base contains both styles, but 
I'd say the lowercase form is more prevalent.

Overall, this patch looks good, nothing much to comment on I think. Not 
tested fully though, as it depends on the other patch, which still has a 
few issues (and fails checkasm).

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 9f67e45..edd03a0 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@  const trans, align=4
         .short 31, 22, 13, 4
 endconst
 
+.macro clip10 in1, in2, c1, c2
+        smax        \in1, \in1, \c1
+        smax        \in2, \in2, \c1
+        smin        \in1, \in1, \c2
+        smin        \in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+        ld1             {v0.8H-v1.8H}, [x1]
+        ld1             {v2.S}[0], [x0], x2
+        ld1             {v2.S}[1], [x0], x2
+        ld1             {v2.S}[2], [x0], x2
+        ld1             {v2.S}[3], [x0], x2
+        sub             x0, x0, x2, lsl #2
+        uxtl            v8.8H, v2.8B
+        uxtl2           v9.8H, v2.16B
+        sqadd           v0.8H, v0.8H, v8.8H
+        sqadd           v1.8H, v1.8H, v9.8H
+        sqxtun          v0.8B, v0.8H
+        sqxtun2         v0.16B, v1.8H
+        st1             {v0.S}[0], [x0], x2
+        st1             {v0.S}[1], [x0], x2
+        st1             {v0.S}[2], [x0], x2
+        st1             {v0.S}[3], [x0], x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+        mov             x12, x0
+        ld1             {v0.8H-v1.8H}, [x1]
+        ld1             {v2.D}[0], [x12], x2
+        ld1             {v2.D}[1], [x12], x2
+        ld1             {v3.D}[0], [x12], x2
+        sqadd           v0.8H, v0.8H, v2.8H
+        ld1             {V3.D}[1], [x12], x2
+        movi            v4.8H, #0
+        sqadd           v1.8H, v1.8H, v3.8H
+        mvni            v5.8H, #0xFC, LSL #8 // movi #0x3FF
+        clip10          v0.8H, v1.8H, v4.8H, v5.8H
+        st1             {v0.D}[0], [x0], x2
+        st1             {v0.D}[1], [x0], x2
+        st1             {v1.D}[0], [x0], x2
+        st1             {v1.D}[1], [x0], x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,   #8
+1:      subs            x3,   x3, #2
+        ld1             {v2.D}[0],   [x0]
+        ld1             {v2.D}[1],   [x12]
+        uxtl            v3.8H,   v2.8B
+        ld1             {v0.8H-v1.8H}, [x1], #32
+        uxtl2           v2.8H,   v2.16B
+        sqadd           v0.8H,   v0.8H,   v3.8H
+        sqadd           v1.8H,   v1.8H,   v2.8H
+        sqxtun          v0.8B,   v0.8H
+        sqxtun2         v0.16B,  v1.8H
+        st1             {v0.D}[0],   [x0], x2
+        st1             {v0.D}[1],   [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8H, #0
+        mvni            v5.8H, #0xFC, LSL #8 // movi #0x3FF
+1:      subs            x3,  x3, #2
+        ld1             {v0.8H-v1.8H}, [x1], #32
+        ld1             {v2.8H},    [x0]
+        sqadd           v0.8H, v0.8H, v2.8H
+        ld1             {v3.8H},    [x12]
+        sqadd           v1.8H, v1.8H, v3.8H
+        clip10          v0.8H, v1.8H, v4.8H, v5.8H
+        st1             {v0.8H}, [x0], x2
+        st1             {v1.8H}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+        mov             x3,  #16
+        add             x12, x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.16B},     [x0]
+        ld1             {v0.8H-v3.8H}, [x1], #64
+        ld1             {v19.16B},    [x12]
+        uxtl            v17.8H, v16.8B
+        uxtl2           v18.8H, v16.16B
+        uxtl            v20.8H, v19.8B
+        uxtl2           v21.8H, v19.16B
+        sqadd           v0.8H,  v0.8H, v17.8H
+        sqadd           v1.8H,  v1.8H, v18.8H
+        sqadd           v2.8H,  v2.8H, v20.8H
+        sqadd           v3.8H,  v3.8H, v21.8H
+        sqxtun          v0.8B,  v0.8H
+        sqxtun2         v0.16B, v1.8H
+        sqxtun          v1.8B,  v2.8H
+        sqxtun2         v1.16B, v3.8H
+        st1             {v0.16B},     [x0], x2
+        st1             {v1.16B},     [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+        mov             x3,  #16
+        movi            v20.8H, #0
+        mvni            v21.8H, #0xFC, LSL #8 // movi #0x3FF
+        add             x12, x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8H-v17.8H}, [x0]
+        ld1             {v0.8H-v3.8H},  [x1], #64
+        sqadd           v0.8H, v0.8H, v16.8H
+        ld1             {v18.8H-v19.8H}, [x12]
+        sqadd           v1.8H, v1.8H, v17.8H
+        sqadd           v2.8H, v2.8H, v18.8H
+        sqadd           v3.8H, v3.8H, v19.8H
+        clip10          v0.8H, v1.8H, v20.8H, v21.8H
+        clip10          v2.8H, v3.8H, v20.8H, v21.8H
+        st1             {v0.8H-v1.8H},   [x0], x2
+        st1             {v2.8H-v3.8H},   [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_8_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #32
+1:      subs            x3,  x3, #2
+        ld1             {v20.16B, v21.16B}, [x0]
+        uxtl            v16.8H,  v20.8B
+        uxtl2           v17.8H,  v20.16B
+        ld1             {v22.16B, v23.16B}, [x12]
+        uxtl            v18.8H,  v21.8B
+        uxtl2           v19.8H,  v21.16B
+        uxtl            v20.8H,  v22.8B
+        ld1             {v0.8H-v3.8H}, [x1], #64
+        ld1             {v4.8H-v7.8H}, [x1], #64
+        uxtl2           v21.8H,  v22.16B
+        uxtl            v22.8H,  v23.8B
+        uxtl2           v23.8H,  v23.16B
+        sqadd           v0.8H, v0.8H,  v16.8H
+        sqadd           v1.8H, v1.8H,  v17.8H
+        sqadd           v2.8H, v2.8H,  v18.8H
+        sqadd           v3.8H, v3.8H,  v19.8H
+        sqadd           v4.8H, v4.8H,  v20.8H
+        sqadd           v5.8H, v5.8H,  v21.8H
+        sqadd           v6.8H, v6.8H,  v22.8H
+        sqadd           v7.8H, v7.8H,  v23.8H
+        sqxtun          v0.8B,   v0.8H
+        sqxtun2         v0.16B,  v1.8H
+        sqxtun          v1.8B,   v2.8H
+        sqxtun2         v1.16B,  v3.8H
+        sqxtun          v2.8B,   v4.8H
+        sqxtun2         v2.16B,  v5.8H
+        st1             {v0.16B, v1.16B}, [x0], x2
+        sqxtun          v3.8B,   v6.8H
+        sqxtun2         v3.16B,  v7.8H
+        st1             {v2.16B, v3.16B}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_10_neon, export=1
+        mov             x3,  #32
+        movi            v20.8H, #0
+        mvni            v21.8H, #0xFC, LSL #8 // movi #0x3FF
+1:      subs            x3,  x3, #1
+        ld1             {v0.8H-v3.8H}, [x1], #64
+        ld1             {v16.8H-v19.8H},   [x0]
+        sqadd           v0.8H, v0.8H, v16.8H
+        sqadd           v1.8H, v1.8H, v17.8H
+        sqadd           v2.8H, v2.8H, v18.8H
+        sqadd           v3.8H, v3.8H, v19.8H
+        clip10          v0.8H, v1.8H, v20.8H, v21.8H
+        clip10          v2.8H, v3.8H, v20.8H, v21.8H
+        st1             {v0.8H-v3.8H},   [x0], x2
+        bne             1b
+        ret
+endfunc
+
 .macro sum_sub out, in, c, op, p
   .ifc \op, +
         smlal\p         \out, \in, \c
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 19d9a7f..4c29daa 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,6 +25,22 @@ 
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -35,10 +51,18 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
     if (!have_neon(av_get_cpu_flags())) return;
 
     if (bit_depth == 8) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
     }
     if (bit_depth == 10) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_10_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_10_neon;
     }