diff mbox series

[FFmpeg-devel,v2,2/4] avcodec/aarch64/hevcdsp: port add_residual functions

Message ID 20210204113259.20112-3-josh@itanimul.li
State New
Headers show
Series avcodec/aarch64/hevcdsp
Related show

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Josh Dekker Feb. 4, 2021, 11:32 a.m. UTC
From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>

Speedup is fairly small, around 1.5%, but these are fairly simple.

Signed-off-by: Josh Dekker <josh@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 190 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
 2 files changed, 214 insertions(+)

Comments

Martin Storsjö Feb. 11, 2021, 9:02 a.m. UTC | #1
On Thu, 4 Feb 2021, Josh Dekker wrote:

> From: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
>
> Speedup is fairly small, around 1.5%, but these are fairly simple.
>
> Signed-off-by: Josh Dekker <josh@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 190 ++++++++++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  24 +++
> 2 files changed, 214 insertions(+)
>
> diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
> index c70d6a906d..329038a958 100644
> --- a/libavcodec/aarch64/hevcdsp_idct_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
> @@ -36,6 +36,196 @@ const trans, align=4
>         .short 31, 22, 13, 4
> endconst
> 
> +.macro clip10 in1, in2, c1, c2
> +        smax        \in1, \in1, \c1
> +        smax        \in2, \in2, \c1
> +        smin        \in1, \in1, \c2
> +        smin        \in2, \in2, \c2
> +.endm
> +
> +function ff_hevc_add_residual_4x4_8_neon, export=1
> +        ld1             {v0.8h-v1.8h}, [x1]
> +        ld1             {v2.s}[0], [x0], x2
> +        ld1             {v2.s}[1], [x0], x2
> +        ld1             {v2.s}[2], [x0], x2
> +        ld1             {v2.s}[3], [x0], x2
> +        sub             x0, x0, x2, lsl #2
> +        uxtl            v6.8h,  v2.8B
> +        uxtl2           v7.8h,  v2.16B

Personal preference: I prefer the non-shouty forms like v2.16b instead of 
v2.16B.

> +        sqadd           v0.8h,  v0.8h, v6.8h
> +        sqadd           v1.8h,  v1.8h, v7.8h

Nit: Incosistent alignment between columns 1-2 and 2-3. (And if one would 
want to make space for full sized operands like v16.16b, they'd all need 
another space.)

> +        sqxtun          v0.8B,  v0.8h
> +        sqxtun2         v0.16B, v1.8h
> +        st1             {v0.s}[0], [x0], x2
> +        st1             {v0.s}[1], [x0], x2
> +        st1             {v0.s}[2], [x0], x2
> +        st1             {v0.s}[3], [x0], x2
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_4x4_10_neon, export=1
> +        mov             x12, x0
> +        ld1             {v0.8h-v1.8h}, [x1]
> +        ld1             {v2.d}[0], [x12], x2
> +        ld1             {v2.d}[1], [x12], x2
> +        ld1             {v3.d}[0], [x12], x2
> +        sqadd           v0.8h, v0.8h, v2.8h
> +        ld1             {V3.d}[1], [x12], x2
> +        movi            v4.8h, #0
> +        sqadd           v1.8h, v1.8h, v3.8h
> +        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
> +        clip10          v0.8h, v1.8h, v4.8h, v5.8h
> +        st1             {v0.d}[0], [x0], x2
> +        st1             {v0.d}[1], [x0], x2
> +        st1             {v1.d}[0], [x0], x2
> +        st1             {v1.d}[1], [x0], x2
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_8_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,   #8
> +1:      subs            x3,   x3, #2

Nit: Odd vertical alignment here?

> +        ld1             {v2.d}[0],   [x0]
> +        ld1             {v2.d}[1],   [x12]
> +        uxtl            v3.8h,   v2.8B
> +        ld1             {v0.8h-v1.8h}, [x1], #32
> +        uxtl2           v2.8h,   v2.16B
> +        sqadd           v0.8h,   v0.8h,   v3.8h
> +        sqadd           v1.8h,   v1.8h,   v2.8h
> +        sqxtun          v0.8B,   v0.8h
> +        sqxtun2         v0.16B,  v1.8h
> +        st1             {v0.d}[0],   [x0], x2
> +        st1             {v0.d}[1],   [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_8x8_10_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,  #8
> +        movi            v4.8h, #0
> +        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
> +1:      subs            x3,  x3, #2
> +        ld1             {v0.8h-v1.8h}, [x1], #32
> +        ld1             {v2.8h},    [x0]
> +        sqadd           v0.8h, v0.8h, v2.8h
> +        ld1             {v3.8h},    [x12]
> +        sqadd           v1.8h, v1.8h, v3.8h
> +        clip10          v0.8h, v1.8h, v4.8h, v5.8h
> +        st1             {v0.8h}, [x0], x2
> +        st1             {v1.8h}, [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_8_neon, export=1
> +        mov             x3,  #16
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +1:      subs            x3,  x3, #2
> +        ld1             {v16.16B},     [x0]
> +        ld1             {v0.8h-v3.8h}, [x1], #64
> +        ld1             {v19.16B},    [x12]
> +        uxtl            v17.8h, v16.8B
> +        uxtl2           v18.8h, v16.16B
> +        uxtl            v20.8h, v19.8B
> +        uxtl2           v21.8h, v19.16B
> +        sqadd           v0.8h,  v0.8h, v17.8h
> +        sqadd           v1.8h,  v1.8h, v18.8h
> +        sqadd           v2.8h,  v2.8h, v20.8h
> +        sqadd           v3.8h,  v3.8h, v21.8h
> +        sqxtun          v0.8B,  v0.8h
> +        sqxtun2         v0.16B, v1.8h
> +        sqxtun          v1.8B,  v2.8h
> +        sqxtun2         v1.16B, v3.8h
> +        st1             {v0.16B},     [x0], x2
> +        st1             {v1.16B},     [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_16x16_10_neon, export=1
> +        mov             x3,  #16
> +        movi            v20.8h, #0
> +        mvni            v21.8h, #0xFC, LSL #8 // movi #0x3FF
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +1:      subs            x3,  x3, #2
> +        ld1             {v16.8h-v17.8h}, [x0]
> +        ld1             {v0.8h-v3.8h},  [x1], #64
> +        sqadd           v0.8h, v0.8h, v16.8h
> +        ld1             {v18.8h-v19.8h}, [x12]
> +        sqadd           v1.8h, v1.8h, v17.8h
> +        sqadd           v2.8h, v2.8h, v18.8h
> +        sqadd           v3.8h, v3.8h, v19.8h
> +        clip10          v0.8h, v1.8h, v20.8h, v21.8h
> +        clip10          v2.8h, v3.8h, v20.8h, v21.8h
> +        st1             {v0.8h-v1.8h},   [x0], x2
> +        st1             {v2.8h-v3.8h},   [x12], x2
> +        bne             1b
> +        ret
> +endfunc
> +
> +function ff_hevc_add_residual_32x32_8_neon, export=1
> +        add             x12, x0, x2
> +        add             x2,  x2, x2
> +        mov             x3,  #32
> +1:      subs            x3,  x3, #2
> +        ld1             {v20.16B, v21.16B}, [x0]
> +        uxtl            v16.8h,  v20.8B
> +        uxtl2           v17.8h,  v20.16B
> +        ld1             {v22.16B, v23.16B}, [x12]
> +        uxtl            v18.8h,  v21.8B
> +        uxtl2           v19.8h,  v21.16B
> +        uxtl            v20.8h,  v22.8B
> +        ld1             {v0.8h-v3.8h}, [x1], #64
> +        ld1             {v4.8h-v7.8h}, [x1], #64
> +        uxtl2           v21.8h,  v22.16B
> +        uxtl            v22.8h,  v23.8B
> +        uxtl2           v23.8h,  v23.16B
> +        sqadd           v0.8h, v0.8h,  v16.8h
> +        sqadd           v1.8h, v1.8h,  v17.8h

Here, the vertical alignment is visibly inconsistent across instructions 
where they could line up better.


Other than that, I've got nothing to complain about functionally, and it 
gives a very good speedup (3-14x depending on block size and core type).

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index c70d6a906d..329038a958 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -36,6 +36,196 @@  const trans, align=4
         .short 31, 22, 13, 4
 endconst
 
+.macro clip10 in1, in2, c1, c2
+        smax        \in1, \in1, \c1
+        smax        \in2, \in2, \c1
+        smin        \in1, \in1, \c2
+        smin        \in2, \in2, \c2
+.endm
+
+function ff_hevc_add_residual_4x4_8_neon, export=1
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.s}[0], [x0], x2
+        ld1             {v2.s}[1], [x0], x2
+        ld1             {v2.s}[2], [x0], x2
+        ld1             {v2.s}[3], [x0], x2
+        sub             x0, x0, x2, lsl #2
+        uxtl            v6.8h,  v2.8B
+        uxtl2           v7.8h,  v2.16B
+        sqadd           v0.8h,  v0.8h, v6.8h
+        sqadd           v1.8h,  v1.8h, v7.8h
+        sqxtun          v0.8B,  v0.8h
+        sqxtun2         v0.16B, v1.8h
+        st1             {v0.s}[0], [x0], x2
+        st1             {v0.s}[1], [x0], x2
+        st1             {v0.s}[2], [x0], x2
+        st1             {v0.s}[3], [x0], x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+        mov             x12, x0
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.d}[0], [x12], x2
+        ld1             {v2.d}[1], [x12], x2
+        ld1             {v3.d}[0], [x12], x2
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {V3.d}[1], [x12], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.d}[0], [x0], x2
+        st1             {v0.d}[1], [x0], x2
+        st1             {v1.d}[0], [x0], x2
+        st1             {v1.d}[1], [x0], x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_8_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,   #8
+1:      subs            x3,   x3, #2
+        ld1             {v2.d}[0],   [x0]
+        ld1             {v2.d}[1],   [x12]
+        uxtl            v3.8h,   v2.8B
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        uxtl2           v2.8h,   v2.16B
+        sqadd           v0.8h,   v0.8h,   v3.8h
+        sqadd           v1.8h,   v1.8h,   v2.8h
+        sqxtun          v0.8B,   v0.8h
+        sqxtun2         v0.16B,  v1.8h
+        st1             {v0.d}[0],   [x0], x2
+        st1             {v0.d}[1],   [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_10_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+        mvni            v5.8h, #0xFC, LSL #8 // movi #0x3FF
+1:      subs            x3,  x3, #2
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        ld1             {v2.8h},    [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h},    [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
+        st1             {v0.8h}, [x0], x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_8_neon, export=1
+        mov             x3,  #16
+        add             x12, x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.16B},     [x0]
+        ld1             {v0.8h-v3.8h}, [x1], #64
+        ld1             {v19.16B},    [x12]
+        uxtl            v17.8h, v16.8B
+        uxtl2           v18.8h, v16.16B
+        uxtl            v20.8h, v19.8B
+        uxtl2           v21.8h, v19.16B
+        sqadd           v0.8h,  v0.8h, v17.8h
+        sqadd           v1.8h,  v1.8h, v18.8h
+        sqadd           v2.8h,  v2.8h, v20.8h
+        sqadd           v3.8h,  v3.8h, v21.8h
+        sqxtun          v0.8B,  v0.8h
+        sqxtun2         v0.16B, v1.8h
+        sqxtun          v1.8B,  v2.8h
+        sqxtun2         v1.16B, v3.8h
+        st1             {v0.16B},     [x0], x2
+        st1             {v1.16B},     [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_10_neon, export=1
+        mov             x3,  #16
+        movi            v20.8h, #0
+        mvni            v21.8h, #0xFC, LSL #8 // movi #0x3FF
+        add             x12, x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8h-v17.8h}, [x0]
+        ld1             {v0.8h-v3.8h},  [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
+        ld1             {v18.8h-v19.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h},   [x0], x2
+        st1             {v2.8h-v3.8h},   [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_8_neon, export=1
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #32
+1:      subs            x3,  x3, #2
+        ld1             {v20.16B, v21.16B}, [x0]
+        uxtl            v16.8h,  v20.8B
+        uxtl2           v17.8h,  v20.16B
+        ld1             {v22.16B, v23.16B}, [x12]
+        uxtl            v18.8h,  v21.8B
+        uxtl2           v19.8h,  v21.16B
+        uxtl            v20.8h,  v22.8B
+        ld1             {v0.8h-v3.8h}, [x1], #64
+        ld1             {v4.8h-v7.8h}, [x1], #64
+        uxtl2           v21.8h,  v22.16B
+        uxtl            v22.8h,  v23.8B
+        uxtl2           v23.8h,  v23.16B
+        sqadd           v0.8h, v0.8h,  v16.8h
+        sqadd           v1.8h, v1.8h,  v17.8h
+        sqadd           v2.8h, v2.8h,  v18.8h
+        sqadd           v3.8h, v3.8h,  v19.8h
+        sqadd           v4.8h, v4.8h,  v20.8h
+        sqadd           v5.8h, v5.8h,  v21.8h
+        sqadd           v6.8h, v6.8h,  v22.8h
+        sqadd           v7.8h, v7.8h,  v23.8h
+        sqxtun          v0.8B,   v0.8h
+        sqxtun2         v0.16B,  v1.8h
+        sqxtun          v1.8B,   v2.8h
+        sqxtun2         v1.16B,  v3.8h
+        sqxtun          v2.8B,   v4.8h
+        sqxtun2         v2.16B,  v5.8h
+        st1             {v0.16B, v1.16B}, [x0], x2
+        sqxtun          v3.8B,   v6.8h
+        sqxtun2         v3.16B,  v7.8h
+        st1             {v2.16B, v3.16B}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_10_neon, export=1
+        mov             x3,  #32
+        movi            v20.8h, #0
+        mvni            v21.8h, #0xFC, LSL #8 // movi #0x3FF
+1:      subs            x3,  x3, #1
+        ld1             {v0.8h-v3.8h}, [x1], #64
+        ld1             {v16.8h-v19.8h},   [x0]
+        sqadd           v0.8h, v0.8h, v16.8h
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v3.8h},   [x0], x2
+        bne             1b
+        ret
+endfunc
+
 .macro sum_sub out, in, c, op, p
   .ifc \op, +
         smlal\p         \out, \in, \c
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 19d9a7f9ed..4c29daa6d5 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,6 +25,22 @@ 
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                     ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                      ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
+                                       ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -35,10 +51,18 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
     if (!have_neon(av_get_cpu_flags())) return;
 
     if (bit_depth == 8) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_8_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_8_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_8_neon;
     }
     if (bit_depth == 10) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_10_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_10_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_10_neon;
         c->idct[1]                     = ff_hevc_idct_8x8_10_neon;
         c->idct[2]                     = ff_hevc_idct_16x16_10_neon;
     }