diff mbox series

[FFmpeg-devel,v2] lavc/aarch64: hevc_add_res add 12bit variants

Message ID 20220816050153.16674-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,v2] lavc/aarch64: hevc_add_res add 12bit variants | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/configure_x86 warning Failed to apply patch

Commit Message

J. Dekker Aug. 16, 2022, 5:01 a.m. UTC
hevc_add_res_4x4_12_c: 46.0
hevc_add_res_4x4_12_neon: 18.7
hevc_add_res_8x8_12_c: 194.7
hevc_add_res_8x8_12_neon: 25.2
hevc_add_res_16x16_12_c: 716.0
hevc_add_res_16x16_12_neon: 69.7
hevc_add_res_32x32_12_c: 3820.7
hevc_add_res_32x32_12_neon: 261.0

Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
 2 files changed, 105 insertions(+), 85 deletions(-)

Comments

Martin Storsjö Aug. 16, 2022, 11:38 a.m. UTC | #1
On Tue, 16 Aug 2022, J. Dekker wrote:

> hevc_add_res_4x4_12_c: 46.0
> hevc_add_res_4x4_12_neon: 18.7
> hevc_add_res_8x8_12_c: 194.7
> hevc_add_res_8x8_12_neon: 25.2
> hevc_add_res_16x16_12_c: 716.0
> hevc_add_res_16x16_12_neon: 69.7
> hevc_add_res_32x32_12_c: 3820.7
> hevc_add_res_32x32_12_neon: 261.0
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_idct_neon.S    | 156 ++++++++++++----------
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  34 ++---
> 2 files changed, 105 insertions(+), 85 deletions(-)
>
> -function ff_hevc_add_residual_32x32_10_neon, export=1
> +.macro add_res bitdepth
> +function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
> +        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
> +        b               X(ff_hevc_add_residual_4x4_16_neon)

When the function isn't exported, you shouldn't use X() to access the 
symbol of it. On Darwin, X() adds the underscore prefix, but that symbol 
name is only defined for exported functions. Also, you probably should 
remove the ff_ prefix for symbols that aren't exported, for clarity.

This issue causes the patch in its current form to break compilation on 
macOS.

> -void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                     ptrdiff_t stride);
> -void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                      ptrdiff_t stride);
> -void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                     ptrdiff_t stride);
> -void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                      ptrdiff_t stride);
> -void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                       ptrdiff_t stride);
> -void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                        ptrdiff_t stride);
> -void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
> -                                       ptrdiff_t stride);
> -void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
> -                                        ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
> +void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);

Note that these have been amended to include "const" on the coeffs 
parameter recently.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 484eea8437..5fb5990f3d 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -37,11 +37,11 @@  const trans, align=4
         .short          31, 22, 13, 4
 endconst
 
-.macro clip10 in1, in2, c1, c2
-        smax            \in1, \in1, \c1
-        smax            \in2, \in2, \c1
-        smin            \in1, \in1, \c2
-        smin            \in2, \in2, \c2
+.macro clip2 in1, in2, min, max
+        smax            \in1, \in1, \min
+        smax            \in2, \in2, \min
+        smin            \in1, \in1, \max
+        smin            \in2, \in2, \max
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -64,25 +64,6 @@  function ff_hevc_add_residual_4x4_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_4x4_10_neon, export=1
-        mov             x12,  x0
-        ld1             {v0.8h-v1.8h}, [x1]
-        ld1             {v2.d}[0], [x12], x2
-        ld1             {v2.d}[1], [x12], x2
-        ld1             {v3.d}[0], [x12], x2
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.d}[1], [x12], x2
-        movi            v4.8h, #0
-        sqadd           v1.8h, v1.8h, v3.8h
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.d}[0], [x0],  x2
-        st1             {v0.d}[1], [x0],  x2
-        st1             {v1.d}[0], [x0],  x2
-        st1             {v1.d}[1], [x0],  x2
-        ret
-endfunc
-
 function ff_hevc_add_residual_8x8_8_neon, export=1
         add             x12, x0, x2
         add             x2, x2, x2
@@ -103,25 +84,6 @@  function ff_hevc_add_residual_8x8_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_8x8_10_neon, export=1
-        add             x12, x0, x2
-        add             x2,  x2, x2
-        mov             x3,  #8
-        movi            v4.8h, #0
-        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs            x3,  x3, #2
-        ld1             {v0.8h-v1.8h}, [x1], #32
-        ld1             {v2.8h}, [x0]
-        sqadd           v0.8h, v0.8h, v2.8h
-        ld1             {v3.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v3.8h
-        clip10          v0.8h, v1.8h, v4.8h, v5.8h
-        st1             {v0.8h}, [x0],  x2
-        st1             {v1.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_16x16_8_neon, export=1
         mov             x3,  #16
         add             x12, x0, x2
@@ -148,28 +110,6 @@  function ff_hevc_add_residual_16x16_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov             x3,  #16
-        movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-        add             x12,  x0, x2
-        add             x2,  x2, x2
-1:      subs            x3,  x3, #2
-        ld1             {v16.8h-v17.8h}, [x0]
-        ld1             {v0.8h-v3.8h},   [x1], #64
-        sqadd           v0.8h, v0.8h, v16.8h
-        ld1             {v18.8h-v19.8h}, [x12]
-        sqadd           v1.8h, v1.8h, v17.8h
-        sqadd           v2.8h, v2.8h, v18.8h
-        sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
-        st1             {v0.8h-v1.8h}, [x0],  x2
-        st1             {v2.8h-v3.8h}, [x12], x2
-        bne             1b
-        ret
-endfunc
-
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
         add             x2,  x2, x2
@@ -209,10 +149,88 @@  function ff_hevc_add_residual_32x32_8_neon, export=1
         ret
 endfunc
 
-function ff_hevc_add_residual_32x32_10_neon, export=1
+.macro add_res bitdepth
+function ff_hevc_add_residual_4x4_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_4x4_16_neon)
+endfunc
+function ff_hevc_add_residual_8x8_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_8x8_16_neon)
+endfunc
+function ff_hevc_add_residual_16x16_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_16x16_16_neon)
+endfunc
+function ff_hevc_add_residual_32x32_\bitdepth\()_neon, export=1
+        mvni            v21.8h, #((0xFF << (\bitdepth - 8)) & 0xFF), lsl #8
+        b               X(ff_hevc_add_residual_32x32_16_neon)
+endfunc
+.endm
+
+add_res 10
+add_res 12
+
+function ff_hevc_add_residual_4x4_16_neon, export=0
+        mov             x12,  x0
+        ld1             {v0.8h-v1.8h}, [x1]
+        ld1             {v2.d}[0], [x12], x2
+        ld1             {v2.d}[1], [x12], x2
+        ld1             {v3.d}[0], [x12], x2
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.d}[1], [x12], x2
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.d}[0], [x0],  x2
+        st1             {v0.d}[1], [x0],  x2
+        st1             {v1.d}[0], [x0],  x2
+        st1             {v1.d}[1], [x0],  x2
+        ret
+endfunc
+
+function ff_hevc_add_residual_8x8_16_neon, export=0
+        add             x12, x0, x2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+1:      subs            x3,  x3, #2
+        ld1             {v0.8h-v1.8h}, [x1], #32
+        ld1             {v2.8h}, [x0]
+        sqadd           v0.8h, v0.8h, v2.8h
+        ld1             {v3.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip2           v0.8h, v1.8h, v4.8h, v21.8h
+        st1             {v0.8h}, [x0],  x2
+        st1             {v1.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_16x16_16_neon, export=0
+        mov             x3,  #16
+        movi            v20.8h, #0
+        add             x12,  x0, x2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
+        ld1             {v16.8h-v17.8h}, [x0]
+        ld1             {v0.8h-v3.8h},   [x1], #64
+        sqadd           v0.8h, v0.8h, v16.8h
+        ld1             {v18.8h-v19.8h}, [x12]
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
+        st1             {v0.8h-v1.8h}, [x0],  x2
+        st1             {v2.8h-v3.8h}, [x12], x2
+        bne             1b
+        ret
+endfunc
+
+function ff_hevc_add_residual_32x32_16_neon, export=0
         mov             x3, #32
         movi            v20.8h, #0
-        mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
 1:      subs            x3, x3, #1
         ld1             {v0.8h -v3.8h},  [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
@@ -220,8 +238,8 @@  function ff_hevc_add_residual_32x32_10_neon, export=1
         sqadd           v1.8h, v1.8h, v17.8h
         sqadd           v2.8h, v2.8h, v18.8h
         sqadd           v3.8h, v3.8h, v19.8h
-        clip10          v0.8h, v1.8h, v20.8h, v21.8h
-        clip10          v2.8h, v3.8h, v20.8h, v21.8h
+        clip2           v0.8h, v1.8h, v20.8h, v21.8h
+        clip2           v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v3.8h}, [x0], x2
         bne             1b
         ret
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 2002530266..f37e47121e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -25,22 +25,18 @@ 
 #include "libavutil/aarch64/cpu.h"
 #include "libavcodec/hevcdsp.h"
 
-void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                     ptrdiff_t stride);
-void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
-void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_4x4_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_8x8_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_16x16_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_8_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_10_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
+void ff_hevc_add_residual_32x32_12_neon(uint8_t *_dst, int16_t *coeffs, ptrdiff_t stride);
 void ff_hevc_idct_8x8_8_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_8x8_10_neon(int16_t *coeffs, int col_limit);
 void ff_hevc_idct_16x16_8_neon(int16_t *coeffs, int col_limit);
@@ -100,4 +96,10 @@  av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_10_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_10_neon;
     }
+    if (bit_depth == 12) {
+        c->add_residual[0]             = ff_hevc_add_residual_4x4_12_neon;
+        c->add_residual[1]             = ff_hevc_add_residual_8x8_12_neon;
+        c->add_residual[2]             = ff_hevc_add_residual_16x16_12_neon;
+        c->add_residual[3]             = ff_hevc_add_residual_32x32_12_neon;
+    }
 }