diff mbox series

[FFmpeg-devel,v2,12/14] aarch64/vvc: Add put_epel_h i8mm

Message ID tencent_9B246846014969BE7DEEAE823A95C341B50A@qq.com
State New
Headers show
Series aarch64/vvc: Add SIMD | expand

Commit Message

Zhao Zhili Sept. 11, 2024, 6:06 p.m. UTC
From: Zhao Zhili <zhilizhao@tencent.com>

put_chroma_h_8_4x4_c:                                    0.4 ( 1.00x)
put_chroma_h_8_4x4_neon:                                 0.0 ( 0.00x)
put_chroma_h_8_4x4_i8mm:                                 0.1 ( 2.67x)
put_chroma_h_8_8x8_c:                                    1.6 ( 1.00x)
put_chroma_h_8_8x8_neon:                                 0.1 (11.00x)
put_chroma_h_8_8x8_i8mm:                                 0.1 (11.00x)
put_chroma_h_8_16x16_c:                                  6.9 ( 1.00x)
put_chroma_h_8_16x16_neon:                               1.1 ( 6.00x)
put_chroma_h_8_16x16_i8mm:                               0.7 (10.62x)
put_chroma_h_8_32x32_c:                                 27.6 ( 1.00x)
put_chroma_h_8_32x32_neon:                               4.7 ( 5.95x)
put_chroma_h_8_32x32_i8mm:                               4.4 ( 6.28x)
put_chroma_h_8_64x64_c:                                116.2 ( 1.00x)
put_chroma_h_8_64x64_neon:                              19.1 ( 6.07x)
put_chroma_h_8_64x64_i8mm:                              17.1 ( 6.77x)
put_chroma_h_8_128x128_c:                              466.6 ( 1.00x)
put_chroma_h_8_128x128_neon:                            81.4 ( 5.73x)
put_chroma_h_8_128x128_i8mm:                            71.7 ( 6.51x)
---
 libavcodec/aarch64/h26x/dsp.h       |  6 ++-
 libavcodec/aarch64/h26x/epel_neon.S | 60 ++++++++++++++++++++++++++---
 libavcodec/aarch64/vvc/dsp_init.c   |  7 ++++
 3 files changed, 66 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 6978b900fe..90a42d7108 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -273,7 +273,11 @@  NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         const int8_t *hf, const int8_t *vf, int width),);
 
-NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst,
+NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width), _i8mm);
+
+NEON8_FNPROTO_PARTIAL_6(epel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride, int height,
         const int8_t *hf, const int8_t *vf, int width), _i8mm);
 
diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S
index 80a0b66a52..cad8f2a5f4 100644
--- a/libavcodec/aarch64/h26x/epel_neon.S
+++ b/libavcodec/aarch64/h26x/epel_neon.S
@@ -1910,6 +1910,12 @@  endfunc
 
 #if HAVE_I8MM
 ENABLE_I8MM
+
+function ff_vvc_put_epel_h4_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
         EPEL_H_HEADER
 1:      ld1             {v4.8b}, [x1], x2
@@ -1953,6 +1959,11 @@  function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_epel_h8_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
         EPEL_H_HEADER
 1:      ld1             {v4.16b}, [x1], x2
@@ -2003,6 +2014,11 @@  function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_epel_h16_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
         EPEL_H_HEADER
 1:      ld1             {v0.16b, v1.16b}, [x1], x2
@@ -2077,6 +2093,11 @@  function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_epel_h32_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
         EPEL_H_HEADER
 1:      ld1             {v0.16b, v1.16b, v2.16b}, [x1], x2
@@ -2176,11 +2197,8 @@  function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
         ret
 endfunc
 
-function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
-        EPEL_H_HEADER
-        sub             x2, x2, #64
-1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
-        subs            w3, w3, #1   // height
+.macro put_epel_h64_8_neon_i8mm
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
         ext             v4.16b, v0.16b, v1.16b, #1
         ext             v5.16b, v0.16b, v1.16b, #2
         ext             v6.16b, v0.16b, v1.16b, #3
@@ -2243,7 +2261,37 @@  function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
         xtn2            v22.8h, v26.4s
         xtn             v23.4h, v23.4s
         xtn2            v23.8h, v27.4s
-        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
+.endm
+
+function ff_vvc_put_epel_h64_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        mov             x10, #(VVC_MAX_PB_SIZE * 2 - 64)
+        sub             x2, x2, #64
+        b               1f
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+        mov             x10, #64
+        sub             x2, x2, #64
+1:
+        subs            w3, w3, #1   // height
+        put_epel_h64_8_neon_i8mm
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vvc_put_epel_h128_8_neon_i8mm, export=1
+        VVC_EPEL_H_HEADER
+        sub             x11, x2, #128
+        mov             x10, #64
+        mov             x2, #0
+1:
+        put_epel_h64_8_neon_i8mm
+        subs            w3, w3, #1
+        put_epel_h64_8_neon_i8mm
+        add             x1, x1, x11
         b.ne            1b
         ret
 endfunc
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index c8c13eb068..c947885145 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -127,6 +127,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
             c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
             c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
+
+            c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon_i8mm;
+            c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon_i8mm;
+            c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon_i8mm;
+            c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
+            c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
+            c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
         }
     } else if (bd == 10) {
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;