[FFmpeg-devel,v2,05/14] aarch64/vvc: Add put_qpel_hx i8mm

Message ID	tencent_BCE622908426C0E7D62A8B933D1C730B8206@qq.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Message-ID: <tencent_BCE622908426C0E7D62A8B933D1C730B8206@qq.com> From: Zhao Zhili <quinkblack@foxmail.com> To: ffmpeg-devel@ffmpeg.org Date: Thu, 12 Sep 2024 02:06:09 +0800 In-Reply-To: <20240911180618.28921-1-quinkblack@foxmail.com> References: <20240911180618.28921-1-quinkblack@foxmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v2 05/14] aarch64/vvc: Add put_qpel_hx i8mm Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Zhao Zhili <zhilizhao@tencent.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	aarch64/vvc: Add SIMD \| expand [FFmpeg-devel,v2,00/14] aarch64/vvc: Add SIMD [FFmpeg-devel,v2,02/14] aarch64/hevc: Move epel/qpel to h26x directory [FFmpeg-devel,v2,03/14] aarch64/vvc: Add put_qpel_h_* and put_qpel_uni_h_* [FFmpeg-devel,v2,04/14] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w [FFmpeg-devel,v2,05/14] aarch64/vvc: Add put_qpel_hx i8mm [FFmpeg-devel,v2,06/14] avcodec/hevc: ff_hevc_(qpel/epel)_filters are signed type [FFmpeg-devel,v2,07/14] aarch64/h26x: Remove duplicate b.eq instruction [FFmpeg-devel,v2,08/14] aarch64/vvc: Add put_qpel_vx [FFmpeg-devel,v2,09/14] aarch64/vvc: Add put_qpel_hv [FFmpeg-devel,v2,10/14] aarch64/vvc: Add sad [FFmpeg-devel,v2,11/14] aarch64/vvc: Add put_epel_h [FFmpeg-devel,v2,12/14] aarch64/vvc: Add put_epel_h i8mm [FFmpeg-devel,v2,13/14] aarch64/vvc: Add put_epel_hv [FFmpeg-devel,v2,14/14] aarch64/vvc: Add avg

Message ID

tencent_BCE622908426C0E7D62A8B933D1C730B8206@qq.com

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
Message-ID: <tencent_BCE622908426C0E7D62A8B933D1C730B8206@qq.com>
From: Zhao Zhili <quinkblack@foxmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Thu, 12 Sep 2024 02:06:09 +0800
In-Reply-To: <20240911180618.28921-1-quinkblack@foxmail.com>
References: <20240911180618.28921-1-quinkblack@foxmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH v2 05/14] aarch64/vvc: Add put_qpel_hx i8mm
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Zhao Zhili <zhilizhao@tencent.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

aarch64/vvc: Add SIMD | expand

Commit Message

Zhao Zhili Sept. 11, 2024, 6:06 p.m. UTC

From: Zhao Zhili <zhilizhao@tencent.com>

Benchmark on Android pixel 8 with -fno-vectorize

put_luma_h_8_4x4_c:                                      0.2 ( 1.00x)
put_luma_h_8_4x4_neon:                                   0.2 ( 1.00x)
put_luma_h_8_4x4_i8mm:                                   0.0 ( 0.00x)
put_luma_h_8_8x8_c:                                      1.5 ( 1.00x)
put_luma_h_8_8x8_neon:                                   0.5 ( 3.00x)
put_luma_h_8_8x8_i8mm:                                   0.5 ( 3.00x)
put_luma_h_8_16x16_c:                                    6.2 ( 1.00x)
put_luma_h_8_16x16_neon:                                 2.0 ( 3.12x)
put_luma_h_8_16x16_i8mm:                                 1.5 ( 4.17x)
put_luma_h_8_32x32_c:                                   25.5 ( 1.00x)
put_luma_h_8_32x32_neon:                                 9.0 ( 2.83x)
put_luma_h_8_32x32_i8mm:                                 6.8 ( 3.78x)
put_luma_h_8_64x64_c:                                   99.8 ( 1.00x)
put_luma_h_8_64x64_neon:                                35.2 ( 2.83x)
put_luma_h_8_64x64_i8mm:                                27.2 ( 3.66x)
put_luma_h_8_128x128_c:                                422.0 ( 1.00x)
put_luma_h_8_128x128_neon:                             138.5 ( 3.05x)
put_luma_h_8_128x128_i8mm:                             109.2 ( 3.86x)
---
 libavcodec/aarch64/h26x/dsp.h       |  4 ++
 libavcodec/aarch64/h26x/qpel_neon.S | 68 ++++++++++++++++++++++++++---
 libavcodec/aarch64/vvc/dsp_init.c   |  9 ++++
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 076d01b477..323a253257 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -270,4 +270,8 @@  NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         const int8_t *hf, const int8_t *vf, int width),);
 
+NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst,
+        const uint8_t *_src, ptrdiff_t _srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width), _i8mm);
+
 #endif
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index 47b3948f8b..1fa5a1dd0e 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -3516,6 +3516,17 @@  endfunc
         sub             x1, x1, #3
 .endm
 
+.macro VVC_QPEL_H_HEADER
+        ld1r            {v31.2d}, [x4]
+        sub             x1, x1, #3
+.endm
+
+function ff_vvc_put_qpel_h4_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        mov             x10, #VVC_MAX_PB_SIZE * 2
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
         QPEL_H_HEADER
         mov             x10, #HEVC_MAX_PB_SIZE * 2
@@ -3572,6 +3583,12 @@  function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_qpel_h8_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        mov             x10, #VVC_MAX_PB_SIZE * 2
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
         QPEL_H_HEADER
         mov             x10, #HEVC_MAX_PB_SIZE * 2
@@ -3656,6 +3673,12 @@  function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_qpel_h16_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        mov             x10, #VVC_MAX_PB_SIZE * 2
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
         QPEL_H_HEADER
         mov             x10, #HEVC_MAX_PB_SIZE * 2
@@ -3746,6 +3769,13 @@  function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
         ret
 endfunc
 
+function ff_vvc_put_qpel_h32_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        mov             x10, #VVC_MAX_PB_SIZE * 2
+        add             x15, x0, #32
+        b               1f
+endfunc
+
 function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
         QPEL_H_HEADER
         mov             x10, #HEVC_MAX_PB_SIZE * 2
@@ -3881,10 +3911,7 @@  function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
         ret
 endfunc
 
-function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
-        QPEL_H_HEADER
-        sub             x2, x2, #64
-1:
+.macro put_qpel_h64_8_neon_i8mm
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
         ext             v1.16b, v16.16b, v17.16b, #1
         ext             v2.16b, v16.16b, v17.16b, #2
@@ -3975,11 +4002,42 @@  function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
         sqxtn2          v20.8h, v26.4s
         sqxtn           v21.4h, v23.4s
         sqxtn2          v21.8h, v27.4s
-        stp             q20, q21, [x0], #32
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+.endm
+
+function ff_vvc_put_qpel_h64_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        mov             x10, #(VVC_MAX_PB_SIZE * 2 - 32 * 3)
+        sub             x2, x2, #64
+        b               1f
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #32
+        sub             x2, x2, #64
+1:
+        put_qpel_h64_8_neon_i8mm
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vvc_put_qpel_h128_8_neon_i8mm, export=1
+        VVC_QPEL_H_HEADER
+        sub             x11, x2, #128
+        mov             x10, #32
+        mov             x2, #0
+1:
+        put_qpel_h64_8_neon_i8mm
         subs            w3, w3, #1
+        put_qpel_h64_8_neon_i8mm
+        add             x1, x1, x11
         b.ne            1b
         ret
 endfunc
+
 DISABLE_I8MM
 #endif
 
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 457be8c725..bcc7df8f6c 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -88,6 +88,15 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
             c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
         c->alf.filter[LUMA] = alf_filter_luma_8_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
+
+        if (have_i8mm(cpu_flags)) {
+            c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
+            c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon_i8mm;
+            c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon_i8mm;
+            c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
+            c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
+            c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
+        }
     } else if (bd == 10) {
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;

[FFmpeg-devel,v2,05/14] aarch64/vvc: Add put_qpel_hx i8mm

Commit Message

Patch