[FFmpeg-devel,v2,10/14] aarch64/vvc: Add sad

Message ID	tencent_26CC69008EA7C5485314C84633E1536E1505@qq.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Message-ID: <tencent_26CC69008EA7C5485314C84633E1536E1505@qq.com> From: Zhao Zhili <quinkblack@foxmail.com> To: ffmpeg-devel@ffmpeg.org Date: Thu, 12 Sep 2024 02:06:14 +0800 In-Reply-To: <20240911180618.28921-1-quinkblack@foxmail.com> References: <20240911180618.28921-1-quinkblack@foxmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v2 10/14] aarch64/vvc: Add sad Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Zhao Zhili <zhilizhao@tencent.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	aarch64/vvc: Add SIMD \| expand [FFmpeg-devel,v2,00/14] aarch64/vvc: Add SIMD [FFmpeg-devel,v2,02/14] aarch64/hevc: Move epel/qpel to h26x directory [FFmpeg-devel,v2,03/14] aarch64/vvc: Add put_qpel_h_* and put_qpel_uni_h_* [FFmpeg-devel,v2,04/14] aarch64/vvc: Add put_pel/put_pel_uni/put_pel_uni_w [FFmpeg-devel,v2,05/14] aarch64/vvc: Add put_qpel_hx i8mm [FFmpeg-devel,v2,06/14] avcodec/hevc: ff_hevc_(qpel/epel)_filters are signed type [FFmpeg-devel,v2,07/14] aarch64/h26x: Remove duplicate b.eq instruction [FFmpeg-devel,v2,08/14] aarch64/vvc: Add put_qpel_vx [FFmpeg-devel,v2,09/14] aarch64/vvc: Add put_qpel_hv [FFmpeg-devel,v2,10/14] aarch64/vvc: Add sad [FFmpeg-devel,v2,11/14] aarch64/vvc: Add put_epel_h [FFmpeg-devel,v2,12/14] aarch64/vvc: Add put_epel_h i8mm [FFmpeg-devel,v2,13/14] aarch64/vvc: Add put_epel_hv [FFmpeg-devel,v2,14/14] aarch64/vvc: Add avg

Message ID

tencent_26CC69008EA7C5485314C84633E1536E1505@qq.com

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
Message-ID: <tencent_26CC69008EA7C5485314C84633E1536E1505@qq.com>
From: Zhao Zhili <quinkblack@foxmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Thu, 12 Sep 2024 02:06:14 +0800
In-Reply-To: <20240911180618.28921-1-quinkblack@foxmail.com>
References: <20240911180618.28921-1-quinkblack@foxmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH v2 10/14] aarch64/vvc: Add sad
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Zhao Zhili <zhilizhao@tencent.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

aarch64/vvc: Add SIMD | expand

Commit Message

Zhao Zhili Sept. 11, 2024, 6:06 p.m. UTC

From: Zhao Zhili <zhilizhao@tencent.com>

sad_8x16_c:                                              0.8 ( 1.00x)
sad_8x16_neon:                                           0.2 ( 3.00x)
sad_16x8_c:                                              0.5 ( 1.00x)
sad_16x8_neon:                                           0.2 ( 2.00x)
sad_16x16_c:                                             1.5 ( 1.00x)
sad_16x16_neon:                                          0.2 ( 6.00x)
---
 libavcodec/aarch64/vvc/Makefile   |  1 +
 libavcodec/aarch64/vvc/dsp_init.c |  5 +++
 libavcodec/aarch64/vvc/sad.S      | 75 +++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 libavcodec/aarch64/vvc/sad.S

diff --git a/libavcodec/aarch64/vvc/Makefile b/libavcodec/aarch64/vvc/Makefile
index a1c1f03e27..7ba13a2165 100644
--- a/libavcodec/aarch64/vvc/Makefile
+++ b/libavcodec/aarch64/vvc/Makefile
@@ -3,6 +3,7 @@  clean::
 
 OBJS-$(CONFIG_VVC_DECODER)              += aarch64/vvc/dsp_init.o
 NEON-OBJS-$(CONFIG_VVC_DECODER)         += aarch64/vvc/alf.o \
+                                           aarch64/vvc/sad.o \
                                            aarch64/h26x/epel_neon.o \
                                            aarch64/h26x/qpel_neon.o \
                                            aarch64/h26x/sao_neon.o
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index 934d918ffd..714d642634 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -39,6 +39,9 @@ 
 #include "alf_template.c"
 #undef BIT_DEPTH
 
+int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
+                    const int block_w, const int block_h);
+
 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -125,4 +128,6 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
     }
+
+    c->inter.sad = ff_vvc_sad_neon;
 }
diff --git a/libavcodec/aarch64/vvc/sad.S b/libavcodec/aarch64/vvc/sad.S
new file mode 100644
index 0000000000..beca876faf
--- /dev/null
+++ b/libavcodec/aarch64/vvc/sad.S
@@ -0,0 +1,75 @@ 
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkblack@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define VVC_MAX_PB_SIZE 128
+
+function ff_vvc_sad_neon, export=1
+        src0            .req x0
+        src1            .req x1
+        dx              .req w2
+        dy              .req w3
+        block_w         .req w4
+        block_h         .req w5
+
+        sub             w7, dx, #4
+        sub             w8, dy, #4
+        add             w6, dx, dy, lsl #7
+        add             w7, w7, w8, lsl #7
+        sxtw            x6, w6
+        sxtw            x7, w7
+        add             src0, src0, x6, lsl #1
+        sub             src1, src1, x7, lsl #1
+
+        cmp             block_w, #16
+        movi            v16.4s, #0
+        b.ge            2f
+1:
+        // block_w == 8
+        ldr             q0, [src0]
+        ldr             q2, [src1]
+        subs            block_h, block_h, #2
+        sabal           v16.4s, v0.4h, v2.4h
+        sabal2          v16.4s, v0.8h, v2.8h
+
+        add             src0, src0, #(2 * VVC_MAX_PB_SIZE * 2)
+        add             src1, src1, #(2 * VVC_MAX_PB_SIZE * 2)
+        b.ne            1b
+        b               4f
+2:
+        // block_w == 16, no block_w > 16 according the spec
+        movi            v17.4s, #0
+3:
+        ldp             q0, q1, [src0], #(2 * VVC_MAX_PB_SIZE * 2)
+        ldp             q2, q3, [src1], #(2 * VVC_MAX_PB_SIZE * 2)
+        subs            block_h, block_h, #2
+        sabal           v16.4s, v0.4h, v2.4h
+        sabal2          v16.4s, v0.8h, v2.8h
+        sabal           v17.4s, v1.4h, v3.4h
+        sabal2          v17.4s, v1.8h, v3.8h
+
+        b.ne            3b
+        add             v16.4s, v16.4s, v17.4s
+4:
+        addv            s16, v16.4s
+        mov             w0, v16.s[0]
+        ret
+endfunc

[FFmpeg-devel,v2,10/14] aarch64/vvc: Add sad

Commit Message

Patch