[FFmpeg-devel,2/3] aarch64/vvc: Add put_qpel_vx

Message ID	tencent_FC9A5C40229A9F9FB9ABEE4E1970BC2D7408@qq.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Message-ID: <tencent_FC9A5C40229A9F9FB9ABEE4E1970BC2D7408@qq.com> From: Zhao Zhili <quinkblack@foxmail.com> To: ffmpeg-devel@ffmpeg.org Date: Wed, 11 Sep 2024 01:35:05 +0800 In-Reply-To: <20240910173506.28876-1-quinkblack@foxmail.com> References: <20240910173506.28876-1-quinkblack@foxmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH 2/3] aarch64/vvc: Add put_qpel_vx Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Zhao Zhili <zhilizhao@tencent.com> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,1/3] aarch64/h26x: Remove duplicate b.eq instruction \| expand [FFmpeg-devel,1/3] aarch64/h26x: Remove duplicate b.eq instruction [FFmpeg-devel,2/3] aarch64/vvc: Add put_qpel_vx [FFmpeg-devel,3/3] aarch64/vvc: Add put_qpel_hv

Message ID

tencent_FC9A5C40229A9F9FB9ABEE4E1970BC2D7408@qq.com

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
Message-ID: <tencent_FC9A5C40229A9F9FB9ABEE4E1970BC2D7408@qq.com>
From: Zhao Zhili <quinkblack@foxmail.com>
To: ffmpeg-devel@ffmpeg.org
Date: Wed, 11 Sep 2024 01:35:05 +0800
In-Reply-To: <20240910173506.28876-1-quinkblack@foxmail.com>
References: <20240910173506.28876-1-quinkblack@foxmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH 2/3] aarch64/vvc: Add put_qpel_vx
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: Zhao Zhili <zhilizhao@tencent.com>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel,1/3] aarch64/h26x: Remove duplicate b.eq instruction | expand

Context	Check	Description
yinshiyou/configure_loongarch64	warning	Failed to apply patch
andriy/configure_x86	warning	Failed to apply patch

Context

Check

Description

yinshiyou/configure_loongarch64

warning

Failed to apply patch

andriy/configure_x86

warning

Failed to apply patch

Commit Message

Zhao Zhili Sept. 10, 2024, 5:35 p.m. UTC

From: Zhao Zhili <zhilizhao@tencent.com>

put_luma_v_8_4x4_c:                                      1.0 ( 1.00x)
put_luma_v_8_4x4_neon:                                   0.0 ( 0.00x)
put_luma_v_8_8x8_c:                                      3.5 ( 1.00x)
put_luma_v_8_8x8_neon:                                   0.5 ( 7.00x)
put_luma_v_8_16x16_c:                                   13.8 ( 1.00x)
put_luma_v_8_16x16_neon:                                 1.2 (11.00x)
put_luma_v_8_32x32_c:                                   54.2 ( 1.00x)
put_luma_v_8_32x32_neon:                                 5.0 (10.85x)
put_luma_v_8_64x64_c:                                  217.5 ( 1.00x)
put_luma_v_8_64x64_neon:                                18.8 (11.60x)
put_luma_v_8_128x128_c:                                886.2 ( 1.00x)
put_luma_v_8_128x128_neon:                              74.0 (11.98x)
---
 libavcodec/aarch64/h26x/dsp.h       |   8 +++
 libavcodec/aarch64/h26x/qpel_neon.S | 100 ++++++++++++++++++++++++++++
 libavcodec/aarch64/vvc/dsp_init.c   |   7 ++
 3 files changed, 115 insertions(+)

Comments

Martin Storsjö Sept. 11, 2024, 12:27 p.m. UTC | #1

On Wed, 11 Sep 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao@tencent.com>
>
> put_luma_v_8_4x4_c:                                      1.0 ( 1.00x)
> put_luma_v_8_4x4_neon:                                   0.0 ( 0.00x)
> put_luma_v_8_8x8_c:                                      3.5 ( 1.00x)
> put_luma_v_8_8x8_neon:                                   0.5 ( 7.00x)
> put_luma_v_8_16x16_c:                                   13.8 ( 1.00x)
> put_luma_v_8_16x16_neon:                                 1.2 (11.00x)
> put_luma_v_8_32x32_c:                                   54.2 ( 1.00x)
> put_luma_v_8_32x32_neon:                                 5.0 (10.85x)
> put_luma_v_8_64x64_c:                                  217.5 ( 1.00x)
> put_luma_v_8_64x64_neon:                                18.8 (11.60x)
> put_luma_v_8_128x128_c:                                886.2 ( 1.00x)
> put_luma_v_8_128x128_neon:                              74.0 (11.98x)
> ---
> libavcodec/aarch64/h26x/dsp.h       |   8 +++
> libavcodec/aarch64/h26x/qpel_neon.S | 100 ++++++++++++++++++++++++++++
> libavcodec/aarch64/vvc/dsp_init.c   |   7 ++
> 3 files changed, 115 insertions(+)

This doesn't look harmful, and looks like the rest of these functions, so 
I guess it's acceptable. Let it be known that I very much dislike the 
structure of these functions, but you're adding more in the same style of 
the old, so I guess that's ok.

// Martin

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index 323a253257..881091f39a 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -274,4 +274,12 @@  NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst,
         const uint8_t *_src, ptrdiff_t _srcstride, int height,
         const int8_t *hf, const int8_t *vf, int width), _i8mm);
 
+void ff_vvc_put_qpel_v4_8_neon(int16_t *dst, const uint8_t *_src,
+                               ptrdiff_t _srcstride, int height,
+                               const int8_t *hf, const int8_t *vf, int width);
+
+void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src,
+                               ptrdiff_t _srcstride, int height,
+                               const int8_t *hf, const int8_t *vf, int width);
+
 #endif
diff --git a/libavcodec/aarch64/h26x/qpel_neon.S b/libavcodec/aarch64/h26x/qpel_neon.S
index 7868811b3b..671942109a 100644
--- a/libavcodec/aarch64/h26x/qpel_neon.S
+++ b/libavcodec/aarch64/h26x/qpel_neon.S
@@ -86,6 +86,11 @@  endconst
         sxtl            v0.8h, v0.8b
 .endm
 
+.macro vvc_load_qpel_filterh freg
+        ld1             {v0.8b}, [\freg]
+        sxtl            v0.8h, v0.8b
+.endm
+
 .macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
         smull           \dst\().4s, \src0\().4h, v0.h[0]
         smlal           \dst\().4s, \src1\().4h, v0.h[1]
@@ -95,11 +100,15 @@  endconst
         smlal           \dst\().4s, \src5\().4h, v0.h[5]
         smlal           \dst\().4s, \src6\().4h, v0.h[6]
         smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.ifc \op, sqxtn
+        sqxtn           \dst\().4h, \dst\().4s
+.else
 .ifc \op, sshr
         sshr            \dst\().4s, \dst\().4s, \shift
 .else
         \op             \dst\().4h, \dst\().4s, \shift
 .endif
+.endif
 .endm
 
 .macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
@@ -111,11 +120,15 @@  endconst
         smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
         smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
         smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
+.ifc \op, sqxtn2
+        sqxtn2          \dst\().8h, \dstt\().4s
+.else
 .ifc \op, sshr
         sshr            \dst\().4s, \dstt\().4s, \shift
 .else
         \op             \dst\().8h, \dstt\().4s, \shift
 .endif
+.endif
 .endm
 
 .macro calc_all
@@ -1000,6 +1013,93 @@  function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
         ret
 endfunc
 
+/* ff_hevc_put_hevc_qpel_vx require filter parameters be
+ * [-, +, -, +, +, -, +, -],
+ * vvc doesn't meet the requirement.
+ */
+function ff_vvc_put_qpel_v4_8_neon, export=1
+        vvc_load_qpel_filterh x5
+        sub             x1, x1, x2, lsl #1
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, x2
+        ldr             s16, [x1]
+        ldr             s17, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             s18, [x1]
+        ldr             s19, [x1, x2]
+        uxtl            v16.8h, v16.8b
+        uxtl            v17.8h, v17.8b
+        add             x1, x1, x2, lsl #1
+        ldr             s20, [x1]
+        ldr             s21, [x1, x2]
+        uxtl            v18.8h, v18.8b
+        uxtl            v19.8h, v19.8b
+        add             x1, x1, x2, lsl #1
+        ldr             s22, [x1]
+        add             x1, x1, x2
+        uxtl            v20.8h, v20.8b
+        uxtl            v21.8h, v21.8b
+        uxtl            v22.8h, v22.8b
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().s}[0], [x1], x2
+        uxtl            \tmp\().8h, \tmp\().8b
+        calc_qpelh      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn
+        subs            w3, w3, #1
+        st1             {v24.4h}, [x0], x9
+.endm
+1:
+        calc_all
+.purgem calc
+2:
+        ret
+endfunc
+
+function ff_vvc_put_qpel_v8_8_neon, export=1
+        vvc_load_qpel_filterh x5
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+0:
+        mov             x8, x1
+        ldr             d16, [x8]
+        ldr             d17, [x8, x2]
+        mov             x10, x0
+        mov             w11, w3
+        add             x8, x8, x2, lsl #1
+        ldr             d18, [x8]
+        ldr             d19, [x8, x2]
+        uxtl            v16.8h, v16.8b
+        uxtl            v17.8h, v17.8b
+        add             x8, x8, x2, lsl #1
+        ldr             d20, [x8]
+        ldr             d21, [x8, x2]
+        uxtl            v18.8h, v18.8b
+        uxtl            v19.8h, v19.8b
+        add             x8, x8, x2, lsl #1
+        ldr             d22, [x8]
+        add             x8, x8, x2
+        uxtl            v20.8h, v20.8b
+        uxtl            v21.8h, v21.8b
+        uxtl            v22.8h, v22.8b
+.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
+        ld1             {\tmp\().8b}, [x8], x2
+        uxtl            \tmp\().8h, \tmp\().8b
+        calc_qpelh      v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn
+        calc_qpelh2     v24, v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn2
+        subs            w11, w11, #1
+        st1             {v24.8h}, [x10], x9
+.endm
+1:
+        calc_all
+.purgem calc
+2:
+        sub             w6, w6, #8
+        add             x0, x0, #16
+        add             x1, x1, #8
+        cbnz            w6, 0b
+        ret
+endfunc
+
 function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
         load_qpel_filterb x7, x6
         sub             x2, x2, x3, lsl #1
diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
index bcc7df8f6c..ba3a49aa1a 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -60,6 +60,13 @@  void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
         c->inter.put[0][5][0][1] =
         c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
 
+        c->inter.put[0][1][1][0] = ff_vvc_put_qpel_v4_8_neon;
+        c->inter.put[0][2][1][0] =
+        c->inter.put[0][3][1][0] =
+        c->inter.put[0][4][1][0] =
+        c->inter.put[0][5][1][0] =
+        c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
+
         c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
         c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
         c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;

[FFmpeg-devel,2/3] aarch64/vvc: Add put_qpel_vx

Checks

Commit Message

Comments

Patch