[FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels

Message ID	20240705182816.27464-1-remi@remlab.net
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net> To: ffmpeg-devel@ffmpeg.org Date: Fri, 5 Jul 2024 21:28:16 +0300 Message-ID: <20240705182816.27464-1-remi@remlab.net> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V 8-bit h264_weight_pixels Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels \| expand [FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels [FFmpeg-devel,2/2] lavc/h264dsp: R-V V 8-bit h264_biweight_pixels

Message ID

20240705182816.27464-1-remi@remlab.net

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
From: =?utf-8?q?R=C3=A9mi_Denis-Courmont?= <remi@remlab.net>
To: ffmpeg-devel@ffmpeg.org
Date: Fri,  5 Jul 2024 21:28:16 +0300
Message-ID: <20240705182816.27464-1-remi@remlab.net>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V 8-bit h264_weight_pixels
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels | expand

Checks

Context	Check	Description
yinshiyou/make_loongarch64	success	Make finished
yinshiyou/make_fate_loongarch64	fail	Make fate failed
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

Context

Check

Description

yinshiyou/make_loongarch64

success

Make finished

yinshiyou/make_fate_loongarch64

fail

Make fate failed

andriy/make_x86

success

Make finished

andriy/make_fate_x86

success

Make fate finished

Commit Message

Rémi Denis-Courmont July 5, 2024, 6:28 p.m. UTC

There are two implementations here:
- a generic scalable one processing one column at a time,
- a specialised processing one (fixed-size) row at a time.

Unsurprisingly, the generic one works out better with smaller widths.
With larger widths, the gains from filling vectors are outweighed by
the extra cost of strided loads and stores. In other words, memory
accesses become the bottleneck.

T-Head C908:
h264_weight2_8_c:        54.2
h264_weight2_8_rvv_i32:  17.5
h264_weight4_8_c:       102.0
h264_weight4_8_rvv_i32:  34.7
h264_weight8_8_c:       213.7
h264_weight8_8_rvv_i32:  79.7
h264_weight16_8_c:      401.0
h264_weight16_8_rvv_i32: 74.2

SpacemiT X60:
h264_weight2_8_c:        48.5
h264_weight2_8_rvv_i32:  11.7
h264_weight4_8_c:        90.5
h264_weight4_8_rvv_i32:  23.7
h264_weight8_8_c:       175.0
h264_weight8_8_rvv_i32:  58.0
h264_weight16_8_c:      342.2
h264_weight16_8_rvv_i32: 66.0
---
 libavcodec/riscv/h264dsp_init.c |  7 +++
 libavcodec/riscv/h264dsp_rvv.S  | 77 +++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

Comments

Rémi Denis-Courmont July 6, 2024, 10:52 a.m. UTC | #1

Superseded.

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index bf9743eb6b..e1b725dcbb 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -21,12 +21,15 @@ 
 #include "config.h"
 
 #include <stdint.h>
+#include <string.h>
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h264dsp.h"
 
+extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
+
 void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@@ -60,6 +63,10 @@  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
 # if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32) {
         if (bit_depth == 8 && ff_rv_vlen_least(128)) {
+            memcpy(dsp->weight_h264_pixels_tab,
+                   ff_h264_weight_funcs_8_rvv,
+                   sizeof (dsp->weight_h264_pixels_tab));
+
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma_mbaff =
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 96a8a0a8a3..ab85bfbd69 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -26,6 +26,83 @@ 
 
 #include "libavutil/riscv/asm.S"
 
+func ff_h264_weight_pixels_simple_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a5, a5, a3
+1:
+        vsetvli zero, a6, e32, m4, ta, ma
+        vle8.v  v8, (a0)
+        addi    a2, a2, -1
+        vmv.v.x v16, a5
+        vsetvli zero, zero, e16, m2, ta, ma
+        vzext.vf2   v24, v8
+        vwmaccsu.vx v16, a4, v24
+        vnclip.wi   v16, v16, 0
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m1, ta, ma
+        vnclipu.wx  v8, v16, a3
+        vse8.v  v8, (a0)
+        add     a0, a0, a1
+        bnez    a2, 1b
+
+        ret
+endfunc
+
+func ff_h264_weight_pixels_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a5, a5, a3
+1:
+        mv      t0, a0
+        mv      t6, a6
+2:
+        vsetvli t2, a2, e32, m8, ta, ma
+        vlse8.v v8, (t0), a1
+        addi    t6, t6, -1
+        vmv.v.x v16, a5
+        vsetvli zero, zero, e16, m4, ta, ma
+        vzext.vf2   v24, v8
+        vwmaccsu.vx v16, a4, v24
+        vnclip.wi   v16, v16, 0
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m2, ta, ma
+        vnclipu.wx  v8, v16, a3
+        vsse8.v v8, (t0), a1
+        addi    t0, t0, 1
+        bnez    t6, 2b
+
+        mul     t3, a1, t2
+        sub     a2, a2, t2
+        add     a0, a0, t3
+        bnez    a2, 1b
+
+        ret
+endfunc
+
+.irp    w, 16, 8, 4, 2
+func ff_h264_weight_pixels\w\()_8_rvv, zve32x
+        li      a6, \w
+        .if     \w == 16
+        j       ff_h264_weight_pixels_simple_8_rvv
+        .else
+        j       ff_h264_weight_pixels_8_rvv
+        .endif
+endfunc
+.endr
+
+        .global ff_h264_weight_funcs_8_rvv
+        .hidden ff_h264_weight_funcs_8_rvv
+const ff_h264_weight_funcs_8_rvv
+        .irp    w, 16, 8, 4, 2
+#if __riscv_xlen == 32
+        .word   ff_h264_weight_pixels\w\()_8_rvv
+#elif __riscv_xlen == 64
+        .dword  ff_h264_weight_pixels\w\()_8_rvv
+#else
+        .qword  ff_h264_weight_pixels\w\()_8_rvv
+#endif
+        .endr
+endconst
+
         .variant_cc ff_h264_loop_filter_luma_8_rvv
 func ff_h264_loop_filter_luma_8_rvv, zve32x
         # p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13

[FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels

Checks

Commit Message

Comments

Patch