diff mbox series

[FFmpeg-devel,2/2] lavc/h264dsp: R-V V 8-bit h264_biweight_pixels

Message ID 20240706105234.145689-2-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,PATCHv2,1/2] lavc/h264dsp: R-V V 8-bit h264_weight_pixels | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont July 6, 2024, 10:52 a.m. UTC
T-Head C908:
h264_biweight2_8_c:        58.0
h264_biweight2_8_rvv_i32:  11.2
h264_biweight4_8_c:       106.0
h264_biweight4_8_rvv_i32:  22.7
h264_biweight8_8_c:       205.7
h264_biweight8_8_rvv_i32:  50.0
h264_biweight16_8_c:      403.5
h264_biweight16_8_rvv_i32: 83.2

SpacemiT X60:
h264_weight2_8_c:          48.2
h264_weight2_8_rvv_i32:     8.2
h264_weight4_8_c:          90.5
h264_weight4_8_rvv_i32:    16.5
h264_weight8_8_c:         175.2
h264_weight8_8_rvv_i32:    38.0
h264_weight16_8_c:        342.2
h264_weight16_8_rvv_i32:   66.0
---
 libavcodec/riscv/h264dsp_init.c | 14 ++++--
 libavcodec/riscv/h264dsp_rvv.S  | 88 +++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index e1b725dcbb..88afec8df0 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -28,7 +28,10 @@ 
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h264dsp.h"
 
-extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
+extern const struct {
+    const h264_weight_func weight;
+    const h264_biweight_func biweight;
+} ff_h264_weight_funcs_8_rvv[];
 
 void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
@@ -63,9 +66,12 @@  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
 # if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32) {
         if (bit_depth == 8 && ff_rv_vlen_least(128)) {
-            memcpy(dsp->weight_h264_pixels_tab,
-                   ff_h264_weight_funcs_8_rvv,
-                   sizeof (dsp->weight_h264_pixels_tab));
+            for (int i = 0; i < 4; i++) {
+                dsp->weight_h264_pixels_tab[i] =
+                    ff_h264_weight_funcs_8_rvv[i].weight;
+                dsp->biweight_h264_pixels_tab[i] =
+                    ff_h264_weight_funcs_8_rvv[i].biweight;
+            }
 
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index bbcbf2e4de..5c3931569b 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -48,6 +48,35 @@  func ff_h264_weight_pixels_simple_8_rvv, zve32x
         ret
 endfunc
 
+        .variant_cc ff_h264_biweight_pixels_simple_8_rvv
+func ff_h264_biweight_pixels_simple_8_rvv, zve32x
+        csrwi   vxrm, 2
+        addi    a7, a7, 1
+        ori     a7, a7, 1
+        sll     a7, a7, a4
+1:
+        vsetvli zero, t6, e32, m4, ta, ma
+        vle8.v  v8, (a0)
+        addi    a3, a3, -1
+        vle8.v  v12, (a1)
+        add     a1, a1, a2
+        vmv.v.x v16, a7
+        vsetvli zero, zero, e16, m2, ta, ma
+        vzext.vf2   v24, v8
+        vzext.vf2   v28, v12
+        vwmaccsu.vx v16, a5, v24
+        vwmaccsu.vx v16, a6, v28
+        vnclip.wx   v16, v16, a4
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m1, ta, ma
+        vnclipu.wi  v8, v16, 1
+        vse8.v  v8, (a0)
+        add     a0, a0, a2
+        bnez    a3, 1b
+
+        ret
+endfunc
+
 func ff_h264_weight_pixels_8_rvv, zve32x
         csrwi   vxrm, 0
         sll     a5, a5, a3
@@ -84,6 +113,53 @@  func ff_h264_weight_pixels_8_rvv, zve32x
         ret
 endfunc
 
+        .variant_cc ff_h264_biweight_pixels_8_rvv
+func ff_h264_biweight_pixels_8_rvv, zve32x
+        csrwi   vxrm, 2
+        addi    a7, a7, 1
+        ori     a7, a7, 1
+        sll     a7, a7, a4
+1:
+        mv      t0, a0
+        mv      t1, a1
+        mv      t5, t6
+2:
+        vsetvli t2, a3, e32, m8, ta, ma
+        vlsseg2e8.v v0, (t0), a2
+        vlsseg2e8.v v4, (t1), a2
+        addi    t5, t5, -2
+        vmv.v.x v16, a7
+        vmv.v.x v24, a7
+        vsetvli zero, zero, e16, m4, ta, ma
+        vzext.vf2   v8, v0
+        vzext.vf2   v12, v2
+        vwmaccsu.vx v16, a5, v8
+        vwmaccsu.vx v24, a5, v12
+        vzext.vf2   v8, v4
+        vzext.vf2   v12, v6
+        vwmaccsu.vx v16, a6, v8
+        vwmaccsu.vx v24, a6, v12
+        vnclip.wx   v8, v16, a4
+        vnclip.wx   v12, v24, a4
+        vmax.vx v8, v8, zero
+        vmax.vx v12, v12, zero
+        vsetvli zero, zero, e8, m2, ta, ma
+        vnclipu.wi  v0, v8, 1
+        vnclipu.wi  v2, v12, 1
+        vssseg2e8.v v0, (t0), a2
+        addi    t0, t0, 2
+        addi    t1, t1, 2
+        bnez    t5, 2b
+
+        mul     t3, a2, t2
+        sub     a3, a3, t2
+        add     a0, a0, t3
+        add     a1, a1, t3
+        bnez    a3, 1b
+
+        ret
+endfunc
+
 .irp    w, 16, 8, 4, 2
 func ff_h264_weight_pixels\w\()_8_rvv, zve32x
         li      a6, \w
@@ -93,6 +169,15 @@  func ff_h264_weight_pixels\w\()_8_rvv, zve32x
         j       ff_h264_weight_pixels_8_rvv
         .endif
 endfunc
+
+func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
+        li      t6, \w
+        .if     \w == 16
+        j       ff_h264_biweight_pixels_simple_8_rvv
+        .else
+        j       ff_h264_biweight_pixels_8_rvv
+        .endif
+endfunc
 .endr
 
         .global ff_h264_weight_funcs_8_rvv
@@ -101,10 +186,13 @@  const ff_h264_weight_funcs_8_rvv
         .irp    w, 16, 8, 4, 2
 #if __riscv_xlen == 32
         .word   ff_h264_weight_pixels\w\()_8_rvv
+        .word   ff_h264_biweight_pixels\w\()_8_rvv
 #elif __riscv_xlen == 64
         .dword  ff_h264_weight_pixels\w\()_8_rvv
+        .dword  ff_h264_biweight_pixels\w\()_8_rvv
 #else
         .qword  ff_h264_weight_pixels\w\()_8_rvv
+        .qword  ff_h264_biweight_pixels\w\()_8_rvv
 #endif
         .endr
 endconst