diff mbox series

[FFmpeg-devel,2/2] lavc/h264dsp: R-V V 8-bit h264_biweight_pixels

Message ID 20240705202349.51307-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel] lavc/h264dsp: R-V V 8-bit h264_weight_pixels | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 fail Make fate failed
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont July 5, 2024, 8:23 p.m. UTC
T-Head C908:
h264_biweight2_8_c:        65.2
h264_biweight2_8_rvv_i32:  24.0
h264_biweight4_8_c:       135.2
h264_biweight4_8_rvv_i32:  48.0
h264_biweight8_8_c:       231.5
h264_biweight8_8_rvv_i32: 104.7
h264_biweight16_8_c:      454.0
h264_biweight16_8_rvv_i32: 93.7

SpacemiT X60:
h264_biweight2_8_c:        57.7
h264_biweight2_8_rvv_i32:  16.7
h264_biweight4_8_c:       106.0
h264_biweight4_8_rvv_i32:  33.7
h264_biweight8_8_c:       205.7
h264_biweight8_8_rvv_i32:  77.7
h264_biweight16_8_c:      403.5
h264_biweight16_8_rvv_i32: 83.2
---
 libavcodec/riscv/h264dsp_init.c | 14 ++++--
 libavcodec/riscv/h264dsp_rvv.S  | 78 +++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index e1b725dcbb..88afec8df0 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -28,7 +28,10 @@ 
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h264dsp.h"
 
-extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
+extern const struct {
+    const h264_weight_func weight;
+    const h264_biweight_func biweight;
+} ff_h264_weight_funcs_8_rvv[];
 
 void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
@@ -63,9 +66,12 @@  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
 # if HAVE_RVV
     if (flags & AV_CPU_FLAG_RVV_I32) {
         if (bit_depth == 8 && ff_rv_vlen_least(128)) {
-            memcpy(dsp->weight_h264_pixels_tab,
-                   ff_h264_weight_funcs_8_rvv,
-                   sizeof (dsp->weight_h264_pixels_tab));
+            for (int i = 0; i < 4; i++) {
+                dsp->weight_h264_pixels_tab[i] =
+                    ff_h264_weight_funcs_8_rvv[i].weight;
+                dsp->biweight_h264_pixels_tab[i] =
+                    ff_h264_weight_funcs_8_rvv[i].biweight;
+            }
 
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index ab85bfbd69..6cbc699b21 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -48,6 +48,34 @@  func ff_h264_weight_pixels_simple_8_rvv, zve32x
         ret
 endfunc
 
+        .variant_cc ff_h264_biweight_pixels_simple_8_rvv
+func ff_h264_biweight_pixels_simple_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a7, a7, a3
+        addi    a4, a4, 1
+1:
+        vsetvli zero, t6, e32, m4, ta, ma
+        vle8.v  v8, (a0)
+        addi    a3, a3, -1
+        vle8.v  v12, (a1)
+        add     a1, a1, a2
+        vmv.v.x v16, a7
+        vsetvli zero, zero, e16, m2, ta, ma
+        vzext.vf2   v24, v8
+        vzext.vf2   v28, v12
+        vwmaccsu.vx v16, a5, v24
+        vwmaccsu.vx v16, a6, v28
+        vnclip.wi   v16, v16, 0
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m1, ta, ma
+        vnclipu.wx  v8, v16, a4
+        vse8.v  v8, (a0)
+        add     a0, a0, a2
+        bnez    a3, 1b
+
+        ret
+endfunc
+
 func ff_h264_weight_pixels_8_rvv, zve32x
         csrwi   vxrm, 0
         sll     a5, a5, a3
@@ -78,6 +106,44 @@  func ff_h264_weight_pixels_8_rvv, zve32x
         ret
 endfunc
 
+        .variant_cc ff_h264_biweight_pixels_8_rvv
+func ff_h264_biweight_pixels_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a7, a7, a3
+        addi    a4, a4, 1
+1:
+        mv      t0, a0
+        mv      t1, a1
+        mv      t5, t6
+2:
+        vsetvli t2, a3, e32, m8, ta, ma
+        vlse8.v v8, (t0), a2
+        vlse8.v v12, (t1), a2
+        addi    t5, t5, -1
+        vmv.v.x v16, a7
+        vsetvli zero, zero, e16, m4, ta, ma
+        vzext.vf2   v24, v8
+        vzext.vf2   v28, v12
+        vwmaccsu.vx v16, a5, v24
+        vwmaccsu.vx v16, a6, v28
+        vnclip.wi   v16, v16, 0
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m2, ta, ma
+        vnclipu.wx  v8, v16, a4
+        vsse8.v v8, (t0), a2
+        addi    t0, t0, 1
+        addi    t1, t1, 1
+        bnez    t5, 2b
+
+        mul     t3, a2, t2
+        sub     a3, a3, t2
+        add     a0, a0, t3
+        add     a1, a1, t3
+        bnez    a3, 1b
+
+        ret
+endfunc
+
 .irp    w, 16, 8, 4, 2
 func ff_h264_weight_pixels\w\()_8_rvv, zve32x
         li      a6, \w
@@ -87,6 +153,15 @@  func ff_h264_weight_pixels\w\()_8_rvv, zve32x
         j       ff_h264_weight_pixels_8_rvv
         .endif
 endfunc
+
+func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
+        li      t6, \w
+        .if     \w == 16
+        j       ff_h264_biweight_pixels_simple_8_rvv
+        .else
+        j       ff_h264_biweight_pixels_8_rvv
+        .endif
+endfunc
 .endr
 
         .global ff_h264_weight_funcs_8_rvv
@@ -95,10 +170,13 @@  const ff_h264_weight_funcs_8_rvv
         .irp    w, 16, 8, 4, 2
 #if __riscv_xlen == 32
         .word   ff_h264_weight_pixels\w\()_8_rvv
+        .word   ff_h264_biweight_pixels\w\()_8_rvv
 #elif __riscv_xlen == 64
         .dword  ff_h264_weight_pixels\w\()_8_rvv
+        .dword  ff_h264_biweight_pixels\w\()_8_rvv
 #else
         .qword  ff_h264_weight_pixels\w\()_8_rvv
+        .qword  ff_h264_biweight_pixels\w\()_8_rvv
 #endif
         .endr
 endconst