diff mbox series

[FFmpeg-devel,1/3] lavc/h264dsp: optimise R-V V weight for shorter heights

Message ID 20240901161744.102668-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,1/3] lavc/h264dsp: optimise R-V V weight for shorter heights | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont Sept. 1, 2024, 4:17 p.m. UTC
The height is a power of two of up to 16 rows. The current code was
optimised for large sample counts.

T-Head C908:
h264_weight2_8_c:                                      211.7 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       184.0 ( 1.15x)
h264_weight2_8_rvv_i32:                   after         54.2 ( 3.90x)
h264_weight4_8_c:                                      285.7 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       341.2 ( 0.86x)
h264_weight4_8_rvv_i32:                   after         82.2 ( 3.47x)
h264_weight8_8_c:                                      498.7 ( 1.00x)
h264_weight8_8_rvv_i32:                   before       683.7 ( 0.73x)
h264_weight8_8_rvv_i64:                   after        128.5 ( 3.95x)
h264_weight16_8_c:                                     878.2 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    239.5 ( 3.67x)

SpacemiT X60:
h264_weight2_8_c:                                      207.2 ( 1.00x)
h264_weight2_8_rvv_i32:                   before       259.6 ( 0.80x)
h264_weight2_8_rvv_i32:                   after         82.2 ( 2.52x)
h264_weight4_8_c:                                      290.8 ( 1.00x)
h264_weight4_8_rvv_i32:                   before       509.6 ( 0.57x)
h264_weight4_8_rvv_i32:                   after         61.5 ( 4.73x)
h264_weight8_8_c:                                      498.8 ( 1.00x)
h264_weight8_8_rvv_i32:                   before      1019.8 ( 0.49x)
h264_weight8_8_rvv_i64:                   after         71.8 ( 6.95x)
h264_weight16_8_c:                                     874.0 ( 1.00x)
h264_weight16_8_rvv_i32:                  unchanged    249.0 ( 3.51x)
---
 libavcodec/riscv/h264dsp_init.c | 18 +++++++--
 libavcodec/riscv/h264dsp_rvv.S  | 72 +++++++++++++--------------------
 2 files changed, 42 insertions(+), 48 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 9ffc9b0333..6391667a40 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -96,13 +96,23 @@  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
     if (flags & AV_CPU_FLAG_RVV_I32) {
         const bool zvl128b = ff_rv_vlen_least(128);
 
+        if (bit_depth == 8) {
+            if (zvl128b)
+                dsp->weight_h264_pixels_tab[0] =
+                    ff_h264_weight_funcs_8_rvv[0].weight;
+            if (flags & AV_CPU_FLAG_RVV_I64)
+                dsp->weight_h264_pixels_tab[1] =
+                    ff_h264_weight_funcs_8_rvv[1].weight;
+            dsp->weight_h264_pixels_tab[2] =
+                 ff_h264_weight_funcs_8_rvv[2].weight;
+            dsp->weight_h264_pixels_tab[3] =
+                 ff_h264_weight_funcs_8_rvv[3].weight;
+        }
+
         if (bit_depth == 8 && zvl128b) {
-            for (int i = 0; i < 4; i++) {
-                dsp->weight_h264_pixels_tab[i] =
-                    ff_h264_weight_funcs_8_rvv[i].weight;
+            for (int i = 0; i < 4; i++)
                 dsp->biweight_h264_pixels_tab[i] =
                     ff_h264_weight_funcs_8_rvv[i].biweight;
-            }
 
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 422ac02222..5c79d1d2b0 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -28,9 +28,13 @@ 
 
 #include "libavutil/riscv/asm.S"
 
-func ff_h264_weight_pixels_simple_8_rvv, zve32x
+.macro  h264_weight depth, w, b=
+func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x
+        lpad    0
         csrwi   vxrm, 0
         sll     a5, a5, a3
+        .ifb    \b
+        li      a6, \w
 1:
         vsetvli     zero, a6, e16, m2, ta, ma
         vle8.v  v8, (a0)
@@ -43,10 +47,33 @@  func ff_h264_weight_pixels_simple_8_rvv, zve32x
         vnclipu.wx  v8, v16, a3
         vse8.v  v8, (a0)
         add     a0, a0, a1
+        .else
+1:
+        vsetvli     t1, a2, e\b, m2, ta, ma
+        vlse\b\().v v8, (a0), a1
+        vsetvli     t0, zero, e16, m4, ta, ma
+        vzext.vf2   v24, v8
+        sub     a2, a2, t1
+        vmul.vx     v16, v24, a4
+        mul     t2, t1, a1
+        vsadd.vx    v16, v16, a5
+        vmax.vx     v16, v16, zero
+        vsetvli     zero, zero, e8, m2, ta, ma
+        vnclipu.wx  v8, v16, a3
+        vsetvli     zero, t1, e\b, m2, ta, ma
+        vsse\b\().v v8, (a0), a1
+        add     a0, a0, t2
+        .endif
         bnez    a2, 1b
 
         ret
 endfunc
+.endm
+
+h264_weight 8, 2, 16
+h264_weight 8, 4, 32
+h264_weight 8, 8, 64
+h264_weight 8, 16
 
         .variant_cc ff_h264_biweight_pixels_simple_8_rvv
 func ff_h264_biweight_pixels_simple_8_rvv, zve32x
@@ -76,39 +103,6 @@  func ff_h264_biweight_pixels_simple_8_rvv, zve32x
         ret
 endfunc
 
-func ff_h264_weight_pixels_8_rvv, zve32x
-        csrwi   vxrm, 0
-        sll     a5, a5, a3
-1:
-        mv      t0, a0
-        mv      t6, a6
-2:
-        vsetvli     t2, a2, e16, m8, ta, ma
-        vlsseg2e8.v v0, (t0), a1
-        addi    t6, t6, -2
-        vzext.vf2   v16, v0
-        vzext.vf2   v24, v4
-        vmul.vx     v16, v16, a4
-        vmul.vx     v24, v24, a4
-        vsadd.vx    v16, v16, a5
-        vsadd.vx    v24, v24, a5
-        vmax.vx     v16, v16, zero
-        vmax.vx     v24, v24, zero
-        vsetvli     zero, zero, e8, m4, ta, ma
-        vnclipu.wx  v0, v16, a3
-        vnclipu.wx  v4, v24, a3
-        vssseg2e8.v v0, (t0), a1
-        addi    t0, t0, 2
-        bnez    t6, 2b
-
-        mul     t3, a1, t2
-        sub     a2, a2, t2
-        add     a0, a0, t3
-        bnez    a2, 1b
-
-        ret
-endfunc
-
         .variant_cc ff_h264_biweight_pixels_8_rvv
 func ff_h264_biweight_pixels_8_rvv, zve32x
         csrwi   vxrm, 2
@@ -153,16 +147,6 @@  func ff_h264_biweight_pixels_8_rvv, zve32x
 endfunc
 
 .irp    w, 16, 8, 4, 2
-func ff_h264_weight_pixels\w\()_8_rvv, zve32x
-        lpad    0
-        li      a6, \w
-        .if     \w == 16
-        j       ff_h264_weight_pixels_simple_8_rvv
-        .else
-        j       ff_h264_weight_pixels_8_rvv
-        .endif
-endfunc
-
 func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
         lpad    0
         li      t6, \w