diff mbox series

[FFmpeg-devel,2/2] lavc/vvc_mc: R-V V dmvr

Message ID tencent_06B5AA8E85C1D077F1CD5C1A165458380F06@qq.com
State New
Headers show
Series None | expand

Commit Message

uk7b@foxmail.com Sept. 28, 2024, 9:41 a.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                     k230               banana_f3
dmvr_8_12x20_c:                       626.5 ( 1.00x)    621.7 ( 1.00x)
dmvr_8_12x20_rvv_i32:                 126.3 ( 4.96x)    79.9 ( 7.78x)
dmvr_8_20x12_c:                       608.0 ( 1.00x)    652.9 ( 1.00x)
dmvr_8_20x12_rvv_i32:                 135.5 ( 4.49x)    90.4 ( 7.22x)
dmvr_8_20x20_c:                      1006.0 ( 1.00x)    1079.9 ( 1.00x)
dmvr_8_20x20_rvv_i32:                 228.3 ( 4.41x)    142.4 ( 7.58x)
dmvr_h_8_12x20_c:                    2005.8 ( 1.00x)    2007.2 ( 1.00x)
dmvr_h_8_12x20_rvv_i32:               274.5 ( 7.31x)    184.2 (10.90x)
dmvr_h_8_20x12_c:                    1987.5 ( 1.00x)    2006.9 ( 1.00x)
dmvr_h_8_20x12_rvv_i32:               302.3 ( 6.58x)    173.7 (11.56x)
dmvr_h_8_20x20_c:                    3302.3 ( 1.00x)    3340.4 ( 1.00x)
dmvr_h_8_20x20_rvv_i32:               487.5 ( 6.77x)    267.4 (12.49x)
dmvr_hv_8_12x20_c:                   3607.8 ( 1.00x)    3600.7 ( 1.00x)
dmvr_hv_8_12x20_rvv_i32:              459.8 ( 7.85x)    371.7 ( 9.69x)
dmvr_hv_8_20x12_c:                   3626.3 ( 1.00x)    3621.7 ( 1.00x)
dmvr_hv_8_20x12_rvv_i32:              422.8 ( 8.58x)    298.7 (12.13x)
dmvr_hv_8_20x20_c:                   5931.8 ( 1.00x)    5934.4 ( 1.00x)
dmvr_hv_8_20x20_rvv_i32:              672.5 ( 8.82x)    475.9 (12.47x)
dmvr_v_8_12x20_c:                    2154.0 ( 1.00x)    2152.9 ( 1.00x)
dmvr_v_8_12x20_rvv_i32:               274.5 ( 7.85x)    183.9 (11.71x)
dmvr_v_8_20x12_c:                    2774.5 ( 1.00x)    2152.9 ( 1.00x)
dmvr_v_8_20x12_rvv_i32:               302.3 ( 9.18x)    173.7 (12.40x)
dmvr_v_8_20x20_c:                    3552.0 ( 1.00x)    3590.4 ( 1.00x)
dmvr_v_8_20x20_rvv_i32:               487.5 ( 7.29x)    267.4 (13.43x)
---
 libavcodec/riscv/vvc/vvc_mc_rvv.S  | 139 +++++++++++++++++++++++++++++
 libavcodec/riscv/vvc/vvcdsp_init.c |  22 +++++
 2 files changed, 161 insertions(+)
diff mbox series

Patch

diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S b/libavcodec/riscv/vvc/vvc_mc_rvv.S
index 18532616d9..61fe840c4d 100644
--- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
+++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
@@ -285,3 +285,142 @@  endfunc
 func_w_avg 128
 func_w_avg 256
 #endif
+
+func dmvr zve32x, zbb, zba
+        lpad    0
+        li                t0, 4
+1:
+        add               t1, a1, a2
+        addi              t4, a0, 128*2
+        add               t2, t1, a2
+        addi              t5, a0, 128*2*2
+        add               t3, t2, a2
+        addi              t6, a0, 128*2*3
+        vle8.v            v0, (a1)
+        vle8.v            v4, (t1)
+        vle8.v            v8, (t2)
+        vle8.v            v12, (t3)
+        addi              a3, a3, -4
+        vwmulu.vx         v16, v0, t0
+        vwmulu.vx         v20, v4, t0
+        vwmulu.vx         v24, v8, t0
+        vwmulu.vx         v28, v12, t0
+        vse16.v           v16, (a0)
+        vse16.v           v20, (t4)
+        vse16.v           v24, (t5)
+        vse16.v           v28, (t6)
+        sh2add            a1, a2, a1
+        add               a0, a0, 128*2*4
+        bnez              a3, 1b
+        ret
+endfunc
+
+.macro dmvr_h_v mn, type
+        lla               t4, ff_vvc_inter_luma_dmvr_filters
+        sh1add            t4, \mn, t4
+        lbu               t5, (t4)
+        lbu               t6, 1(t4)
+1:
+.ifc \type,h
+        addi              t0, a1, 1
+        addi              t1, a1, 2
+.else
+        add               t0, a1, a2
+        add               t1, t0, a2
+.endif
+        vle8.v            v0, (a1)
+        vle8.v            v4, (t0)
+        vle8.v            v8, (t1)
+        addi              a3, a3, -2
+        vzext.vf2         v12, v0
+        vzext.vf2         v16, v4
+        vzext.vf2         v20, v8
+        addi              t2, a0, 128*2
+        vmul.vx           v12, v12, t5
+        vmul.vx           v24, v16, t5
+        vmacc.vx          v12, t6, v16
+        vmacc.vx          v24, t6, v20
+        vssrl.vi          v12, v12, 2
+        vssrl.vi          v24, v24, 2
+        vse16.v           v12, (a0)
+        vse16.v           v24, (t2)
+        add               a0, a0, 128*4
+        sh1add            a1, a2, a1
+        bnez              a3, 1b
+        ret
+.endm
+
+func dmvr_h zve32x, zbb, zba
+        lpad    0
+        dmvr_h_v a4, h
+endfunc
+
+func dmvr_v zve32x, zbb, zba
+        lpad    0
+        dmvr_h_v a5, v
+endfunc
+
+.macro dmvr_load_h dst, filter0, filter1
+        addi              a6, a1, 1
+        vle8.v            \dst, (a1)
+        vle8.v            v2, (a6)
+        vzext.vf2         v4, \dst
+        vzext.vf2         v8, v2
+        vmul.vx           \dst, v4, \filter0
+        vmacc.vx          \dst, \filter1, v8
+        vssrl.vi          \dst, \dst, 2
+.endm
+
+func dmvr_hv zve32x, zbb, zba
+        lpad    0
+        lla               t0, ff_vvc_inter_luma_dmvr_filters
+        sh1add            t1, a4, t0
+        sh1add            t2, a5, t0
+        lbu               t3, (t1)          // filter[mx][0]
+        lbu               t4, 1(t1)         // filter[mx][1]
+        lbu               t5, (t2)          // filter[my][0]
+        lbu               t6, 1(t2)         // filter[my][1]
+        dmvr_load_h       v12, t3, t4
+        add               a1, a1, a2
+1:
+        vmul.vx           v28, v12, t5
+        addi              a3, a3, -1
+        dmvr_load_h       v12, t3, t4
+        vmacc.vx          v28, t6, v12
+        vssrl.vi          v28, v28, 4
+        vse16.v           v28, (a0)
+        add               a1, a1, a2
+        addi              a0, a0, 128*2
+        bnez              a3, 1b
+        ret
+endfunc
+
+.macro func_dmvr vlen, name
+func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x, zbb, zba
+        lpad    0
+        li                t0, 20
+        beq               a6, t0, DMVR20\vlen\name
+        .ifc \name, dmvr
+        vsetvlstatic8     12, \vlen
+        .else
+        csrwi             vxrm, 0
+        vsetvlstatic16    12, \vlen
+        .endif
+        j                 \name
+DMVR20\vlen\name:
+        .ifc \name, dmvr
+        vsetvlstatic8     20, \vlen
+        .else
+        csrwi             vxrm, 0
+        vsetvlstatic16    20, \vlen
+        .endif
+        j                 \name
+endfunc
+.endm
+
+.irp vlen,256,128
+func_dmvr \vlen, dmvr
+func_dmvr \vlen, dmvr_h
+func_dmvr \vlen, dmvr_v
+func_dmvr \vlen, dmvr_hv
+.endr
diff --git a/libavcodec/riscv/vvc/vvcdsp_init.c b/libavcodec/riscv/vvc/vvcdsp_init.c
index ac1e7dda7d..7df3ce58db 100644
--- a/libavcodec/riscv/vvc/vvcdsp_init.c
+++ b/libavcodec/riscv/vvc/vvcdsp_init.c
@@ -37,6 +37,26 @@  void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,
 AVG_PROTOTYPES(8, rvv_128)
 AVG_PROTOTYPES(8, rvv_256)
 
+#define DMVR_PROTOTYPES(bd, opt)                                                                    \
+void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,               \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,             \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride,            \
+     int height, intptr_t mx, intptr_t my, int width);                                              \
+
+DMVR_PROTOTYPES(8, rvv_128)
+DMVR_PROTOTYPES(8, rvv_256)
+
+#define DMVR_INIT(bd, opt) do {                                    \
+    c->inter.dmvr[0][0]   = ff_vvc_dmvr_##bd##_##opt;              \
+    c->inter.dmvr[0][1]   = ff_vvc_dmvr_h_##bd##_##opt;            \
+    c->inter.dmvr[1][0]   = ff_vvc_dmvr_v_##bd##_##opt;            \
+    c->inter.dmvr[1][1]   = ff_vvc_dmvr_hv_##bd##_##opt;           \
+} while (0)
+
 void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
 {
 #if HAVE_RVV
@@ -51,6 +71,7 @@  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
 # if (__riscv_xlen == 64)
                 c->inter.w_avg    = ff_vvc_w_avg_8_rvv_256;
 # endif
+                DMVR_INIT(8, rvv_256);
                 break;
             default:
                 break;
@@ -63,6 +84,7 @@  void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd)
 # if (__riscv_xlen == 64)
                 c->inter.w_avg    = ff_vvc_w_avg_8_rvv_128;
 # endif
+                DMVR_INIT(8, rvv_128);
                 break;
             default:
                 break;