diff mbox series

[FFmpeg-devel,3/7] lavc/vp9dsp: R-V V mc tap h

Message ID CAEa-L+uiHaO=rrkE=v1nLHEVLmXHNVbH-tTW7gaavHNQGZFoqA@mail.gmail.com
State New
Headers show
Series [FFmpeg-devel,1/7] lavc/vp9dsp: R-V mc copy_avg | expand

Checks

Context Check Description
andriy/configure_x86 warning Failed to apply patch
yinshiyou/configure_loongarch64 warning Failed to apply patch

Commit Message

flow gg March 22, 2024, 6:04 a.m. UTC
The order of some instructions appears imperfect because, when len==32, the
registers for operations like hv can only just suffice, making it difficult
to adjust.
It's possible to create a separate function for len<32, but it likely won't
have a significant impact, so this hasn't been done yet.
diff mbox series

Patch

From d9044b400f5a161928a920f0399e5e0715f0c8e6 Mon Sep 17 00:00:00 2001
From: sunyuechi <sunyuechi@iscas.ac.cn>
Date: Thu, 21 Mar 2024 22:53:59 +0800
Subject: [PATCH 3/7] lavc/vp9dsp: R-V V mc tap h

C908:
vp9_avg_8tap_smooth_4h_8bpp_c: 12.7
vp9_avg_8tap_smooth_4h_8bpp_rvv_i64: 5.0
vp9_avg_8tap_smooth_8h_8bpp_c: 48.5
vp9_avg_8tap_smooth_8h_8bpp_rvv_i64: 9.2
vp9_avg_8tap_smooth_16h_8bpp_c: 191.7
vp9_avg_8tap_smooth_16h_8bpp_rvv_i64: 21.0
vp9_avg_8tap_smooth_32h_8bpp_c: 780.0
vp9_avg_8tap_smooth_32h_8bpp_rvv_i64: 66.5
vp9_avg_8tap_smooth_64h_8bpp_c: 3123.7
vp9_avg_8tap_smooth_64h_8bpp_rvv_i64: 264.2
vp9_put_8tap_smooth_4h_8bpp_c: 11.0
vp9_put_8tap_smooth_4h_8bpp_rvv_i64: 4.2
vp9_put_8tap_smooth_8h_8bpp_c: 42.0
vp9_put_8tap_smooth_8h_8bpp_rvv_i64: 8.2
vp9_put_8tap_smooth_16h_8bpp_c: 165.5
vp9_put_8tap_smooth_16h_8bpp_rvv_i64: 19.7
vp9_put_8tap_smooth_32h_8bpp_c: 659.0
vp9_put_8tap_smooth_32h_8bpp_rvv_i64: 64.0
vp9_put_8tap_smooth_64h_8bpp_c: 2682.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i64: 272.2
---
 libavcodec/riscv/vp9_mc_rvv.S  | 232 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp_init.c |   8 +-
 2 files changed, 239 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index a97807633e..eacc174bc4 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -123,6 +123,230 @@  func ff_copy\len\()_rvv, zve32x
 endfunc
 .endr
 
+subpel_filters_regular:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
+        .byte -1,  3, -10, 122,  18,  -6,  2,  0
+        .byte -1,  4, -13, 118,  27,  -9,  3, -1
+        .byte -1,  4, -16, 112,  37, -11,  4, -1
+        .byte -1,  5, -18, 105,  48, -14,  4, -1
+        .byte -1,  5, -19,  97,  58, -16,  5, -1
+        .byte -1,  6, -19,  88,  68, -18,  5, -1
+        .byte -1,  6, -19,  78,  78, -19,  6, -1
+        .byte -1,  5, -18,  68,  88, -19,  6, -1
+        .byte -1,  5, -16,  58,  97, -19,  5, -1
+        .byte -1,  4, -14,  48, 105, -18,  5, -1
+        .byte -1,  4, -11,  37, 112, -16,  4, -1
+        .byte -1,  3,  -9,  27, 118, -13,  4, -1
+        .byte  0,  2,  -6,  18, 122, -10,  3, -1
+        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
+subpel_filters_sharp:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
+        .byte -2,  5, -13, 125,  17,  -6,  3, -1
+        .byte -3,  7, -17, 121,  27, -10,  5, -2
+        .byte -4,  9, -20, 115,  37, -13,  6, -2
+        .byte -4, 10, -23, 108,  48, -16,  8, -3
+        .byte -4, 10, -24, 100,  59, -19,  9, -3
+        .byte -4, 11, -24,  90,  70, -21, 10, -4
+        .byte -4, 11, -23,  80,  80, -23, 11, -4
+        .byte -4, 10, -21,  70,  90, -24, 11, -4
+        .byte -3,  9, -19,  59, 100, -24, 10, -4
+        .byte -3,  8, -16,  48, 108, -23, 10, -4
+        .byte -2,  6, -13,  37, 115, -20,  9, -4
+        .byte -2,  5, -10,  27, 121, -17,  7, -3
+        .byte -1,  3,  -6,  17, 125, -13,  5, -2
+        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
+subpel_filters_smooth:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -3, -1,  32,  64,  38,   1, -3,  0
+        .byte -2, -2,  29,  63,  41,   2, -3,  0
+        .byte -2, -2,  26,  63,  43,   4, -4,  0
+        .byte -2, -3,  24,  62,  46,   5, -4,  0
+        .byte -2, -3,  21,  60,  49,   7, -4,  0
+        .byte -1, -4,  18,  59,  51,   9, -4,  0
+        .byte -1, -4,  16,  57,  53,  12, -4, -1
+        .byte -1, -4,  14,  55,  55,  14, -4, -1
+        .byte -1, -4,  12,  53,  57,  16, -4, -1
+        .byte  0, -4,   9,  51,  59,  18, -4, -1
+        .byte  0, -4,   7,  49,  60,  21, -3, -2
+        .byte  0, -4,   5,  46,  62,  24, -3, -2
+        .byte  0, -4,   4,  43,  63,  26, -2, -2
+        .byte  0, -3,   2,  41,  63,  29, -2, -2
+        .byte  0, -3,   1,  38,  64,  32, -1, -3
+
+.macro epel_filter name type regtype
+        lla             \regtype\()2, subpel_filters_\name
+        li              \regtype\()1, 8
+        mul             \regtype\()0, a5, \regtype\()1
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        .irp n 1,2,3,4,5,6
+        lb              \regtype\n, \n(\regtype\()0)
+        .endr
+.ifc \regtype,t
+        lb              a7, 7(\regtype\()0)
+.elseif \regtype == s
+        lb              s7, 7(\regtype\()0)
+.endif
+        lb              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst len do name type from_mem regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.elseif \regtype == s
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+.ifc \len,4
+        vsetvli         zero, zero, e8, mf4, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e8, mf2, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e8, m1, ta, ma
+.else
+        vsetvli         zero, zero, e8, m2, ta, ma
+.endif
+
+.ifc \do,put
+        vnclipu.wi      \dst, v24, 0
+.elseif \do == avg
+        vle8.v          \dst, (a0)
+        vnclipu.wi      v24, v24, 0
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst len do name type from_mem regtype
+        epel_load       \dst \len \do \name \type \from_mem \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len do name type
+        epel_filter \name \type t
+
+.ifc \len,4
+        vsetivli        zero, 4, e8, mf4, ta, ma
+.elseif \len == 8
+        vsetivli        zero, 8, e8, mf2, ta, ma
+.elseif \len == 16
+        vsetivli        zero, 16, e8, m1, ta, ma
+.else
+        li              a5, 32
+        vsetvli         zero, a5, e8, m2, ta, ma
+.endif
+.ifc \do,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \do \name \type 1 t
+        vse8.v          v30, (a0)
+.ifc \len,64
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30 \len \do \name \type 1 t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+.endm
+
+.macro gen_epel len do name type
+func ff_\do\()_8tap_\name\()_\len\()\type\()_rvv, zve32x
+        epel \len \do \name \type
+endfunc
+.endm
+
 .irp len 64, 32, 16, 8, 4
 func ff_avg\len\()_rvv, zve32x
         copy_avg \len avg
@@ -134,4 +358,12 @@  endfunc
 func ff_avg_bilin_\len\()h_rvv, zve32x
         bilin_h \len avg
 endfunc
+
+.irp name regular sharp smooth
+        .irp do put avg
+                .irp type h
+                        gen_epel \len \do \name \type
+                .endr
+        .endr
+.endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index d6d6fb52cc..413b203e5f 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -109,7 +109,13 @@  static av_cold void vp9dsp_mc_init_rvv(VP9DSPContext *dsp, int bpp)
 
 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type)  \
     dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] =   \
-        ff_##type##_bilin_##sz##dir##_rvv;
+        ff_##type##_bilin_##sz##dir##_rvv;                   \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_smooth_##sz##dir##_rvv;             \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_regular_##sz##dir##_rvv;            \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_8tap_sharp_##sz##dir##_rvv;
 
 #define init_subpel2(idx, idxh, idxv, dir, type)      \
     init_subpel1(0, idx, idxh, idxv, 64, dir, type);  \
-- 
2.44.0