diff mbox series

[FFmpeg-devel,v2,3/5] lavc/vp9dsp: R-V V mc tap h v

Message ID tencent_29164343A9C58A73E5583A61F23CD2AAF008@qq.com
State New
Headers show
Series [FFmpeg-devel,v2,1/5] lavc/vp9dsp: R-V V mc avg | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

uk7b@foxmail.com May 21, 2024, 5:13 p.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 243 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++----
 libavcodec/riscv/vp9dsp_init.c |  38 +++++-
 3 files changed, 328 insertions(+), 25 deletions(-)

Comments

Rémi Denis-Courmont May 25, 2024, 10:17 a.m. UTC | #1
Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 243 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++----
>  libavcodec/riscv/vp9dsp_init.c |  38 +++++-
>  3 files changed, 328 insertions(+), 25 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 739380d9a9..adba4afb90 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
> 
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
>  endfunc
>  .endm
> 
> +const subpel_filters_regular
> +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> +        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
> +        .byte -1,  3, -10, 122,  18,  -6,  2,  0
> +        .byte -1,  4, -13, 118,  27,  -9,  3, -1
> +        .byte -1,  4, -16, 112,  37, -11,  4, -1
> +        .byte -1,  5, -18, 105,  48, -14,  4, -1
> +        .byte -1,  5, -19,  97,  58, -16,  5, -1
> +        .byte -1,  6, -19,  88,  68, -18,  5, -1
> +        .byte -1,  6, -19,  78,  78, -19,  6, -1
> +        .byte -1,  5, -18,  68,  88, -19,  6, -1
> +        .byte -1,  5, -16,  58,  97, -19,  5, -1
> +        .byte -1,  4, -14,  48, 105, -18,  5, -1
> +        .byte -1,  4, -11,  37, 112, -16,  4, -1
> +        .byte -1,  3,  -9,  27, 118, -13,  4, -1
> +        .byte  0,  2,  -6,  18, 122, -10,  3, -1
> +        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
> +subpel_filters_sharp:
> +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> +        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
> +        .byte -2,  5, -13, 125,  17,  -6,  3, -1
> +        .byte -3,  7, -17, 121,  27, -10,  5, -2
> +        .byte -4,  9, -20, 115,  37, -13,  6, -2
> +        .byte -4, 10, -23, 108,  48, -16,  8, -3
> +        .byte -4, 10, -24, 100,  59, -19,  9, -3
> +        .byte -4, 11, -24,  90,  70, -21, 10, -4
> +        .byte -4, 11, -23,  80,  80, -23, 11, -4
> +        .byte -4, 10, -21,  70,  90, -24, 11, -4
> +        .byte -3,  9, -19,  59, 100, -24, 10, -4
> +        .byte -3,  8, -16,  48, 108, -23, 10, -4
> +        .byte -2,  6, -13,  37, 115, -20,  9, -4
> +        .byte -2,  5, -10,  27, 121, -17,  7, -3
> +        .byte -1,  3,  -6,  17, 125, -13,  5, -2
> +        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
> +subpel_filters_smooth:
> +        .byte  0,  0,   0, 128,   0,   0,  0,  0
> +        .byte -3, -1,  32,  64,  38,   1, -3,  0
> +        .byte -2, -2,  29,  63,  41,   2, -3,  0
> +        .byte -2, -2,  26,  63,  43,   4, -4,  0
> +        .byte -2, -3,  24,  62,  46,   5, -4,  0
> +        .byte -2, -3,  21,  60,  49,   7, -4,  0
> +        .byte -1, -4,  18,  59,  51,   9, -4,  0
> +        .byte -1, -4,  16,  57,  53,  12, -4, -1
> +        .byte -1, -4,  14,  55,  55,  14, -4, -1
> +        .byte -1, -4,  12,  53,  57,  16, -4, -1
> +        .byte  0, -4,   9,  51,  59,  18, -4, -1
> +        .byte  0, -4,   7,  49,  60,  21, -3, -2
> +        .byte  0, -4,   5,  46,  62,  24, -3, -2
> +        .byte  0, -4,   4,  43,  63,  26, -2, -2
> +        .byte  0, -3,   2,  41,  63,  29, -2, -2
> +        .byte  0, -3,   1,  38,  64,  32, -1, -3
> +endconst

Is there a reason that you cannot use the tables from C code?

> +
> +.macro epel_filter name type regtype
> +        lla             \regtype\()2, subpel_filters_\name

It should be possible to spare one ADDI by using just AUIPC here, and folding 
the immediate offset into the LB's below (see also H.263 loop filter).

> +        li              \regtype\()1, 8
> +.ifc \type,v
> +        mul             \regtype\()0, a6, \regtype\()1
> +.else
> +        mul             \regtype\()0, a5, \regtype\()1

slli 3 ?

> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +        .irp n,1,2,3,4,5,6
> +        lb              \regtype\n, \n(\regtype\()0)
> +        .endr
> +.ifc \regtype,t
> +        lb              a7, 7(\regtype\()0)
> +.else
> +        lb              s7, 7(\regtype\()0)
> +.endif
> +        lb              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst len op name type from_mem regtype
> +        li              a5, 64
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        sub             a2, a2, a3
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a2
> +        vle8.v          v24, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a2, a2, -1
> +        vle8.v          v20, (a2)
> +        addi            a2, a2, 2
> +        vle8.v          v24, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        .rept 6
> +        sub             a2, a2, a3
> +        .endr
> +        vle8.v          v28, (a2)
> +        sub             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        sh1add          a2, a3, a2
> +        add             a2, a2, a3
> +.else
> +        addi            a2, a2, -6
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, -1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst len op name type from_mem regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len op name type vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
>          .irp op, put, avg
>                  bilin_h_v \len, \op, h, a5
>                  bilin_h_v \len, \op, v, a6
> +                .irp name, regular, sharp, smooth

AFAICT, regular and sharp are identical, except for the base address of the 
filter table, so it should be possible to share the byte code. Similarly, it 
should be possible to share most of the horizontal and vertical code (maybe 
also for bilinear. not just EPel) with separate load/store then inner 
procedures. The H.263 loop filter already does that though with almost no 
overhead, though
H.263 is obviously simpler than VP9.

A French philosopher famously said that Perfect is the ennemy of Good. 
Generally, as with VVC, nested repetition macros for finely specialised 
functions tend to generate way too much byte code, and this ends up being 
worse rather than better in the big picture.

> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
>          .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> 
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen)              \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \ +
>                                        ptrdiff_t dststride,                
> \ const uint8_t *src,                  \ ptrdiff_t srcstride,              
>   \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);             \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> mx, int my);
> 
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> 
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..314a1e5808 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
> 
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    if (ff_rv_vlen_least(128)) {
> 
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -85,7 +86,42 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> ff_avg_vp9_bilin_4h_rvv;
> 
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (ff_rv_vlen_least(256)) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
>      }
> +    }
> +
>  #endif
>  #endif
>  }
flow gg May 25, 2024, 10:47 a.m. UTC | #2
One more thing I remember is that after adjusting the sign, vmacc can be
used; otherwise, due to the sign, mul + add are needed.

flow gg <hlefthleft@gmail.com> 于2024年5月25日周六 18:38写道:

> > Is there a reason that you cannot use the tables from C code?
>
> Similar to VP8, to adjust the positive and negative data and prevent small
> probability overflow during calculations.
>
> > AFAICT, regular and sharp are identical, except for the base address of
> the
> > filter table, so it should be possible to share the byte code
>
> Initially, they used the same code, but after testing hundreds of times,
> there were always a few failures...
>
> Because the data in the table is different, when regular, sharp, and
> smooth use the same code, there will always be a small amount of overflow.
> Different signed and unsigned calculations are needed.
>
> > A French philosopher famously said that Perfect is the ennemy of Good.
> > Generally, as with VVC, nested repetition macros for finely specialised
> > functions tend to generate way too much byte code, and this ends up being
> > worse rather than better in the big picture.
>
> Got it, I will try to update.
>
> Rémi Denis-Courmont <remi@remlab.net> 于2024年5月25日周六 18:17写道:
>
>> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST uk7b@foxmail.com a écrit
>> :
>> > From: sunyuechi <sunyuechi@iscas.ac.cn>
>> >
>> >                                                      C908   X60
>> > vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
>> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
>> > vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
>> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
>> > vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
>> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
>> > vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
>> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
>> > vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
>> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
>> > vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
>> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
>> > vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
>> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
>> > vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
>> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
>> > vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
>> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
>> > vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
>> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
>> > vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
>> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
>> > vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
>> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
>> > vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
>> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
>> > vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
>> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
>> > vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
>> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
>> > vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
>> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
>> > vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
>> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
>> > vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
>> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
>> > vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
>> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
>> > vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
>> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
>> > ---
>> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +++++++++++++++++++++++++++++++++
>> >  libavcodec/riscv/vp9dsp.h      |  72 ++++++----
>> >  libavcodec/riscv/vp9dsp_init.c |  38 +++++-
>> >  3 files changed, 328 insertions(+), 25 deletions(-)
>> >
>> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
>> b/libavcodec/riscv/vp9_mc_rvv.S
>> > index 739380d9a9..adba4afb90 100644
>> > --- a/libavcodec/riscv/vp9_mc_rvv.S
>> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
>> > @@ -36,6 +36,18 @@
>> >  .endif
>> >  .endm
>> >
>> > +.macro vsetvlstatic16 len
>> > +.ifc \len,4
>> > +        vsetvli         zero, zero, e16, mf2, ta, ma
>> > +.elseif \len == 8
>> > +        vsetvli         zero, zero, e16, m1, ta, ma
>> > +.elseif \len == 16
>> > +        vsetvli         zero, zero, e16, m2, ta, ma
>> > +.else
>> > +        vsetvli         zero, zero, e16, m4, ta, ma
>> > +.endif
>> > +.endm
>> > +
>> >  .macro copy_avg len
>> >  func ff_vp9_avg\len\()_rvv, zve32x
>> >          csrwi           vxrm, 0
>> > @@ -92,10 +104,241 @@ func ff_\op\()_vp9_bilin_\len\()\type\()_rvv,
>> zve32x
>> >  endfunc
>> >  .endm
>> >
>> > +const subpel_filters_regular
>> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
>> > +        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
>> > +        .byte -1,  3, -10, 122,  18,  -6,  2,  0
>> > +        .byte -1,  4, -13, 118,  27,  -9,  3, -1
>> > +        .byte -1,  4, -16, 112,  37, -11,  4, -1
>> > +        .byte -1,  5, -18, 105,  48, -14,  4, -1
>> > +        .byte -1,  5, -19,  97,  58, -16,  5, -1
>> > +        .byte -1,  6, -19,  88,  68, -18,  5, -1
>> > +        .byte -1,  6, -19,  78,  78, -19,  6, -1
>> > +        .byte -1,  5, -18,  68,  88, -19,  6, -1
>> > +        .byte -1,  5, -16,  58,  97, -19,  5, -1
>> > +        .byte -1,  4, -14,  48, 105, -18,  5, -1
>> > +        .byte -1,  4, -11,  37, 112, -16,  4, -1
>> > +        .byte -1,  3,  -9,  27, 118, -13,  4, -1
>> > +        .byte  0,  2,  -6,  18, 122, -10,  3, -1
>> > +        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
>> > +subpel_filters_sharp:
>> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
>> > +        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
>> > +        .byte -2,  5, -13, 125,  17,  -6,  3, -1
>> > +        .byte -3,  7, -17, 121,  27, -10,  5, -2
>> > +        .byte -4,  9, -20, 115,  37, -13,  6, -2
>> > +        .byte -4, 10, -23, 108,  48, -16,  8, -3
>> > +        .byte -4, 10, -24, 100,  59, -19,  9, -3
>> > +        .byte -4, 11, -24,  90,  70, -21, 10, -4
>> > +        .byte -4, 11, -23,  80,  80, -23, 11, -4
>> > +        .byte -4, 10, -21,  70,  90, -24, 11, -4
>> > +        .byte -3,  9, -19,  59, 100, -24, 10, -4
>> > +        .byte -3,  8, -16,  48, 108, -23, 10, -4
>> > +        .byte -2,  6, -13,  37, 115, -20,  9, -4
>> > +        .byte -2,  5, -10,  27, 121, -17,  7, -3
>> > +        .byte -1,  3,  -6,  17, 125, -13,  5, -2
>> > +        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
>> > +subpel_filters_smooth:
>> > +        .byte  0,  0,   0, 128,   0,   0,  0,  0
>> > +        .byte -3, -1,  32,  64,  38,   1, -3,  0
>> > +        .byte -2, -2,  29,  63,  41,   2, -3,  0
>> > +        .byte -2, -2,  26,  63,  43,   4, -4,  0
>> > +        .byte -2, -3,  24,  62,  46,   5, -4,  0
>> > +        .byte -2, -3,  21,  60,  49,   7, -4,  0
>> > +        .byte -1, -4,  18,  59,  51,   9, -4,  0
>> > +        .byte -1, -4,  16,  57,  53,  12, -4, -1
>> > +        .byte -1, -4,  14,  55,  55,  14, -4, -1
>> > +        .byte -1, -4,  12,  53,  57,  16, -4, -1
>> > +        .byte  0, -4,   9,  51,  59,  18, -4, -1
>> > +        .byte  0, -4,   7,  49,  60,  21, -3, -2
>> > +        .byte  0, -4,   5,  46,  62,  24, -3, -2
>> > +        .byte  0, -4,   4,  43,  63,  26, -2, -2
>> > +        .byte  0, -3,   2,  41,  63,  29, -2, -2
>> > +        .byte  0, -3,   1,  38,  64,  32, -1, -3
>> > +endconst
>>
>> Is there a reason that you cannot use the tables from C code?
>>
>> > +
>> > +.macro epel_filter name type regtype
>> > +        lla             \regtype\()2, subpel_filters_\name
>>
>> It should be possible to spare one ADDI by using just AUIPC here, and
>> folding
>> the immediate offset into the LB's below (see also H.263 loop filter).
>>
>> > +        li              \regtype\()1, 8
>> > +.ifc \type,v
>> > +        mul             \regtype\()0, a6, \regtype\()1
>> > +.else
>> > +        mul             \regtype\()0, a5, \regtype\()1
>>
>> slli 3 ?
>>
>> > +.endif
>> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
>> > +        .irp n,1,2,3,4,5,6
>> > +        lb              \regtype\n, \n(\regtype\()0)
>> > +        .endr
>> > +.ifc \regtype,t
>> > +        lb              a7, 7(\regtype\()0)
>> > +.else
>> > +        lb              s7, 7(\regtype\()0)
>> > +.endif
>> > +        lb              \regtype\()0, 0(\regtype\()0)
>> > +.endm
>> > +
>> > +.macro epel_load dst len op name type from_mem regtype
>> > +        li              a5, 64
>> > +.ifc \from_mem, 1
>> > +        vle8.v          v22, (a2)
>> > +.ifc \type,v
>> > +        sub             a2, a2, a3
>> > +        vle8.v          v20, (a2)
>> > +        sh1add          a2, a3, a2
>> > +        vle8.v          v24, (a2)
>> > +        add             a2, a2, a3
>> > +        vle8.v          v26, (a2)
>> > +        add             a2, a2, a3
>> > +        vle8.v          v28, (a2)
>> > +        add             a2, a2, a3
>> > +        vle8.v          v30, (a2)
>> > +.else
>> > +        addi            a2, a2, -1
>> > +        vle8.v          v20, (a2)
>> > +        addi            a2, a2, 2
>> > +        vle8.v          v24, (a2)
>> > +        addi            a2, a2, 1
>> > +        vle8.v          v26, (a2)
>> > +        addi            a2, a2, 1
>> > +        vle8.v          v28, (a2)
>> > +        addi            a2, a2, 1
>> > +        vle8.v          v30, (a2)
>> > +.endif
>> > +
>> > +.ifc \name,smooth
>> > +        vwmulu.vx       v16, v24, \regtype\()4
>> > +        vwmaccu.vx      v16, \regtype\()2, v20
>> > +        vwmaccu.vx      v16, \regtype\()5, v26
>> > +        vwmaccsu.vx     v16, \regtype\()6, v28
>> > +.else
>> > +        vwmulu.vx       v16, v28, \regtype\()6
>> > +        vwmaccsu.vx     v16, \regtype\()2, v20
>> > +        vwmaccsu.vx     v16, \regtype\()5, v26
>> > +.endif
>> > +
>> > +.ifc \regtype,t
>> > +        vwmaccsu.vx     v16, a7, v30
>> > +.else
>> > +        vwmaccsu.vx     v16, s7, v30
>> > +.endif
>> > +
>> > +.ifc \type,v
>> > +        .rept 6
>> > +        sub             a2, a2, a3
>> > +        .endr
>> > +        vle8.v          v28, (a2)
>> > +        sub             a2, a2, a3
>> > +        vle8.v          v26, (a2)
>> > +        sh1add          a2, a3, a2
>> > +        add             a2, a2, a3
>> > +.else
>> > +        addi            a2, a2, -6
>> > +        vle8.v          v28, (a2)
>> > +        addi            a2, a2, -1
>> > +        vle8.v          v26, (a2)
>> > +        addi            a2, a2, 3
>> > +.endif
>> > +
>> > +.ifc \name,smooth
>> > +        vwmaccsu.vx     v16, \regtype\()1, v28
>> > +.else
>> > +        vwmaccu.vx      v16, \regtype\()1, v28
>> > +        vwmulu.vx       v28, v24, \regtype\()4
>> > +.endif
>> > +        vwmaccsu.vx     v16, \regtype\()0, v26
>> > +        vwmulu.vx       v20, v22, \regtype\()3
>> > +.else
>> > +.ifc \name,smooth
>> > +        vwmulu.vx       v16, v8, \regtype\()4
>> > +        vwmaccu.vx      v16, \regtype\()2, v4
>> > +        vwmaccu.vx      v16, \regtype\()5, v10
>> > +        vwmaccsu.vx     v16, \regtype\()6, v12
>> > +        vwmaccsu.vx     v16, \regtype\()1, v2
>> > +.else
>> > +        vwmulu.vx       v16, v2, \regtype\()1
>> > +        vwmaccu.vx      v16, \regtype\()6, v12
>> > +        vwmaccsu.vx     v16, \regtype\()5, v10
>> > +        vwmaccsu.vx     v16, \regtype\()2, v4
>> > +        vwmulu.vx       v28, v8, \regtype\()4
>> > +.endif
>> > +        vwmaccsu.vx     v16, \regtype\()0, v0
>> > +        vwmulu.vx       v20, v6, \regtype\()3
>> > +
>> > +.ifc \regtype,t
>> > +        vwmaccsu.vx     v16, a7, v14
>> > +.else
>> > +        vwmaccsu.vx     v16, s7, v14
>> > +.endif
>> > +
>> > +.endif
>> > +        vwadd.wx        v16, v16, a5
>> > +        vsetvlstatic16  \len
>> > +
>> > +.ifc \name,smooth
>> > +        vwadd.vv        v24, v16, v20
>> > +.else
>> > +        vwadd.vv        v24, v16, v28
>> > +        vwadd.wv        v24, v24, v20
>> > +.endif
>> > +        vnsra.wi        v24, v24, 7
>> > +        vmax.vx         v24, v24, zero
>> > +        vsetvlstatic8   \len, zero, 32, m2
>> > +
>> > +        vnclipu.wi      \dst, v24, 0
>> > +.ifc \op,avg
>> > +        vle8.v          v24, (a0)
>> > +        vaaddu.vv       \dst, \dst, v24
>> > +.endif
>> > +
>> > +.endm
>> > +
>> > +.macro epel_load_inc dst len op name type from_mem regtype
>> > +        epel_load       \dst, \len, \op, \name, \type, \from_mem,
>> \regtype
>> > +        add             a2, a2, a3
>> > +.endm
>> > +
>> > +.macro epel len op name type vlen
>> > +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
>> > +        epel_filter     \name, \type, t
>> > +.if \vlen < 256
>> > +        vsetvlstatic8   \len, a5, 32, m2
>> > +.else
>> > +        vsetvlstatic8   \len, a5, 64, m2
>> > +.endif
>> > +.ifc \op,avg
>> > +        csrwi           vxrm, 0
>> > +.endif
>> > +
>> > +1:
>> > +        addi            a4, a4, -1
>> > +        epel_load       v30, \len, \op, \name, \type, 1, t
>> > +        vse8.v          v30, (a0)
>> > +.if \len == 64 && \vlen < 256
>> > +        addi            a0, a0, 32
>> > +        addi            a2, a2, 32
>> > +        epel_load       v30, \len, \op, \name, \type, 1, t
>> > +        vse8.v          v30, (a0)
>> > +        addi            a0, a0, -32
>> > +        addi            a2, a2, -32
>> > +.endif
>> > +        add             a2, a2, a3
>> > +        add             a0, a0, a1
>> > +        bnez            a4, 1b
>> > +
>> > +        ret
>> > +endfunc
>> > +.endm
>> > +
>> >  .irp len, 64, 32, 16, 8, 4
>> >          copy_avg \len
>> >          .irp op, put, avg
>> >                  bilin_h_v \len, \op, h, a5
>> >                  bilin_h_v \len, \op, v, a6
>> > +                .irp name, regular, sharp, smooth
>>
>> AFAICT, regular and sharp are identical, except for the base address of
>> the
>> filter table, so it should be possible to share the byte code. Similarly,
>> it
>> should be possible to share most of the horizontal and vertical code
>> (maybe
>> also for bilinear. not just EPel) with separate load/store then inner
>> procedures. The H.263 loop filter already does that though with almost no
>> overhead, though
>> H.263 is obviously simpler than VP9.
>>
>> A French philosopher famously said that Perfect is the ennemy of Good.
>> Generally, as with VVC, nested repetition macros for finely specialised
>> functions tend to generate way too much byte code, and this ends up being
>> worse rather than better in the big picture.
>>
>> > +                        .irp type, h, v
>> > +                                epel \len, \op, \name, \type, 128
>> > +                                epel \len, \op, \name, \type, 256
>> > +                        .endr
>> > +                .endr
>> >          .endr
>> >  .endr
>> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
>> > index 8fb326dae0..5fd64a1b8c 100644
>> > --- a/libavcodec/riscv/vp9dsp.h
>> > +++ b/libavcodec/riscv/vp9dsp.h
>> > @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
>> const
>> > uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
>> > uint8_t *l, const uint8_t *a);
>> >
>> > -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>>
>> >   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
>> > dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
>> > min_vlen)              \ +void
>> > ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>> \ +
>> >                                        ptrdiff_t dststride,
>>
>> > \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>>
>> >   \ int h, int mx, int my);              \ \ -void
>> > ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,
>>  \
>> > +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>>
>> >   \ +                                        ptrdiff_t dststride,
>>
>> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>>
>> >         \ int h, int mx, int my);              \ \ -void
>> > ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,
>> \
>> > +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>>
>> >   \ +                                         ptrdiff_t dststride,
>>
>> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>>
>> >       \ int h, int mx, int my);             \ \ -void
>> > ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,
>>  \
>> > +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>>
>> >   \ +                                        ptrdiff_t dststride,
>>
>> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>>
>> >         \ int h, int mx, int my);              \ \ -void
>> > ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,
>>  \
>> > +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>>
>> >   \ +                                        ptrdiff_t dststride,
>>
>> >       \ const uint8_t *src,                  \ ptrdiff_t srcstride,
>>
>> >         \ int h, int mx, int my);              \ \ -void
>> > ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,
>> \
>> > +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>>
>> >   \ +                                         ptrdiff_t dststride,
>>
>> >       \ const uint8_t *src,                 \ ptrdiff_t srcstride,
>>
>> >       \ int h, int mx, int my);
>> > @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst,
>> ptrdiff_t
>> > dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
>> > mx, int my);
>> >
>> > -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
>> > -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
>> > -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
>> > -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
>> > -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
>> > -
>> > -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
>> > -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
>> > -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
>> > -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
>> > -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
>> > -
>> > -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
>> > -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
>> > -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
>> > -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
>> > -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
>> > +
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
>> > +
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
>> > +
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
>> > +
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
>> > +
>> > +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
>> > +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>> >
>> >  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>> >  VP9_BILINEAR_RISCV_RVV_FUNC(32);
>> > diff --git a/libavcodec/riscv/vp9dsp_init.c
>> b/libavcodec/riscv/vp9dsp_init.c
>> > index 9606d8545f..314a1e5808 100644
>> > --- a/libavcodec/riscv/vp9dsp_init.c
>> > +++ b/libavcodec/riscv/vp9dsp_init.c
>> > @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
>> > *dsp, int bpp) # endif
>> >
>> >  #if HAVE_RVV
>> > -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
>> ff_rv_vlen_least(128))
>> > { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
>> > +    if (ff_rv_vlen_least(128)) {
>> >
>> >  #define init_fpel(idx1, sz)                                           \
>> >      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] =
>> ff_vp9_avg##sz##_rvv;  \
>> > @@ -85,7 +86,42 @@ static av_cold void
>> vp9dsp_mc_init_riscv(VP9DSPContext
>> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
>> > ff_avg_vp9_bilin_4h_rvv;
>> >
>> >  #undef init_fpel
>> > +
>> > +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
>> > +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
>> > +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
>> > +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
>> > +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
>> > +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
>> > +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
>> > +
>> > +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
>> > +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
>> > +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
>> > +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
>> > +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
>> > +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
>> > +
>> > +    init_subpel2(0, 1, 0, h, put, 128);
>> > +    init_subpel2(1, 1, 0, h, avg, 128);
>> > +
>> > +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
>> > +        init_subpel2(0, 0, 1, v, put, 128);
>> > +        init_subpel2(1, 0, 1, v, avg, 128);
>> > +    }
>> > +
>> > +    }
>> > +    if (ff_rv_vlen_least(256)) {
>> > +        init_subpel2(0, 1, 0, h, put, 256);
>> > +        init_subpel2(1, 1, 0, h, avg, 256);
>> > +
>> > +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
>> > +            init_subpel2(0, 0, 1, v, put, 256);
>> > +            init_subpel2(1, 0, 1, v, avg, 256);
>> > +        }
>> >      }
>> > +    }
>> > +
>> >  #endif
>> >  #endif
>> >  }
>>
>>
>> --
>> Rémi Denis-Courmont
>> http://www.remlab.net/
>>
>>
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
Rémi Denis-Courmont May 25, 2024, 5:36 p.m. UTC | #3
Le lauantaina 25. toukokuuta 2024, 13.38.39 EEST flow gg a écrit :
> > Is there a reason that you cannot use the tables from C code?
> 
> Similar to VP8, to adjust the positive and negative data and prevent small
> probability overflow during calculations.
> 
> > AFAICT, regular and sharp are identical, except for the base address of
> > the filter table, so it should be possible to share the byte code
> 
> Initially, they used the same code, but after testing hundreds of times,
> there were always a few failures...

AFAICT, the C reference and the AArch64 assembler are exactly the same for all 
3 filters, except for the offset in the filter table.  So logically, it ought toe 
be possible to merge regular and sharp with almost no changes, and merge 
smooth with a few fixes.

As for the tables themselves, it seems the "problem" is that C uses 16-bit 
values for no apparent reasons. We should probably change the C code to use 8-
bit on all platforms - except AArch64, because the NEON code probably relies 
on 16-bit format.
diff mbox series

Patch

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 739380d9a9..adba4afb90 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@ 
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -92,10 +104,241 @@  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
 endfunc
 .endm
 
+const subpel_filters_regular
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte  0,  1,  -5, 126,   8,  -3,  1,  0
+        .byte -1,  3, -10, 122,  18,  -6,  2,  0
+        .byte -1,  4, -13, 118,  27,  -9,  3, -1
+        .byte -1,  4, -16, 112,  37, -11,  4, -1
+        .byte -1,  5, -18, 105,  48, -14,  4, -1
+        .byte -1,  5, -19,  97,  58, -16,  5, -1
+        .byte -1,  6, -19,  88,  68, -18,  5, -1
+        .byte -1,  6, -19,  78,  78, -19,  6, -1
+        .byte -1,  5, -18,  68,  88, -19,  6, -1
+        .byte -1,  5, -16,  58,  97, -19,  5, -1
+        .byte -1,  4, -14,  48, 105, -18,  5, -1
+        .byte -1,  4, -11,  37, 112, -16,  4, -1
+        .byte -1,  3,  -9,  27, 118, -13,  4, -1
+        .byte  0,  2,  -6,  18, 122, -10,  3, -1
+        .byte  0,  1,  -3,   8, 126,  -5,  1,  0
+subpel_filters_sharp:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -1,  3,  -7, 127,   8,  -3,  1,  0
+        .byte -2,  5, -13, 125,  17,  -6,  3, -1
+        .byte -3,  7, -17, 121,  27, -10,  5, -2
+        .byte -4,  9, -20, 115,  37, -13,  6, -2
+        .byte -4, 10, -23, 108,  48, -16,  8, -3
+        .byte -4, 10, -24, 100,  59, -19,  9, -3
+        .byte -4, 11, -24,  90,  70, -21, 10, -4
+        .byte -4, 11, -23,  80,  80, -23, 11, -4
+        .byte -4, 10, -21,  70,  90, -24, 11, -4
+        .byte -3,  9, -19,  59, 100, -24, 10, -4
+        .byte -3,  8, -16,  48, 108, -23, 10, -4
+        .byte -2,  6, -13,  37, 115, -20,  9, -4
+        .byte -2,  5, -10,  27, 121, -17,  7, -3
+        .byte -1,  3,  -6,  17, 125, -13,  5, -2
+        .byte  0,  1,  -3,   8, 127,  -7,  3, -1
+subpel_filters_smooth:
+        .byte  0,  0,   0, 128,   0,   0,  0,  0
+        .byte -3, -1,  32,  64,  38,   1, -3,  0
+        .byte -2, -2,  29,  63,  41,   2, -3,  0
+        .byte -2, -2,  26,  63,  43,   4, -4,  0
+        .byte -2, -3,  24,  62,  46,   5, -4,  0
+        .byte -2, -3,  21,  60,  49,   7, -4,  0
+        .byte -1, -4,  18,  59,  51,   9, -4,  0
+        .byte -1, -4,  16,  57,  53,  12, -4, -1
+        .byte -1, -4,  14,  55,  55,  14, -4, -1
+        .byte -1, -4,  12,  53,  57,  16, -4, -1
+        .byte  0, -4,   9,  51,  59,  18, -4, -1
+        .byte  0, -4,   7,  49,  60,  21, -3, -2
+        .byte  0, -4,   5,  46,  62,  24, -3, -2
+        .byte  0, -4,   4,  43,  63,  26, -2, -2
+        .byte  0, -3,   2,  41,  63,  29, -2, -2
+        .byte  0, -3,   1,  38,  64,  32, -1, -3
+endconst
+
+.macro epel_filter name type regtype
+        lla             \regtype\()2, subpel_filters_\name
+        li              \regtype\()1, 8
+.ifc \type,v
+        mul             \regtype\()0, a6, \regtype\()1
+.else
+        mul             \regtype\()0, a5, \regtype\()1
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+        .irp n,1,2,3,4,5,6
+        lb              \regtype\n, \n(\regtype\()0)
+        .endr
+.ifc \regtype,t
+        lb              a7, 7(\regtype\()0)
+.else
+        lb              s7, 7(\regtype\()0)
+.endif
+        lb              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst len op name type from_mem regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        sub             a2, a2, a3
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a2
+        vle8.v          v24, (a2)
+        add             a2, a2, a3
+        vle8.v          v26, (a2)
+        add             a2, a2, a3
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        .rept 6
+        sub             a2, a2, a3
+        .endr
+        vle8.v          v28, (a2)
+        sub             a2, a2, a3
+        vle8.v          v26, (a2)
+        sh1add          a2, a3, a2
+        add             a2, a2, a3
+.else
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst len op name type from_mem regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len op name type vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        epel_filter     \name, \type, t
+.if \vlen < 256
+        vsetvlstatic8   \len, a5, 32, m2
+.else
+        vsetvlstatic8   \len, a5, 64, m2
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
         .irp op, put, avg
                 bilin_h_v \len, \op, h, a5
                 bilin_h_v \len, \op, v, a6
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
         .endr
 .endr
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@  void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@  void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 9606d8545f..314a1e5808 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,8 @@  static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    if (ff_rv_vlen_least(128)) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -85,7 +86,42 @@  static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (ff_rv_vlen_least(256)) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
     }
+    }
+
 #endif
 #endif
 }