diff mbox series

[FFmpeg-devel,v3,4/5] lavc/vp9dsp: R-V V mc tap h v

Message ID tencent_DFFD58B561A43C3CAA4F1682433CC5397308@qq.com
State New
Headers show
Series [FFmpeg-devel,v3,1/5] lavc/vp9dsp: R-V V rename ff_avg to ff_vp9_avg | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

uk7b@foxmail.com May 29, 2024, 5:15 p.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

                                                     C908   X60
vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
---
 libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
 libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
 libavcodec/riscv/vp9dsp_init.c |  37 +++++-
 3 files changed, 288 insertions(+), 25 deletions(-)

Comments

flow gg May 29, 2024, 5:19 p.m. UTC | #1
A portion has been modified according to the previous review, but there are
still some parts that haven't been updated

> Similarly, it
> should be possible to share most of the horizontal and vertical code
(maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Here, bilin is modified with reference to your vp8 modification method, but
there are some issues with epel. I want to share most of the horizontal and
vertical code like h263, but because there are different types
(op/name/len), such changes seem hard. Trying to make similar modifications
for bilin also seems some hard , maybe leaving it for future optimization
:'(

> It should be possible to spare one ADDI by using just AUIPC here, and
folding
> the immediate offset into the LB's below (see also H.263 loop filter).

I'm not sure where the problem lies, but for smooth it works, but for
sharp, regular, it gives this error:
dangerous relocation: %pcrel_lo overflow with an addend, the value of
%pcrel_hi is 0xa5000 without any addend, but may be 0xa6000 after adding
the %pcrel_lo addend

<uk7b@foxmail.com> 于2024年5月30日周四 01:16写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  37 +++++-
>  3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> +        addi            \regtype\()2, \regtype\()2, 16*8*2
> +.endif
> +.ifc \name,sharp
> +        addi            \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +        li              a5, 64
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        sub             a2, a2, a3
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a2
> +        vle8.v          v24, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a2, a2, -1
> +        vle8.v          v20, (a2)
> +        addi            a2, a2, 2
> +        vle8.v          v24, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v30, (a2)
> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        .rept 6
> +        sub             a2, a2, a3
> +        .endr
> +        vle8.v          v28, (a2)
> +        sub             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        sh1add          a2, a3, a2
> +        add             a2, a2, a3
> +.else
> +        addi            a2, a2, -6
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, -1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
>
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
>  void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>                     const uint8_t *a);
>
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>      \
> -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)
>     \
> +void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
>    \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,
>     \
> +                                        ptrdiff_t dststride,
>    \
>                                          const uint8_t *src,
>     \
>                                          ptrdiff_t srcstride,
>    \
>                                          int h, int mx, int my);
>     \
>
>     \
> -void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t
> dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,
>    \
> +                                         ptrdiff_t dststride,
>     \
>                                           const uint8_t *src,
>    \
>                                           ptrdiff_t srcstride,
>     \
>                                           int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \
>                          const uint8_t *src, ptrdiff_t srcstride,   \
>                          int h, int mx, int my);
>
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
>
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  # endif
>
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    if (ff_rv_vlen_least(128)) {
>
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>      dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
>
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (ff_rv_vlen_least(256)) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif
> --
> 2.45.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 29, 2024, 8:29 p.m. UTC | #2
Le keskiviikkona 29. toukokuuta 2024, 20.15.39 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
>                                                      C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c                      :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_4v_8bpp_c                      :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32                :    5.0    4.2
> vp9_avg_8tap_smooth_8h_8bpp_c                      :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32                :    9.2    8.5
> vp9_avg_8tap_smooth_8v_8bpp_c                      :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32                :    9.5    8.5
> vp9_avg_8tap_smooth_16h_8bpp_c                     :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32               :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c                     :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32               :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c                     :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32               :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c                     :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32               :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c                     : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32               :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c                     : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32               :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c                      :   11.2    9.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_4v_8bpp_c                      :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32                :    4.2    4.0
> vp9_put_8tap_smooth_8h_8bpp_c                      :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_8v_8bpp_c                      :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32                :    8.5    7.7
> vp9_put_8tap_smooth_16h_8bpp_c                     :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32               :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c                     :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32               :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c                     :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c                     :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32               :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c                     : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32               :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c                     : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32               :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +++++++++++++++++++++++++++++++++
>  libavcodec/riscv/vp9dsp.h      |  72 ++++++++----
>  libavcodec/riscv/vp9dsp_init.c |  37 +++++-
>  3 files changed, 288 insertions(+), 25 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
> 
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +        vsetvli         zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +        vsetvli         zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +        vsetvli         zero, zero, e16, m2, ta, ma
> +.else
> +        vsetvli         zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>          csrwi           vxrm, 0
> @@ -127,8 +139,200 @@ func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
>  endfunc
>  .endm
> 
> +.macro epel_filter name, type, regtype
> +        lla             \regtype\()2, ff_vp9_subpel_filters
> +
> +.ifc \name,regular
> +        addi            \regtype\()2, \regtype\()2, 16*8*2

You can directly LLA filters + 16 * 8 * 2 and save one add. Same below. You can 
also use .equ to alias the filter addresses, and avoid if's.

> +.endif
> +.ifc \name,sharp
> +        addi            \regtype\()2, \regtype\()2, 16*8*2*2
> +.endif
> +
> +.ifc \type,v
> +        slli            \regtype\()0, a6, 4
> +.else
> +        slli            \regtype\()0, a5, 4
> +.endif

Use a macro parameter for the stride register.

> +        add             \regtype\()0, \regtype\()0, \regtype\()2
> +
> +        lh              \regtype\()1, 2(\regtype\()0)
> +        lh              \regtype\()2, 4(\regtype\()0)
> +        lh              \regtype\()3, 6(\regtype\()0)
> +        lh              \regtype\()4, 8(\regtype\()0)
> +        lh              \regtype\()5, 10(\regtype\()0)
> +        lh              \regtype\()6, 12(\regtype\()0)
> +
> +.ifc \regtype,t
> +        lh              a7, 14(\regtype\()0)
> +.else
> +        lh              s7, 14(\regtype\()0)
> +.endif
> +        lh              \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +        li              a5, 64
> +.ifc \from_mem, 1
> +        vle8.v          v22, (a2)
> +.ifc \type,v
> +        sub             a2, a2, a3
> +        vle8.v          v20, (a2)
> +        sh1add          a2, a3, a2
> +        vle8.v          v24, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v28, (a2)
> +        add             a2, a2, a3
> +        vle8.v          v30, (a2)
> +.else
> +        addi            a2, a2, -1
> +        vle8.v          v20, (a2)
> +        addi            a2, a2, 2
> +        vle8.v          v24, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, 1
> +        vle8.v          v30, (a2)

That's a lot of address dependencies, which is going to hurt performance. It 
might help to just spill more S registers if needed.

> +.endif
> +
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v24, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v20
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +        vwmaccsu.vx     v16, \regtype\()6, v28
> +.else
> +        vwmulu.vx       v16, v28, \regtype\()6
> +        vwmaccsu.vx     v16, \regtype\()2, v20
> +        vwmaccsu.vx     v16, \regtype\()5, v26
> +.endif
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v30
> +.else
> +        vwmaccsu.vx     v16, s7, v30
> +.endif
> +
> +.ifc \type,v
> +        .rept 6
> +        sub             a2, a2, a3
> +        .endr

This can be done in 3 instructions, even without mul. Of course you'll again 
need a spare register.

> +        vle8.v          v28, (a2)
> +        sub             a2, a2, a3
> +        vle8.v          v26, (a2)
> +        sh1add          a2, a3, a2
> +        add             a2, a2, a3
> +.else
> +        addi            a2, a2, -6
> +        vle8.v          v28, (a2)
> +        addi            a2, a2, -1
> +        vle8.v          v26, (a2)
> +        addi            a2, a2, 3
> +.endif
> +
> +.ifc \name,smooth
> +        vwmaccsu.vx     v16, \regtype\()1, v28
> +.else
> +        vwmaccu.vx      v16, \regtype\()1, v28
> +        vwmulu.vx       v28, v24, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v26
> +        vwmulu.vx       v20, v22, \regtype\()3
> +.else
> +.ifc \name,smooth
> +        vwmulu.vx       v16, v8, \regtype\()4
> +        vwmaccu.vx      v16, \regtype\()2, v4
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +.else
> +        vwmulu.vx       v16, v2, \regtype\()1
> +        vwmaccu.vx      v16, \regtype\()6, v12
> +        vwmaccsu.vx     v16, \regtype\()5, v10
> +        vwmaccsu.vx     v16, \regtype\()2, v4
> +        vwmulu.vx       v28, v8, \regtype\()4
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()0, v0
> +        vwmulu.vx       v20, v6, \regtype\()3
> +
> +.ifc \regtype,t
> +        vwmaccsu.vx     v16, a7, v14
> +.else
> +        vwmaccsu.vx     v16, s7, v14
> +.endif
> +
> +.endif
> +        vwadd.wx        v16, v16, a5
> +        vsetvlstatic16  \len
> +
> +.ifc \name,smooth
> +        vwadd.vv        v24, v16, v20
> +.else
> +        vwadd.vv        v24, v16, v28
> +        vwadd.wv        v24, v24, v20
> +.endif
> +        vnsra.wi        v24, v24, 7
> +        vmax.vx         v24, v24, zero
> +        vsetvlstatic8   \len, zero, 32, m2
> +
> +        vnclipu.wi      \dst, v24, 0
> +.ifc \op,avg
> +        vle8.v          v24, (a0)
> +        vaaddu.vv       \dst, \dst, v24
> +.endif
> +
> +.endm
> +
> +.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
> +        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
> +        add             a2, a2, a3
> +.endm
> +
> +.macro epel len, op, name, type, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
> +        epel_filter     \name, \type, t
> +.if \vlen < 256
> +        vsetvlstatic8   \len, a5, 32, m2
> +.else
> +        vsetvlstatic8   \len, a5, 64, m2
> +.endif
> +.ifc \op,avg
> +        csrwi           vxrm, 0
> +.endif
> +
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +.if \len == 64 && \vlen < 256
> +        addi            a0, a0, 32
> +        addi            a2, a2, 32
> +        epel_load       v30, \len, \op, \name, \type, 1, t
> +        vse8.v          v30, (a0)
> +        addi            a0, a0, -32
> +        addi            a2, a2, -32
> +.endif
> +        add             a2, a2, a3
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>          copy_avg \len
> +        .irp op, put, avg
> +                .irp name, regular, sharp, smooth
> +                        .irp type, h, v
> +                                epel \len, \op, \name, \type, 128
> +                                epel \len, \op, \name, \type, 256
> +                        .endr
> +                .endr
> +        .endr
>  .endr
> 
>  bilin_h_v  put, h, a5
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 8fb326dae0..5fd64a1b8c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -81,33 +81,39 @@ void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const
> uint8_t *l, const uint8_t *a);
> 
> -#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                      
>   \ -void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \ +#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx,
> min_vlen)              \ +void
> ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \ +
>                                        ptrdiff_t dststride,                
> \ const uint8_t *src,                  \ ptrdiff_t srcstride,              
>   \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);             \ \ -void
> ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
> +void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,       
>   \ +                                        ptrdiff_t dststride,          
>       \ const uint8_t *src,                  \ ptrdiff_t srcstride,        
>         \ int h, int mx, int my);              \ \ -void
> ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
> +void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,      
>   \ +                                         ptrdiff_t dststride,         
>       \ const uint8_t *src,                 \ ptrdiff_t srcstride,         
>       \ int h, int mx, int my);
> @@ -146,23 +152,41 @@ void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t
> dststride,     \ const uint8_t *src, ptrdiff_t srcstride,   \ int h, int
> mx, int my);
> 
> -VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
> -VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
> -VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
> -
> -VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
> -VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
> +
> +VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
> +VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
> 
>  VP9_BILINEAR_RISCV_RVV_FUNC(64);
>  VP9_BILINEAR_RISCV_RVV_FUNC(32);
> diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
> index b3700dfb08..5f759e6bc8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -49,7 +49,8 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) # endif
> 
>  #if HAVE_RVV
> -    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128))
> { +    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
> +    if (ff_rv_vlen_least(128)) {
> 
>  #define init_fpel(idx1, sz)                                           \
>      dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
> @@ -95,6 +96,40 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][1][1][1] =
> ff_avg_vp9_bilin_4hv_rvv;
> 
>  #undef init_fpel
> +
> +#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
> +    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
> +    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
> +    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
> +        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
> +
> +#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
> +    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
> +    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
> +    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
> +    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
> +    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
> +
> +    init_subpel2(0, 1, 0, h, put, 128);
> +    init_subpel2(1, 1, 0, h, avg, 128);
> +
> +    if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +        init_subpel2(0, 0, 1, v, put, 128);
> +        init_subpel2(1, 0, 1, v, avg, 128);
> +    }
> +
> +    }
> +    if (ff_rv_vlen_least(256)) {
> +        init_subpel2(0, 1, 0, h, put, 256);
> +        init_subpel2(1, 1, 0, h, avg, 256);
> +
> +        if (flags & AV_CPU_FLAG_RVB_ADDR) {
> +            init_subpel2(0, 0, 1, v, put, 256);
> +            init_subpel2(1, 0, 1, v, avg, 256);
> +        }
> +    }
>      }
>  #endif
>  #endif
diff mbox series

Patch

diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
index 990271736b..53dd833dac 100644
--- a/libavcodec/riscv/vp9_mc_rvv.S
+++ b/libavcodec/riscv/vp9_mc_rvv.S
@@ -36,6 +36,18 @@ 
 .endif
 .endm
 
+.macro vsetvlstatic16 len
+.ifc \len,4
+        vsetvli         zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+        vsetvli         zero, zero, e16, m1, ta, ma
+.elseif \len == 16
+        vsetvli         zero, zero, e16, m2, ta, ma
+.else
+        vsetvli         zero, zero, e16, m4, ta, ma
+.endif
+.endm
+
 .macro copy_avg len
 func ff_vp9_avg\len\()_rvv, zve32x
         csrwi           vxrm, 0
@@ -127,8 +139,200 @@  func ff_\op\()_vp9_bilin_4hv_rvv, zve32x
 endfunc
 .endm
 
+.macro epel_filter name, type, regtype
+        lla             \regtype\()2, ff_vp9_subpel_filters
+
+.ifc \name,regular
+        addi            \regtype\()2, \regtype\()2, 16*8*2
+.endif
+.ifc \name,sharp
+        addi            \regtype\()2, \regtype\()2, 16*8*2*2
+.endif
+
+.ifc \type,v
+        slli            \regtype\()0, a6, 4
+.else
+        slli            \regtype\()0, a5, 4
+.endif
+        add             \regtype\()0, \regtype\()0, \regtype\()2
+
+        lh              \regtype\()1, 2(\regtype\()0)
+        lh              \regtype\()2, 4(\regtype\()0)
+        lh              \regtype\()3, 6(\regtype\()0)
+        lh              \regtype\()4, 8(\regtype\()0)
+        lh              \regtype\()5, 10(\regtype\()0)
+        lh              \regtype\()6, 12(\regtype\()0)
+
+.ifc \regtype,t
+        lh              a7, 14(\regtype\()0)
+.else
+        lh              s7, 14(\regtype\()0)
+.endif
+        lh              \regtype\()0, 0(\regtype\()0)
+.endm
+
+.macro epel_load dst, len, op, name, type, from_mem, regtype
+        li              a5, 64
+.ifc \from_mem, 1
+        vle8.v          v22, (a2)
+.ifc \type,v
+        sub             a2, a2, a3
+        vle8.v          v20, (a2)
+        sh1add          a2, a3, a2
+        vle8.v          v24, (a2)
+        add             a2, a2, a3
+        vle8.v          v26, (a2)
+        add             a2, a2, a3
+        vle8.v          v28, (a2)
+        add             a2, a2, a3
+        vle8.v          v30, (a2)
+.else
+        addi            a2, a2, -1
+        vle8.v          v20, (a2)
+        addi            a2, a2, 2
+        vle8.v          v24, (a2)
+        addi            a2, a2, 1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 1
+        vle8.v          v28, (a2)
+        addi            a2, a2, 1
+        vle8.v          v30, (a2)
+.endif
+
+.ifc \name,smooth
+        vwmulu.vx       v16, v24, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v20
+        vwmaccu.vx      v16, \regtype\()5, v26
+        vwmaccsu.vx     v16, \regtype\()6, v28
+.else
+        vwmulu.vx       v16, v28, \regtype\()6
+        vwmaccsu.vx     v16, \regtype\()2, v20
+        vwmaccsu.vx     v16, \regtype\()5, v26
+.endif
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v30
+.else
+        vwmaccsu.vx     v16, s7, v30
+.endif
+
+.ifc \type,v
+        .rept 6
+        sub             a2, a2, a3
+        .endr
+        vle8.v          v28, (a2)
+        sub             a2, a2, a3
+        vle8.v          v26, (a2)
+        sh1add          a2, a3, a2
+        add             a2, a2, a3
+.else
+        addi            a2, a2, -6
+        vle8.v          v28, (a2)
+        addi            a2, a2, -1
+        vle8.v          v26, (a2)
+        addi            a2, a2, 3
+.endif
+
+.ifc \name,smooth
+        vwmaccsu.vx     v16, \regtype\()1, v28
+.else
+        vwmaccu.vx      v16, \regtype\()1, v28
+        vwmulu.vx       v28, v24, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v26
+        vwmulu.vx       v20, v22, \regtype\()3
+.else
+.ifc \name,smooth
+        vwmulu.vx       v16, v8, \regtype\()4
+        vwmaccu.vx      v16, \regtype\()2, v4
+        vwmaccu.vx      v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()1, v2
+.else
+        vwmulu.vx       v16, v2, \regtype\()1
+        vwmaccu.vx      v16, \regtype\()6, v12
+        vwmaccsu.vx     v16, \regtype\()5, v10
+        vwmaccsu.vx     v16, \regtype\()2, v4
+        vwmulu.vx       v28, v8, \regtype\()4
+.endif
+        vwmaccsu.vx     v16, \regtype\()0, v0
+        vwmulu.vx       v20, v6, \regtype\()3
+
+.ifc \regtype,t
+        vwmaccsu.vx     v16, a7, v14
+.else
+        vwmaccsu.vx     v16, s7, v14
+.endif
+
+.endif
+        vwadd.wx        v16, v16, a5
+        vsetvlstatic16  \len
+
+.ifc \name,smooth
+        vwadd.vv        v24, v16, v20
+.else
+        vwadd.vv        v24, v16, v28
+        vwadd.wv        v24, v24, v20
+.endif
+        vnsra.wi        v24, v24, 7
+        vmax.vx         v24, v24, zero
+        vsetvlstatic8   \len, zero, 32, m2
+
+        vnclipu.wi      \dst, v24, 0
+.ifc \op,avg
+        vle8.v          v24, (a0)
+        vaaddu.vv       \dst, \dst, v24
+.endif
+
+.endm
+
+.macro epel_load_inc dst, len, op, name, type, from_mem, regtype
+        epel_load       \dst, \len, \op, \name, \type, \from_mem, \regtype
+        add             a2, a2, a3
+.endm
+
+.macro epel len, op, name, type, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
+        epel_filter     \name, \type, t
+.if \vlen < 256
+        vsetvlstatic8   \len, a5, 32, m2
+.else
+        vsetvlstatic8   \len, a5, 64, m2
+.endif
+.ifc \op,avg
+        csrwi           vxrm, 0
+.endif
+
+1:
+        addi            a4, a4, -1
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+.if \len == 64 && \vlen < 256
+        addi            a0, a0, 32
+        addi            a2, a2, 32
+        epel_load       v30, \len, \op, \name, \type, 1, t
+        vse8.v          v30, (a0)
+        addi            a0, a0, -32
+        addi            a2, a2, -32
+.endif
+        add             a2, a2, a3
+        add             a0, a0, a1
+        bnez            a4, 1b
+
+        ret
+endfunc
+.endm
+
 .irp len, 64, 32, 16, 8, 4
         copy_avg \len
+        .irp op, put, avg
+                .irp name, regular, sharp, smooth
+                        .irp type, h, v
+                                epel \len, \op, \name, \type, 128
+                                epel \len, \op, \name, \type, 256
+                        .endr
+                .endr
+        .endr
 .endr
 
 bilin_h_v  put, h, a5
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 8fb326dae0..5fd64a1b8c 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -81,33 +81,39 @@  void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
 void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
                    const uint8_t *a);
 
-#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)                         \
-void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+#define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx, min_vlen)              \
+void ff_put_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_put_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_put_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_put_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);             \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##h_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##v_rvv(uint8_t *dst, ptrdiff_t dststride,   \
+void ff_avg_vp9_8tap_##type##_##SIZE##v_rvv##min_vlen(uint8_t *dst,          \
+                                        ptrdiff_t dststride,                 \
                                         const uint8_t *src,                  \
                                         ptrdiff_t srcstride,                 \
                                         int h, int mx, int my);              \
                                                                              \
-void ff_avg_8tap_##type##_##SIZE##hv_rvv(uint8_t *dst, ptrdiff_t dststride,  \
+void ff_avg_vp9_8tap_##type##_##SIZE##hv_rvv##min_vlen(uint8_t *dst,         \
+                                         ptrdiff_t dststride,                \
                                          const uint8_t *src,                 \
                                          ptrdiff_t srcstride,                \
                                          int h, int mx, int my);
@@ -146,23 +152,41 @@  void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,     \
                         const uint8_t *src, ptrdiff_t srcstride,   \
                         int h, int mx, int my);
 
-VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR);
-VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP);
-VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP);
-
-VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH);
-VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH);
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 128);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 128);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, regular, FILTER_8TAP_REGULAR, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, regular, FILTER_8TAP_REGULAR, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, sharp, FILTER_8TAP_SHARP, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, sharp, FILTER_8TAP_SHARP, 256);
+
+VP9_8TAP_RISCV_RVV_FUNC(64, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(32, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(16, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(8, smooth, FILTER_8TAP_SMOOTH, 256);
+VP9_8TAP_RISCV_RVV_FUNC(4, smooth, FILTER_8TAP_SMOOTH, 256);
 
 VP9_BILINEAR_RISCV_RVV_FUNC(64);
 VP9_BILINEAR_RISCV_RVV_FUNC(32);
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index b3700dfb08..5f759e6bc8 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -49,7 +49,8 @@  static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
 # endif
 
 #if HAVE_RVV
-    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
+    if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32)) {
+    if (ff_rv_vlen_least(128)) {
 
 #define init_fpel(idx1, sz)                                           \
     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_vp9_avg##sz##_rvv;  \
@@ -95,6 +96,40 @@  static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
     dsp->mc[4][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_4hv_rvv;
 
 #undef init_fpel
+
+#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, vlen)  \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_smooth_##sz##dir##_rvv##vlen;       \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_regular_##sz##dir##_rvv##vlen;      \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] =   \
+        ff_##type##_vp9_8tap_sharp_##sz##dir##_rvv##vlen;
+
+#define init_subpel2(idx, idxh, idxv, dir, type, vlen)      \
+    init_subpel1(0, idx, idxh, idxv, 64, dir, type, vlen);  \
+    init_subpel1(1, idx, idxh, idxv, 32, dir, type, vlen);  \
+    init_subpel1(2, idx, idxh, idxv, 16, dir, type, vlen);  \
+    init_subpel1(3, idx, idxh, idxv,  8, dir, type, vlen);  \
+    init_subpel1(4, idx, idxh, idxv,  4, dir, type, vlen)
+
+    init_subpel2(0, 1, 0, h, put, 128);
+    init_subpel2(1, 1, 0, h, avg, 128);
+
+    if (flags & AV_CPU_FLAG_RVB_ADDR) {
+        init_subpel2(0, 0, 1, v, put, 128);
+        init_subpel2(1, 0, 1, v, avg, 128);
+    }
+
+    }
+    if (ff_rv_vlen_least(256)) {
+        init_subpel2(0, 1, 0, h, put, 256);
+        init_subpel2(1, 1, 0, h, avg, 256);
+
+        if (flags & AV_CPU_FLAG_RVB_ADDR) {
+            init_subpel2(0, 0, 1, v, put, 256);
+            init_subpel2(1, 0, 1, v, avg, 256);
+        }
+    }
     }
 #endif
 #endif