diff mbox series

[FFmpeg-devel,v3,6/9] lavc/vp8dsp: R-V V put_epel hv

Message ID tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com
State New
Headers show
Series [FFmpeg-devel,v3,1/9] lavc/vp8dsp: R-V put_vp8_pixels | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

uk7b@foxmail.com May 6, 2024, 3:38 a.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_epel4_h4v4_c: 20.0
vp8_put_epel4_h4v4_rvv_i32: 11.0
vp8_put_epel4_h4v6_c: 25.2
vp8_put_epel4_h4v6_rvv_i32: 13.5
vp8_put_epel4_h6v4_c: 22.2
vp8_put_epel4_h6v4_rvv_i32: 14.5
vp8_put_epel4_h6v6_c: 29.0
vp8_put_epel4_h6v6_rvv_i32: 15.7
vp8_put_epel8_h4v4_c: 73.0
vp8_put_epel8_h4v4_rvv_i32: 22.2
vp8_put_epel8_h4v6_c: 90.5
vp8_put_epel8_h4v6_rvv_i32: 26.7
vp8_put_epel8_h6v4_c: 85.0
vp8_put_epel8_h6v4_rvv_i32: 27.2
vp8_put_epel8_h6v6_c: 104.7
vp8_put_epel8_h6v6_rvv_i32: 29.5
vp8_put_epel16_h4v4_c: 145.5
vp8_put_epel16_h4v4_rvv_i32: 26.5
vp8_put_epel16_h4v6_c: 190.7
vp8_put_epel16_h4v6_rvv_i32: 47.5
vp8_put_epel16_h6v4_c: 173.7
vp8_put_epel16_h6v4_rvv_i32: 33.2
vp8_put_epel16_h6v6_c: 222.2
vp8_put_epel16_h6v6_rvv_i32: 35.5
---
 libavcodec/riscv/vp8dsp_init.c |  13 ++++
 libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
 2 files changed, 109 insertions(+), 21 deletions(-)

Comments

Rémi Denis-Courmont May 6, 2024, 7:24 p.m. UTC | #1
Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp8_put_epel4_h4v4_c: 20.0
> vp8_put_epel4_h4v4_rvv_i32: 11.0
> vp8_put_epel4_h4v6_c: 25.2
> vp8_put_epel4_h4v6_rvv_i32: 13.5
> vp8_put_epel4_h6v4_c: 22.2
> vp8_put_epel4_h6v4_rvv_i32: 14.5
> vp8_put_epel4_h6v6_c: 29.0
> vp8_put_epel4_h6v6_rvv_i32: 15.7
> vp8_put_epel8_h4v4_c: 73.0
> vp8_put_epel8_h4v4_rvv_i32: 22.2
> vp8_put_epel8_h4v6_c: 90.5
> vp8_put_epel8_h4v6_rvv_i32: 26.7
> vp8_put_epel8_h6v4_c: 85.0
> vp8_put_epel8_h6v4_rvv_i32: 27.2
> vp8_put_epel8_h6v6_c: 104.7
> vp8_put_epel8_h6v6_rvv_i32: 29.5
> vp8_put_epel16_h4v4_c: 145.5
> vp8_put_epel16_h4v4_rvv_i32: 26.5
> vp8_put_epel16_h4v6_c: 190.7
> vp8_put_epel16_h4v6_rvv_i32: 47.5
> vp8_put_epel16_h6v4_c: 173.7
> vp8_put_epel16_h6v4_rvv_i32: 33.2
> vp8_put_epel16_h6v6_c: 222.2
> vp8_put_epel16_h6v6_rvv_i32: 35.5
> ---
>  libavcodec/riscv/vp8dsp_init.c |  13 ++++
>  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
>  2 files changed, 109 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index dc3e087f01..463c8fa0a2 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>          c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
>          c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
>          c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> +
> +        c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
> +        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> +        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
>      }
>  #endif
>  #endif
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index bf268e4d8d..baa8152830 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -161,26 +161,26 @@ const subpel_filters
>          .byte 0,  -1,  12, 123,  -6, 0
>  endconst
> 
> -.macro epel_filter size type
> -        lla             t2, subpel_filters
> +.macro epel_filter size type regtype
> +        lla             \regtype\()2, subpel_filters
>  .ifc \type,v
> -        addi            t0, a6, -1
> +        addi            \regtype\()0, a6, -1

IMO, passing a complete register name, if you really need to vary it, would be 
simpler and more flexible than an ABI register type prefix.

>  .elseif \type == h
> -        addi            t0, a5, -1
> +        addi            \regtype\()0, a5, -1
>  .endif
> -        li              t1, 6
> -        mul             t0, t0, t1
> -        add             t0, t0, t2
> +        li              \regtype\()1, 6
> +        mul             \regtype\()0, \regtype\()0, \regtype\()1
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
>          .irp n 1,2,3,4
> -        lb              t\n, \n(t0)
> +        lb              \regtype\n, \n(\regtype\()0)
>          .endr
>  .ifc \size,6
> -        lb              t5, 5(t0)
> -        lb              t0, (t0)
> +        lb              \regtype\()5, 5(\regtype\()0)
> +        lb              \regtype\()0, (\regtype\()0)
>  .endif
>  .endm
> 
> -.macro epel_load dst len size type
> +.macro epel_load dst len size type from_mem regtype
>  .ifc \type,v
>          mv              a5, a3
>  .else
> @@ -189,24 +189,35 @@ endconst
>          sub             t6, a2, a5
>          add             a7, a2, a5
> 
> +.if \from_mem
>          vle8.v          v24, (a2)
>          vle8.v          v22, (t6)
>          vle8.v          v26, (a7)
>          add             a7, a7, a5
>          vle8.v          v28, (a7)
> -        vwmulu.vx       v16, v24, t2
> -        vwmulu.vx       v20, v26, t3
> +        vwmulu.vx       v16, v24, \regtype\()2
> +        vwmulu.vx       v20, v26, \regtype\()3
>  .ifc \size,6
>          sub             t6, t6, a5
>          add             a7, a7, a5
>          vle8.v          v24, (t6)
>          vle8.v          v26, (a7)
> -        vwmaccu.vx      v16, t0, v24
> -        vwmaccu.vx      v16, t5, v26
> +        vwmaccu.vx      v16, \regtype\()0, v24
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()1, v22
> +        vwmaccsu.vx     v16, \regtype\()4, v28
> +.else
> +        vwmulu.vx       v16, v4, \regtype\()2
> +        vwmulu.vx       v20, v6, \regtype\()3
> +        .ifc \size,6
> +        vwmaccu.vx      v16, \regtype\()0, v0
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        .endif
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +        vwmaccsu.vx     v16, \regtype\()4, v8
>  .endif
>          li              t6, 64
> -        vwmaccsu.vx     v16, t1, v22
> -        vwmaccsu.vx     v16, t4, v28
>          vwadd.wx        v16, v16, t6
>          vsetvlstatic16  \len
>          vwadd.vv        v24, v16, v20
> @@ -216,18 +227,18 @@ endconst
>          vnclipu.wi      \dst, v24, 0
>  .endm
> 
> -.macro epel_load_inc dst len size type
> -        epel_load       \dst \len \size \type
> +.macro epel_load_inc dst len size type from_mem regtype
> +        epel_load       \dst \len \size \type \from_mem \regtype
>          add             a2, a2, a3
>  .endm
> 
>  .macro epel len size type
>  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> -        epel_filter     \size \type
> +        epel_filter     \size \type t
>          vsetvlstatic8   \len
>  1:
>          addi            a4, a4, -1
> -        epel_load_inc   v30 \len \size \type
> +        epel_load_inc   v30 \len \size \type 1 t
>          vse8.v          v30, (a0)
>          add             a0, a0, a1
>          bnez            a4, 1b
> @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x endfunc
>  .endm
> 
> +.macro epel_hv len hsize vsize
> +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> +        addi            sp, sp, -48
> +        .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64

This code actually requires ==, not >=.

> +        sd              s\n, \n\()<<3(sp)
> +#else
> +        sw              s\n, \n\()<<3(sp)

You can do that but you only need half the stack space and offsets.

(And that's why I avoid S and FS registers like the plague, but sometimes you 
just can't.)

> +#endif
> +        .endr
> +        sub             a2, a2, a3
> +        epel_filter     \hsize h t
> +        epel_filter     \vsize v s
> +        vsetvlstatic8   \len
> +.if \hsize == 6 || \vsize == 6
> +        sub             a2, a2, a3
> +        epel_load_inc   v0 \len \hsize h 1 t
> +.endif
> +        epel_load_inc   v2 \len \hsize h 1 t
> +        epel_load_inc   v4 \len \hsize h 1 t
> +        epel_load_inc   v6 \len \hsize h 1 t
> +        epel_load_inc   v8 \len \hsize h 1 t
> +.if \hsize == 6 || \vsize == 6
> +        epel_load_inc   v10 \len \hsize h 1 t
> +.endif
> +        addi            a4, a4, -1
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30 \len \vsize v 0 s
> +        vse8.v          v30, (a0)
> +.if \hsize == 6 || \vsize == 6
> +        vmv.v.v         v0, v2
> +.endif
> +        vmv.v.v         v2, v4
> +        vmv.v.v         v4, v6
> +        vmv.v.v         v6, v8
> +.if \hsize == 6 || \vsize == 6
> +        vmv.v.v         v8, v10
> +        epel_load_inc   v10 \len \hsize h 1 t
> +.else
> +        epel_load_inc   v8 \len 4 h 1 t
> +.endif
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +        epel_load       v30 \len \vsize v 0 s
> +        vse8.v          v30, (a0)
> +
> +        .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64
> +        ld              s\n, \n\()<<3(sp)
> +#else
> +        lw              s\n, \n\()<<3(sp)
> +#endif
> +        .endr
> +        addi            sp, sp, 48
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len 16,8,4
>  put_vp8_bilin_h_v \len h a5
>  put_vp8_bilin_h_v \len v a6
> @@ -244,4 +315,8 @@ epel \len 6 h
>  epel \len 4 h
>  epel \len 6 v
>  epel \len 4 v
> +epel_hv \len 6 6
> +epel_hv \len 4 4
> +epel_hv \len 6 4
> +epel_hv \len 4 6
>  .endr
flow gg May 7, 2024, 2:31 a.m. UTC | #2
> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.

If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.

> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.

Ok, fixed it

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月7日周二 03:25写道:

> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 ++++
> >  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
> >  2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >          c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +        c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >      }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >          .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -        lla             t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +        lla             \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -        addi            t0, a6, -1
> > +        addi            \regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> >  .elseif \type == h
> > -        addi            t0, a5, -1
> > +        addi            \regtype\()0, a5, -1
> >  .endif
> > -        li              t1, 6
> > -        mul             t0, t0, t1
> > -        add             t0, t0, t2
> > +        li              \regtype\()1, 6
> > +        mul             \regtype\()0, \regtype\()0, \regtype\()1
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> >          .irp n 1,2,3,4
> > -        lb              t\n, \n(t0)
> > +        lb              \regtype\n, \n(\regtype\()0)
> >          .endr
> >  .ifc \size,6
> > -        lb              t5, 5(t0)
> > -        lb              t0, (t0)
> > +        lb              \regtype\()5, 5(\regtype\()0)
> > +        lb              \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >          mv              a5, a3
> >  .else
> > @@ -189,24 +189,35 @@ endconst
> >          sub             t6, a2, a5
> >          add             a7, a2, a5
> >
> > +.if \from_mem
> >          vle8.v          v24, (a2)
> >          vle8.v          v22, (t6)
> >          vle8.v          v26, (a7)
> >          add             a7, a7, a5
> >          vle8.v          v28, (a7)
> > -        vwmulu.vx       v16, v24, t2
> > -        vwmulu.vx       v20, v26, t3
> > +        vwmulu.vx       v16, v24, \regtype\()2
> > +        vwmulu.vx       v20, v26, \regtype\()3
> >  .ifc \size,6
> >          sub             t6, t6, a5
> >          add             a7, a7, a5
> >          vle8.v          v24, (t6)
> >          vle8.v          v26, (a7)
> > -        vwmaccu.vx      v16, t0, v24
> > -        vwmaccu.vx      v16, t5, v26
> > +        vwmaccu.vx      v16, \regtype\()0, v24
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v22
> > +        vwmaccsu.vx     v16, \regtype\()4, v28
> > +.else
> > +        vwmulu.vx       v16, v4, \regtype\()2
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +        .ifc \size,6
> > +        vwmaccu.vx      v16, \regtype\()0, v0
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        .endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +        vwmaccsu.vx     v16, \regtype\()4, v8
> >  .endif
> >          li              t6, 64
> > -        vwmaccsu.vx     v16, t1, v22
> > -        vwmaccsu.vx     v16, t4, v28
> >          vwadd.wx        v16, v16, t6
> >          vsetvlstatic16  \len
> >          vwadd.vv        v24, v16, v20
> > @@ -216,18 +227,18 @@ endconst
> >          vnclipu.wi      \dst, v24, 0
> >  .endm
> >
> > -.macro epel_load_inc dst len size type
> > -        epel_load       \dst \len \size \type
> > +.macro epel_load_inc dst len size type from_mem regtype
> > +        epel_load       \dst \len \size \type \from_mem \regtype
> >          add             a2, a2, a3
> >  .endm
> >
> >  .macro epel len size type
> >  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> > -        epel_filter     \size \type
> > +        epel_filter     \size \type t
> >          vsetvlstatic8   \len
> >  1:
> >          addi            a4, a4, -1
> > -        epel_load_inc   v30 \len \size \type
> > +        epel_load_inc   v30 \len \size \type 1 t
> >          vse8.v          v30, (a0)
> >          add             a0, a0, a1
> >          bnez            a4, 1b
> > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> > zve32x endfunc
> >  .endm
> >
> > +.macro epel_hv len hsize vsize
> > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> > +        addi            sp, sp, -48
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
>
> This code actually requires ==, not >=.
>
> > +        sd              s\n, \n\()<<3(sp)
> > +#else
> > +        sw              s\n, \n\()<<3(sp)
>
> You can do that but you only need half the stack space and offsets.
>
> (And that's why I avoid S and FS registers like the plague, but sometimes
> you
> just can't.)
>
> > +#endif
> > +        .endr
> > +        sub             a2, a2, a3
> > +        epel_filter     \hsize h t
> > +        epel_filter     \vsize v s
> > +        vsetvlstatic8   \len
> > +.if \hsize == 6 || \vsize == 6
> > +        sub             a2, a2, a3
> > +        epel_load_inc   v0 \len \hsize h 1 t
> > +.endif
> > +        epel_load_inc   v2 \len \hsize h 1 t
> > +        epel_load_inc   v4 \len \hsize h 1 t
> > +        epel_load_inc   v6 \len \hsize h 1 t
> > +        epel_load_inc   v8 \len \hsize h 1 t
> > +.if \hsize == 6 || \vsize == 6
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.endif
> > +        addi            a4, a4, -1
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v0, v2
> > +.endif
> > +        vmv.v.v         v2, v4
> > +        vmv.v.v         v4, v6
> > +        vmv.v.v         v6, v8
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v8, v10
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.else
> > +        epel_load_inc   v8 \len 4 h 1 t
> > +.endif
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
> > +        ld              s\n, \n\()<<3(sp)
> > +#else
> > +        lw              s\n, \n\()<<3(sp)
> > +#endif
> > +        .endr
> > +        addi            sp, sp, 48
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len 16,8,4
> >  put_vp8_bilin_h_v \len h a5
> >  put_vp8_bilin_h_v \len v a6
> > @@ -244,4 +315,8 @@ epel \len 6 h
> >  epel \len 4 h
> >  epel \len 6 v
> >  epel \len 4 v
> > +epel_hv \len 6 6
> > +epel_hv \len 4 4
> > +epel_hv \len 6 4
> > +epel_hv \len 4 6
> >  .endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dc3e087f01..463c8fa0a2 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -97,6 +97,19 @@  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
         c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
         c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index bf268e4d8d..baa8152830 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -161,26 +161,26 @@  const subpel_filters
         .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel_filter size type
-        lla             t2, subpel_filters
+.macro epel_filter size type regtype
+        lla             \regtype\()2, subpel_filters
 .ifc \type,v
-        addi            t0, a6, -1
+        addi            \regtype\()0, a6, -1
 .elseif \type == h
-        addi            t0, a5, -1
+        addi            \regtype\()0, a5, -1
 .endif
-        li              t1, 6
-        mul             t0, t0, t1
-        add             t0, t0, t2
+        li              \regtype\()1, 6
+        mul             \regtype\()0, \regtype\()0, \regtype\()1
+        add             \regtype\()0, \regtype\()0, \regtype\()2
         .irp n 1,2,3,4
-        lb              t\n, \n(t0)
+        lb              \regtype\n, \n(\regtype\()0)
         .endr
 .ifc \size,6
-        lb              t5, 5(t0)
-        lb              t0, (t0)
+        lb              \regtype\()5, 5(\regtype\()0)
+        lb              \regtype\()0, (\regtype\()0)
 .endif
 .endm
 
-.macro epel_load dst len size type
+.macro epel_load dst len size type from_mem regtype
 .ifc \type,v
         mv              a5, a3
 .else
@@ -189,24 +189,35 @@  endconst
         sub             t6, a2, a5
         add             a7, a2, a5
 
+.if \from_mem
         vle8.v          v24, (a2)
         vle8.v          v22, (t6)
         vle8.v          v26, (a7)
         add             a7, a7, a5
         vle8.v          v28, (a7)
-        vwmulu.vx       v16, v24, t2
-        vwmulu.vx       v20, v26, t3
+        vwmulu.vx       v16, v24, \regtype\()2
+        vwmulu.vx       v20, v26, \regtype\()3
 .ifc \size,6
         sub             t6, t6, a5
         add             a7, a7, a5
         vle8.v          v24, (t6)
         vle8.v          v26, (a7)
-        vwmaccu.vx      v16, t0, v24
-        vwmaccu.vx      v16, t5, v26
+        vwmaccu.vx      v16, \regtype\()0, v24
+        vwmaccu.vx      v16, \regtype\()5, v26
+.endif
+        vwmaccsu.vx     v16, \regtype\()1, v22
+        vwmaccsu.vx     v16, \regtype\()4, v28
+.else
+        vwmulu.vx       v16, v4, \regtype\()2
+        vwmulu.vx       v20, v6, \regtype\()3
+        .ifc \size,6
+        vwmaccu.vx      v16, \regtype\()0, v0
+        vwmaccu.vx      v16, \regtype\()5, v10
+        .endif
+        vwmaccsu.vx     v16, \regtype\()1, v2
+        vwmaccsu.vx     v16, \regtype\()4, v8
 .endif
         li              t6, 64
-        vwmaccsu.vx     v16, t1, v22
-        vwmaccsu.vx     v16, t4, v28
         vwadd.wx        v16, v16, t6
         vsetvlstatic16  \len
         vwadd.vv        v24, v16, v20
@@ -216,18 +227,18 @@  endconst
         vnclipu.wi      \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size type
-        epel_load       \dst \len \size \type
+.macro epel_load_inc dst len size type from_mem regtype
+        epel_load       \dst \len \size \type \from_mem \regtype
         add             a2, a2, a3
 .endm
 
 .macro epel len size type
 func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
-        epel_filter     \size \type
+        epel_filter     \size \type t
         vsetvlstatic8   \len
 1:
         addi            a4, a4, -1
-        epel_load_inc   v30 \len \size \type
+        epel_load_inc   v30 \len \size \type 1 t
         vse8.v          v30, (a0)
         add             a0, a0, a1
         bnez            a4, 1b
@@ -236,6 +247,66 @@  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
 endfunc
 .endm
 
+.macro epel_hv len hsize vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+        addi            sp, sp, -48
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        sd              s\n, \n\()<<3(sp)
+#else
+        sw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        sub             a2, a2, a3
+        epel_filter     \hsize h t
+        epel_filter     \vsize v s
+        vsetvlstatic8   \len
+.if \hsize == 6 || \vsize == 6
+        sub             a2, a2, a3
+        epel_load_inc   v0 \len \hsize h 1 t
+.endif
+        epel_load_inc   v2 \len \hsize h 1 t
+        epel_load_inc   v4 \len \hsize h 1 t
+        epel_load_inc   v6 \len \hsize h 1 t
+        epel_load_inc   v8 \len \hsize h 1 t
+.if \hsize == 6 || \vsize == 6
+        epel_load_inc   v10 \len \hsize h 1 t
+.endif
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \vsize v 0 s
+        vse8.v          v30, (a0)
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v0, v2
+.endif
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v8, v10
+        epel_load_inc   v10 \len \hsize h 1 t
+.else
+        epel_load_inc   v8 \len 4 h 1 t
+.endif
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30 \len \vsize v 0 s
+        vse8.v          v30, (a0)
+
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        ld              s\n, \n\()<<3(sp)
+#else
+        lw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        addi            sp, sp, 48
+
+        ret
+endfunc
+.endm
+
 .irp len 16,8,4
 put_vp8_bilin_h_v \len h a5
 put_vp8_bilin_h_v \len v a6
@@ -244,4 +315,8 @@  epel \len 6 h
 epel \len 4 h
 epel \len 6 v
 epel \len 4 v
+epel_hv \len 6 6
+epel_hv \len 4 4
+epel_hv \len 6 4
+epel_hv \len 4 6
 .endr