[FFmpeg-devel,v3,6/9] lavc/vp8dsp: R-V V put_epel hv

Message ID	tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com
State	New
Headers	show Delivered-To: ffmpegpatchwork2@gmail.com Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100; Message-ID: <tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com> From: uk7b@foxmail.com To: ffmpeg-devel@ffmpeg.org Date: Mon, 6 May 2024 11:38:06 +0800 In-Reply-To: <20240506033809.3790245-1-uk7b@foxmail.com> References: <20240506033809.3790245-1-uk7b@foxmail.com> MIME-Version: 1.0 Subject: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv Precedence: list Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: sunyuechi <sunyuechi@iscas.ac.cn> Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Errors-To: ffmpeg-devel-bounces@ffmpeg.org Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>
Series	[FFmpeg-devel,v3,1/9] lavc/vp8dsp: R-V put_vp8_pixels \| expand [FFmpeg-devel,v3,1/9] lavc/vp8dsp: R-V put_vp8_pixels [FFmpeg-devel,v3,2/9] lavc/vp8dsp: R-V V put_bilin_h v [FFmpeg-devel,v3,3/9] lavc/vp8dsp: R-V V put_bilin_hv [FFmpeg-devel,v3,4/9] lavc/vp8dsp: R-V V put_epel h [FFmpeg-devel,v3,5/9] lavc/vp8dsp: R-V V put_epel v [FFmpeg-devel,v3,6/9] lavc/vp8dsp: R-V V put_epel hv [FFmpeg-devel,v3,7/9] lavc/vp8dsp: R-V V loop_filter_simple [FFmpeg-devel,v3,8/9] lavc/vp8dsp: R-V V loop_filter_inner [FFmpeg-devel,v3,9/9] lavc/vp8dsp: R-V V loop_filter

Message ID

tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com

State

New

Headers

Received-SPF: pass (google.com: domain of ffmpeg-devel-bounces@ffmpeg.org
 designates 79.124.17.100 as permitted sender) client-ip=79.124.17.100;
Message-ID: <tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com>
From: uk7b@foxmail.com
To: ffmpeg-devel@ffmpeg.org
Date: Mon,  6 May 2024 11:38:06 +0800
In-Reply-To: <20240506033809.3790245-1-uk7b@foxmail.com>
References: <20240506033809.3790245-1-uk7b@foxmail.com>
MIME-Version: 1.0
Subject: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv
Precedence: list
Reply-To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org>
Cc: sunyuechi <sunyuechi@iscas.ac.cn>
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Errors-To: ffmpeg-devel-bounces@ffmpeg.org
Sender: "ffmpeg-devel" <ffmpeg-devel-bounces@ffmpeg.org>

Series

[FFmpeg-devel,v3,1/9] lavc/vp8dsp: R-V put_vp8_pixels | expand

Context	Check	Description
andriy/make_x86	success	Make finished
andriy/make_fate_x86	success	Make fate finished

Context

Check

Description

andriy/make_x86

success

Make finished

andriy/make_fate_x86

success

Make fate finished

Commit Message

uk7b@foxmail.com May 6, 2024, 3:38 a.m. UTC

From: sunyuechi <sunyuechi@iscas.ac.cn>

C908:
vp8_put_epel4_h4v4_c: 20.0
vp8_put_epel4_h4v4_rvv_i32: 11.0
vp8_put_epel4_h4v6_c: 25.2
vp8_put_epel4_h4v6_rvv_i32: 13.5
vp8_put_epel4_h6v4_c: 22.2
vp8_put_epel4_h6v4_rvv_i32: 14.5
vp8_put_epel4_h6v6_c: 29.0
vp8_put_epel4_h6v6_rvv_i32: 15.7
vp8_put_epel8_h4v4_c: 73.0
vp8_put_epel8_h4v4_rvv_i32: 22.2
vp8_put_epel8_h4v6_c: 90.5
vp8_put_epel8_h4v6_rvv_i32: 26.7
vp8_put_epel8_h6v4_c: 85.0
vp8_put_epel8_h6v4_rvv_i32: 27.2
vp8_put_epel8_h6v6_c: 104.7
vp8_put_epel8_h6v6_rvv_i32: 29.5
vp8_put_epel16_h4v4_c: 145.5
vp8_put_epel16_h4v4_rvv_i32: 26.5
vp8_put_epel16_h4v6_c: 190.7
vp8_put_epel16_h4v6_rvv_i32: 47.5
vp8_put_epel16_h6v4_c: 173.7
vp8_put_epel16_h6v4_rvv_i32: 33.2
vp8_put_epel16_h6v6_c: 222.2
vp8_put_epel16_h6v6_rvv_i32: 35.5
---
 libavcodec/riscv/vp8dsp_init.c |  13 ++++
 libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
 2 files changed, 109 insertions(+), 21 deletions(-)

Comments

Rémi Denis-Courmont May 6, 2024, 7:24 p.m. UTC | #1

Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> C908:
> vp8_put_epel4_h4v4_c: 20.0
> vp8_put_epel4_h4v4_rvv_i32: 11.0
> vp8_put_epel4_h4v6_c: 25.2
> vp8_put_epel4_h4v6_rvv_i32: 13.5
> vp8_put_epel4_h6v4_c: 22.2
> vp8_put_epel4_h6v4_rvv_i32: 14.5
> vp8_put_epel4_h6v6_c: 29.0
> vp8_put_epel4_h6v6_rvv_i32: 15.7
> vp8_put_epel8_h4v4_c: 73.0
> vp8_put_epel8_h4v4_rvv_i32: 22.2
> vp8_put_epel8_h4v6_c: 90.5
> vp8_put_epel8_h4v6_rvv_i32: 26.7
> vp8_put_epel8_h6v4_c: 85.0
> vp8_put_epel8_h6v4_rvv_i32: 27.2
> vp8_put_epel8_h6v6_c: 104.7
> vp8_put_epel8_h6v6_rvv_i32: 29.5
> vp8_put_epel16_h4v4_c: 145.5
> vp8_put_epel16_h4v4_rvv_i32: 26.5
> vp8_put_epel16_h4v6_c: 190.7
> vp8_put_epel16_h4v6_rvv_i32: 47.5
> vp8_put_epel16_h6v4_c: 173.7
> vp8_put_epel16_h6v4_rvv_i32: 33.2
> vp8_put_epel16_h6v6_c: 222.2
> vp8_put_epel16_h6v6_rvv_i32: 35.5
> ---
>  libavcodec/riscv/vp8dsp_init.c |  13 ++++
>  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
>  2 files changed, 109 insertions(+), 21 deletions(-)
> 
> diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
> index dc3e087f01..463c8fa0a2 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>          c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
>          c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
>          c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> +
> +        c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> +        c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> +        c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> +        c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
> +        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> +        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
>      }
>  #endif
>  #endif
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index bf268e4d8d..baa8152830 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -161,26 +161,26 @@ const subpel_filters
>          .byte 0,  -1,  12, 123,  -6, 0
>  endconst
> 
> -.macro epel_filter size type
> -        lla             t2, subpel_filters
> +.macro epel_filter size type regtype
> +        lla             \regtype\()2, subpel_filters
>  .ifc \type,v
> -        addi            t0, a6, -1
> +        addi            \regtype\()0, a6, -1

IMO, passing a complete register name, if you really need to vary it, would be 
simpler and more flexible than an ABI register type prefix.

>  .elseif \type == h
> -        addi            t0, a5, -1
> +        addi            \regtype\()0, a5, -1
>  .endif
> -        li              t1, 6
> -        mul             t0, t0, t1
> -        add             t0, t0, t2
> +        li              \regtype\()1, 6
> +        mul             \regtype\()0, \regtype\()0, \regtype\()1
> +        add             \regtype\()0, \regtype\()0, \regtype\()2
>          .irp n 1,2,3,4
> -        lb              t\n, \n(t0)
> +        lb              \regtype\n, \n(\regtype\()0)
>          .endr
>  .ifc \size,6
> -        lb              t5, 5(t0)
> -        lb              t0, (t0)
> +        lb              \regtype\()5, 5(\regtype\()0)
> +        lb              \regtype\()0, (\regtype\()0)
>  .endif
>  .endm
> 
> -.macro epel_load dst len size type
> +.macro epel_load dst len size type from_mem regtype
>  .ifc \type,v
>          mv              a5, a3
>  .else
> @@ -189,24 +189,35 @@ endconst
>          sub             t6, a2, a5
>          add             a7, a2, a5
> 
> +.if \from_mem
>          vle8.v          v24, (a2)
>          vle8.v          v22, (t6)
>          vle8.v          v26, (a7)
>          add             a7, a7, a5
>          vle8.v          v28, (a7)
> -        vwmulu.vx       v16, v24, t2
> -        vwmulu.vx       v20, v26, t3
> +        vwmulu.vx       v16, v24, \regtype\()2
> +        vwmulu.vx       v20, v26, \regtype\()3
>  .ifc \size,6
>          sub             t6, t6, a5
>          add             a7, a7, a5
>          vle8.v          v24, (t6)
>          vle8.v          v26, (a7)
> -        vwmaccu.vx      v16, t0, v24
> -        vwmaccu.vx      v16, t5, v26
> +        vwmaccu.vx      v16, \regtype\()0, v24
> +        vwmaccu.vx      v16, \regtype\()5, v26
> +.endif
> +        vwmaccsu.vx     v16, \regtype\()1, v22
> +        vwmaccsu.vx     v16, \regtype\()4, v28
> +.else
> +        vwmulu.vx       v16, v4, \regtype\()2
> +        vwmulu.vx       v20, v6, \regtype\()3
> +        .ifc \size,6
> +        vwmaccu.vx      v16, \regtype\()0, v0
> +        vwmaccu.vx      v16, \regtype\()5, v10
> +        .endif
> +        vwmaccsu.vx     v16, \regtype\()1, v2
> +        vwmaccsu.vx     v16, \regtype\()4, v8
>  .endif
>          li              t6, 64
> -        vwmaccsu.vx     v16, t1, v22
> -        vwmaccsu.vx     v16, t4, v28
>          vwadd.wx        v16, v16, t6
>          vsetvlstatic16  \len
>          vwadd.vv        v24, v16, v20
> @@ -216,18 +227,18 @@ endconst
>          vnclipu.wi      \dst, v24, 0
>  .endm
> 
> -.macro epel_load_inc dst len size type
> -        epel_load       \dst \len \size \type
> +.macro epel_load_inc dst len size type from_mem regtype
> +        epel_load       \dst \len \size \type \from_mem \regtype
>          add             a2, a2, a3
>  .endm
> 
>  .macro epel len size type
>  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> -        epel_filter     \size \type
> +        epel_filter     \size \type t
>          vsetvlstatic8   \len
>  1:
>          addi            a4, a4, -1
> -        epel_load_inc   v30 \len \size \type
> +        epel_load_inc   v30 \len \size \type 1 t
>          vse8.v          v30, (a0)
>          add             a0, a0, a1
>          bnez            a4, 1b
> @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x endfunc
>  .endm
> 
> +.macro epel_hv len hsize vsize
> +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> +        addi            sp, sp, -48
> +        .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64

This code actually requires ==, not >=.

> +        sd              s\n, \n\()<<3(sp)
> +#else
> +        sw              s\n, \n\()<<3(sp)

You can do that but you only need half the stack space and offsets.

(And that's why I avoid S and FS registers like the plague, but sometimes you 
just can't.)

> +#endif
> +        .endr
> +        sub             a2, a2, a3
> +        epel_filter     \hsize h t
> +        epel_filter     \vsize v s
> +        vsetvlstatic8   \len
> +.if \hsize == 6 || \vsize == 6
> +        sub             a2, a2, a3
> +        epel_load_inc   v0 \len \hsize h 1 t
> +.endif
> +        epel_load_inc   v2 \len \hsize h 1 t
> +        epel_load_inc   v4 \len \hsize h 1 t
> +        epel_load_inc   v6 \len \hsize h 1 t
> +        epel_load_inc   v8 \len \hsize h 1 t
> +.if \hsize == 6 || \vsize == 6
> +        epel_load_inc   v10 \len \hsize h 1 t
> +.endif
> +        addi            a4, a4, -1
> +1:
> +        addi            a4, a4, -1
> +        epel_load       v30 \len \vsize v 0 s
> +        vse8.v          v30, (a0)
> +.if \hsize == 6 || \vsize == 6
> +        vmv.v.v         v0, v2
> +.endif
> +        vmv.v.v         v2, v4
> +        vmv.v.v         v4, v6
> +        vmv.v.v         v6, v8
> +.if \hsize == 6 || \vsize == 6
> +        vmv.v.v         v8, v10
> +        epel_load_inc   v10 \len \hsize h 1 t
> +.else
> +        epel_load_inc   v8 \len 4 h 1 t
> +.endif
> +        add             a0, a0, a1
> +        bnez            a4, 1b
> +        epel_load       v30 \len \vsize v 0 s
> +        vse8.v          v30, (a0)
> +
> +        .irp n 0,1,2,3,4,5
> +#if __riscv_xlen >= 64
> +        ld              s\n, \n\()<<3(sp)
> +#else
> +        lw              s\n, \n\()<<3(sp)
> +#endif
> +        .endr
> +        addi            sp, sp, 48
> +
> +        ret
> +endfunc
> +.endm
> +
>  .irp len 16,8,4
>  put_vp8_bilin_h_v \len h a5
>  put_vp8_bilin_h_v \len v a6
> @@ -244,4 +315,8 @@ epel \len 6 h
>  epel \len 4 h
>  epel \len 6 v
>  epel \len 4 v
> +epel_hv \len 6 6
> +epel_hv \len 4 4
> +epel_hv \len 6 4
> +epel_hv \len 4 6
>  .endr

flow gg May 7, 2024, 2:31 a.m. UTC | #2

> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.

If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.

> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.

Ok, fixed it

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月7日周二 03:25写道：

> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 ++++
> >  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++++++++++++++++++++++++++------
> >  2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >          c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >          c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +        c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >      }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >          .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -        lla             t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +        lla             \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -        addi            t0, a6, -1
> > +        addi            \regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> >  .elseif \type == h
> > -        addi            t0, a5, -1
> > +        addi            \regtype\()0, a5, -1
> >  .endif
> > -        li              t1, 6
> > -        mul             t0, t0, t1
> > -        add             t0, t0, t2
> > +        li              \regtype\()1, 6
> > +        mul             \regtype\()0, \regtype\()0, \regtype\()1
> > +        add             \regtype\()0, \regtype\()0, \regtype\()2
> >          .irp n 1,2,3,4
> > -        lb              t\n, \n(t0)
> > +        lb              \regtype\n, \n(\regtype\()0)
> >          .endr
> >  .ifc \size,6
> > -        lb              t5, 5(t0)
> > -        lb              t0, (t0)
> > +        lb              \regtype\()5, 5(\regtype\()0)
> > +        lb              \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >          mv              a5, a3
> >  .else
> > @@ -189,24 +189,35 @@ endconst
> >          sub             t6, a2, a5
> >          add             a7, a2, a5
> >
> > +.if \from_mem
> >          vle8.v          v24, (a2)
> >          vle8.v          v22, (t6)
> >          vle8.v          v26, (a7)
> >          add             a7, a7, a5
> >          vle8.v          v28, (a7)
> > -        vwmulu.vx       v16, v24, t2
> > -        vwmulu.vx       v20, v26, t3
> > +        vwmulu.vx       v16, v24, \regtype\()2
> > +        vwmulu.vx       v20, v26, \regtype\()3
> >  .ifc \size,6
> >          sub             t6, t6, a5
> >          add             a7, a7, a5
> >          vle8.v          v24, (t6)
> >          vle8.v          v26, (a7)
> > -        vwmaccu.vx      v16, t0, v24
> > -        vwmaccu.vx      v16, t5, v26
> > +        vwmaccu.vx      v16, \regtype\()0, v24
> > +        vwmaccu.vx      v16, \regtype\()5, v26
> > +.endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v22
> > +        vwmaccsu.vx     v16, \regtype\()4, v28
> > +.else
> > +        vwmulu.vx       v16, v4, \regtype\()2
> > +        vwmulu.vx       v20, v6, \regtype\()3
> > +        .ifc \size,6
> > +        vwmaccu.vx      v16, \regtype\()0, v0
> > +        vwmaccu.vx      v16, \regtype\()5, v10
> > +        .endif
> > +        vwmaccsu.vx     v16, \regtype\()1, v2
> > +        vwmaccsu.vx     v16, \regtype\()4, v8
> >  .endif
> >          li              t6, 64
> > -        vwmaccsu.vx     v16, t1, v22
> > -        vwmaccsu.vx     v16, t4, v28
> >          vwadd.wx        v16, v16, t6
> >          vsetvlstatic16  \len
> >          vwadd.vv        v24, v16, v20
> > @@ -216,18 +227,18 @@ endconst
> >          vnclipu.wi      \dst, v24, 0
> >  .endm
> >
> > -.macro epel_load_inc dst len size type
> > -        epel_load       \dst \len \size \type
> > +.macro epel_load_inc dst len size type from_mem regtype
> > +        epel_load       \dst \len \size \type \from_mem \regtype
> >          add             a2, a2, a3
> >  .endm
> >
> >  .macro epel len size type
> >  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> > -        epel_filter     \size \type
> > +        epel_filter     \size \type t
> >          vsetvlstatic8   \len
> >  1:
> >          addi            a4, a4, -1
> > -        epel_load_inc   v30 \len \size \type
> > +        epel_load_inc   v30 \len \size \type 1 t
> >          vse8.v          v30, (a0)
> >          add             a0, a0, a1
> >          bnez            a4, 1b
> > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> > zve32x endfunc
> >  .endm
> >
> > +.macro epel_hv len hsize vsize
> > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
> > +        addi            sp, sp, -48
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
>
> This code actually requires ==, not >=.
>
> > +        sd              s\n, \n\()<<3(sp)
> > +#else
> > +        sw              s\n, \n\()<<3(sp)
>
> You can do that but you only need half the stack space and offsets.
>
> (And that's why I avoid S and FS registers like the plague, but sometimes
> you
> just can't.)
>
> > +#endif
> > +        .endr
> > +        sub             a2, a2, a3
> > +        epel_filter     \hsize h t
> > +        epel_filter     \vsize v s
> > +        vsetvlstatic8   \len
> > +.if \hsize == 6 || \vsize == 6
> > +        sub             a2, a2, a3
> > +        epel_load_inc   v0 \len \hsize h 1 t
> > +.endif
> > +        epel_load_inc   v2 \len \hsize h 1 t
> > +        epel_load_inc   v4 \len \hsize h 1 t
> > +        epel_load_inc   v6 \len \hsize h 1 t
> > +        epel_load_inc   v8 \len \hsize h 1 t
> > +.if \hsize == 6 || \vsize == 6
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.endif
> > +        addi            a4, a4, -1
> > +1:
> > +        addi            a4, a4, -1
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v0, v2
> > +.endif
> > +        vmv.v.v         v2, v4
> > +        vmv.v.v         v4, v6
> > +        vmv.v.v         v6, v8
> > +.if \hsize == 6 || \vsize == 6
> > +        vmv.v.v         v8, v10
> > +        epel_load_inc   v10 \len \hsize h 1 t
> > +.else
> > +        epel_load_inc   v8 \len 4 h 1 t
> > +.endif
> > +        add             a0, a0, a1
> > +        bnez            a4, 1b
> > +        epel_load       v30 \len \vsize v 0 s
> > +        vse8.v          v30, (a0)
> > +
> > +        .irp n 0,1,2,3,4,5
> > +#if __riscv_xlen >= 64
> > +        ld              s\n, \n\()<<3(sp)
> > +#else
> > +        lw              s\n, \n\()<<3(sp)
> > +#endif
> > +        .endr
> > +        addi            sp, sp, 48
> > +
> > +        ret
> > +endfunc
> > +.endm
> > +
> >  .irp len 16,8,4
> >  put_vp8_bilin_h_v \len h a5
> >  put_vp8_bilin_h_v \len v a6
> > @@ -244,4 +315,8 @@ epel \len 6 h
> >  epel \len 4 h
> >  epel \len 6 v
> >  epel \len 4 v
> > +epel_hv \len 6 6
> > +epel_hv \len 4 4
> > +epel_hv \len 6 4
> > +epel_hv \len 4 6
> >  .endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index dc3e087f01..463c8fa0a2 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -97,6 +97,19 @@  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
         c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
         c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
         c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
+
+        c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
+        c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
+        c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
     }
 #endif
 #endif
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index bf268e4d8d..baa8152830 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -161,26 +161,26 @@  const subpel_filters
         .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel_filter size type
-        lla             t2, subpel_filters
+.macro epel_filter size type regtype
+        lla             \regtype\()2, subpel_filters
 .ifc \type,v
-        addi            t0, a6, -1
+        addi            \regtype\()0, a6, -1
 .elseif \type == h
-        addi            t0, a5, -1
+        addi            \regtype\()0, a5, -1
 .endif
-        li              t1, 6
-        mul             t0, t0, t1
-        add             t0, t0, t2
+        li              \regtype\()1, 6
+        mul             \regtype\()0, \regtype\()0, \regtype\()1
+        add             \regtype\()0, \regtype\()0, \regtype\()2
         .irp n 1,2,3,4
-        lb              t\n, \n(t0)
+        lb              \regtype\n, \n(\regtype\()0)
         .endr
 .ifc \size,6
-        lb              t5, 5(t0)
-        lb              t0, (t0)
+        lb              \regtype\()5, 5(\regtype\()0)
+        lb              \regtype\()0, (\regtype\()0)
 .endif
 .endm
 
-.macro epel_load dst len size type
+.macro epel_load dst len size type from_mem regtype
 .ifc \type,v
         mv              a5, a3
 .else
@@ -189,24 +189,35 @@  endconst
         sub             t6, a2, a5
         add             a7, a2, a5
 
+.if \from_mem
         vle8.v          v24, (a2)
         vle8.v          v22, (t6)
         vle8.v          v26, (a7)
         add             a7, a7, a5
         vle8.v          v28, (a7)
-        vwmulu.vx       v16, v24, t2
-        vwmulu.vx       v20, v26, t3
+        vwmulu.vx       v16, v24, \regtype\()2
+        vwmulu.vx       v20, v26, \regtype\()3
 .ifc \size,6
         sub             t6, t6, a5
         add             a7, a7, a5
         vle8.v          v24, (t6)
         vle8.v          v26, (a7)
-        vwmaccu.vx      v16, t0, v24
-        vwmaccu.vx      v16, t5, v26
+        vwmaccu.vx      v16, \regtype\()0, v24
+        vwmaccu.vx      v16, \regtype\()5, v26
+.endif
+        vwmaccsu.vx     v16, \regtype\()1, v22
+        vwmaccsu.vx     v16, \regtype\()4, v28
+.else
+        vwmulu.vx       v16, v4, \regtype\()2
+        vwmulu.vx       v20, v6, \regtype\()3
+        .ifc \size,6
+        vwmaccu.vx      v16, \regtype\()0, v0
+        vwmaccu.vx      v16, \regtype\()5, v10
+        .endif
+        vwmaccsu.vx     v16, \regtype\()1, v2
+        vwmaccsu.vx     v16, \regtype\()4, v8
 .endif
         li              t6, 64
-        vwmaccsu.vx     v16, t1, v22
-        vwmaccsu.vx     v16, t4, v28
         vwadd.wx        v16, v16, t6
         vsetvlstatic16  \len
         vwadd.vv        v24, v16, v20
@@ -216,18 +227,18 @@  endconst
         vnclipu.wi      \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size type
-        epel_load       \dst \len \size \type
+.macro epel_load_inc dst len size type from_mem regtype
+        epel_load       \dst \len \size \type \from_mem \regtype
         add             a2, a2, a3
 .endm
 
 .macro epel len size type
 func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
-        epel_filter     \size \type
+        epel_filter     \size \type t
         vsetvlstatic8   \len
 1:
         addi            a4, a4, -1
-        epel_load_inc   v30 \len \size \type
+        epel_load_inc   v30 \len \size \type 1 t
         vse8.v          v30, (a0)
         add             a0, a0, a1
         bnez            a4, 1b
@@ -236,6 +247,66 @@  func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
 endfunc
 .endm
 
+.macro epel_hv len hsize vsize
+func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x
+        addi            sp, sp, -48
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        sd              s\n, \n\()<<3(sp)
+#else
+        sw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        sub             a2, a2, a3
+        epel_filter     \hsize h t
+        epel_filter     \vsize v s
+        vsetvlstatic8   \len
+.if \hsize == 6 || \vsize == 6
+        sub             a2, a2, a3
+        epel_load_inc   v0 \len \hsize h 1 t
+.endif
+        epel_load_inc   v2 \len \hsize h 1 t
+        epel_load_inc   v4 \len \hsize h 1 t
+        epel_load_inc   v6 \len \hsize h 1 t
+        epel_load_inc   v8 \len \hsize h 1 t
+.if \hsize == 6 || \vsize == 6
+        epel_load_inc   v10 \len \hsize h 1 t
+.endif
+        addi            a4, a4, -1
+1:
+        addi            a4, a4, -1
+        epel_load       v30 \len \vsize v 0 s
+        vse8.v          v30, (a0)
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v0, v2
+.endif
+        vmv.v.v         v2, v4
+        vmv.v.v         v4, v6
+        vmv.v.v         v6, v8
+.if \hsize == 6 || \vsize == 6
+        vmv.v.v         v8, v10
+        epel_load_inc   v10 \len \hsize h 1 t
+.else
+        epel_load_inc   v8 \len 4 h 1 t
+.endif
+        add             a0, a0, a1
+        bnez            a4, 1b
+        epel_load       v30 \len \vsize v 0 s
+        vse8.v          v30, (a0)
+
+        .irp n 0,1,2,3,4,5
+#if __riscv_xlen >= 64
+        ld              s\n, \n\()<<3(sp)
+#else
+        lw              s\n, \n\()<<3(sp)
+#endif
+        .endr
+        addi            sp, sp, 48
+
+        ret
+endfunc
+.endm
+
 .irp len 16,8,4
 put_vp8_bilin_h_v \len h a5
 put_vp8_bilin_h_v \len v a6
@@ -244,4 +315,8 @@  epel \len 6 h
 epel \len 4 h
 epel \len 6 v
 epel \len 4 v
+epel_hv \len 6 6
+epel_hv \len 4 4
+epel_hv \len 6 4
+epel_hv \len 4 6
 .endr

[FFmpeg-devel,v3,6/9] lavc/vp8dsp: R-V V put_epel hv

Checks

Commit Message

Comments

Patch