diff mbox series

[FFmpeg-devel] lavc/vc1dsp: R-V V mspel_pixels

Message ID tencent_A53C3DAA76164C43BAC51C73A7580E610508@qq.com
State New
Headers show
Series [FFmpeg-devel] lavc/vc1dsp: R-V V mspel_pixels | expand

Commit Message

uk7b@foxmail.com May 4, 2024, 10:01 a.m. UTC
From: sunyuechi <sunyuechi@iscas.ac.cn>

vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
---
 libavcodec/riscv/vc1dsp_init.c |  8 +++++
 libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

Comments

flow gg May 4, 2024, 10:08 a.m. UTC | #1
Hi, it's me. I accidentally repeated it but it seems to be correct.

<uk7b@foxmail.com> 于2024年5月4日周六 18:01写道:

> From: sunyuechi <sunyuechi@iscas.ac.cn>
>
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +++++
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+)
>
> diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block
>  void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
>
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>          if (flags & AV_CPU_FLAG_RVV_I64) {
>              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>          }
>      }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>          vsse32.v      v0, (a0), a1
>          ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vse8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vse8.v        v31, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v8, (a1), a2
> +        vsse64.v      v8, (a0), a2
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +        csrwi         vxrm, 0
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        li            t0, 128
> +
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +        vle8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vle8.v        v15, (a0)
> +        vsetvli       zero, t0, e8, m8, ta, ma
> +        vaaddu.vv     v0, v0, v16
> +        vaaddu.vv     v8, v8, v24
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +        vse8.v        v\n, (a0)
> +        sub           a0, a0, a2
> +        .endr
> +        vse8.v        v0, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +        csrwi         vxrm, 0
> +        li            t0, 64
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v16, (a1), a2
> +        vlse64.v      v8, (a0), a2
> +        vsetvli       zero, t0, e8, m4, ta, ma
> +        vaaddu.vv     v16, v16, v8
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vsse64.v      v16, (a0), a2
> +
> +        ret
> +endfunc
> --
> 2.45.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 4, 2024, 5:53 p.m. UTC | #2
Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit :
> From: sunyuechi <sunyuechi@iscas.ac.cn>
> 
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +++++
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
>  2 files changed, 74 insertions(+)
> 
> diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t
> stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> *dest, ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t
> *src, ptrdiff_t line_size, int rnd);
> 
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>          if (flags & AV_CPU_FLAG_RVV_I64) {
>              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>          }
>      }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>          vsse32.v      v0, (a0), a1
>          ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)

Is it not faster to compute the address ahead of time, e.g.:

add t1, a2, a1
vle8.v vN, (a1)
sh1add a1, a2, a1
vle8.v vN+1, (t1)

...and so on? Even on a reordering core, you can't eliminate stall on data 
dependency if there is nothing else to be done.

(Ditto below and in other patches.)

> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vse8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vse8.v        v31, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vlse64.v      v8, (a1), a2
> +        vsse64.v      v8, (a0), a2

Copying 64-bit quantities should not need RVV at all. Maybe the C version 
needs to be improved instead, but if that is not possible, then an RVI version 
may be more portable and work just as well.

> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +        csrwi         vxrm, 0
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        li            t0, 128
> +
> +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +        vle8.v        v\n, (a1)
> +        add           a1, a1, a2
> +        .endr
> +        vle8.v        v31, (a1)
> +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +        vle8.v        v\n, (a0)
> +        add           a0, a0, a2
> +        .endr
> +        vle8.v        v15, (a0)
> +        vsetvli       zero, t0, e8, m8, ta, ma
> +        vaaddu.vv     v0, v0, v16
> +        vaaddu.vv     v8, v8, v24
> +        vsetivli      zero, 16, e8, m1, ta, ma
> +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +        vse8.v        v\n, (a0)
> +        sub           a0, a0, a2
> +        .endr
> +        vse8.v        v0, (a0)
> +
> +        ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +        csrwi         vxrm, 0
> +        li            t0, 64
> +        vsetivli      zero, 8, e8, mf2, ta, ma

Does MF2 actually improve perfs over M1 here?

> +        vlse64.v      v16, (a1), a2
> +        vlse64.v      v8, (a0), a2
> +        vsetvli       zero, t0, e8, m4, ta, ma
> +        vaaddu.vv     v16, v16, v8
> +        vsetivli      zero, 8, e8, mf2, ta, ma
> +        vsse64.v      v16, (a0), a2
> +
> +        ret
> +endfunc
flow gg May 5, 2024, 9:18 a.m. UTC | #3
> Is it not faster to compute the address ahead of time, e.g.:
> Ditto below and in other patches.

Yes, update here and I will check other patches

> Copying 64-bit quantities should not need RVV at all. Maybe the C version
needs to be improved instead, but if that is not possible, then an RVI
version
may be more portable and work just as well.

The logic in the c version is the same in other places, which might be
difficult to modify. I've updated it using rvi.

> Does MF2 actually improve perfs over M1 here?

The difference here seems very small, but when both mf2 and m1 are correct,
the test results have only shown mf2 to be better, so I want to use mf2.

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月5日周日 01:53写道:

> Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit :
> > From: sunyuechi <sunyuechi@iscas.ac.cn>
> >
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> > ---
> >  libavcodec/riscv/vc1dsp_init.c |  8 +++++
> >  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++++++++++++++++++++++++++++++++++
> >  2 files changed, 74 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..610c43a1a3 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t
> > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> > const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >      if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >          dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >          dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> > +        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >          if (flags & AV_CPU_FLAG_RVV_I64) {
> >              dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >              dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> > +            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >          }
> >      }
> >  #endif
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > index 4a00945ead..48244f91aa 100644
> > --- a/libavcodec/riscv/vc1dsp_rvv.S
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
> >          vsse32.v      v0, (a0), a1
> >          ret
> >  endfunc
> > +
> > +func ff_put_pixels16x16_rvv, zve32x
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vle8.v        v\n, (a1)
> > +        add           a1, a1, a2
> > +        .endr
> > +        vle8.v        v31, (a1)
>
> Is it not faster to compute the address ahead of time, e.g.:
>
> add t1, a2, a1
> vle8.v vN, (a1)
> sh1add a1, a2, a1
> vle8.v vN+1, (t1)
>
> ...and so on? Even on a reordering core, you can't eliminate stall on data
> dependency if there is nothing else to be done.
>
> (Ditto below and in other patches.)
>
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vse8.v        v\n, (a0)
> > +        add           a0, a0, a2
> > +        .endr
> > +        vse8.v        v31, (a0)
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_put_pixels8x8_rvv, zve64x
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
> > +        vlse64.v      v8, (a1), a2
> > +        vsse64.v      v8, (a0), a2
>
> Copying 64-bit quantities should not need RVV at all. Maybe the C version
> needs to be improved instead, but if that is not possible, then an RVI
> version
> may be more portable and work just as well.
>
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_avg_pixels16x16_rvv, zve32x
> > +        csrwi         vxrm, 0
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        li            t0, 128
> > +
> > +        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +        vle8.v        v\n, (a1)
> > +        add           a1, a1, a2
> > +        .endr
> > +        vle8.v        v31, (a1)
> > +        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> > +        vle8.v        v\n, (a0)
> > +        add           a0, a0, a2
> > +        .endr
> > +        vle8.v        v15, (a0)
> > +        vsetvli       zero, t0, e8, m8, ta, ma
> > +        vaaddu.vv     v0, v0, v16
> > +        vaaddu.vv     v8, v8, v24
> > +        vsetivli      zero, 16, e8, m1, ta, ma
> > +        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> > +        vse8.v        v\n, (a0)
> > +        sub           a0, a0, a2
> > +        .endr
> > +        vse8.v        v0, (a0)
> > +
> > +        ret
> > +endfunc
> > +
> > +func ff_avg_pixels8x8_rvv, zve64x
> > +        csrwi         vxrm, 0
> > +        li            t0, 64
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
>
> Does MF2 actually improve perfs over M1 here?
>
> > +        vlse64.v      v16, (a1), a2
> > +        vlse64.v      v8, (a0), a2
> > +        vsetvli       zero, t0, e8, m4, ta, ma
> > +        vaaddu.vv     v16, v16, v8
> > +        vsetivli      zero, 8, e8, mf2, ta, ma
> > +        vsse64.v      v16, (a0), a2
> > +
> > +        ret
> > +endfunc
>
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 5, 2024, 7:26 p.m. UTC | #4
Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit :
> > Does MF2 actually improve perfs over M1 here?
> 
> The difference here seems very small, but when both mf2 and m1 are correct,
> the test results have only shown mf2 to be better, so I want to use mf2.

I can live with that. But this is a slippery slope because large vector sizes 
would involve even smaller fractions. Then we would need to compute the value 
which might negate the performance gains from fractional multipliers.

The fastest approach that I can think of is a symbolic LA (which expands to 
1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. 
Furthermore, this requires VSETVL, which precludes immediate constant VL 
Indeed, the VSETIVL instruction does not exist.

AFAIU, BananaPi F3 has 256-bit vectors already now.
flow gg May 10, 2024, 8:22 a.m. UTC | #5
Hi, I got BananaPi F3, made some fixes, updated in reply

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月6日周一 03:26写道:

> Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit :
> > > Does MF2 actually improve perfs over M1 here?
> >
> > The difference here seems very small, but when both mf2 and m1 are
> correct,
> > the test results have only shown mf2 to be better, so I want to use mf2.
>
> I can live with that. But this is a slippery slope because large vector
> sizes
> would involve even smaller fractions. Then we would need to compute the
> value
> which might negate the performance gains from fractional multipliers.
>
> The fastest approach that I can think of is a symbolic LA (which expands
> to
> 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable.
> Furthermore, this requires VSETVL, which precludes immediate constant VL
> Indeed, the VSETIVL instruction does not exist.
>
> AFAIU, BananaPi F3 has 256-bit vectors already now.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 10, 2024, 3:34 p.m. UTC | #6
Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> Hi, I got BananaPi F3, made some fixes, updated in reply

So... Does it benefit from halving the logical multiplier to process fixed-sized 
block as compared to C908, or can we stick to the same code regardless of 
vector sizes?

Also beware that K60 cores have in-order pipelines, so data dependencies will 
probably hurt more than on C908.
flow gg May 11, 2024, 10:02 a.m. UTC | #7
The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
vc1,
or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
results in a 10-20% performance decrease on both k230 and banana_f3.

I think we should just continue using it as is...

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月10日周五 23:34写道:

> Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> > Hi, I got BananaPi F3, made some fixes, updated in reply
>
> So... Does it benefit from halving the logical multiplier to process
> fixed-sized
> block as compared to C908, or can we stick to the same code regardless of
> vector sizes?
>
> Also beware that K60 cores have in-order pipelines, so data dependencies
> will
> probably hurt more than on C908.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Rémi Denis-Courmont May 11, 2024, 10:24 a.m. UTC | #8
Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit :
> The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
> vc1,
> or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
> results in a 10-20% performance decrease on both k230 and banana_f3.

The questions remain, how changing from MF2 to MF4 affects performance on 
Zvl256b, and if it does, how to deal with that without breaking support for 
Zvl128b.
flow gg May 11, 2024, 10:47 a.m. UTC | #9
In banana_f3, further reducing the value of mf resulted in another
performance improvement. I think in the end we might need to use different
functions depending on vlen in init..

Rémi Denis-Courmont <remi@remlab.net> 于2024年5月11日周六 18:24写道:

> Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit :
> > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
> > vc1,
> > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
> > results in a 10-20% performance decrease on both k230 and banana_f3.
>
> The questions remain, how changing from MF2 to MF4 affects performance on
> Zvl256b, and if it does, how to deal with that without breaking support
> for
> Zvl128b.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
diff mbox series

Patch

diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
index e47b644f80..610c43a1a3 100644
--- a/libavcodec/riscv/vc1dsp_init.c
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -29,6 +29,10 @@  void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block
 void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
+void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
 
 av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
 {
@@ -38,9 +42,13 @@  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
     if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
         dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
         dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+        dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
+        dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
         if (flags & AV_CPU_FLAG_RVV_I64) {
             dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
             dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+            dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
+            dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
         }
     }
 #endif
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4a00945ead..48244f91aa 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -111,3 +111,69 @@  func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
         vsse32.v      v0, (a0), a1
         ret
 endfunc
+
+func ff_put_pixels16x16_rvv, zve32x
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vse8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vse8.v        v31, (a0)
+
+        ret
+endfunc
+
+func ff_put_pixels8x8_rvv, zve64x
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v8, (a1), a2
+        vsse64.v      v8, (a0), a2
+
+        ret
+endfunc
+
+func ff_avg_pixels16x16_rvv, zve32x
+        csrwi         vxrm, 0
+        vsetivli      zero, 16, e8, m1, ta, ma
+        li            t0, 128
+
+        .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+        vle8.v        v\n, (a1)
+        add           a1, a1, a2
+        .endr
+        vle8.v        v31, (a1)
+        .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+        vle8.v        v\n, (a0)
+        add           a0, a0, a2
+        .endr
+        vle8.v        v15, (a0)
+        vsetvli       zero, t0, e8, m8, ta, ma
+        vaaddu.vv     v0, v0, v16
+        vaaddu.vv     v8, v8, v24
+        vsetivli      zero, 16, e8, m1, ta, ma
+        .irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+        vse8.v        v\n, (a0)
+        sub           a0, a0, a2
+        .endr
+        vse8.v        v0, (a0)
+
+        ret
+endfunc
+
+func ff_avg_pixels8x8_rvv, zve64x
+        csrwi         vxrm, 0
+        li            t0, 64
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vlse64.v      v16, (a1), a2
+        vlse64.v      v8, (a0), a2
+        vsetvli       zero, t0, e8, m4, ta, ma
+        vaaddu.vv     v16, v16, v8
+        vsetivli      zero, 8, e8, mf2, ta, ma
+        vsse64.v      v16, (a0), a2
+
+        ret
+endfunc