Message ID | tencent_A53C3DAA76164C43BAC51C73A7580E610508@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel] lavc/vc1dsp: R-V V mspel_pixels | expand |
Hi, it's me. I accidentally repeated it but it seems to be correct. <uk7b@foxmail.com> 于2024年5月4日周六 18:01写道: > From: sunyuechi <sunyuechi@iscas.ac.cn> > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > --- > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > 2 files changed, 74 insertions(+) > > diff --git a/libavcodec/riscv/vc1dsp_init.c > b/libavcodec/riscv/vc1dsp_init.c > index e47b644f80..610c43a1a3 100644 > --- a/libavcodec/riscv/vc1dsp_init.c > +++ b/libavcodec/riscv/vc1dsp_init.c > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > ptrdiff_t stride, int16_t *block > void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > { > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > if (flags & AV_CPU_FLAG_RVV_I64) { > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > } > } > #endif > diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S > index 4a00945ead..48244f91aa 100644 > --- a/libavcodec/riscv/vc1dsp_rvv.S > +++ b/libavcodec/riscv/vc1dsp_rvv.S > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > vsse32.v v0, (a0), a1 > ret > endfunc > + > +func ff_put_pixels16x16_rvv, zve32x > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vse8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vse8.v v31, (a0) > + > + ret > +endfunc > + > +func ff_put_pixels8x8_rvv, zve64x > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v8, (a1), a2 > + vsse64.v v8, (a0), a2 > + > + ret > +endfunc > + > +func ff_avg_pixels16x16_rvv, zve32x > + csrwi vxrm, 0 > + vsetivli zero, 16, e8, m1, ta, ma > + li t0, 128 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > + vle8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vle8.v v15, (a0) > + vsetvli zero, t0, e8, m8, ta, ma > + vaaddu.vv v0, v0, v16 > + vaaddu.vv v8, v8, v24 > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > + vse8.v v\n, (a0) > + sub a0, a0, a2 > + .endr > + vse8.v v0, (a0) > + > + ret > +endfunc > + > +func ff_avg_pixels8x8_rvv, zve64x > + csrwi vxrm, 0 > + li t0, 64 > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v16, (a1), a2 > + vlse64.v v8, (a0), a2 > + vsetvli zero, t0, e8, m4, ta, ma > + vaaddu.vv v16, v16, v8 > + vsetivli zero, 8, e8, mf2, ta, ma > + vsse64.v v16, (a0), a2 > + > + ret > +endfunc > -- > 2.45.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > --- > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > 2 files changed, 74 insertions(+) > > diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c > index e47b644f80..610c43a1a3 100644 > --- a/libavcodec/riscv/vc1dsp_init.c > +++ b/libavcodec/riscv/vc1dsp_init.c > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, > ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t > *dest, ptrdiff_t stride, int16_t *block); void > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, > const uint8_t *src, ptrdiff_t line_size, int rnd); +void > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t > *src, ptrdiff_t line_size, int rnd); > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > { > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > if (flags & AV_CPU_FLAG_RVV_I64) { > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > } > } > #endif > diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S > index 4a00945ead..48244f91aa 100644 > --- a/libavcodec/riscv/vc1dsp_rvv.S > +++ b/libavcodec/riscv/vc1dsp_rvv.S > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > vsse32.v v0, (a0), a1 > ret > endfunc > + > +func ff_put_pixels16x16_rvv, zve32x > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) Is it not faster to compute the address ahead of time, e.g.: add t1, a2, a1 vle8.v vN, (a1) sh1add a1, a2, a1 vle8.v vN+1, (t1) ...and so on? Even on a reordering core, you can't eliminate stall on data dependency if there is nothing else to be done. (Ditto below and in other patches.) > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vse8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vse8.v v31, (a0) > + > + ret > +endfunc > + > +func ff_put_pixels8x8_rvv, zve64x > + vsetivli zero, 8, e8, mf2, ta, ma > + vlse64.v v8, (a1), a2 > + vsse64.v v8, (a0), a2 Copying 64-bit quantities should not need RVV at all. Maybe the C version needs to be improved instead, but if that is not possible, then an RVI version may be more portable and work just as well. > + > + ret > +endfunc > + > +func ff_avg_pixels16x16_rvv, zve32x > + csrwi vxrm, 0 > + vsetivli zero, 16, e8, m1, ta, ma > + li t0, 128 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 > + vle8.v v\n, (a1) > + add a1, a1, a2 > + .endr > + vle8.v v31, (a1) > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > + vle8.v v\n, (a0) > + add a0, a0, a2 > + .endr > + vle8.v v15, (a0) > + vsetvli zero, t0, e8, m8, ta, ma > + vaaddu.vv v0, v0, v16 > + vaaddu.vv v8, v8, v24 > + vsetivli zero, 16, e8, m1, ta, ma > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > + vse8.v v\n, (a0) > + sub a0, a0, a2 > + .endr > + vse8.v v0, (a0) > + > + ret > +endfunc > + > +func ff_avg_pixels8x8_rvv, zve64x > + csrwi vxrm, 0 > + li t0, 64 > + vsetivli zero, 8, e8, mf2, ta, ma Does MF2 actually improve perfs over M1 here? > + vlse64.v v16, (a1), a2 > + vlse64.v v8, (a0), a2 > + vsetvli zero, t0, e8, m4, ta, ma > + vaaddu.vv v16, v16, v8 > + vsetivli zero, 8, e8, mf2, ta, ma > + vsse64.v v16, (a0), a2 > + > + ret > +endfunc
> Is it not faster to compute the address ahead of time, e.g.: > Ditto below and in other patches. Yes, update here and I will check other patches > Copying 64-bit quantities should not need RVV at all. Maybe the C version needs to be improved instead, but if that is not possible, then an RVI version may be more portable and work just as well. The logic in the c version is the same in other places, which might be difficult to modify. I've updated it using rvi. > Does MF2 actually improve perfs over M1 here? The difference here seems very small, but when both mf2 and m1 are correct, the test results have only shown mf2 to be better, so I want to use mf2. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月5日周日 01:53写道: > Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 > > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 > > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 > > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 > > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 > > --- > > libavcodec/riscv/vc1dsp_init.c | 8 +++++ > > libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ > > 2 files changed, 74 insertions(+) > > > > diff --git a/libavcodec/riscv/vc1dsp_init.c > b/libavcodec/riscv/vc1dsp_init.c > > index e47b644f80..610c43a1a3 100644 > > --- a/libavcodec/riscv/vc1dsp_init.c > > +++ b/libavcodec/riscv/vc1dsp_init.c > > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, > ptrdiff_t > > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, > > ptrdiff_t stride, int16_t *block); void > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t > > *dest, ptrdiff_t stride, int16_t *block); void > > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t > > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, > > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, > > const uint8_t *src, ptrdiff_t line_size, int rnd); +void > > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t > > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const > uint8_t > > *src, ptrdiff_t line_size, int rnd); > > > > av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > > { > > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) > > if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { > > dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; > > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; > > + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; > > if (flags & AV_CPU_FLAG_RVV_I64) { > > dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; > > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; > > + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; > > + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; > > } > > } > > #endif > > diff --git a/libavcodec/riscv/vc1dsp_rvv.S > b/libavcodec/riscv/vc1dsp_rvv.S > > index 4a00945ead..48244f91aa 100644 > > --- a/libavcodec/riscv/vc1dsp_rvv.S > > +++ b/libavcodec/riscv/vc1dsp_rvv.S > > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > > vsse32.v v0, (a0), a1 > > ret > > endfunc > > + > > +func ff_put_pixels16x16_rvv, zve32x > > + vsetivli zero, 16, e8, m1, ta, ma > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vle8.v v\n, (a1) > > + add a1, a1, a2 > > + .endr > > + vle8.v v31, (a1) > > Is it not faster to compute the address ahead of time, e.g.: > > add t1, a2, a1 > vle8.v vN, (a1) > sh1add a1, a2, a1 > vle8.v vN+1, (t1) > > ...and so on? Even on a reordering core, you can't eliminate stall on data > dependency if there is nothing else to be done. > > (Ditto below and in other patches.) > > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vse8.v v\n, (a0) > > + add a0, a0, a2 > > + .endr > > + vse8.v v31, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_put_pixels8x8_rvv, zve64x > > + vsetivli zero, 8, e8, mf2, ta, ma > > + vlse64.v v8, (a1), a2 > > + vsse64.v v8, (a0), a2 > > Copying 64-bit quantities should not need RVV at all. Maybe the C version > needs to be improved instead, but if that is not possible, then an RVI > version > may be more portable and work just as well. > > > + > > + ret > > +endfunc > > + > > +func ff_avg_pixels16x16_rvv, zve32x > > + csrwi vxrm, 0 > > + vsetivli zero, 16, e8, m1, ta, ma > > + li t0, 128 > > + > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30 > > + vle8.v v\n, (a1) > > + add a1, a1, a2 > > + .endr > > + vle8.v v31, (a1) > > + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 > > + vle8.v v\n, (a0) > > + add a0, a0, a2 > > + .endr > > + vle8.v v15, (a0) > > + vsetvli zero, t0, e8, m8, ta, ma > > + vaaddu.vv v0, v0, v16 > > + vaaddu.vv v8, v8, v24 > > + vsetivli zero, 16, e8, m1, ta, ma > > + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 > > + vse8.v v\n, (a0) > > + sub a0, a0, a2 > > + .endr > > + vse8.v v0, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_avg_pixels8x8_rvv, zve64x > > + csrwi vxrm, 0 > > + li t0, 64 > > + vsetivli zero, 8, e8, mf2, ta, ma > > Does MF2 actually improve perfs over M1 here? > > > + vlse64.v v16, (a1), a2 > > + vlse64.v v8, (a0), a2 > > + vsetvli zero, t0, e8, m4, ta, ma > > + vaaddu.vv v16, v16, v8 > > + vsetivli zero, 8, e8, mf2, ta, ma > > + vsse64.v v16, (a0), a2 > > + > > + ret > > +endfunc > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit : > > Does MF2 actually improve perfs over M1 here? > > The difference here seems very small, but when both mf2 and m1 are correct, > the test results have only shown mf2 to be better, so I want to use mf2. I can live with that. But this is a slippery slope because large vector sizes would involve even smaller fractions. Then we would need to compute the value which might negate the performance gains from fractional multipliers. The fastest approach that I can think of is a symbolic LA (which expands to 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. Furthermore, this requires VSETVL, which precludes immediate constant VL Indeed, the VSETIVL instruction does not exist. AFAIU, BananaPi F3 has 256-bit vectors already now.
Hi, I got BananaPi F3, made some fixes, updated in reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月6日周一 03:26写道: > Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit : > > > Does MF2 actually improve perfs over M1 here? > > > > The difference here seems very small, but when both mf2 and m1 are > correct, > > the test results have only shown mf2 to be better, so I want to use mf2. > > I can live with that. But this is a slippery slope because large vector > sizes > would involve even smaller fractions. Then we would need to compute the > value > which might negate the performance gains from fractional multipliers. > > The fastest approach that I can think of is a symbolic LA (which expands > to > 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable. > Furthermore, this requires VSETVL, which precludes immediate constant VL > Indeed, the VSETIVL instruction does not exist. > > AFAIU, BananaPi F3 has 256-bit vectors already now. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> Hi, I got BananaPi F3, made some fixes, updated in reply
So... Does it benefit from halving the logical multiplier to process fixed-sized
block as compared to C908, or can we stick to the same code regardless of
vector sizes?
Also beware that K60 cores have in-order pipelines, so data dependencies will
probably hurt more than on C908.
The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in vc1, or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, results in a 10-20% performance decrease on both k230 and banana_f3. I think we should just continue using it as is... Rémi Denis-Courmont <remi@remlab.net> 于2024年5月10日周五 23:34写道: > Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit : > > Hi, I got BananaPi F3, made some fixes, updated in reply > > So... Does it benefit from halving the logical multiplier to process > fixed-sized > block as compared to C908, or can we stick to the same code regardless of > vector sizes? > > Also beware that K60 cores have in-order pipelines, so data dependencies > will > probably hurt more than on C908. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit : > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in > vc1, > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, > results in a 10-20% performance decrease on both k230 and banana_f3. The questions remain, how changing from MF2 to MF4 affects performance on Zvl256b, and if it does, how to deal with that without breaking support for Zvl128b.
In banana_f3, further reducing the value of mf resulted in another performance improvement. I think in the end we might need to use different functions depending on vlen in init.. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月11日周六 18:24写道: > Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit : > > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in > > vc1, > > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8, > > results in a 10-20% performance decrease on both k230 and banana_f3. > > The questions remain, how changing from MF2 to MF4 affects performance on > Zvl256b, and if it does, how to deal with that without breaking support > for > Zvl128b. > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c index e47b644f80..610c43a1a3 100644 --- a/libavcodec/riscv/vc1dsp_init.c +++ b/libavcodec/riscv/vc1dsp_init.c @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) { @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp) if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) { dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv; dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv; + dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv; + dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv; if (flags & AV_CPU_FLAG_RVV_I64) { dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv; dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv; + dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv; + dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv; } } #endif diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 4a00945ead..48244f91aa 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vsse32.v v0, (a0), a1 ret endfunc + +func ff_put_pixels16x16_rvv, zve32x + vsetivli zero, 16, e8, m1, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vle8.v v\n, (a1) + add a1, a1, a2 + .endr + vle8.v v31, (a1) + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vse8.v v\n, (a0) + add a0, a0, a2 + .endr + vse8.v v31, (a0) + + ret +endfunc + +func ff_put_pixels8x8_rvv, zve64x + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v8, (a1), a2 + vsse64.v v8, (a0), a2 + + ret +endfunc + +func ff_avg_pixels16x16_rvv, zve32x + csrwi vxrm, 0 + vsetivli zero, 16, e8, m1, ta, ma + li t0, 128 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + vle8.v v\n, (a1) + add a1, a1, a2 + .endr + vle8.v v31, (a1) + .irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + vle8.v v\n, (a0) + add a0, a0, a2 + .endr + vle8.v v15, (a0) + vsetvli zero, t0, e8, m8, ta, ma + vaaddu.vv v0, v0, v16 + vaaddu.vv v8, v8, v24 + vsetivli zero, 16, e8, m1, ta, ma + .irp n 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 + vse8.v v\n, (a0) + sub a0, a0, a2 + .endr + vse8.v v0, (a0) + + ret +endfunc + +func ff_avg_pixels8x8_rvv, zve64x + csrwi vxrm, 0 + li t0, 64 + vsetivli zero, 8, e8, mf2, ta, ma + vlse64.v v16, (a1), a2 + vlse64.v v8, (a0), a2 + vsetvli zero, t0, e8, m4, ta, ma + vaaddu.vv v16, v16, v8 + vsetivli zero, 8, e8, mf2, ta, ma + vsse64.v v16, (a0), a2 + + ret +endfunc
From: sunyuechi <sunyuechi@iscas.ac.cn> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7 vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5 vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7 vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5 vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7 --- libavcodec/riscv/vc1dsp_init.c | 8 +++++ libavcodec/riscv/vc1dsp_rvv.S | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+)