Message ID | tencent_9DE07314580961CD3C39AA26161543F9C508@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v3,1/9] lavc/vp9dsp: R-V ipred vert | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Le maanantaina 13. toukokuuta 2024, 19.59.21 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp9_tm_4x4_8bpp_c: 116.5 > vp9_tm_4x4_8bpp_rvv_i32: 43.5 > vp9_tm_8x8_8bpp_c: 416.2 > vp9_tm_8x8_8bpp_rvv_i32: 86.0 > vp9_tm_16x16_8bpp_c: 1665.5 > vp9_tm_16x16_8bpp_rvv_i32: 187.2 > vp9_tm_32x32_8bpp_c: 6974.2 > vp9_tm_32x32_8bpp_rvv_i32: 625.7 > --- > libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ > libavcodec/riscv/vp9dsp.h | 8 ++ > libavcodec/riscv/vp9dsp_init.c | 4 + > 3 files changed, 153 insertions(+) > > diff --git a/libavcodec/riscv/vp9_intra_rvv.S > b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 > --- a/libavcodec/riscv/vp9_intra_rvv.S > +++ b/libavcodec/riscv/vp9_intra_rvv.S > @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x > > ret > endfunc > + > +.macro tm_sum dst, top, offset > + lbu t3, \offset(a2) > + sub t3, t3, a4 > + vadd.vx \dst, \top, t3 The macro saves some copycat code, but it seems to prevent good scheduling. Consuming t3 right after loading it is not ideal. > +.endm > + > +func ff_tm_32x32_rvv, zve32x > + lbu a4, -1(a3) > + li t5, 32 > + > + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 > + vsetvli zero, t5, e16, m4, ta, ma AFAICT, you do not need to reset the vector configuration every time. > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + > + tm_sum v0, v28, \n1 > + tm_sum v4, v28, \n2 > + tm_sum v8, v28, \n3 > + tm_sum v12, v28, \n4 > + tm_sum v16, v28, \n5 > + tm_sum v20, v28, \n6 > + tm_sum v24, v28, \n7 > + tm_sum v28, v28, \n8 > + > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m2, ta, ma > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + .endm > + > + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 > + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 > + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 > + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 > + > + ret > +endfunc > + > +func ff_tm_16x16_rvv, zve32x > + vsetivli zero, 16, e16, m2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v30, v8 > + lbu a4, -1(a3) > + > + tm_sum v0, v30, 15 > + tm_sum v2, v30, 14 > + tm_sum v4, v30, 13 > + tm_sum v6, v30, 12 > + tm_sum v8, v30, 11 > + tm_sum v10, v30, 10 > + tm_sum v12, v30, 9 > + tm_sum v14, v30, 8 > + tm_sum v16, v30, 7 > + tm_sum v18, v30, 6 > + tm_sum v20, v30, 5 > + tm_sum v22, v30, 4 > + tm_sum v24, v30, 3 > + tm_sum v26, v30, 2 > + tm_sum v28, v30, 1 > + tm_sum v30, v30, 0 > + > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, m1, ta, ma > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v30, v30, 0 > + vse8.v v30, (a0) > + > + ret > +endfunc > + > +func ff_tm_8x8_rvv, zve32x > + vsetivli zero, 8, e16, m1, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum v16, v28, 7 > + tm_sum v17, v28, 6 > + tm_sum v18, v28, 5 > + tm_sum v19, v28, 4 > + tm_sum v20, v28, 3 > + tm_sum v21, v28, 2 > + tm_sum v22, v28, 1 > + tm_sum v23, v28, 0 > + > + .irp n 16, 17, 18, 19, 20, 21, 22, 23 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf2, ta, ma > + .irp n 16, 17, 18, 19, 20, 21, 22 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v23, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > + > +func ff_tm_4x4_rvv, zve32x > + vsetivli zero, 4, e16, mf2, ta, ma > + vle8.v v8, (a3) > + vzext.vf2 v28, v8 > + lbu a4, -1(a3) > + > + tm_sum v16, v28, 3 > + tm_sum v17, v28, 2 > + tm_sum v18, v28, 1 > + tm_sum v19, v28, 0 > + > + .irp n 16, 17, 18, 19 > + vmax.vx v\n, v\n, zero > + .endr > + > + vsetvli zero, zero, e8, mf4, ta, ma > + .irp n 16, 17, 18 > + vnclipu.wi v\n, v\n, 0 > + vse8.v v\n, (a0) > + add a0, a0, a1 > + .endr > + vnclipu.wi v24, v19, 0 > + vse8.v v24, (a0) > + > + ret > +endfunc > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > index 0ad961c7e0..79330b4968 100644 > --- a/libavcodec/riscv/vp9dsp.h > +++ b/libavcodec/riscv/vp9dsp.h > @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const > uint8_t *l, const uint8_t *a); > void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > const uint8_t *a); > +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > + const uint8_t *a); > > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c > b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 > --- a/libavcodec/riscv/vp9dsp_init.c > +++ b/libavcodec/riscv/vp9dsp_init.c > @@ -89,6 +89,10 @@ static av_cold void > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; > + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; > + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; > + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; > + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; > } > #endif > #endif
Why is it unnecessary to reset the vector configuration every time? I think it is necessary to reset e16/e8 each time. Rémi Denis-Courmont <remi@remlab.net> 于2024年5月15日周三 01:46写道: > Le maanantaina 13. toukokuuta 2024, 19.59.21 EEST uk7b@foxmail.com a > écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp9_tm_4x4_8bpp_c: 116.5 > > vp9_tm_4x4_8bpp_rvv_i32: 43.5 > > vp9_tm_8x8_8bpp_c: 416.2 > > vp9_tm_8x8_8bpp_rvv_i32: 86.0 > > vp9_tm_16x16_8bpp_c: 1665.5 > > vp9_tm_16x16_8bpp_rvv_i32: 187.2 > > vp9_tm_32x32_8bpp_c: 6974.2 > > vp9_tm_32x32_8bpp_rvv_i32: 625.7 > > --- > > libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ > > libavcodec/riscv/vp9dsp.h | 8 ++ > > libavcodec/riscv/vp9dsp_init.c | 4 + > > 3 files changed, 153 insertions(+) > > > > diff --git a/libavcodec/riscv/vp9_intra_rvv.S > > b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 > > --- a/libavcodec/riscv/vp9_intra_rvv.S > > +++ b/libavcodec/riscv/vp9_intra_rvv.S > > @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x > > > > ret > > endfunc > > + > > +.macro tm_sum dst, top, offset > > + lbu t3, \offset(a2) > > + sub t3, t3, a4 > > + vadd.vx \dst, \top, t3 > > The macro saves some copycat code, but it seems to prevent good > scheduling. > Consuming t3 right after loading it is not ideal. > > > +.endm > > + > > +func ff_tm_32x32_rvv, zve32x > > + lbu a4, -1(a3) > > + li t5, 32 > > + > > + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 > > + vsetvli zero, t5, e16, m4, ta, ma > > AFAICT, you do not need to reset the vector configuration every time. > > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + > > + tm_sum v0, v28, \n1 > > + tm_sum v4, v28, \n2 > > + tm_sum v8, v28, \n3 > > + tm_sum v12, v28, \n4 > > + tm_sum v16, v28, \n5 > > + tm_sum v20, v28, \n6 > > + tm_sum v24, v28, \n7 > > + tm_sum v28, v28, \n8 > > + > > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, m2, ta, ma > > + .irp n 0, 4, 8, 12, 16, 20, 24, 28 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + .endm > > + > > + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 > > + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 > > + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 > > + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 > > + > > + ret > > +endfunc > > + > > +func ff_tm_16x16_rvv, zve32x > > + vsetivli zero, 16, e16, m2, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v30, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v0, v30, 15 > > + tm_sum v2, v30, 14 > > + tm_sum v4, v30, 13 > > + tm_sum v6, v30, 12 > > + tm_sum v8, v30, 11 > > + tm_sum v10, v30, 10 > > + tm_sum v12, v30, 9 > > + tm_sum v14, v30, 8 > > + tm_sum v16, v30, 7 > > + tm_sum v18, v30, 6 > > + tm_sum v20, v30, 5 > > + tm_sum v22, v30, 4 > > + tm_sum v24, v30, 3 > > + tm_sum v26, v30, 2 > > + tm_sum v28, v30, 1 > > + tm_sum v30, v30, 0 > > + > > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, m1, ta, ma > > + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v30, v30, 0 > > + vse8.v v30, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_tm_8x8_rvv, zve32x > > + vsetivli zero, 8, e16, m1, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v16, v28, 7 > > + tm_sum v17, v28, 6 > > + tm_sum v18, v28, 5 > > + tm_sum v19, v28, 4 > > + tm_sum v20, v28, 3 > > + tm_sum v21, v28, 2 > > + tm_sum v22, v28, 1 > > + tm_sum v23, v28, 0 > > + > > + .irp n 16, 17, 18, 19, 20, 21, 22, 23 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, mf2, ta, ma > > + .irp n 16, 17, 18, 19, 20, 21, 22 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v24, v23, 0 > > + vse8.v v24, (a0) > > + > > + ret > > +endfunc > > + > > +func ff_tm_4x4_rvv, zve32x > > + vsetivli zero, 4, e16, mf2, ta, ma > > + vle8.v v8, (a3) > > + vzext.vf2 v28, v8 > > + lbu a4, -1(a3) > > + > > + tm_sum v16, v28, 3 > > + tm_sum v17, v28, 2 > > + tm_sum v18, v28, 1 > > + tm_sum v19, v28, 0 > > + > > + .irp n 16, 17, 18, 19 > > + vmax.vx v\n, v\n, zero > > + .endr > > + > > + vsetvli zero, zero, e8, mf4, ta, ma > > + .irp n 16, 17, 18 > > + vnclipu.wi v\n, v\n, 0 > > + vse8.v v\n, (a0) > > + add a0, a0, a1 > > + .endr > > + vnclipu.wi v24, v19, 0 > > + vse8.v v24, (a0) > > + > > + ret > > +endfunc > > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h > > index 0ad961c7e0..79330b4968 100644 > > --- a/libavcodec/riscv/vp9dsp.h > > +++ b/libavcodec/riscv/vp9dsp.h > > @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, > const > > uint8_t *l, const uint8_t *a); > > void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > const uint8_t *a); > > +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, > > + const uint8_t *a); > > > > #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) > > > \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t > > dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c > > b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 > > --- a/libavcodec/riscv/vp9dsp_init.c > > +++ b/libavcodec/riscv/vp9dsp_init.c > > @@ -89,6 +89,10 @@ static av_cold void > > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) > > dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; > > dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; > > dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; > > + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; > > + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; > > + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; > > + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; > > } > > #endif > > #endif > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
Le tiistaina 14. toukokuuta 2024, 20.57.17 EEST flow gg a écrit : > Why is it unnecessary to reset the vector configuration every time? I think > it is necessary to reset e16/e8 each time. I misread the placement of .endm OTOH, it seems that you could just write the tm_sum32 with a single parameter, as the other ones are just relative by constant +/-1.
> The macro saves some copycat code, but it seems to prevent good scheduling. > Consuming t3 right after loading it is not ideal. > OTOH, it seems that you could just write the tm_sum32 with a single parameter, > as the other ones are just relative by constant +/-1. Okay, updated it in the reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月15日周三 02:08写道: > Le tiistaina 14. toukokuuta 2024, 20.57.17 EEST flow gg a écrit : > > Why is it unnecessary to reset the vector configuration every time? I > think > > it is necessary to reset e16/e8 each time. > > I misread the placement of .endm > > OTOH, it seems that you could just write the tm_sum32 with a single > parameter, > as the other ones are just relative by constant +/-1. > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644 --- a/libavcodec/riscv/vp9_intra_rvv.S +++ b/libavcodec/riscv/vp9_intra_rvv.S @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x ret endfunc + +.macro tm_sum dst, top, offset + lbu t3, \offset(a2) + sub t3, t3, a4 + vadd.vx \dst, \top, t3 +.endm + +func ff_tm_32x32_rvv, zve32x + lbu a4, -1(a3) + li t5, 32 + + .macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8 + vsetvli zero, t5, e16, m4, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + + tm_sum v0, v28, \n1 + tm_sum v4, v28, \n2 + tm_sum v8, v28, \n3 + tm_sum v12, v28, \n4 + tm_sum v16, v28, \n5 + tm_sum v20, v28, \n6 + tm_sum v24, v28, \n7 + tm_sum v28, v28, \n8 + + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m2, ta, ma + .irp n 0, 4, 8, 12, 16, 20, 24, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + .endm + + tm_sum32 31, 30, 29, 28, 27, 26, 25, 24 + tm_sum32 23, 22, 21, 20, 19, 18, 17, 16 + tm_sum32 15, 14, 13, 12, 11, 10, 9, 8 + tm_sum32 7, 6, 5, 4, 3, 2, 1, 0 + + ret +endfunc + +func ff_tm_16x16_rvv, zve32x + vsetivli zero, 16, e16, m2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v30, v8 + lbu a4, -1(a3) + + tm_sum v0, v30, 15 + tm_sum v2, v30, 14 + tm_sum v4, v30, 13 + tm_sum v6, v30, 12 + tm_sum v8, v30, 11 + tm_sum v10, v30, 10 + tm_sum v12, v30, 9 + tm_sum v14, v30, 8 + tm_sum v16, v30, 7 + tm_sum v18, v30, 6 + tm_sum v20, v30, 5 + tm_sum v22, v30, 4 + tm_sum v24, v30, 3 + tm_sum v26, v30, 2 + tm_sum v28, v30, 1 + tm_sum v30, v30, 0 + + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, m1, ta, ma + .irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v30, v30, 0 + vse8.v v30, (a0) + + ret +endfunc + +func ff_tm_8x8_rvv, zve32x + vsetivli zero, 8, e16, m1, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum v16, v28, 7 + tm_sum v17, v28, 6 + tm_sum v18, v28, 5 + tm_sum v19, v28, 4 + tm_sum v20, v28, 3 + tm_sum v21, v28, 2 + tm_sum v22, v28, 1 + tm_sum v23, v28, 0 + + .irp n 16, 17, 18, 19, 20, 21, 22, 23 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf2, ta, ma + .irp n 16, 17, 18, 19, 20, 21, 22 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v23, 0 + vse8.v v24, (a0) + + ret +endfunc + +func ff_tm_4x4_rvv, zve32x + vsetivli zero, 4, e16, mf2, ta, ma + vle8.v v8, (a3) + vzext.vf2 v28, v8 + lbu a4, -1(a3) + + tm_sum v16, v28, 3 + tm_sum v17, v28, 2 + tm_sum v18, v28, 1 + tm_sum v19, v28, 0 + + .irp n 16, 17, 18, 19 + vmax.vx v\n, v\n, zero + .endr + + vsetvli zero, zero, e8, mf4, ta, ma + .irp n 16, 17, 18 + vnclipu.wi v\n, v\n, 0 + vse8.v v\n, (a0) + add a0, a0, a1 + .endr + vnclipu.wi v24, v19, 0 + vse8.v v24, (a0) + + ret +endfunc diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h index 0ad961c7e0..79330b4968 100644 --- a/libavcodec/riscv/vp9dsp.h +++ b/libavcodec/riscv/vp9dsp.h @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a); +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); +void ff_tm_4x4_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, + const uint8_t *a); #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride, \ diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c index eab3e9cb0a..184fadbaf7 100644 --- a/libavcodec/riscv/vp9dsp_init.c +++ b/libavcodec/riscv/vp9dsp_init.c @@ -89,6 +89,10 @@ static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp) dsp->intra_pred[TX_32X32][HOR_PRED] = ff_h_32x32_rvv; dsp->intra_pred[TX_16X16][HOR_PRED] = ff_h_16x16_rvv; dsp->intra_pred[TX_8X8][HOR_PRED] = ff_h_8x8_rvv; + dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_tm_32x32_rvv; + dsp->intra_pred[TX_16X16][TM_VP8_PRED] = ff_tm_16x16_rvv; + dsp->intra_pred[TX_8X8][TM_VP8_PRED] = ff_tm_8x8_rvv; + dsp->intra_pred[TX_4X4][TM_VP8_PRED] = ff_tm_4x4_rvv; } #endif #endif
From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp9_tm_4x4_8bpp_c: 116.5 vp9_tm_4x4_8bpp_rvv_i32: 43.5 vp9_tm_8x8_8bpp_c: 416.2 vp9_tm_8x8_8bpp_rvv_i32: 86.0 vp9_tm_16x16_8bpp_c: 1665.5 vp9_tm_16x16_8bpp_rvv_i32: 187.2 vp9_tm_32x32_8bpp_c: 6974.2 vp9_tm_32x32_8bpp_rvv_i32: 625.7 --- libavcodec/riscv/vp9_intra_rvv.S | 141 +++++++++++++++++++++++++++++++ libavcodec/riscv/vp9dsp.h | 8 ++ libavcodec/riscv/vp9dsp_init.c | 4 + 3 files changed, 153 insertions(+)