Message ID | tencent_E0584A61BE3C48AAB01631430EC535733E08@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v4,1/9] lavc/vp8dsp: R-V put_vp8_pixels | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Le tiistaina 7. toukokuuta 2024, 19.54.09 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp8_put_epel4_h4v4_c: 20.0 > vp8_put_epel4_h4v4_rvv_i32: 11.0 > vp8_put_epel4_h4v6_c: 25.2 > vp8_put_epel4_h4v6_rvv_i32: 13.5 > vp8_put_epel4_h6v4_c: 22.2 > vp8_put_epel4_h6v4_rvv_i32: 14.5 > vp8_put_epel4_h6v6_c: 29.0 > vp8_put_epel4_h6v6_rvv_i32: 15.7 > vp8_put_epel8_h4v4_c: 73.0 > vp8_put_epel8_h4v4_rvv_i32: 22.2 > vp8_put_epel8_h4v6_c: 90.5 > vp8_put_epel8_h4v6_rvv_i32: 26.7 > vp8_put_epel8_h6v4_c: 85.0 > vp8_put_epel8_h6v4_rvv_i32: 27.2 > vp8_put_epel8_h6v6_c: 104.7 > vp8_put_epel8_h6v6_rvv_i32: 29.5 > vp8_put_epel16_h4v4_c: 145.5 > vp8_put_epel16_h4v4_rvv_i32: 26.5 > vp8_put_epel16_h4v6_c: 190.7 > vp8_put_epel16_h4v6_rvv_i32: 47.5 > vp8_put_epel16_h6v4_c: 173.7 > vp8_put_epel16_h6v4_rvv_i32: 33.2 > vp8_put_epel16_h6v6_c: 222.2 > vp8_put_epel16_h6v6_rvv_i32: 35.5 > --- > libavcodec/riscv/vp8dsp_init.c | 13 ++++ > libavcodec/riscv/vp8dsp_rvv.S | 123 +++++++++++++++++++++++++++------ > 2 files changed, 115 insertions(+), 21 deletions(-) > > diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c > index dc3e087f01..463c8fa0a2 100644 > --- a/libavcodec/riscv/vp8dsp_init.c > +++ b/libavcodec/riscv/vp8dsp_init.c > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) > c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; > c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; > c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; > + > + c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv; > + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; > + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; > } > #endif > #endif > diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S > index 4d7a9f6a2d..fba72f8c15 100644 > --- a/libavcodec/riscv/vp8dsp_rvv.S > +++ b/libavcodec/riscv/vp8dsp_rvv.S > @@ -161,26 +161,26 @@ const subpel_filters > .byte 0, -1, 12, 123, -6, 0 > endconst > > -.macro epel_filter size type > - lla t2, subpel_filters > +.macro epel_filter size type regtype > + lla \regtype\()2, subpel_filters > .ifc \type,v > - addi t0, a6, -1 > + addi \regtype\()0, a6, -1 > .else > - addi t0, a5, -1 > + addi \regtype\()0, a5, -1 > .endif > - li t1, 6 > - mul t0, t0, t1 > - add t0, t0, t2 > + li \regtype\()1, 6 > + mul \regtype\()0, \regtype\()0, \regtype\()1 > + add \regtype\()0, \regtype\()0, \regtype\()2 > .irp n 1,2,3,4 > - lb t\n, \n(t0) > + lb \regtype\n, \n(\regtype\()0) > .endr > .ifc \size,6 > - lb t5, 5(t0) > - lb t0, (t0) > + lb \regtype\()5, 5(\regtype\()0) > + lb \regtype\()0, (\regtype\()0) > .endif > .endm > > -.macro epel_load dst len size type > +.macro epel_load dst len size type from_mem regtype > .ifc \type,v > mv a5, a3 > .else > @@ -189,24 +189,35 @@ endconst > sub t6, a2, a5 > add a7, a2, a5 > > +.if \from_mem > vle8.v v24, (a2) > vle8.v v22, (t6) > vle8.v v26, (a7) > add a7, a7, a5 > vle8.v v28, (a7) > - vwmulu.vx v16, v24, t2 > - vwmulu.vx v20, v26, t3 > + vwmulu.vx v16, v24, \regtype\()2 > + vwmulu.vx v20, v26, \regtype\()3 > .ifc \size,6 > sub t6, t6, a5 > add a7, a7, a5 > vle8.v v24, (t6) > vle8.v v26, (a7) > - vwmaccu.vx v16, t0, v24 > - vwmaccu.vx v16, t5, v26 > + vwmaccu.vx v16, \regtype\()0, v24 > + vwmaccu.vx v16, \regtype\()5, v26 > +.endif > + vwmaccsu.vx v16, \regtype\()1, v22 > + vwmaccsu.vx v16, \regtype\()4, v28 > +.else > + vwmulu.vx v16, v4, \regtype\()2 > + vwmulu.vx v20, v6, \regtype\()3 > + .ifc \size,6 > + vwmaccu.vx v16, \regtype\()0, v0 > + vwmaccu.vx v16, \regtype\()5, v10 > + .endif > + vwmaccsu.vx v16, \regtype\()1, v2 > + vwmaccsu.vx v16, \regtype\()4, v8 > .endif > li t6, 64 > - vwmaccsu.vx v16, t1, v22 > - vwmaccsu.vx v16, t4, v28 > vwadd.wx v16, v16, t6 > vsetvlstatic16 \len > vwadd.vv v24, v16, v20 > @@ -216,18 +227,18 @@ endconst > vnclipu.wi \dst, v24, 0 > .endm > > -.macro epel_load_inc dst len size type > - epel_load \dst \len \size \type > +.macro epel_load_inc dst len size type from_mem regtype > + epel_load \dst \len \size \type \from_mem \regtype > add a2, a2, a3 > .endm > > .macro epel len size type > func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x > - epel_filter \size \type > + epel_filter \size \type t > vsetvlstatic8 \len > 1: > addi a4, a4, -1 > - epel_load_inc v30 \len \size \type > + epel_load_inc v30 \len \size \type 1 t > vse8.v v30, (a0) > add a0, a0, a1 > bnez a4, 1b > @@ -236,6 +247,72 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, > zve32x endfunc > .endm > > +.macro epel_hv len hsize vsize > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x > +#if __riscv_xlen == 64 > + addi sp, sp, -48 > + .irp n 0,1,2,3,4,5 > + sd s\n, \n\()<<3(sp) > + .endr > +#else > + addi sp, sp, -24 > + .irp n 0,1,2,3,4,5 > + sw s\n, \n\()<<2(sp) > + .endr > +#endif > + sub a2, a2, a3 > + epel_filter \hsize h t > + epel_filter \vsize v s > + vsetvlstatic8 \len > +.if \hsize == 6 || \vsize == 6 > + sub a2, a2, a3 > + epel_load_inc v0 \len \hsize h 1 t > +.endif > + epel_load_inc v2 \len \hsize h 1 t > + epel_load_inc v4 \len \hsize h 1 t > + epel_load_inc v6 \len \hsize h 1 t > + epel_load_inc v8 \len \hsize h 1 t > +.if \hsize == 6 || \vsize == 6 > + epel_load_inc v10 \len \hsize h 1 t > +.endif > + addi a4, a4, -1 > +1: > + addi a4, a4, -1 > + epel_load v30 \len \vsize v 0 s > + vse8.v v30, (a0) > +.if \hsize == 6 || \vsize == 6 > + vmv.v.v v0, v2 > +.endif > + vmv.v.v v2, v4 > + vmv.v.v v4, v6 > + vmv.v.v v6, v8 > +.if \hsize == 6 || \vsize == 6 > + vmv.v.v v8, v10 > + epel_load_inc v10 \len \hsize h 1 t > +.else > + epel_load_inc v8 \len 4 h 1 t > +.endif > + add a0, a0, a1 > + bnez a4, 1b > + epel_load v30 \len \vsize v 0 s > + vse8.v v30, (a0) > + > +#if __riscv_xlen == 64 > + .irp n 0,1,2,3,4,5 > + ld s\n, \n\()<<3(sp) > + .endr > + addi sp, sp, 48 > +#else > + .irp n 0,1,2,3,4,5 > + lw s\n, \n\()<<2(sp) > + .endr > + addi sp, sp, 24 > +#endif You can either exclude RV128 or support it. IMHO, at this point, we can ignore RV32 and RV128, as there are no signs of such vector-capable hardware on the horizon. > + > + ret > +endfunc > +.endm > + > .irp len 16,8,4 > put_vp8_bilin_h_v \len h a5 > put_vp8_bilin_h_v \len v a6 > @@ -244,4 +321,8 @@ epel \len 6 h > epel \len 4 h > epel \len 6 v > epel \len 4 v > +epel_hv \len 6 6 > +epel_hv \len 4 4 > +epel_hv \len 6 4 > +epel_hv \len 4 6 > .endr
Okay, updated it in the reply Rémi Denis-Courmont <remi@remlab.net> 于2024年5月10日周五 23:41写道: > Le tiistaina 7. toukokuuta 2024, 19.54.09 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp8_put_epel4_h4v4_c: 20.0 > > vp8_put_epel4_h4v4_rvv_i32: 11.0 > > vp8_put_epel4_h4v6_c: 25.2 > > vp8_put_epel4_h4v6_rvv_i32: 13.5 > > vp8_put_epel4_h6v4_c: 22.2 > > vp8_put_epel4_h6v4_rvv_i32: 14.5 > > vp8_put_epel4_h6v6_c: 29.0 > > vp8_put_epel4_h6v6_rvv_i32: 15.7 > > vp8_put_epel8_h4v4_c: 73.0 > > vp8_put_epel8_h4v4_rvv_i32: 22.2 > > vp8_put_epel8_h4v6_c: 90.5 > > vp8_put_epel8_h4v6_rvv_i32: 26.7 > > vp8_put_epel8_h6v4_c: 85.0 > > vp8_put_epel8_h6v4_rvv_i32: 27.2 > > vp8_put_epel8_h6v6_c: 104.7 > > vp8_put_epel8_h6v6_rvv_i32: 29.5 > > vp8_put_epel16_h4v4_c: 145.5 > > vp8_put_epel16_h4v4_rvv_i32: 26.5 > > vp8_put_epel16_h4v6_c: 190.7 > > vp8_put_epel16_h4v6_rvv_i32: 47.5 > > vp8_put_epel16_h6v4_c: 173.7 > > vp8_put_epel16_h6v4_rvv_i32: 33.2 > > vp8_put_epel16_h6v6_c: 222.2 > > vp8_put_epel16_h6v6_rvv_i32: 35.5 > > --- > > libavcodec/riscv/vp8dsp_init.c | 13 ++++ > > libavcodec/riscv/vp8dsp_rvv.S | 123 +++++++++++++++++++++++++++------ > > 2 files changed, 115 insertions(+), 21 deletions(-) > > > > diff --git a/libavcodec/riscv/vp8dsp_init.c > b/libavcodec/riscv/vp8dsp_init.c > > index dc3e087f01..463c8fa0a2 100644 > > --- a/libavcodec/riscv/vp8dsp_init.c > > +++ b/libavcodec/riscv/vp8dsp_init.c > > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) > > c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; > > c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; > > c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; > > + > > + c->put_vp8_epel_pixels_tab[0][2][2] = > ff_put_vp8_epel16_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[0][2][1] = > ff_put_vp8_epel16_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[0][1][1] = > ff_put_vp8_epel16_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[0][1][2] = > ff_put_vp8_epel16_h6v4_rvv; > > + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; > > + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; > > } > > #endif > > #endif > > diff --git a/libavcodec/riscv/vp8dsp_rvv.S > b/libavcodec/riscv/vp8dsp_rvv.S > > index 4d7a9f6a2d..fba72f8c15 100644 > > --- a/libavcodec/riscv/vp8dsp_rvv.S > > +++ b/libavcodec/riscv/vp8dsp_rvv.S > > @@ -161,26 +161,26 @@ const subpel_filters > > .byte 0, -1, 12, 123, -6, 0 > > endconst > > > > -.macro epel_filter size type > > - lla t2, subpel_filters > > +.macro epel_filter size type regtype > > + lla \regtype\()2, subpel_filters > > .ifc \type,v > > - addi t0, a6, -1 > > + addi \regtype\()0, a6, -1 > > .else > > - addi t0, a5, -1 > > + addi \regtype\()0, a5, -1 > > .endif > > - li t1, 6 > > - mul t0, t0, t1 > > - add t0, t0, t2 > > + li \regtype\()1, 6 > > + mul \regtype\()0, \regtype\()0, \regtype\()1 > > + add \regtype\()0, \regtype\()0, \regtype\()2 > > .irp n 1,2,3,4 > > - lb t\n, \n(t0) > > + lb \regtype\n, \n(\regtype\()0) > > .endr > > .ifc \size,6 > > - lb t5, 5(t0) > > - lb t0, (t0) > > + lb \regtype\()5, 5(\regtype\()0) > > + lb \regtype\()0, (\regtype\()0) > > .endif > > .endm > > > > -.macro epel_load dst len size type > > +.macro epel_load dst len size type from_mem regtype > > .ifc \type,v > > mv a5, a3 > > .else > > @@ -189,24 +189,35 @@ endconst > > sub t6, a2, a5 > > add a7, a2, a5 > > > > +.if \from_mem > > vle8.v v24, (a2) > > vle8.v v22, (t6) > > vle8.v v26, (a7) > > add a7, a7, a5 > > vle8.v v28, (a7) > > - vwmulu.vx v16, v24, t2 > > - vwmulu.vx v20, v26, t3 > > + vwmulu.vx v16, v24, \regtype\()2 > > + vwmulu.vx v20, v26, \regtype\()3 > > .ifc \size,6 > > sub t6, t6, a5 > > add a7, a7, a5 > > vle8.v v24, (t6) > > vle8.v v26, (a7) > > - vwmaccu.vx v16, t0, v24 > > - vwmaccu.vx v16, t5, v26 > > + vwmaccu.vx v16, \regtype\()0, v24 > > + vwmaccu.vx v16, \regtype\()5, v26 > > +.endif > > + vwmaccsu.vx v16, \regtype\()1, v22 > > + vwmaccsu.vx v16, \regtype\()4, v28 > > +.else > > + vwmulu.vx v16, v4, \regtype\()2 > > + vwmulu.vx v20, v6, \regtype\()3 > > + .ifc \size,6 > > + vwmaccu.vx v16, \regtype\()0, v0 > > + vwmaccu.vx v16, \regtype\()5, v10 > > + .endif > > + vwmaccsu.vx v16, \regtype\()1, v2 > > + vwmaccsu.vx v16, \regtype\()4, v8 > > .endif > > li t6, 64 > > - vwmaccsu.vx v16, t1, v22 > > - vwmaccsu.vx v16, t4, v28 > > vwadd.wx v16, v16, t6 > > vsetvlstatic16 \len > > vwadd.vv v24, v16, v20 > > @@ -216,18 +227,18 @@ endconst > > vnclipu.wi \dst, v24, 0 > > .endm > > > > -.macro epel_load_inc dst len size type > > - epel_load \dst \len \size \type > > +.macro epel_load_inc dst len size type from_mem regtype > > + epel_load \dst \len \size \type \from_mem \regtype > > add a2, a2, a3 > > .endm > > > > .macro epel len size type > > func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x > > - epel_filter \size \type > > + epel_filter \size \type t > > vsetvlstatic8 \len > > 1: > > addi a4, a4, -1 > > - epel_load_inc v30 \len \size \type > > + epel_load_inc v30 \len \size \type 1 t > > vse8.v v30, (a0) > > add a0, a0, a1 > > bnez a4, 1b > > @@ -236,6 +247,72 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, > > zve32x endfunc > > .endm > > > > +.macro epel_hv len hsize vsize > > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x > > +#if __riscv_xlen == 64 > > + addi sp, sp, -48 > > + .irp n 0,1,2,3,4,5 > > + sd s\n, \n\()<<3(sp) > > + .endr > > +#else > > + addi sp, sp, -24 > > + .irp n 0,1,2,3,4,5 > > + sw s\n, \n\()<<2(sp) > > + .endr > > +#endif > > + sub a2, a2, a3 > > + epel_filter \hsize h t > > + epel_filter \vsize v s > > + vsetvlstatic8 \len > > +.if \hsize == 6 || \vsize == 6 > > + sub a2, a2, a3 > > + epel_load_inc v0 \len \hsize h 1 t > > +.endif > > + epel_load_inc v2 \len \hsize h 1 t > > + epel_load_inc v4 \len \hsize h 1 t > > + epel_load_inc v6 \len \hsize h 1 t > > + epel_load_inc v8 \len \hsize h 1 t > > +.if \hsize == 6 || \vsize == 6 > > + epel_load_inc v10 \len \hsize h 1 t > > +.endif > > + addi a4, a4, -1 > > +1: > > + addi a4, a4, -1 > > + epel_load v30 \len \vsize v 0 s > > + vse8.v v30, (a0) > > +.if \hsize == 6 || \vsize == 6 > > + vmv.v.v v0, v2 > > +.endif > > + vmv.v.v v2, v4 > > + vmv.v.v v4, v6 > > + vmv.v.v v6, v8 > > +.if \hsize == 6 || \vsize == 6 > > + vmv.v.v v8, v10 > > + epel_load_inc v10 \len \hsize h 1 t > > +.else > > + epel_load_inc v8 \len 4 h 1 t > > +.endif > > + add a0, a0, a1 > > + bnez a4, 1b > > + epel_load v30 \len \vsize v 0 s > > + vse8.v v30, (a0) > > + > > +#if __riscv_xlen == 64 > > + .irp n 0,1,2,3,4,5 > > + ld s\n, \n\()<<3(sp) > > + .endr > > + addi sp, sp, 48 > > +#else > > + .irp n 0,1,2,3,4,5 > > + lw s\n, \n\()<<2(sp) > > + .endr > > + addi sp, sp, 24 > > +#endif > > You can either exclude RV128 or support it. IMHO, at this point, we can > ignore > RV32 and RV128, as there are no signs of such vector-capable hardware on > the > horizon. > > > + > > + ret > > +endfunc > > +.endm > > + > > .irp len 16,8,4 > > put_vp8_bilin_h_v \len h a5 > > put_vp8_bilin_h_v \len v a6 > > @@ -244,4 +321,8 @@ epel \len 6 h > > epel \len 4 h > > epel \len 6 v > > epel \len 4 v > > +epel_hv \len 6 6 > > +epel_hv \len 4 4 > > +epel_hv \len 6 4 > > +epel_hv \len 4 6 > > .endr > > > -- > Rémi Denis-Courmont > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c index dc3e087f01..463c8fa0a2 100644 --- a/libavcodec/riscv/vp8dsp_init.c +++ b/libavcodec/riscv/vp8dsp_init.c @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; + + c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv; + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; + c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv; + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; + c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv; + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; + c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv; + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; } #endif #endif diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 4d7a9f6a2d..fba72f8c15 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -161,26 +161,26 @@ const subpel_filters .byte 0, -1, 12, 123, -6, 0 endconst -.macro epel_filter size type - lla t2, subpel_filters +.macro epel_filter size type regtype + lla \regtype\()2, subpel_filters .ifc \type,v - addi t0, a6, -1 + addi \regtype\()0, a6, -1 .else - addi t0, a5, -1 + addi \regtype\()0, a5, -1 .endif - li t1, 6 - mul t0, t0, t1 - add t0, t0, t2 + li \regtype\()1, 6 + mul \regtype\()0, \regtype\()0, \regtype\()1 + add \regtype\()0, \regtype\()0, \regtype\()2 .irp n 1,2,3,4 - lb t\n, \n(t0) + lb \regtype\n, \n(\regtype\()0) .endr .ifc \size,6 - lb t5, 5(t0) - lb t0, (t0) + lb \regtype\()5, 5(\regtype\()0) + lb \regtype\()0, (\regtype\()0) .endif .endm -.macro epel_load dst len size type +.macro epel_load dst len size type from_mem regtype .ifc \type,v mv a5, a3 .else @@ -189,24 +189,35 @@ endconst sub t6, a2, a5 add a7, a2, a5 +.if \from_mem vle8.v v24, (a2) vle8.v v22, (t6) vle8.v v26, (a7) add a7, a7, a5 vle8.v v28, (a7) - vwmulu.vx v16, v24, t2 - vwmulu.vx v20, v26, t3 + vwmulu.vx v16, v24, \regtype\()2 + vwmulu.vx v20, v26, \regtype\()3 .ifc \size,6 sub t6, t6, a5 add a7, a7, a5 vle8.v v24, (t6) vle8.v v26, (a7) - vwmaccu.vx v16, t0, v24 - vwmaccu.vx v16, t5, v26 + vwmaccu.vx v16, \regtype\()0, v24 + vwmaccu.vx v16, \regtype\()5, v26 +.endif + vwmaccsu.vx v16, \regtype\()1, v22 + vwmaccsu.vx v16, \regtype\()4, v28 +.else + vwmulu.vx v16, v4, \regtype\()2 + vwmulu.vx v20, v6, \regtype\()3 + .ifc \size,6 + vwmaccu.vx v16, \regtype\()0, v0 + vwmaccu.vx v16, \regtype\()5, v10 + .endif + vwmaccsu.vx v16, \regtype\()1, v2 + vwmaccsu.vx v16, \regtype\()4, v8 .endif li t6, 64 - vwmaccsu.vx v16, t1, v22 - vwmaccsu.vx v16, t4, v28 vwadd.wx v16, v16, t6 vsetvlstatic16 \len vwadd.vv v24, v16, v20 @@ -216,18 +227,18 @@ endconst vnclipu.wi \dst, v24, 0 .endm -.macro epel_load_inc dst len size type - epel_load \dst \len \size \type +.macro epel_load_inc dst len size type from_mem regtype + epel_load \dst \len \size \type \from_mem \regtype add a2, a2, a3 .endm .macro epel len size type func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x - epel_filter \size \type + epel_filter \size \type t vsetvlstatic8 \len 1: addi a4, a4, -1 - epel_load_inc v30 \len \size \type + epel_load_inc v30 \len \size \type 1 t vse8.v v30, (a0) add a0, a0, a1 bnez a4, 1b @@ -236,6 +247,72 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x endfunc .endm +.macro epel_hv len hsize vsize +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x +#if __riscv_xlen == 64 + addi sp, sp, -48 + .irp n 0,1,2,3,4,5 + sd s\n, \n\()<<3(sp) + .endr +#else + addi sp, sp, -24 + .irp n 0,1,2,3,4,5 + sw s\n, \n\()<<2(sp) + .endr +#endif + sub a2, a2, a3 + epel_filter \hsize h t + epel_filter \vsize v s + vsetvlstatic8 \len +.if \hsize == 6 || \vsize == 6 + sub a2, a2, a3 + epel_load_inc v0 \len \hsize h 1 t +.endif + epel_load_inc v2 \len \hsize h 1 t + epel_load_inc v4 \len \hsize h 1 t + epel_load_inc v6 \len \hsize h 1 t + epel_load_inc v8 \len \hsize h 1 t +.if \hsize == 6 || \vsize == 6 + epel_load_inc v10 \len \hsize h 1 t +.endif + addi a4, a4, -1 +1: + addi a4, a4, -1 + epel_load v30 \len \vsize v 0 s + vse8.v v30, (a0) +.if \hsize == 6 || \vsize == 6 + vmv.v.v v0, v2 +.endif + vmv.v.v v2, v4 + vmv.v.v v4, v6 + vmv.v.v v6, v8 +.if \hsize == 6 || \vsize == 6 + vmv.v.v v8, v10 + epel_load_inc v10 \len \hsize h 1 t +.else + epel_load_inc v8 \len 4 h 1 t +.endif + add a0, a0, a1 + bnez a4, 1b + epel_load v30 \len \vsize v 0 s + vse8.v v30, (a0) + +#if __riscv_xlen == 64 + .irp n 0,1,2,3,4,5 + ld s\n, \n\()<<3(sp) + .endr + addi sp, sp, 48 +#else + .irp n 0,1,2,3,4,5 + lw s\n, \n\()<<2(sp) + .endr + addi sp, sp, 24 +#endif + + ret +endfunc +.endm + .irp len 16,8,4 put_vp8_bilin_h_v \len h a5 put_vp8_bilin_h_v \len v a6 @@ -244,4 +321,8 @@ epel \len 6 h epel \len 4 h epel \len 6 v epel \len 4 v +epel_hv \len 6 6 +epel_hv \len 4 4 +epel_hv \len 6 4 +epel_hv \len 4 6 .endr
From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp8_put_epel4_h4v4_c: 20.0 vp8_put_epel4_h4v4_rvv_i32: 11.0 vp8_put_epel4_h4v6_c: 25.2 vp8_put_epel4_h4v6_rvv_i32: 13.5 vp8_put_epel4_h6v4_c: 22.2 vp8_put_epel4_h6v4_rvv_i32: 14.5 vp8_put_epel4_h6v6_c: 29.0 vp8_put_epel4_h6v6_rvv_i32: 15.7 vp8_put_epel8_h4v4_c: 73.0 vp8_put_epel8_h4v4_rvv_i32: 22.2 vp8_put_epel8_h4v6_c: 90.5 vp8_put_epel8_h4v6_rvv_i32: 26.7 vp8_put_epel8_h6v4_c: 85.0 vp8_put_epel8_h6v4_rvv_i32: 27.2 vp8_put_epel8_h6v6_c: 104.7 vp8_put_epel8_h6v6_rvv_i32: 29.5 vp8_put_epel16_h4v4_c: 145.5 vp8_put_epel16_h4v4_rvv_i32: 26.5 vp8_put_epel16_h4v6_c: 190.7 vp8_put_epel16_h4v6_rvv_i32: 47.5 vp8_put_epel16_h6v4_c: 173.7 vp8_put_epel16_h6v4_rvv_i32: 33.2 vp8_put_epel16_h6v6_c: 222.2 vp8_put_epel16_h6v6_rvv_i32: 35.5 --- libavcodec/riscv/vp8dsp_init.c | 13 ++++ libavcodec/riscv/vp8dsp_rvv.S | 123 +++++++++++++++++++++++++++------ 2 files changed, 115 insertions(+), 21 deletions(-)