Message ID | tencent_CD8AB9E784B056C3FC789A54472C62862A09@qq.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,v3,1/9] lavc/vp8dsp: R-V put_vp8_pixels | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit : > From: sunyuechi <sunyuechi@iscas.ac.cn> > > C908: > vp8_put_epel4_h4v4_c: 20.0 > vp8_put_epel4_h4v4_rvv_i32: 11.0 > vp8_put_epel4_h4v6_c: 25.2 > vp8_put_epel4_h4v6_rvv_i32: 13.5 > vp8_put_epel4_h6v4_c: 22.2 > vp8_put_epel4_h6v4_rvv_i32: 14.5 > vp8_put_epel4_h6v6_c: 29.0 > vp8_put_epel4_h6v6_rvv_i32: 15.7 > vp8_put_epel8_h4v4_c: 73.0 > vp8_put_epel8_h4v4_rvv_i32: 22.2 > vp8_put_epel8_h4v6_c: 90.5 > vp8_put_epel8_h4v6_rvv_i32: 26.7 > vp8_put_epel8_h6v4_c: 85.0 > vp8_put_epel8_h6v4_rvv_i32: 27.2 > vp8_put_epel8_h6v6_c: 104.7 > vp8_put_epel8_h6v6_rvv_i32: 29.5 > vp8_put_epel16_h4v4_c: 145.5 > vp8_put_epel16_h4v4_rvv_i32: 26.5 > vp8_put_epel16_h4v6_c: 190.7 > vp8_put_epel16_h4v6_rvv_i32: 47.5 > vp8_put_epel16_h6v4_c: 173.7 > vp8_put_epel16_h6v4_rvv_i32: 33.2 > vp8_put_epel16_h6v6_c: 222.2 > vp8_put_epel16_h6v6_rvv_i32: 35.5 > --- > libavcodec/riscv/vp8dsp_init.c | 13 ++++ > libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------ > 2 files changed, 109 insertions(+), 21 deletions(-) > > diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c > index dc3e087f01..463c8fa0a2 100644 > --- a/libavcodec/riscv/vp8dsp_init.c > +++ b/libavcodec/riscv/vp8dsp_init.c > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) > c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; > c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; > c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; > + > + c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; > + c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; > + c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; > + c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv; > + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; > + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; > } > #endif > #endif > diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S > index bf268e4d8d..baa8152830 100644 > --- a/libavcodec/riscv/vp8dsp_rvv.S > +++ b/libavcodec/riscv/vp8dsp_rvv.S > @@ -161,26 +161,26 @@ const subpel_filters > .byte 0, -1, 12, 123, -6, 0 > endconst > > -.macro epel_filter size type > - lla t2, subpel_filters > +.macro epel_filter size type regtype > + lla \regtype\()2, subpel_filters > .ifc \type,v > - addi t0, a6, -1 > + addi \regtype\()0, a6, -1 IMO, passing a complete register name, if you really need to vary it, would be simpler and more flexible than an ABI register type prefix. > .elseif \type == h > - addi t0, a5, -1 > + addi \regtype\()0, a5, -1 > .endif > - li t1, 6 > - mul t0, t0, t1 > - add t0, t0, t2 > + li \regtype\()1, 6 > + mul \regtype\()0, \regtype\()0, \regtype\()1 > + add \regtype\()0, \regtype\()0, \regtype\()2 > .irp n 1,2,3,4 > - lb t\n, \n(t0) > + lb \regtype\n, \n(\regtype\()0) > .endr > .ifc \size,6 > - lb t5, 5(t0) > - lb t0, (t0) > + lb \regtype\()5, 5(\regtype\()0) > + lb \regtype\()0, (\regtype\()0) > .endif > .endm > > -.macro epel_load dst len size type > +.macro epel_load dst len size type from_mem regtype > .ifc \type,v > mv a5, a3 > .else > @@ -189,24 +189,35 @@ endconst > sub t6, a2, a5 > add a7, a2, a5 > > +.if \from_mem > vle8.v v24, (a2) > vle8.v v22, (t6) > vle8.v v26, (a7) > add a7, a7, a5 > vle8.v v28, (a7) > - vwmulu.vx v16, v24, t2 > - vwmulu.vx v20, v26, t3 > + vwmulu.vx v16, v24, \regtype\()2 > + vwmulu.vx v20, v26, \regtype\()3 > .ifc \size,6 > sub t6, t6, a5 > add a7, a7, a5 > vle8.v v24, (t6) > vle8.v v26, (a7) > - vwmaccu.vx v16, t0, v24 > - vwmaccu.vx v16, t5, v26 > + vwmaccu.vx v16, \regtype\()0, v24 > + vwmaccu.vx v16, \regtype\()5, v26 > +.endif > + vwmaccsu.vx v16, \regtype\()1, v22 > + vwmaccsu.vx v16, \regtype\()4, v28 > +.else > + vwmulu.vx v16, v4, \regtype\()2 > + vwmulu.vx v20, v6, \regtype\()3 > + .ifc \size,6 > + vwmaccu.vx v16, \regtype\()0, v0 > + vwmaccu.vx v16, \regtype\()5, v10 > + .endif > + vwmaccsu.vx v16, \regtype\()1, v2 > + vwmaccsu.vx v16, \regtype\()4, v8 > .endif > li t6, 64 > - vwmaccsu.vx v16, t1, v22 > - vwmaccsu.vx v16, t4, v28 > vwadd.wx v16, v16, t6 > vsetvlstatic16 \len > vwadd.vv v24, v16, v20 > @@ -216,18 +227,18 @@ endconst > vnclipu.wi \dst, v24, 0 > .endm > > -.macro epel_load_inc dst len size type > - epel_load \dst \len \size \type > +.macro epel_load_inc dst len size type from_mem regtype > + epel_load \dst \len \size \type \from_mem \regtype > add a2, a2, a3 > .endm > > .macro epel len size type > func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x > - epel_filter \size \type > + epel_filter \size \type t > vsetvlstatic8 \len > 1: > addi a4, a4, -1 > - epel_load_inc v30 \len \size \type > + epel_load_inc v30 \len \size \type 1 t > vse8.v v30, (a0) > add a0, a0, a1 > bnez a4, 1b > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, > zve32x endfunc > .endm > > +.macro epel_hv len hsize vsize > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x > + addi sp, sp, -48 > + .irp n 0,1,2,3,4,5 > +#if __riscv_xlen >= 64 This code actually requires ==, not >=. > + sd s\n, \n\()<<3(sp) > +#else > + sw s\n, \n\()<<3(sp) You can do that but you only need half the stack space and offsets. (And that's why I avoid S and FS registers like the plague, but sometimes you just can't.) > +#endif > + .endr > + sub a2, a2, a3 > + epel_filter \hsize h t > + epel_filter \vsize v s > + vsetvlstatic8 \len > +.if \hsize == 6 || \vsize == 6 > + sub a2, a2, a3 > + epel_load_inc v0 \len \hsize h 1 t > +.endif > + epel_load_inc v2 \len \hsize h 1 t > + epel_load_inc v4 \len \hsize h 1 t > + epel_load_inc v6 \len \hsize h 1 t > + epel_load_inc v8 \len \hsize h 1 t > +.if \hsize == 6 || \vsize == 6 > + epel_load_inc v10 \len \hsize h 1 t > +.endif > + addi a4, a4, -1 > +1: > + addi a4, a4, -1 > + epel_load v30 \len \vsize v 0 s > + vse8.v v30, (a0) > +.if \hsize == 6 || \vsize == 6 > + vmv.v.v v0, v2 > +.endif > + vmv.v.v v2, v4 > + vmv.v.v v4, v6 > + vmv.v.v v6, v8 > +.if \hsize == 6 || \vsize == 6 > + vmv.v.v v8, v10 > + epel_load_inc v10 \len \hsize h 1 t > +.else > + epel_load_inc v8 \len 4 h 1 t > +.endif > + add a0, a0, a1 > + bnez a4, 1b > + epel_load v30 \len \vsize v 0 s > + vse8.v v30, (a0) > + > + .irp n 0,1,2,3,4,5 > +#if __riscv_xlen >= 64 > + ld s\n, \n\()<<3(sp) > +#else > + lw s\n, \n\()<<3(sp) > +#endif > + .endr > + addi sp, sp, 48 > + > + ret > +endfunc > +.endm > + > .irp len 16,8,4 > put_vp8_bilin_h_v \len h a5 > put_vp8_bilin_h_v \len v a6 > @@ -244,4 +315,8 @@ epel \len 6 h > epel \len 4 h > epel \len 6 v > epel \len 4 v > +epel_hv \len 6 6 > +epel_hv \len 4 4 > +epel_hv \len 6 4 > +epel_hv \len 4 6 > .endr
> IMO, passing a complete register name, if you really need to vary it, would be simpler and more flexible than an ABI register type prefix. If the full register name is passed here, some require four parameters, some require six parameters, and there is often repetition. I feel it's easy to get confused about the differences between the parameters passed each time. If use a prefix instead, would only need one parameter, which I think would be less error-prone. > This code actually requires ==, not >=. > You can do that but you only need half the stack space and offsets. Ok, fixed it Rémi Denis-Courmont <remi@remlab.net> 于2024年5月7日周二 03:25写道: > Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST uk7b@foxmail.com a écrit : > > From: sunyuechi <sunyuechi@iscas.ac.cn> > > > > C908: > > vp8_put_epel4_h4v4_c: 20.0 > > vp8_put_epel4_h4v4_rvv_i32: 11.0 > > vp8_put_epel4_h4v6_c: 25.2 > > vp8_put_epel4_h4v6_rvv_i32: 13.5 > > vp8_put_epel4_h6v4_c: 22.2 > > vp8_put_epel4_h6v4_rvv_i32: 14.5 > > vp8_put_epel4_h6v6_c: 29.0 > > vp8_put_epel4_h6v6_rvv_i32: 15.7 > > vp8_put_epel8_h4v4_c: 73.0 > > vp8_put_epel8_h4v4_rvv_i32: 22.2 > > vp8_put_epel8_h4v6_c: 90.5 > > vp8_put_epel8_h4v6_rvv_i32: 26.7 > > vp8_put_epel8_h6v4_c: 85.0 > > vp8_put_epel8_h6v4_rvv_i32: 27.2 > > vp8_put_epel8_h6v6_c: 104.7 > > vp8_put_epel8_h6v6_rvv_i32: 29.5 > > vp8_put_epel16_h4v4_c: 145.5 > > vp8_put_epel16_h4v4_rvv_i32: 26.5 > > vp8_put_epel16_h4v6_c: 190.7 > > vp8_put_epel16_h4v6_rvv_i32: 47.5 > > vp8_put_epel16_h6v4_c: 173.7 > > vp8_put_epel16_h6v4_rvv_i32: 33.2 > > vp8_put_epel16_h6v6_c: 222.2 > > vp8_put_epel16_h6v6_rvv_i32: 35.5 > > --- > > libavcodec/riscv/vp8dsp_init.c | 13 ++++ > > libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------ > > 2 files changed, 109 insertions(+), 21 deletions(-) > > > > diff --git a/libavcodec/riscv/vp8dsp_init.c > b/libavcodec/riscv/vp8dsp_init.c > > index dc3e087f01..463c8fa0a2 100644 > > --- a/libavcodec/riscv/vp8dsp_init.c > > +++ b/libavcodec/riscv/vp8dsp_init.c > > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) > > c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; > > c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; > > c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; > > + > > + c->put_vp8_epel_pixels_tab[0][2][2] = > ff_put_vp8_epel16_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; > > + c->put_vp8_epel_pixels_tab[0][2][1] = > ff_put_vp8_epel16_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; > > + c->put_vp8_epel_pixels_tab[0][1][1] = > ff_put_vp8_epel16_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; > > + c->put_vp8_epel_pixels_tab[0][1][2] = > ff_put_vp8_epel16_h6v4_rvv; > > + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; > > + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; > > } > > #endif > > #endif > > diff --git a/libavcodec/riscv/vp8dsp_rvv.S > b/libavcodec/riscv/vp8dsp_rvv.S > > index bf268e4d8d..baa8152830 100644 > > --- a/libavcodec/riscv/vp8dsp_rvv.S > > +++ b/libavcodec/riscv/vp8dsp_rvv.S > > @@ -161,26 +161,26 @@ const subpel_filters > > .byte 0, -1, 12, 123, -6, 0 > > endconst > > > > -.macro epel_filter size type > > - lla t2, subpel_filters > > +.macro epel_filter size type regtype > > + lla \regtype\()2, subpel_filters > > .ifc \type,v > > - addi t0, a6, -1 > > + addi \regtype\()0, a6, -1 > > IMO, passing a complete register name, if you really need to vary it, > would be > simpler and more flexible than an ABI register type prefix. > > > .elseif \type == h > > - addi t0, a5, -1 > > + addi \regtype\()0, a5, -1 > > .endif > > - li t1, 6 > > - mul t0, t0, t1 > > - add t0, t0, t2 > > + li \regtype\()1, 6 > > + mul \regtype\()0, \regtype\()0, \regtype\()1 > > + add \regtype\()0, \regtype\()0, \regtype\()2 > > .irp n 1,2,3,4 > > - lb t\n, \n(t0) > > + lb \regtype\n, \n(\regtype\()0) > > .endr > > .ifc \size,6 > > - lb t5, 5(t0) > > - lb t0, (t0) > > + lb \regtype\()5, 5(\regtype\()0) > > + lb \regtype\()0, (\regtype\()0) > > .endif > > .endm > > > > -.macro epel_load dst len size type > > +.macro epel_load dst len size type from_mem regtype > > .ifc \type,v > > mv a5, a3 > > .else > > @@ -189,24 +189,35 @@ endconst > > sub t6, a2, a5 > > add a7, a2, a5 > > > > +.if \from_mem > > vle8.v v24, (a2) > > vle8.v v22, (t6) > > vle8.v v26, (a7) > > add a7, a7, a5 > > vle8.v v28, (a7) > > - vwmulu.vx v16, v24, t2 > > - vwmulu.vx v20, v26, t3 > > + vwmulu.vx v16, v24, \regtype\()2 > > + vwmulu.vx v20, v26, \regtype\()3 > > .ifc \size,6 > > sub t6, t6, a5 > > add a7, a7, a5 > > vle8.v v24, (t6) > > vle8.v v26, (a7) > > - vwmaccu.vx v16, t0, v24 > > - vwmaccu.vx v16, t5, v26 > > + vwmaccu.vx v16, \regtype\()0, v24 > > + vwmaccu.vx v16, \regtype\()5, v26 > > +.endif > > + vwmaccsu.vx v16, \regtype\()1, v22 > > + vwmaccsu.vx v16, \regtype\()4, v28 > > +.else > > + vwmulu.vx v16, v4, \regtype\()2 > > + vwmulu.vx v20, v6, \regtype\()3 > > + .ifc \size,6 > > + vwmaccu.vx v16, \regtype\()0, v0 > > + vwmaccu.vx v16, \regtype\()5, v10 > > + .endif > > + vwmaccsu.vx v16, \regtype\()1, v2 > > + vwmaccsu.vx v16, \regtype\()4, v8 > > .endif > > li t6, 64 > > - vwmaccsu.vx v16, t1, v22 > > - vwmaccsu.vx v16, t4, v28 > > vwadd.wx v16, v16, t6 > > vsetvlstatic16 \len > > vwadd.vv v24, v16, v20 > > @@ -216,18 +227,18 @@ endconst > > vnclipu.wi \dst, v24, 0 > > .endm > > > > -.macro epel_load_inc dst len size type > > - epel_load \dst \len \size \type > > +.macro epel_load_inc dst len size type from_mem regtype > > + epel_load \dst \len \size \type \from_mem \regtype > > add a2, a2, a3 > > .endm > > > > .macro epel len size type > > func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x > > - epel_filter \size \type > > + epel_filter \size \type t > > vsetvlstatic8 \len > > 1: > > addi a4, a4, -1 > > - epel_load_inc v30 \len \size \type > > + epel_load_inc v30 \len \size \type 1 t > > vse8.v v30, (a0) > > add a0, a0, a1 > > bnez a4, 1b > > @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, > > zve32x endfunc > > .endm > > > > +.macro epel_hv len hsize vsize > > +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x > > + addi sp, sp, -48 > > + .irp n 0,1,2,3,4,5 > > +#if __riscv_xlen >= 64 > > This code actually requires ==, not >=. > > > + sd s\n, \n\()<<3(sp) > > +#else > > + sw s\n, \n\()<<3(sp) > > You can do that but you only need half the stack space and offsets. > > (And that's why I avoid S and FS registers like the plague, but sometimes > you > just can't.) > > > +#endif > > + .endr > > + sub a2, a2, a3 > > + epel_filter \hsize h t > > + epel_filter \vsize v s > > + vsetvlstatic8 \len > > +.if \hsize == 6 || \vsize == 6 > > + sub a2, a2, a3 > > + epel_load_inc v0 \len \hsize h 1 t > > +.endif > > + epel_load_inc v2 \len \hsize h 1 t > > + epel_load_inc v4 \len \hsize h 1 t > > + epel_load_inc v6 \len \hsize h 1 t > > + epel_load_inc v8 \len \hsize h 1 t > > +.if \hsize == 6 || \vsize == 6 > > + epel_load_inc v10 \len \hsize h 1 t > > +.endif > > + addi a4, a4, -1 > > +1: > > + addi a4, a4, -1 > > + epel_load v30 \len \vsize v 0 s > > + vse8.v v30, (a0) > > +.if \hsize == 6 || \vsize == 6 > > + vmv.v.v v0, v2 > > +.endif > > + vmv.v.v v2, v4 > > + vmv.v.v v4, v6 > > + vmv.v.v v6, v8 > > +.if \hsize == 6 || \vsize == 6 > > + vmv.v.v v8, v10 > > + epel_load_inc v10 \len \hsize h 1 t > > +.else > > + epel_load_inc v8 \len 4 h 1 t > > +.endif > > + add a0, a0, a1 > > + bnez a4, 1b > > + epel_load v30 \len \vsize v 0 s > > + vse8.v v30, (a0) > > + > > + .irp n 0,1,2,3,4,5 > > +#if __riscv_xlen >= 64 > > + ld s\n, \n\()<<3(sp) > > +#else > > + lw s\n, \n\()<<3(sp) > > +#endif > > + .endr > > + addi sp, sp, 48 > > + > > + ret > > +endfunc > > +.endm > > + > > .irp len 16,8,4 > > put_vp8_bilin_h_v \len h a5 > > put_vp8_bilin_h_v \len v a6 > > @@ -244,4 +315,8 @@ epel \len 6 h > > epel \len 4 h > > epel \len 6 v > > epel \len 4 v > > +epel_hv \len 6 6 > > +epel_hv \len 4 4 > > +epel_hv \len 6 4 > > +epel_hv \len 4 6 > > .endr > > > -- > 雷米‧德尼-库尔蒙 > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c index dc3e087f01..463c8fa0a2 100644 --- a/libavcodec/riscv/vp8dsp_init.c +++ b/libavcodec/riscv/vp8dsp_init.c @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c) c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv; c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv; c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv; + + c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv; + c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv; + c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv; + c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv; + c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv; + c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv; + c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv; + c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv; + c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv; + c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv; + c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv; + c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv; } #endif #endif diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index bf268e4d8d..baa8152830 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -161,26 +161,26 @@ const subpel_filters .byte 0, -1, 12, 123, -6, 0 endconst -.macro epel_filter size type - lla t2, subpel_filters +.macro epel_filter size type regtype + lla \regtype\()2, subpel_filters .ifc \type,v - addi t0, a6, -1 + addi \regtype\()0, a6, -1 .elseif \type == h - addi t0, a5, -1 + addi \regtype\()0, a5, -1 .endif - li t1, 6 - mul t0, t0, t1 - add t0, t0, t2 + li \regtype\()1, 6 + mul \regtype\()0, \regtype\()0, \regtype\()1 + add \regtype\()0, \regtype\()0, \regtype\()2 .irp n 1,2,3,4 - lb t\n, \n(t0) + lb \regtype\n, \n(\regtype\()0) .endr .ifc \size,6 - lb t5, 5(t0) - lb t0, (t0) + lb \regtype\()5, 5(\regtype\()0) + lb \regtype\()0, (\regtype\()0) .endif .endm -.macro epel_load dst len size type +.macro epel_load dst len size type from_mem regtype .ifc \type,v mv a5, a3 .else @@ -189,24 +189,35 @@ endconst sub t6, a2, a5 add a7, a2, a5 +.if \from_mem vle8.v v24, (a2) vle8.v v22, (t6) vle8.v v26, (a7) add a7, a7, a5 vle8.v v28, (a7) - vwmulu.vx v16, v24, t2 - vwmulu.vx v20, v26, t3 + vwmulu.vx v16, v24, \regtype\()2 + vwmulu.vx v20, v26, \regtype\()3 .ifc \size,6 sub t6, t6, a5 add a7, a7, a5 vle8.v v24, (t6) vle8.v v26, (a7) - vwmaccu.vx v16, t0, v24 - vwmaccu.vx v16, t5, v26 + vwmaccu.vx v16, \regtype\()0, v24 + vwmaccu.vx v16, \regtype\()5, v26 +.endif + vwmaccsu.vx v16, \regtype\()1, v22 + vwmaccsu.vx v16, \regtype\()4, v28 +.else + vwmulu.vx v16, v4, \regtype\()2 + vwmulu.vx v20, v6, \regtype\()3 + .ifc \size,6 + vwmaccu.vx v16, \regtype\()0, v0 + vwmaccu.vx v16, \regtype\()5, v10 + .endif + vwmaccsu.vx v16, \regtype\()1, v2 + vwmaccsu.vx v16, \regtype\()4, v8 .endif li t6, 64 - vwmaccsu.vx v16, t1, v22 - vwmaccsu.vx v16, t4, v28 vwadd.wx v16, v16, t6 vsetvlstatic16 \len vwadd.vv v24, v16, v20 @@ -216,18 +227,18 @@ endconst vnclipu.wi \dst, v24, 0 .endm -.macro epel_load_inc dst len size type - epel_load \dst \len \size \type +.macro epel_load_inc dst len size type from_mem regtype + epel_load \dst \len \size \type \from_mem \regtype add a2, a2, a3 .endm .macro epel len size type func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x - epel_filter \size \type + epel_filter \size \type t vsetvlstatic8 \len 1: addi a4, a4, -1 - epel_load_inc v30 \len \size \type + epel_load_inc v30 \len \size \type 1 t vse8.v v30, (a0) add a0, a0, a1 bnez a4, 1b @@ -236,6 +247,66 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x endfunc .endm +.macro epel_hv len hsize vsize +func ff_put_vp8_epel\len\()_h\hsize\()v\vsize\()_rvv, zve32x + addi sp, sp, -48 + .irp n 0,1,2,3,4,5 +#if __riscv_xlen >= 64 + sd s\n, \n\()<<3(sp) +#else + sw s\n, \n\()<<3(sp) +#endif + .endr + sub a2, a2, a3 + epel_filter \hsize h t + epel_filter \vsize v s + vsetvlstatic8 \len +.if \hsize == 6 || \vsize == 6 + sub a2, a2, a3 + epel_load_inc v0 \len \hsize h 1 t +.endif + epel_load_inc v2 \len \hsize h 1 t + epel_load_inc v4 \len \hsize h 1 t + epel_load_inc v6 \len \hsize h 1 t + epel_load_inc v8 \len \hsize h 1 t +.if \hsize == 6 || \vsize == 6 + epel_load_inc v10 \len \hsize h 1 t +.endif + addi a4, a4, -1 +1: + addi a4, a4, -1 + epel_load v30 \len \vsize v 0 s + vse8.v v30, (a0) +.if \hsize == 6 || \vsize == 6 + vmv.v.v v0, v2 +.endif + vmv.v.v v2, v4 + vmv.v.v v4, v6 + vmv.v.v v6, v8 +.if \hsize == 6 || \vsize == 6 + vmv.v.v v8, v10 + epel_load_inc v10 \len \hsize h 1 t +.else + epel_load_inc v8 \len 4 h 1 t +.endif + add a0, a0, a1 + bnez a4, 1b + epel_load v30 \len \vsize v 0 s + vse8.v v30, (a0) + + .irp n 0,1,2,3,4,5 +#if __riscv_xlen >= 64 + ld s\n, \n\()<<3(sp) +#else + lw s\n, \n\()<<3(sp) +#endif + .endr + addi sp, sp, 48 + + ret +endfunc +.endm + .irp len 16,8,4 put_vp8_bilin_h_v \len h a5 put_vp8_bilin_h_v \len v a6 @@ -244,4 +315,8 @@ epel \len 6 h epel \len 4 h epel \len 6 v epel \len 4 v +epel_hv \len 6 6 +epel_hv \len 4 4 +epel_hv \len 6 4 +epel_hv \len 4 6 .endr
From: sunyuechi <sunyuechi@iscas.ac.cn> C908: vp8_put_epel4_h4v4_c: 20.0 vp8_put_epel4_h4v4_rvv_i32: 11.0 vp8_put_epel4_h4v6_c: 25.2 vp8_put_epel4_h4v6_rvv_i32: 13.5 vp8_put_epel4_h6v4_c: 22.2 vp8_put_epel4_h6v4_rvv_i32: 14.5 vp8_put_epel4_h6v6_c: 29.0 vp8_put_epel4_h6v6_rvv_i32: 15.7 vp8_put_epel8_h4v4_c: 73.0 vp8_put_epel8_h4v4_rvv_i32: 22.2 vp8_put_epel8_h4v6_c: 90.5 vp8_put_epel8_h4v6_rvv_i32: 26.7 vp8_put_epel8_h6v4_c: 85.0 vp8_put_epel8_h6v4_rvv_i32: 27.2 vp8_put_epel8_h6v6_c: 104.7 vp8_put_epel8_h6v6_rvv_i32: 29.5 vp8_put_epel16_h4v4_c: 145.5 vp8_put_epel16_h4v4_rvv_i32: 26.5 vp8_put_epel16_h4v6_c: 190.7 vp8_put_epel16_h4v6_rvv_i32: 47.5 vp8_put_epel16_h6v4_c: 173.7 vp8_put_epel16_h6v4_rvv_i32: 33.2 vp8_put_epel16_h6v6_c: 222.2 vp8_put_epel16_h6v6_rvv_i32: 35.5 --- libavcodec/riscv/vp8dsp_init.c | 13 ++++ libavcodec/riscv/vp8dsp_rvv.S | 117 +++++++++++++++++++++++++++------ 2 files changed, 109 insertions(+), 21 deletions(-)