@@ -214,7 +214,7 @@ endfunc
lh \regtype\()0, 0(\regtype\()0)
.endm
-.macro epel_load dst, len, op, name, type, from_mem, regtype
+.macro epel_load dst, len, op, name, type, from_mem, regtype, t1=v0, t2=v2, t3=v4, t4=v6, t5=v8, t6=v10, t7=v12, t8=v14
.ifc \from_mem, 1
vle8.v v22, (a2)
.ifc \type,v
@@ -284,25 +284,25 @@ endfunc
vwmulu.vx v20, v22, \regtype\()3
.else
.ifc \name,smooth
- vwmulu.vx v16, v8, \regtype\()4
- vwmaccu.vx v16, \regtype\()2, v4
- vwmaccu.vx v16, \regtype\()5, v10
- vwmaccsu.vx v16, \regtype\()6, v12
- vwmaccsu.vx v16, \regtype\()1, v2
+ vwmulu.vx v16, \t5, \regtype\()4
+ vwmaccu.vx v16, \regtype\()2, \t3
+ vwmaccu.vx v16, \regtype\()5, \t6
+ vwmaccsu.vx v16, \regtype\()6, \t7
+ vwmaccsu.vx v16, \regtype\()1, \t2
.else
- vwmulu.vx v16, v2, \regtype\()1
- vwmaccu.vx v16, \regtype\()6, v12
- vwmaccsu.vx v16, \regtype\()5, v10
- vwmaccsu.vx v16, \regtype\()2, v4
- vwmulu.vx v28, v8, \regtype\()4
+ vwmulu.vx v16, \t2, \regtype\()1
+ vwmaccu.vx v16, \regtype\()6, \t7
+ vwmaccsu.vx v16, \regtype\()5, \t6
+ vwmaccsu.vx v16, \regtype\()2, \t3
+ vwmulu.vx v28, \t5, \regtype\()4
.endif
- vwmaccsu.vx v16, \regtype\()0, v0
- vwmulu.vx v20, v6, \regtype\()3
+ vwmaccsu.vx v16, \regtype\()0, \t1
+ vwmulu.vx v20, \t4, \regtype\()3
.ifc \regtype,t
- vwmaccsu.vx v16, a7, v14
+ vwmaccsu.vx v16, a7, \t8
.else
- vwmaccsu.vx v16, s7, v14
+ vwmaccsu.vx v16, s7, \t8
.endif
.endif
@@ -355,6 +355,76 @@ func ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x, zba
endfunc
.endm
+#if __riscv_xlen == 64
+.macro epel_hv_body len,op,name,t1,t2,t3,t4,t5,t6,t7,t8
+ epel_load v30, \len, \op, \name, t, 0, s, \t1,\t2,\t3,\t4,\t5,\t6,\t7,\t8
+ vse8.v v30, (a0)
+ epel_load \t1, \len, put, \name, h, 1, t
+ add a2, a2, a3
+ add a0, a0, a1
+.endm
+
+.macro epel_hv_once len, name, op
+ sub a2, a2, a3
+ sub a2, a2, a3
+ sub a2, a2, a3
+ .irp n,0,2,4,6,8,10,12,14
+ epel_load_inc v\n, \len, put, \name, h, 1, t
+ .endr
+ addi a4, a4, -1
+1:
+ addi a4, a4, -8
+ epel_hv_body \len,\op,\name,v0,v2,v4,v6,v8,v10,v12,v14
+ epel_hv_body \len,\op,\name,v2,v4,v6,v8,v10,v12,v14,v0
+ epel_hv_body \len,\op,\name,v4,v6,v8,v10,v12,v14,v0,v2
+ epel_hv_body \len,\op,\name,v6,v8,v10,v12,v14,v0,v2,v4
+.if \len > 4
+ epel_hv_body \len,\op,\name,v8,v10,v12,v14,v0,v2,v4,v6
+ epel_hv_body \len,\op,\name,v10,v12,v14,v0,v2,v4,v6,v8
+ epel_hv_body \len,\op,\name,v12,v14,v0,v2,v4,v6,v8,v10
+ epel_hv_body \len,\op,\name,v14,v0,v2,v4,v6,v8,v10,v12
+.endif
+ bgtz a4, 1b
+.endm
+
+.macro epel_hv op, name, len, vlen
+func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x, zba
+ lpad 0
+ addi sp, sp, -64
+ .irp n,0,1,2,3,4,5,6,7
+ sd s\n, \n\()<<3(sp)
+ .endr
+.if \len == 64 && \vlen < 256
+ addi sp, sp, -48
+ .irp n,0,1,2,3,4,5
+ sd a\n, \n\()<<3(sp)
+ .endr
+.endif
+ csrwi vxrm, 0
+ epel_filter \name, h, t, a7
+ epel_filter \name, v, s, s7
+ vsetvlstatic8 \len, a6, 64, m2
+ epel_hv_once \len, \name, \op
+.if \len == 64 && \vlen < 256
+ .irp n,0,1,2,3,4,5
+ ld a\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 48
+ addi a0, a0, 32
+ addi a2, a2, 32
+ epel_filter \name, h, t, a7
+ epel_hv_once \len, \name, \op
+.endif
+ .irp n,0,1,2,3,4,5,6,7
+ ld s\n, \n\()<<3(sp)
+ .endr
+ addi sp, sp, 64
+
+ ret
+endfunc
+.endm
+#endif
+
.irp len, 64, 32, 16, 8, 4
copy_avg \len
.irp op, put, avg
@@ -363,6 +433,10 @@ endfunc
epel \len, \op, \name, \type, 128
epel \len, \op, \name, \type, 256
.endr
+ #if __riscv_xlen == 64
+ epel_hv \op, \name, \len, 128
+ epel_hv \op, \name, \len, 256
+ #endif
.endr
.endr
.endr
@@ -118,6 +118,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
init_subpel2(1, 1, 0, h, avg, 128);
init_subpel2(0, 0, 1, v, put, 128);
init_subpel2(1, 0, 1, v, avg, 128);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 128);
+ init_subpel2(1, 1, 1, hv, avg, 128);
+# endif
}
}
if (vlenb >= 32) {
@@ -126,6 +130,10 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext *dsp, int bpp)
init_subpel2(1, 1, 0, h, avg, 256);
init_subpel2(0, 0, 1, v, put, 256);
init_subpel2(1, 0, 1, v, avg, 256);
+# if __riscv_xlen == 64
+ init_subpel2(0, 1, 1, hv, put, 256);
+ init_subpel2(1, 1, 1, hv, avg, 256);
+# endif
}
}
}
From: sunyuechi <sunyuechi@iscas.ac.cn> C908 X60 vp9_avg_8tap_smooth_4hv_8bpp_c : 32.0 28.0 vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32 : 15.0 13.2 vp9_avg_8tap_smooth_8hv_8bpp_c : 98.0 86.2 vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32 : 23.7 21.2 vp9_avg_8tap_smooth_16hv_8bpp_c : 355.7 297.0 vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32 : 47.0 41.5 vp9_avg_8tap_smooth_32hv_8bpp_c : 1272.7 1099.7 vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32 : 134.7 119.7 vp9_avg_8tap_smooth_64hv_8bpp_c : 4937.0 4224.2 vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32 : 528.5 228.5 vp9_put_8tap_smooth_4hv_8bpp_c : 30.2 26.7 vp9_put_8tap_smooth_4hv_8bpp_rvv_i32 : 30.5 12.5 vp9_put_8tap_smooth_8hv_8bpp_c : 91.5 81.2 vp9_put_8tap_smooth_8hv_8bpp_rvv_i32 : 22.7 20.2 vp9_put_8tap_smooth_16hv_8bpp_c : 313.2 277.5 vp9_put_8tap_smooth_16hv_8bpp_rvv_i32 : 45.2 40.2 vp9_put_8tap_smooth_32hv_8bpp_c : 1166.7 1022.2 vp9_put_8tap_smooth_32hv_8bpp_rvv_i32 : 131.7 117.2 vp9_put_8tap_smooth_64hv_8bpp_c : 4560.5 3961.7 vp9_put_8tap_smooth_64hv_8bpp_rvv_i32 : 517.0 223.2 --- libavcodec/riscv/vp9_mc_rvv.S | 104 ++++++++++++++++++++++++++++----- libavcodec/riscv/vp9dsp_init.c | 8 +++ 2 files changed, 97 insertions(+), 15 deletions(-)