Message ID | 20240813140338.143045-7-jdek@itanimul.li |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel,1/7] checkasm: add csv/tsv bench output | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
Le tiistaina 13. elokuuta 2024, 17.03.36 EEST J. Dekker a écrit : > +#include "libavutil/riscv/asm.S" > + > +.macro vnclipsu.wi shifti, lmul, lmul2, vregs:vararg > + vsetvli zero, zero, e16, \lmul2, ta, ma We don't typically do that for a very good reason. The vsetvli is most often redundant and nonobviously so. (Also in some cases, there are better ways to do signed-to-unsigned clip.) > + .irp x, \vregs > + vmax.vx \x, \x, zero > + .endr > + vsetvli zero, zero, e8, \lmul, ta, ma > + .irp x, \vregs > + vnclipu.wi \x, \x, \shifti > + .endr > +.endm > + > +.macro lowpass_init lmul, sizei, size, w0, w1, backup This is needlessly convoluted. In fact, backup is not even used, which kind of highlights the point. > + vsetivli zero, \sizei, e8, \lmul, ta, ma > + csrwi vxrm, 0 > + li \size, \sizei > + .ifnb \w0 > + li \w0, 20 > + li \w1, -5 > + .endif > +.endm > + > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 > + addi \tmp, \src, 3 > + lbu \tmp2, 2(\src) > + vle8.v v31, (\tmp) > + lbu \tmp, 1(\src) > + vslide1up.vx v30, v31, \tmp2 > + lbu \tmp2, 0(\src) > + vslide1up.vx v29, v30, \tmp > + lbu \tmp, -1(\src) > + vslide1up.vx v28, v29, \tmp2 > + lbu \tmp2, -2(\src) > + vslide1up.vx v27, v28, \tmp > + vslide1up.vx v26, v27, \tmp2 That's a lot of sequentially dependent vector instructions to save zero- extending v31 before the MACs. Are you sure it's faster that way? > + vwaddu.vv \vdst, v26, v31 > + vwmaccu.vx \vdst, \w0, v28 > + vwmaccu.vx \vdst, \w0, v29 > + vwmaccsu.vx \vdst, \w1, v27 > + vwmaccsu.vx \vdst, \w1, v30 > +.endm > + > + /* output is unclipped */ > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, > vsrc5, signed=0 > + .if \signed > + vwadd.vv \vdst, \vsrc0, \vsrc5 > + vwmacc.vx \vdst, \w0, \vsrc2 > + vwmacc.vx \vdst, \w0, \vsrc3 > + vwmacc.vx \vdst, \w1, \vsrc1 > + vwmacc.vx \vdst, \w1, \vsrc4 > + .else > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > + vwmaccu.vx \vdst, \w0, \vsrc2 > + vwmaccu.vx \vdst, \w0, \vsrc3 > + vwmaccsu.vx \vdst, \w1, \vsrc1 > + vwmaccsu.vx \vdst, \w1, \vsrc4 > + .endif > +.endm > + > +.macro qpel_mc00 op, dst, src, stride, size > +func ff_\op\()_h264_qpel_pixels, zve32x > +1: > + add t0, \stride, \src > + add t1, \stride, t0 > + add t2, \stride, t1 > + vle8.v v0, (\src) > + vle8.v v1, (t0) > + vle8.v v2, (t1) > + vle8.v v3, (t2) > + addi \size, \size, -4 > + add \src, \stride, t2 > + add t0, \stride, \dst > + add t1, \stride, t0 > + add t2, \stride, t1 > + .ifc \op, avg > + vle8.v v4, (\dst) > + vle8.v v5, (t0) > + vle8.v v6, (t1) > + vle8.v v7, (t2) > + vaaddu.vv v0, v0, v4 > + vaaddu.vv v1, v1, v5 > + vaaddu.vv v2, v2, v6 > + vaaddu.vv v3, v3, v7 > + .endif > + vse8.v v0, (\dst) > + vse8.v v1, (t0) > + vse8.v v2, (t1) > + vse8.v v3, (t2) > + add \dst, \stride, t2 > + bnez \size, 1b > + ret > +endfunc > +.endm > + > + qpel_mc00 put, a0, a1, a2, a4 > + qpel_mc00 avg, a0, a1, a2, a4 Please don't add constant macro parameters. > + > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, > src_stride, size, w0, w1, src2, src2_stride > +func > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + lowpass_h v0, \src, \w0, \w1 > + lowpass_h v2, t0, \w0, \w1 > + lowpass_h v4, t1, \w0, \w1 > + lowpass_h v6, t2, \w0, \w1 > + add \src, \src_stride, t2 > + addi \size, \size, -4 > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v8, (\src2) > + vle8.v v10, (t0) > + vle8.v v12, (t1) > + vle8.v v14, (t2) > + add \src2, \dst_stride, t2 > + vaaddu.vv v0, v0, v8 > + vaaddu.vv v2, v2, v10 > + vaaddu.vv v4, v4, v12 > + vaaddu.vv v6, v6, v14 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v1, (\dst) > + vle8.v v3, (t0) > + vle8.v v5, (t1) > + vle8.v v7, (t2) > + vaaddu.vv v0, v0, v1 > + vaaddu.vv v2, v2, v3 > + vaaddu.vv v4, v4, v5 > + vaaddu.vv v6, v6, v7 > + .endif > + vse8.v v0, (\dst) > + vse8.v v2, (t0) > + vse8.v v4, (t1) > + vse8.v v6, (t2) > + add \dst, \dst_stride, t2 > + bnez \size, 1b > + ret > +endfunc > + > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > + sub t0, \src, \src_stride > + sub t1, t0, \src_stride > + vle8.v v2, (\src) > + vle8.v v1, (t0) > + vle8.v v0, (t1) > + add t0, \src, \src_stride > + add t1, t0, \src_stride > + add \src, t1, \src_stride > + vle8.v v3, (t0) > + vle8.v v4, (t1) > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + vle8.v v5, (\src) > + vle8.v v6, (t0) > + vle8.v v7, (t1) > + vle8.v v8, (t2) > + add \src, \src_stride, t2 > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 > + addi \size, \size, -4 > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v9, (\src2) > + vle8.v v10, (t0) > + vle8.v v11, (t1) > + vle8.v v12, (t2) > + add \src2, \src2_stride, t2 > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v9, (\dst) > + vle8.v v10, (t0) > + vle8.v v11, (t1) > + vle8.v v12, (t2) > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + vse8.v v24, (\dst) > + vse8.v v26, (t0) > + vse8.v v28, (t1) > + vse8.v v30, (t2) > + add \dst, \dst_stride, t2 > + vmv.v.v v0, v4 > + vmv.v.v v1, v5 > + vmv.v.v v2, v6 > + vmv.v.v v3, v7 > + vmv.v.v v4, v8 At this point, any vector move without rationale is an automatic -1 from me. > + bnez \size, 1b > + ret > +endfunc > + > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > + sub t0, \src, \src_stride > + sub t1, t0, \src_stride > + lowpass_h v4, \src, \w0, \w1 > + lowpass_h v2, t0, \w0, \w1 > + lowpass_h v0, t1, \w0, \w1 > + add t0, \src, \src_stride > + add t1, t0, \src_stride > + add \src, t1, \src_stride > + lowpass_h v6, t0, \w0, \w1 > + lowpass_h v8, t1, \w0, \w1 > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + lowpass_h v10, \src, \w0, \w1 > + lowpass_h v12, t0, \w0, \w1 > + lowpass_h v14, t1, \w0, \w1 > + lowpass_h v16, t2, \w0, \w1 > + vsetvli zero, zero, e16, \lmul2, ta, ma > + addi \size, \size, -4 > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, > signed=1 > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, > v12, signed=1 > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, > v12, v14, signed=1 > + vnclip.wi v0, v20, 10 > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, > signed= > + vnclip.wi v2, v24, 10 > + vnclip.wi v4, v28, 10 > + vnclip.wi v6, v20, 10 > + vmax.vx v18, v0, zero > + vmax.vx v20, v2, zero > + vmax.vx v22, v4, zero > + vmax.vx v24, v6, zero > + vmv.v.v v0, v8 > + vmv.v.v v2, v10 > + vmv.v.v v4, v12 > + vmv.v.v v6, v14 > + vmv.v.v v8, v16 > + add \src, \src_stride, t2 > + vsetvli zero, zero, e8, \lmul, ta, ma > + vnclipu.wi v18, v18, 0 > + vnclipu.wi v20, v20, 0 > + vnclipu.wi v22, v22, 0 > + vnclipu.wi v24, v24, 0 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v26, (\src2) > + vle8.v v27, (t0) > + vle8.v v28, (t1) > + vle8.v v29, (t2) > + add \src2, \src2_stride, t2 > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v26, (\dst) > + vle8.v v27, (t0) > + vle8.v v28, (t1) > + vle8.v v29, (t2) > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + vse8.v v18, (\dst) > + vse8.v v20, (t0) > + vse8.v v22, (t1) > + vse8.v v24, (t2) > + add \dst, \dst_stride, t2 > + bnez \size, 1b > + ret > +endfunc > +.endm > + > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > + loading 32 bit integers, but this makes the convolutions more > complicated + to implement, so it's not necessarily any faster. */ > + > +.macro h264_qpel lmul, lmul2 > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, > t6 > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, > t5, t6, a5, a6 > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, > a2, a3, a4, t5, t6 > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, > a1, a2, a3, a4, t5, t6, a5, a6 > +.endm > + > + h264_qpel m1, m2 > + h264_qpel mf2, m1 > + h264_qpel mf4, mf2 > + h264_qpel mf8, mf4 > + > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, > src_stride, size, w0, w1, src2, src2_stride, tmp > +func > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, > + j ff_\op\()_h264_qpel_pixels > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + mv \src2, \src > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + addi \src2, \src, 1 > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + mv \src2, \src > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + add \src2, \src, \src_stride > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src It's all but impossible to tell if spilling is actually necessary when you alias registers like this. > + mv \tmp, ra Use t0 for subprocedure return. See specs. > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul You can use jal here > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + addi \src, \src, 1 > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + addi \src, \src, 1 > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_v_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + addi \src, \src, 1 > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_v_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > +.endm > + > + ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns put, mf4, 8, rvv256, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns put, mf8, 4, rvv256, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns avg, mf4, 8, rvv256, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns avg, mf8, 4, rvv256, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns put, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns put, mf2, 8, rvv, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns put, mf4, 4, rvv, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns avg, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns avg, mf2, 8, rvv, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns avg, mf4, 4, rvv, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7
On Mon, 19 Aug 2024 21:27:38 +0300 Rémi Denis-Courmont <remi@remlab.net> wrote: > Le tiistaina 13. elokuuta 2024, 17.03.36 EEST J. Dekker a écrit : > > +#include "libavutil/riscv/asm.S" > > + > > +.macro vnclipsu.wi shifti, lmul, lmul2, vregs:vararg > > + vsetvli zero, zero, e16, \lmul2, ta, ma > > We don't typically do that for a very good reason. The vsetvli is most often > redundant and nonobviously so. (Also in some cases, there are better ways to > do signed-to-unsigned clip.) I could rename the macro to discourage its use outside of this function, then. > > > + .irp x, \vregs > > + vmax.vx \x, \x, zero > > + .endr > > + vsetvli zero, zero, e8, \lmul, ta, ma > > + .irp x, \vregs > > + vnclipu.wi \x, \x, \shifti > > + .endr > > +.endm > > + > > +.macro lowpass_init lmul, sizei, size, w0, w1, backup > > This is needlessly convoluted. In fact, backup is not even used, which kind of > highlights the point. That parameter was simply left over from a previous version of the code. Are you suggesting we simply duplicate the contents of this macro into all of the functions that use it? > > > + vsetivli zero, \sizei, e8, \lmul, ta, ma > > + csrwi vxrm, 0 > > + li \size, \sizei > > + .ifnb \w0 > > + li \w0, 20 > > + li \w1, -5 > > + .endif > > +.endm > > + > > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ > > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 > > + addi \tmp, \src, 3 > > + lbu \tmp2, 2(\src) > > + vle8.v v31, (\tmp) > > + lbu \tmp, 1(\src) > > + vslide1up.vx v30, v31, \tmp2 > > + lbu \tmp2, 0(\src) > > + vslide1up.vx v29, v30, \tmp > > + lbu \tmp, -1(\src) > > + vslide1up.vx v28, v29, \tmp2 > > + lbu \tmp2, -2(\src) > > + vslide1up.vx v27, v28, \tmp > > + vslide1up.vx v26, v27, \tmp2 > > That's a lot of sequentially dependent vector instructions to save zero- > extending v31 before the MACs. Are you sure it's faster that way? I'm not sure what you mean. How would your alternative implementation look? It's certainly possible to make these instructions less sequential by emitting multiple `lbu` instructions instead of sliding up. > > > + vwaddu.vv \vdst, v26, v31 > > + vwmaccu.vx \vdst, \w0, v28 > > + vwmaccu.vx \vdst, \w0, v29 > > + vwmaccsu.vx \vdst, \w1, v27 > > + vwmaccsu.vx \vdst, \w1, v30 > > +.endm > > + > > + /* output is unclipped */ > > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, > > vsrc5, signed=0 > > + .if \signed > > + vwadd.vv \vdst, \vsrc0, \vsrc5 > > + vwmacc.vx \vdst, \w0, \vsrc2 > > + vwmacc.vx \vdst, \w0, \vsrc3 > > + vwmacc.vx \vdst, \w1, \vsrc1 > > + vwmacc.vx \vdst, \w1, \vsrc4 > > + .else > > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > > + vwmaccu.vx \vdst, \w0, \vsrc2 > > + vwmaccu.vx \vdst, \w0, \vsrc3 > > + vwmaccsu.vx \vdst, \w1, \vsrc1 > > + vwmaccsu.vx \vdst, \w1, \vsrc4 > > + .endif > > +.endm > > + > > +.macro qpel_mc00 op, dst, src, stride, size > > +func ff_\op\()_h264_qpel_pixels, zve32x > > +1: > > + add t0, \stride, \src > > + add t1, \stride, t0 > > + add t2, \stride, t1 > > + vle8.v v0, (\src) > > + vle8.v v1, (t0) > > + vle8.v v2, (t1) > > + vle8.v v3, (t2) > > + addi \size, \size, -4 > > + add \src, \stride, t2 > > + add t0, \stride, \dst > > + add t1, \stride, t0 > > + add t2, \stride, t1 > > + .ifc \op, avg > > + vle8.v v4, (\dst) > > + vle8.v v5, (t0) > > + vle8.v v6, (t1) > > + vle8.v v7, (t2) > > + vaaddu.vv v0, v0, v4 > > + vaaddu.vv v1, v1, v5 > > + vaaddu.vv v2, v2, v6 > > + vaaddu.vv v3, v3, v7 > > + .endif > > + vse8.v v0, (\dst) > > + vse8.v v1, (t0) > > + vse8.v v2, (t1) > > + vse8.v v3, (t2) > > + add \dst, \stride, t2 > > + bnez \size, 1b > > + ret > > +endfunc > > +.endm > > + > > + qpel_mc00 put, a0, a1, a2, a4 > > + qpel_mc00 avg, a0, a1, a2, a4 > > Please don't add constant macro parameters. Why? It makes the code much easier to modify, and arguably also to understand. This design was certainly invaluable during the development process. If you prefer, we could "bake" the result, but at the cost of future refactorability. Given the state of RISC-V hardware, I'd rather leave the code in a state that lends itself more towards future modifications. > > > + > > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, > > src_stride, size, w0, w1, src2, src2_stride > > +func > > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > > +1: > > + add t0, \src_stride, \src > > + add t1, \src_stride, t0 > > + add t2, \src_stride, t1 > > + lowpass_h v0, \src, \w0, \w1 > > + lowpass_h v2, t0, \w0, \w1 > > + lowpass_h v4, t1, \w0, \w1 > > + lowpass_h v6, t2, \w0, \w1 > > + add \src, \src_stride, t2 > > + addi \size, \size, -4 > > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 > > + .ifnb \src2 > > + add t0, \src2_stride, \src2 > > + add t1, \src2_stride, t0 > > + add t2, \src2_stride, t1 > > + vle8.v v8, (\src2) > > + vle8.v v10, (t0) > > + vle8.v v12, (t1) > > + vle8.v v14, (t2) > > + add \src2, \dst_stride, t2 > > + vaaddu.vv v0, v0, v8 > > + vaaddu.vv v2, v2, v10 > > + vaaddu.vv v4, v4, v12 > > + vaaddu.vv v6, v6, v14 > > + .endif > > + add t0, \dst_stride, \dst > > + add t1, \dst_stride, t0 > > + add t2, \dst_stride, t1 > > + .ifc \op, avg > > + vle8.v v1, (\dst) > > + vle8.v v3, (t0) > > + vle8.v v5, (t1) > > + vle8.v v7, (t2) > > + vaaddu.vv v0, v0, v1 > > + vaaddu.vv v2, v2, v3 > > + vaaddu.vv v4, v4, v5 > > + vaaddu.vv v6, v6, v7 > > + .endif > > + vse8.v v0, (\dst) > > + vse8.v v2, (t0) > > + vse8.v v4, (t1) > > + vse8.v v6, (t2) > > + add \dst, \dst_stride, t2 > > + bnez \size, 1b > > + ret > > +endfunc > > + > > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > > + sub t0, \src, \src_stride > > + sub t1, t0, \src_stride > > + vle8.v v2, (\src) > > + vle8.v v1, (t0) > > + vle8.v v0, (t1) > > + add t0, \src, \src_stride > > + add t1, t0, \src_stride > > + add \src, t1, \src_stride > > + vle8.v v3, (t0) > > + vle8.v v4, (t1) > > +1: > > + add t0, \src_stride, \src > > + add t1, \src_stride, t0 > > + add t2, \src_stride, t1 > > + vle8.v v5, (\src) > > + vle8.v v6, (t0) > > + vle8.v v7, (t1) > > + vle8.v v8, (t2) > > + add \src, \src_stride, t2 > > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 > > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 > > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 > > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 > > + addi \size, \size, -4 > > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 > > + .ifnb \src2 > > + add t0, \src2_stride, \src2 > > + add t1, \src2_stride, t0 > > + add t2, \src2_stride, t1 > > + vle8.v v9, (\src2) > > + vle8.v v10, (t0) > > + vle8.v v11, (t1) > > + vle8.v v12, (t2) > > + add \src2, \src2_stride, t2 > > + vaaddu.vv v24, v24, v9 > > + vaaddu.vv v26, v26, v10 > > + vaaddu.vv v28, v28, v11 > > + vaaddu.vv v30, v30, v12 > > + .endif > > + add t0, \dst_stride, \dst > > + add t1, \dst_stride, t0 > > + add t2, \dst_stride, t1 > > + .ifc \op, avg > > + vle8.v v9, (\dst) > > + vle8.v v10, (t0) > > + vle8.v v11, (t1) > > + vle8.v v12, (t2) > > + vaaddu.vv v24, v24, v9 > > + vaaddu.vv v26, v26, v10 > > + vaaddu.vv v28, v28, v11 > > + vaaddu.vv v30, v30, v12 > > + .endif > > + vse8.v v24, (\dst) > > + vse8.v v26, (t0) > > + vse8.v v28, (t1) > > + vse8.v v30, (t2) > > + add \dst, \dst_stride, t2 > > + vmv.v.v v0, v4 > > + vmv.v.v v1, v5 > > + vmv.v.v v2, v6 > > + vmv.v.v v3, v7 > > + vmv.v.v v4, v8 > > At this point, any vector move without rationale is an automatic -1 from me. There is a rationale; the vectors are reused for the next pass of the (unrolled) vertical convolution. The only way to eliminate them would be to make a special path for 8x8 that urolls all 8 lines to avoid this vector move, but IMO the gain in performance does not justify the increase in complexity and binary size. > > > + bnez \size, 1b > > + ret > > +endfunc > > + > > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > > + sub t0, \src, \src_stride > > + sub t1, t0, \src_stride > > + lowpass_h v4, \src, \w0, \w1 > > + lowpass_h v2, t0, \w0, \w1 > > + lowpass_h v0, t1, \w0, \w1 > > + add t0, \src, \src_stride > > + add t1, t0, \src_stride > > + add \src, t1, \src_stride > > + lowpass_h v6, t0, \w0, \w1 > > + lowpass_h v8, t1, \w0, \w1 > > +1: > > + add t0, \src_stride, \src > > + add t1, \src_stride, t0 > > + add t2, \src_stride, t1 > > + lowpass_h v10, \src, \w0, \w1 > > + lowpass_h v12, t0, \w0, \w1 > > + lowpass_h v14, t1, \w0, \w1 > > + lowpass_h v16, t2, \w0, \w1 > > + vsetvli zero, zero, e16, \lmul2, ta, ma > > + addi \size, \size, -4 > > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, > > signed=1 > > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, > > v12, signed=1 > > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, > > v12, v14, signed=1 > > + vnclip.wi v0, v20, 10 > > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, > > signed= > > + vnclip.wi v2, v24, 10 > > + vnclip.wi v4, v28, 10 > > + vnclip.wi v6, v20, 10 > > + vmax.vx v18, v0, zero > > + vmax.vx v20, v2, zero > > + vmax.vx v22, v4, zero > > + vmax.vx v24, v6, zero > > + vmv.v.v v0, v8 > > + vmv.v.v v2, v10 > > + vmv.v.v v4, v12 > > + vmv.v.v v6, v14 > > + vmv.v.v v8, v16 > > + add \src, \src_stride, t2 > > + vsetvli zero, zero, e8, \lmul, ta, ma > > + vnclipu.wi v18, v18, 0 > > + vnclipu.wi v20, v20, 0 > > + vnclipu.wi v22, v22, 0 > > + vnclipu.wi v24, v24, 0 > > + .ifnb \src2 > > + add t0, \src2_stride, \src2 > > + add t1, \src2_stride, t0 > > + add t2, \src2_stride, t1 > > + vle8.v v26, (\src2) > > + vle8.v v27, (t0) > > + vle8.v v28, (t1) > > + vle8.v v29, (t2) > > + add \src2, \src2_stride, t2 > > + vaaddu.vv v18, v18, v26 > > + vaaddu.vv v20, v20, v27 > > + vaaddu.vv v22, v22, v28 > > + vaaddu.vv v24, v24, v29 > > + .endif > > + add t0, \dst_stride, \dst > > + add t1, \dst_stride, t0 > > + add t2, \dst_stride, t1 > > + .ifc \op, avg > > + vle8.v v26, (\dst) > > + vle8.v v27, (t0) > > + vle8.v v28, (t1) > > + vle8.v v29, (t2) > > + vaaddu.vv v18, v18, v26 > > + vaaddu.vv v20, v20, v27 > > + vaaddu.vv v22, v22, v28 > > + vaaddu.vv v24, v24, v29 > > + .endif > > + vse8.v v18, (\dst) > > + vse8.v v20, (t0) > > + vse8.v v22, (t1) > > + vse8.v v24, (t2) > > + add \dst, \dst_stride, t2 > > + bnez \size, 1b > > + ret > > +endfunc > > +.endm > > + > > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > > + loading 32 bit integers, but this makes the convolutions more > > complicated + to implement, so it's not necessarily any faster. */ > > + > > +.macro h264_qpel lmul, lmul2 > > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, > > t6 > > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, > > t5, t6, a5, a6 > > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, > > a2, a3, a4, t5, t6 > > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, > > a1, a2, a3, a4, t5, t6, a5, a6 > > +.endm > > + > > + h264_qpel m1, m2 > > + h264_qpel mf2, m1 > > + h264_qpel mf4, mf2 > > + h264_qpel mf8, mf4 > > + > > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, > > src_stride, size, w0, w1, src2, src2_stride, tmp > > +func > > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, > > + j ff_\op\()_h264_qpel_pixels > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + mv \src2, \src > > + mv \src2_stride, \src_stride > > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + addi \src2, \src, 1 > > + mv \src2_stride, \src_stride > > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + mv \src2, \src > > + mv \src2_stride, \src_stride > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + add \src2, \src, \src_stride > > + mv \src2_stride, \src_stride > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > It's all but impossible to tell if spilling is actually necessary when you > alias registers like this. > > > + mv \tmp, ra > > Use t0 for subprocedure return. See specs. The subprocedure is sometimes the main procedure. And in any case, we use t0 inside the subprocedure. > > > + mv \src_stride, \dst_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > You can use jal here Shouldn't the assembler be responsible for inserting the correct procedure call instruction? > > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + addi \src, \src, 1 > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + add \src, \src, \src_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + add \src, \src, \src_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + addi \src, \src, 1 > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + mv \src_stride, \dst_stride > > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + add \src, \src, \src_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_h_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + mv \src_stride, \dst_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_v_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > +endfunc > > + > > +func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x > > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > + push \dst, \src > > + mv \tmp, ra > > + addi \src, \src, 1 > > + mv \src_stride, \dst_stride > > + addi \dst, sp, -(\sizei * \sizei) > > + li \dst_stride, \sizei > > + call ff_put_h264_qpel_v_lowpass_\lmul > > + addi \src2, sp, -(\sizei * \sizei) > > + mv \src2_stride, \dst_stride > > + pop \dst, \src > > + mv \dst_stride, \src_stride > > + li \size, \sizei > > + mv ra, \tmp > > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > +endfunc > > +.endm > > + > > + ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > > a5, a6, a7 > > + ff_h264_qpel_fns put, mf4, 8, rvv256, a0, a1, a2, a3, > > a4, t5, t6, a5, a6, a7 > > + ff_h264_qpel_fns put, mf8, 4, rvv256, a0, > > a1, a2, a3, a4, t5, t6, a5, a6, a7 > > + > > + ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > > a5, a6, a7 > > + ff_h264_qpel_fns avg, mf4, 8, rvv256, a0, a1, a2, a3, > > a4, t5, t6, a5, a6, a7 > > + ff_h264_qpel_fns avg, mf8, 4, rvv256, a0, > > a1, a2, a3, a4, t5, t6, a5, a6, a7 > > + > > + ff_h264_qpel_fns put, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > > a5, a6, a7 > > + ff_h264_qpel_fns put, mf2, 8, rvv, a0, a1, a2, a3, > > a4, t5, t6, a5, a6, a7 > > + ff_h264_qpel_fns put, mf4, 4, rvv, a0, > > a1, a2, a3, a4, t5, t6, a5, a6, a7 > > + > > + ff_h264_qpel_fns avg, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > > a5, a6, a7 > > + ff_h264_qpel_fns avg, mf2, 8, rvv, a0, a1, a2, a3, > > a4, t5, t6, a5, a6, a7 > > + ff_h264_qpel_fns avg, mf4, 4, rvv, a0, > > a1, a2, a3, a4, t5, t6, a5, a6, a7 > > > -- > レミ・デニ-クールモン > http://www.remlab.net/ > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Le 27 août 2024 17:12:03 GMT+03:00, Niklas Haas <ffmpeg@haasn.xyz> a écrit : >> > + .irp x, \vregs >> > + vmax.vx \x, \x, zero >> > + .endr >> > + vsetvli zero, zero, e8, \lmul, ta, ma >> > + .irp x, \vregs >> > + vnclipu.wi \x, \x, \shifti >> > + .endr >> > +.endm >> > + >> > +.macro lowpass_init lmul, sizei, size, w0, w1, backup >> >> This is needlessly convoluted. In fact, backup is not even used, which kind >> of highlights the point. > >That parameter was simply left over from a previous version of the code. That would not have happened if this was not a macro. >Are you suggesting we simply duplicate the contents of this macro into all of >thefunctions that use it? What are you implying here? Can you point to any other .S file from Arm, Aarch64, LoongArch or RV that does this? This macro can only realistically be used once per function - at the beginning. Do you typically make macros for declaring and initialising local variables in other languages? Because I don't and I don't know anybody else that does. And to make things worse, it has a conditional. TBH, this patch is unreviewable to me. It's simply too hard to read because of excess macro usage and excess macro parameter on top. >> > + vsetivli zero, \sizei, e8, \lmul, ta, ma >> > + csrwi vxrm, 0 >> > + li \size, \sizei >> > + .ifnb \w0 >> > + li \w0, 20 >> > + li \w1, -5 >> > + .endif >> > +.endm >> > + >> > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ >> > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 >> > + addi \tmp, \src, 3 >> > + lbu \tmp2, 2(\src) >> > + vle8.v v31, (\tmp) >> > + lbu \tmp, 1(\src) >> > + vslide1up.vx v30, v31, \tmp2 >> > + lbu \tmp2, 0(\src) >> > + vslide1up.vx v29, v30, \tmp >> > + lbu \tmp, -1(\src) >> > + vslide1up.vx v28, v29, \tmp2 >> > + lbu \tmp2, -2(\src) >> > + vslide1up.vx v27, v28, \tmp >> > + vslide1up.vx v26, v27, \tmp2 >> >> That's a lot of sequentially dependent vector instructions to save zero- >> extending v31 before the MACs. Are you sure it's faster that way? > >I'm not sure what you mean. How would your alternative implementation look? >It's certainly possible to make these instructions less sequential by >emitting multiple `lbu` instructions instead of sliding up. Slides are actually quite slow, but they're unavoidable here. The point is that you wouldn't need v26 up-front if you zero-extended v31 first. And then you would be able to interleave non-dependent instructions. That doesn't affect the number of slides and scalar loads. >> >> > + vwaddu.vv \vdst, v26, v31 >> > + vwmaccu.vx \vdst, \w0, v28 >> > + vwmaccu.vx \vdst, \w0, v29 >> > + vwmaccsu.vx \vdst, \w1, v27 >> > + vwmaccsu.vx \vdst, \w1, v30 >> > +.endm >> > + >> > + /* output is unclipped */ >> > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, >> > vsrc5, signed=0 >> > + .if \signed >> > + vwadd.vv \vdst, \vsrc0, \vsrc5 >> > + vwmacc.vx \vdst, \w0, \vsrc2 >> > + vwmacc.vx \vdst, \w0, \vsrc3 >> > + vwmacc.vx \vdst, \w1, \vsrc1 >> > + vwmacc.vx \vdst, \w1, \vsrc4 >> > + .else >> > + vwaddu.vv \vdst, \vsrc0, \vsrc5 >> > + vwmaccu.vx \vdst, \w0, \vsrc2 >> > + vwmaccu.vx \vdst, \w0, \vsrc3 >> > + vwmaccsu.vx \vdst, \w1, \vsrc1 >> > + vwmaccsu.vx \vdst, \w1, \vsrc4 >> > + .endif >> > +.endm >> > + >> > +.macro qpel_mc00 op, dst, src, stride, size >> > +func ff_\op\()_h264_qpel_pixels, zve32x >> > +1: >> > + add t0, \stride, \src >> > + add t1, \stride, t0 >> > + add t2, \stride, t1 >> > + vle8.v v0, (\src) >> > + vle8.v v1, (t0) >> > + vle8.v v2, (t1) >> > + vle8.v v3, (t2) >> > + addi \size, \size, -4 >> > + add \src, \stride, t2 >> > + add t0, \stride, \dst >> > + add t1, \stride, t0 >> > + add t2, \stride, t1 >> > + .ifc \op, avg >> > + vle8.v v4, (\dst) >> > + vle8.v v5, (t0) >> > + vle8.v v6, (t1) >> > + vle8.v v7, (t2) >> > + vaaddu.vv v0, v0, v4 >> > + vaaddu.vv v1, v1, v5 >> > + vaaddu.vv v2, v2, v6 >> > + vaaddu.vv v3, v3, v7 >> > + .endif >> > + vse8.v v0, (\dst) >> > + vse8.v v1, (t0) >> > + vse8.v v2, (t1) >> > + vse8.v v3, (t2) >> > + add \dst, \stride, t2 >> > + bnez \size, 1b >> > + ret >> > +endfunc >> > +.endm >> > + >> > + qpel_mc00 put, a0, a1, a2, a4 >> > + qpel_mc00 avg, a0, a1, a2, a4 >> >> Please don't add constant macro parameters. > >Why? It makes the code prohibitively difficult to read, review and revector. > It makes the code much easier to modify, The opposite actually. And that's not just me. From a quick look, Arm, Aarch64 and LoongArch assembler is also not doing that. Thing is, those parameter are *not* variables, they are *registers*. You need to know which register of which type is used where, and, in the case of vectors, what the number alignment is. That is vastly more relevant than what value a register represents whilst reviewing amnd *also* if revectoring. Besides you can always comment what value is where. You can't reasonably comment what register a macro parameter is. And then constant arguments hide the commonality of the code, leading to unnecessary duplication. We've had it happen already (VP8 IIRC). > and arguably also to understand. To be fair, I also thought that way when I started doing outline assembler a decade and a half ago. But like everyone else in FFmpeg, x264, dav1d, Linux, etc, I grew out of that nonsense. >This design was certainly invaluable during the development process. If you >prefer, we could "bake" the result, but at the cost of future refactorability. > >Given the state of RISC-V hardware, I'd rather leave the code in a state that >lends itself more towards future modifications. I disagree and it seems that all of the existing code RISC-ish ISA assembler in FFmpeg disagrees too... >> > + >> > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, >> > src_stride, size, w0, w1, src2, src2_stride >> > +func >> > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x >> > +1: >> > + add t0, \src_stride, \src >> > + add t1, \src_stride, t0 >> > + add t2, \src_stride, t1 >> > + lowpass_h v0, \src, \w0, \w1 >> > + lowpass_h v2, t0, \w0, \w1 >> > + lowpass_h v4, t1, \w0, \w1 >> > + lowpass_h v6, t2, \w0, \w1 >> > + add \src, \src_stride, t2 >> > + addi \size, \size, -4 >> > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 >> > + .ifnb \src2 >> > + add t0, \src2_stride, \src2 >> > + add t1, \src2_stride, t0 >> > + add t2, \src2_stride, t1 >> > + vle8.v v8, (\src2) >> > + vle8.v v10, (t0) >> > + vle8.v v12, (t1) >> > + vle8.v v14, (t2) >> > + add \src2, \dst_stride, t2 >> > + vaaddu.vv v0, v0, v8 >> > + vaaddu.vv v2, v2, v10 >> > + vaaddu.vv v4, v4, v12 >> > + vaaddu.vv v6, v6, v14 >> > + .endif >> > + add t0, \dst_stride, \dst >> > + add t1, \dst_stride, t0 >> > + add t2, \dst_stride, t1 >> > + .ifc \op, avg >> > + vle8.v v1, (\dst) >> > + vle8.v v3, (t0) >> > + vle8.v v5, (t1) >> > + vle8.v v7, (t2) >> > + vaaddu.vv v0, v0, v1 >> > + vaaddu.vv v2, v2, v3 >> > + vaaddu.vv v4, v4, v5 >> > + vaaddu.vv v6, v6, v7 >> > + .endif >> > + vse8.v v0, (\dst) >> > + vse8.v v2, (t0) >> > + vse8.v v4, (t1) >> > + vse8.v v6, (t2) >> > + add \dst, \dst_stride, t2 >> > + bnez \size, 1b >> > + ret >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x >> > + sub t0, \src, \src_stride >> > + sub t1, t0, \src_stride >> > + vle8.v v2, (\src) >> > + vle8.v v1, (t0) >> > + vle8.v v0, (t1) >> > + add t0, \src, \src_stride >> > + add t1, t0, \src_stride >> > + add \src, t1, \src_stride >> > + vle8.v v3, (t0) >> > + vle8.v v4, (t1) >> > +1: >> > + add t0, \src_stride, \src >> > + add t1, \src_stride, t0 >> > + add t2, \src_stride, t1 >> > + vle8.v v5, (\src) >> > + vle8.v v6, (t0) >> > + vle8.v v7, (t1) >> > + vle8.v v8, (t2) >> > + add \src, \src_stride, t2 >> > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 >> > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 >> > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 >> > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 >> > + addi \size, \size, -4 >> > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 >> > + .ifnb \src2 >> > + add t0, \src2_stride, \src2 >> > + add t1, \src2_stride, t0 >> > + add t2, \src2_stride, t1 >> > + vle8.v v9, (\src2) >> > + vle8.v v10, (t0) >> > + vle8.v v11, (t1) >> > + vle8.v v12, (t2) >> > + add \src2, \src2_stride, t2 >> > + vaaddu.vv v24, v24, v9 >> > + vaaddu.vv v26, v26, v10 >> > + vaaddu.vv v28, v28, v11 >> > + vaaddu.vv v30, v30, v12 >> > + .endif >> > + add t0, \dst_stride, \dst >> > + add t1, \dst_stride, t0 >> > + add t2, \dst_stride, t1 >> > + .ifc \op, avg >> > + vle8.v v9, (\dst) >> > + vle8.v v10, (t0) >> > + vle8.v v11, (t1) >> > + vle8.v v12, (t2) >> > + vaaddu.vv v24, v24, v9 >> > + vaaddu.vv v26, v26, v10 >> > + vaaddu.vv v28, v28, v11 >> > + vaaddu.vv v30, v30, v12 >> > + .endif >> > + vse8.v v24, (\dst) >> > + vse8.v v26, (t0) >> > + vse8.v v28, (t1) >> > + vse8.v v30, (t2) >> > + add \dst, \dst_stride, t2 >> > + vmv.v.v v0, v4 >> > + vmv.v.v v1, v5 >> > + vmv.v.v v2, v6 >> > + vmv.v.v v3, v7 >> > + vmv.v.v v4, v8 >> >> At this point, any vector move without rationale is an automatic -1 from >> me. > >There is a rationale; I can't see any rationale in the comments or description. >the vectors are reused for the next pass of the (unrolled) vertical >convolution. The only way to eliminate them would be to >make a special path for 8x8 that urolls all 8 lines to avoid this vector >move, Typically you only need to unroll 2x to eliminate vector copies. And it's not ONE vector copy, it's FOUR vector copies. Without actual numbers, I don't trust that the performance loss is negligible. >but IMO the gain in performance does not justify the increase in complexity >and binary size. >> >> > + bnez \size, 1b >> > + ret >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x >> > + sub t0, \src, \src_stride >> > + sub t1, t0, \src_stride >> > + lowpass_h v4, \src, \w0, \w1 >> > + lowpass_h v2, t0, \w0, \w1 >> > + lowpass_h v0, t1, \w0, \w1 >> > + add t0, \src, \src_stride >> > + add t1, t0, \src_stride >> > + add \src, t1, \src_stride >> > + lowpass_h v6, t0, \w0, \w1 >> > + lowpass_h v8, t1, \w0, \w1 >> > +1: >> > + add t0, \src_stride, \src >> > + add t1, \src_stride, t0 >> > + add t2, \src_stride, t1 >> > + lowpass_h v10, \src, \w0, \w1 >> > + lowpass_h v12, t0, \w0, \w1 >> > + lowpass_h v14, t1, \w0, \w1 >> > + lowpass_h v16, t2, \w0, \w1 >> > + vsetvli zero, zero, e16, \lmul2, ta, ma >> > + addi \size, \size, -4 >> > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, >> > signed=1 >> > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, >> > v12, signed=1 >> > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, >> > v12, v14, signed=1 >> > + vnclip.wi v0, v20, 10 >> > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, >> > signed= >> > + vnclip.wi v2, v24, 10 >> > + vnclip.wi v4, v28, 10 >> > + vnclip.wi v6, v20, 10 >> > + vmax.vx v18, v0, zero >> > + vmax.vx v20, v2, zero >> > + vmax.vx v22, v4, zero >> > + vmax.vx v24, v6, zero >> > + vmv.v.v v0, v8 >> > + vmv.v.v v2, v10 >> > + vmv.v.v v4, v12 >> > + vmv.v.v v6, v14 >> > + vmv.v.v v8, v16 >> > + add \src, \src_stride, t2 >> > + vsetvli zero, zero, e8, \lmul, ta, ma >> > + vnclipu.wi v18, v18, 0 >> > + vnclipu.wi v20, v20, 0 >> > + vnclipu.wi v22, v22, 0 >> > + vnclipu.wi v24, v24, 0 >> > + .ifnb \src2 >> > + add t0, \src2_stride, \src2 >> > + add t1, \src2_stride, t0 >> > + add t2, \src2_stride, t1 >> > + vle8.v v26, (\src2) >> > + vle8.v v27, (t0) >> > + vle8.v v28, (t1) >> > + vle8.v v29, (t2) >> > + add \src2, \src2_stride, t2 >> > + vaaddu.vv v18, v18, v26 >> > + vaaddu.vv v20, v20, v27 >> > + vaaddu.vv v22, v22, v28 >> > + vaaddu.vv v24, v24, v29 >> > + .endif >> > + add t0, \dst_stride, \dst >> > + add t1, \dst_stride, t0 >> > + add t2, \dst_stride, t1 >> > + .ifc \op, avg >> > + vle8.v v26, (\dst) >> > + vle8.v v27, (t0) >> > + vle8.v v28, (t1) >> > + vle8.v v29, (t2) >> > + vaaddu.vv v18, v18, v26 >> > + vaaddu.vv v20, v20, v27 >> > + vaaddu.vv v22, v22, v28 >> > + vaaddu.vv v24, v24, v29 >> > + .endif >> > + vse8.v v18, (\dst) >> > + vse8.v v20, (t0) >> > + vse8.v v22, (t1) >> > + vse8.v v24, (t2) >> > + add \dst, \dst_stride, t2 >> > + bnez \size, 1b >> > + ret >> > +endfunc >> > +.endm >> > + >> > +/* Note: We could possibly specialize for the width 8 / width 4 cases by >> > + loading 32 bit integers, but this makes the convolutions more >> > complicated + to implement, so it's not necessarily any faster. */ >> > + >> > +.macro h264_qpel lmul, lmul2 >> > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, >> > t6 >> > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, >> > t5, t6, a5, a6 >> > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, >> > a2, a3, a4, t5, t6 >> > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, >> > a1, a2, a3, a4, t5, t6, a5, a6 >> > +.endm >> > + >> > + h264_qpel m1, m2 >> > + h264_qpel mf2, m1 >> > + h264_qpel mf4, mf2 >> > + h264_qpel mf8, mf4 >> > + >> > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, >> > src_stride, size, w0, w1, src2, src2_stride, tmp >> > +func >> > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, >> > + j ff_\op\()_h264_qpel_pixels >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + mv \src2, \src >> > + mv \src2_stride, \src_stride >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + addi \src2, \src, 1 >> > + mv \src2_stride, \src_stride >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + mv \src2, \src >> > + mv \src2_stride, \src_stride >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + add \src2, \src, \src_stride >> > + mv \src2_stride, \src_stride >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> >> It's all but impossible to tell if spilling is actually necessary when you >> alias registers like this. >> >> > + mv \tmp, ra >> >> Use t0 for subprocedure return. See specs. > >The subprocedure is sometimes the main procedure. Sure does not seem that way, but again, the code is so damn hard to follow. >And in any case, we use t0 >inside the subprocedure. Then fix it. > >> >> > + mv \src_stride, \dst_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> >> You can use jal here > >Shouldn't the assembler be responsible for inserting the correct procedure >call instruction? Doesn't work here (GNU as 2.43.1). >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + addi \src, \src, 1 >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + add \src, \src, \src_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + add \src, \src, \src_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + addi \src, \src, 1 >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + mv \src_stride, \dst_stride >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + add \src, \src, \src_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_h_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 >> > + push \dst, \src >> > + mv \tmp, ra >> > + mv \src_stride, \dst_stride >> > + addi \dst, sp, -(\sizei * \sizei) >> > + li \dst_stride, \sizei >> > + call ff_put_h264_qpel_v_lowpass_\lmul >> > + addi \src2, sp, -(\sizei * \sizei) >> > + mv \src2_stride, \dst_stride >> > + pop \dst, \src >> > + mv \dst_stride, \src_stride >> > + li \size, \sizei >> > + mv ra, \tmp >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 >> > +endfunc >> > + >> > +func ff_\op\()_h264_qpel\sizei\()_mc
On Tue, 27 Aug 2024 21:47:59 +0300 Rémi Denis-Courmont <remi@remlab.net> wrote: > Le 27 août 2024 17:12:03 GMT+03:00, Niklas Haas <ffmpeg@haasn.xyz> a écrit : > >> > + .irp x, \vregs > >> > + vmax.vx \x, \x, zero > >> > + .endr > >> > + vsetvli zero, zero, e8, \lmul, ta, ma > >> > + .irp x, \vregs > >> > + vnclipu.wi \x, \x, \shifti > >> > + .endr > >> > +.endm > >> > + > >> > +.macro lowpass_init lmul, sizei, size, w0, w1, backup > >> > >> This is needlessly convoluted. In fact, backup is not even used, which kind > >> of highlights the point. > > > >That parameter was simply left over from a previous version of the code. > > That would not have happened if this was not a macro. > > >Are you suggesting we simply duplicate the contents of this macro into all of > >thefunctions that use it? > > What are you implying here? Can you point to any other .S file from Arm, > Aarch64, LoongArch or RV that does this? > > This macro can only realistically be used once per function - at the > beginning. Do you typically make macros for declaring and initialising local > variables in other languages? Because I don't and I don't know anybody else > that does. > > And to make things worse, it has a conditional. TBH, this patch is > unreviewable to me. It's simply too hard to read because of excess macro usage > and excess macro parameter on top. Changed. > > >> > + vsetivli zero, \sizei, e8, \lmul, ta, ma > >> > + csrwi vxrm, 0 > >> > + li \size, \sizei > >> > + .ifnb \w0 > >> > + li \w0, 20 > >> > + li \w1, -5 > >> > + .endif > >> > +.endm > >> > + > >> > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ > >> > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 > >> > + addi \tmp, \src, 3 > >> > + lbu \tmp2, 2(\src) > >> > + vle8.v v31, (\tmp) > >> > + lbu \tmp, 1(\src) > >> > + vslide1up.vx v30, v31, \tmp2 > >> > + lbu \tmp2, 0(\src) > >> > + vslide1up.vx v29, v30, \tmp > >> > + lbu \tmp, -1(\src) > >> > + vslide1up.vx v28, v29, \tmp2 > >> > + lbu \tmp2, -2(\src) > >> > + vslide1up.vx v27, v28, \tmp > >> > + vslide1up.vx v26, v27, \tmp2 > >> > >> That's a lot of sequentially dependent vector instructions to save zero- > >> extending v31 before the MACs. Are you sure it's faster that way? > > > >I'm not sure what you mean. How would your alternative implementation look? > >It's certainly possible to make these instructions less sequential by > >emitting multiple `lbu` instructions instead of sliding up. > > Slides are actually quite slow, but they're unavoidable here. The point is > that you wouldn't need v26 up-front if you zero-extended v31 first. And then > you would be able to interleave non-dependent instructions. > > That doesn't affect the number of slides and scalar loads. Right, the reason I did this is that there's afaict no instruction for a "widening accumulate", that is, no equivalent to vwmaccu that doesn't take an extra scalar multiplicand. So the alternative here requires an extra scalar instruction and multiplication. I'll bench it and get back to you. > > >> > >> > + vwaddu.vv \vdst, v26, v31 > >> > + vwmaccu.vx \vdst, \w0, v28 > >> > + vwmaccu.vx \vdst, \w0, v29 > >> > + vwmaccsu.vx \vdst, \w1, v27 > >> > + vwmaccsu.vx \vdst, \w1, v30 > >> > +.endm > >> > + > >> > + /* output is unclipped */ > >> > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, > vsrc4, > >> > vsrc5, signed=0 > >> > + .if \signed > >> > + vwadd.vv \vdst, \vsrc0, \vsrc5 > >> > + vwmacc.vx \vdst, \w0, \vsrc2 > >> > + vwmacc.vx \vdst, \w0, \vsrc3 > >> > + vwmacc.vx \vdst, \w1, \vsrc1 > >> > + vwmacc.vx \vdst, \w1, \vsrc4 > >> > + .else > >> > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > >> > + vwmaccu.vx \vdst, \w0, \vsrc2 > >> > + vwmaccu.vx \vdst, \w0, \vsrc3 > >> > + vwmaccsu.vx \vdst, \w1, \vsrc1 > >> > + vwmaccsu.vx \vdst, \w1, \vsrc4 > >> > + .endif > >> > +.endm > >> > + > >> > +.macro qpel_mc00 op, dst, src, stride, size > >> > +func ff_\op\()_h264_qpel_pixels, zve32x > >> > +1: > >> > + add t0, \stride, \src > >> > + add t1, \stride, t0 > >> > + add t2, \stride, t1 > >> > + vle8.v v0, (\src) > >> > + vle8.v v1, (t0) > >> > + vle8.v v2, (t1) > >> > + vle8.v v3, (t2) > >> > + addi \size, \size, -4 > >> > + add \src, \stride, t2 > >> > + add t0, \stride, \dst > >> > + add t1, \stride, t0 > >> > + add t2, \stride, t1 > >> > + .ifc \op, avg > >> > + vle8.v v4, (\dst) > >> > + vle8.v v5, (t0) > >> > + vle8.v v6, (t1) > >> > + vle8.v v7, (t2) > >> > + vaaddu.vv v0, v0, v4 > >> > + vaaddu.vv v1, v1, v5 > >> > + vaaddu.vv v2, v2, v6 > >> > + vaaddu.vv v3, v3, v7 > >> > + .endif > >> > + vse8.v v0, (\dst) > >> > + vse8.v v1, (t0) > >> > + vse8.v v2, (t1) > >> > + vse8.v v3, (t2) > >> > + add \dst, \stride, t2 > >> > + bnez \size, 1b > >> > + ret > >> > +endfunc > >> > +.endm > >> > + > >> > + qpel_mc00 put, a0, a1, a2, a4 > >> > + qpel_mc00 avg, a0, a1, a2, a4 > >> > >> Please don't add constant macro parameters. > > > >Why? > > It makes the code prohibitively difficult to read, review and revector. Changed. > > > It makes the code much easier to modify, > > The opposite actually. And that's not just me. From a quick look, Arm, Aarch64 > and LoongArch assembler is also not doing that. > > Thing is, those parameter are *not* variables, they are *registers*. You need > to know which register of which type is used where, and, in the case of > vectors, what the number alignment is. That is vastly more relevant than what > value a register represents whilst reviewing amnd *also* if revectoring. > Besides you can always comment what value is where. You can't reasonably > comment what register a macro parameter is. > > And then constant arguments hide the commonality of the code, leading to > unnecessary duplication. We've had it happen already (VP8 IIRC). > > > and arguably also to understand. > > To be fair, I also thought that way when I started doing outline assembler a > decade and a half ago. But like everyone else in FFmpeg, x264, dav1d, Linux, > etc, I grew out of that nonsense. > > >This design was certainly invaluable during the development process. If you > >prefer, we could "bake" the result, but at the cost of future > refactorability. > > > >Given the state of RISC-V hardware, I'd rather leave the code in a state that > >lends itself more towards future modifications. > > I disagree and it seems that all of the existing code RISC-ish ISA assembler > in FFmpeg disagrees too... > > >> > + > >> > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, > >> > src_stride, size, w0, w1, src2, src2_stride > >> > +func > >> > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > >> > +1: > >> > + add t0, \src_stride, \src > >> > + add t1, \src_stride, t0 > >> > + add t2, \src_stride, t1 > >> > + lowpass_h v0, \src, \w0, \w1 > >> > + lowpass_h v2, t0, \w0, \w1 > >> > + lowpass_h v4, t1, \w0, \w1 > >> > + lowpass_h v6, t2, \w0, \w1 > >> > + add \src, \src_stride, t2 > >> > + addi \size, \size, -4 > >> > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 > >> > + .ifnb \src2 > >> > + add t0, \src2_stride, \src2 > >> > + add t1, \src2_stride, t0 > >> > + add t2, \src2_stride, t1 > >> > + vle8.v v8, (\src2) > >> > + vle8.v v10, (t0) > >> > + vle8.v v12, (t1) > >> > + vle8.v v14, (t2) > >> > + add \src2, \dst_stride, t2 > >> > + vaaddu.vv v0, v0, v8 > >> > + vaaddu.vv v2, v2, v10 > >> > + vaaddu.vv v4, v4, v12 > >> > + vaaddu.vv v6, v6, v14 > >> > + .endif > >> > + add t0, \dst_stride, \dst > >> > + add t1, \dst_stride, t0 > >> > + add t2, \dst_stride, t1 > >> > + .ifc \op, avg > >> > + vle8.v v1, (\dst) > >> > + vle8.v v3, (t0) > >> > + vle8.v v5, (t1) > >> > + vle8.v v7, (t2) > >> > + vaaddu.vv v0, v0, v1 > >> > + vaaddu.vv v2, v2, v3 > >> > + vaaddu.vv v4, v4, v5 > >> > + vaaddu.vv v6, v6, v7 > >> > + .endif > >> > + vse8.v v0, (\dst) > >> > + vse8.v v2, (t0) > >> > + vse8.v v4, (t1) > >> > + vse8.v v6, (t2) > >> > + add \dst, \dst_stride, t2 > >> > + bnez \size, 1b > >> > + ret > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > >> > + sub t0, \src, \src_stride > >> > + sub t1, t0, \src_stride > >> > + vle8.v v2, (\src) > >> > + vle8.v v1, (t0) > >> > + vle8.v v0, (t1) > >> > + add t0, \src, \src_stride > >> > + add t1, t0, \src_stride > >> > + add \src, t1, \src_stride > >> > + vle8.v v3, (t0) > >> > + vle8.v v4, (t1) > >> > +1: > >> > + add t0, \src_stride, \src > >> > + add t1, \src_stride, t0 > >> > + add t2, \src_stride, t1 > >> > + vle8.v v5, (\src) > >> > + vle8.v v6, (t0) > >> > + vle8.v v7, (t1) > >> > + vle8.v v8, (t2) > >> > + add \src, \src_stride, t2 > >> > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 > >> > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 > >> > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 > >> > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 > >> > + addi \size, \size, -4 > >> > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 > >> > + .ifnb \src2 > >> > + add t0, \src2_stride, \src2 > >> > + add t1, \src2_stride, t0 > >> > + add t2, \src2_stride, t1 > >> > + vle8.v v9, (\src2) > >> > + vle8.v v10, (t0) > >> > + vle8.v v11, (t1) > >> > + vle8.v v12, (t2) > >> > + add \src2, \src2_stride, t2 > >> > + vaaddu.vv v24, v24, v9 > >> > + vaaddu.vv v26, v26, v10 > >> > + vaaddu.vv v28, v28, v11 > >> > + vaaddu.vv v30, v30, v12 > >> > + .endif > >> > + add t0, \dst_stride, \dst > >> > + add t1, \dst_stride, t0 > >> > + add t2, \dst_stride, t1 > >> > + .ifc \op, avg > >> > + vle8.v v9, (\dst) > >> > + vle8.v v10, (t0) > >> > + vle8.v v11, (t1) > >> > + vle8.v v12, (t2) > >> > + vaaddu.vv v24, v24, v9 > >> > + vaaddu.vv v26, v26, v10 > >> > + vaaddu.vv v28, v28, v11 > >> > + vaaddu.vv v30, v30, v12 > >> > + .endif > >> > + vse8.v v24, (\dst) > >> > + vse8.v v26, (t0) > >> > + vse8.v v28, (t1) > >> > + vse8.v v30, (t2) > >> > + add \dst, \dst_stride, t2 > >> > + vmv.v.v v0, v4 > >> > + vmv.v.v v1, v5 > >> > + vmv.v.v v2, v6 > >> > + vmv.v.v v3, v7 > >> > + vmv.v.v v4, v8 > >> > >> At this point, any vector move without rationale is an automatic -1 from > >> me. > > > >There is a rationale; > > I can't see any rationale in the comments or description. > > >the vectors are reused for the next pass of the (unrolled) vertical > >convolution. The only way to eliminate them would be to > >make a special path for 8x8 that urolls all 8 lines to avoid this vector > >move, > > Typically you only need to unroll 2x to eliminate vector copies. And it's not > ONE vector copy, it's FOUR vector copies. Without actual numbers, I don't > trust that the performance loss is negligible. How do you implement a vertical convolution without either redundant loads or vector moves? > > >but IMO the gain in performance does not justify the increase in complexity > >and binary size. > > >> > >> > + bnez \size, 1b > >> > + ret > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > >> > + sub t0, \src, \src_stride > >> > + sub t1, t0, \src_stride > >> > + lowpass_h v4, \src, \w0, \w1 > >> > + lowpass_h v2, t0, \w0, \w1 > >> > + lowpass_h v0, t1, \w0, \w1 > >> > + add t0, \src, \src_stride > >> > + add t1, t0, \src_stride > >> > + add \src, t1, \src_stride > >> > + lowpass_h v6, t0, \w0, \w1 > >> > + lowpass_h v8, t1, \w0, \w1 > >> > +1: > >> > + add t0, \src_stride, \src > >> > + add t1, \src_stride, t0 > >> > + add t2, \src_stride, t1 > >> > + lowpass_h v10, \src, \w0, \w1 > >> > + lowpass_h v12, t0, \w0, \w1 > >> > + lowpass_h v14, t1, \w0, \w1 > >> > + lowpass_h v16, t2, \w0, \w1 > >> > + vsetvli zero, zero, e16, \lmul2, ta, ma > >> > + addi \size, \size, -4 > >> > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, > >> > signed=1 > >> > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, > >> > v12, signed=1 > >> > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, > >> > v12, v14, signed=1 > >> > + vnclip.wi v0, v20, 10 > >> > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, > >> > signed= > >> > + vnclip.wi v2, v24, 10 > >> > + vnclip.wi v4, v28, 10 > >> > + vnclip.wi v6, v20, 10 > >> > + vmax.vx v18, v0, zero > >> > + vmax.vx v20, v2, zero > >> > + vmax.vx v22, v4, zero > >> > + vmax.vx v24, v6, zero > >> > + vmv.v.v v0, v8 > >> > + vmv.v.v v2, v10 > >> > + vmv.v.v v4, v12 > >> > + vmv.v.v v6, v14 > >> > + vmv.v.v v8, v16 > >> > + add \src, \src_stride, t2 > >> > + vsetvli zero, zero, e8, \lmul, ta, ma > >> > + vnclipu.wi v18, v18, 0 > >> > + vnclipu.wi v20, v20, 0 > >> > + vnclipu.wi v22, v22, 0 > >> > + vnclipu.wi v24, v24, 0 > >> > + .ifnb \src2 > >> > + add t0, \src2_stride, \src2 > >> > + add t1, \src2_stride, t0 > >> > + add t2, \src2_stride, t1 > >> > + vle8.v v26, (\src2) > >> > + vle8.v v27, (t0) > >> > + vle8.v v28, (t1) > >> > + vle8.v v29, (t2) > >> > + add \src2, \src2_stride, t2 > >> > + vaaddu.vv v18, v18, v26 > >> > + vaaddu.vv v20, v20, v27 > >> > + vaaddu.vv v22, v22, v28 > >> > + vaaddu.vv v24, v24, v29 > >> > + .endif > >> > + add t0, \dst_stride, \dst > >> > + add t1, \dst_stride, t0 > >> > + add t2, \dst_stride, t1 > >> > + .ifc \op, avg > >> > + vle8.v v26, (\dst) > >> > + vle8.v v27, (t0) > >> > + vle8.v v28, (t1) > >> > + vle8.v v29, (t2) > >> > + vaaddu.vv v18, v18, v26 > >> > + vaaddu.vv v20, v20, v27 > >> > + vaaddu.vv v22, v22, v28 > >> > + vaaddu.vv v24, v24, v29 > >> > + .endif > >> > + vse8.v v18, (\dst) > >> > + vse8.v v20, (t0) > >> > + vse8.v v22, (t1) > >> > + vse8.v v24, (t2) > >> > + add \dst, \dst_stride, t2 > >> > + bnez \size, 1b > >> > + ret > >> > +endfunc > >> > +.endm > >> > + > >> > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > >> > + loading 32 bit integers, but this makes the convolutions more > >> > complicated + to implement, so it's not necessarily any faster. */ > >> > + > >> > +.macro h264_qpel lmul, lmul2 > >> > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, > t5, > >> > t6 > >> > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, > >> > t5, t6, a5, a6 > >> > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, > >> > a2, a3, a4, t5, t6 > >> > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, > >> > a1, a2, a3, a4, t5, t6, a5, a6 > >> > +.endm > >> > + > >> > + h264_qpel m1, m2 > >> > + h264_qpel mf2, m1 > >> > + h264_qpel mf4, mf2 > >> > + h264_qpel mf8, mf4 > >> > + > >> > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, > >> > src_stride, size, w0, w1, src2, src2_stride, tmp > >> > +func > >> > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, > >> > + j ff_\op\()_h264_qpel_pixels > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + mv \src2, \src > >> > + mv \src2_stride, \src_stride > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + addi \src2, \src, 1 > >> > + mv \src2_stride, \src_stride > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + mv \src2, \src > >> > + mv \src2_stride, \src_stride > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + add \src2, \src, \src_stride > >> > + mv \src2_stride, \src_stride > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > >> It's all but impossible to tell if spilling is actually necessary when you > >> alias registers like this. > >> > >> > + mv \tmp, ra > >> > >> Use t0 for subprocedure return. See specs. > > > >The subprocedure is sometimes the main procedure. > > Sure does not seem that way, but again, the code is so damn hard to follow. > > >And in any case, we use t0 > >inside the subprocedure. > > Then fix it. > > > > >> > >> > + mv \src_stride, \dst_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > >> You can use jal here > > > >Shouldn't the assembler be responsible for inserting the correct procedure > >call instruction? > > Doesn't work here (GNU as 2.43.1). > > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + addi \src, \src, 1 > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + add \src, \src, \src_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + add \src, \src, \src_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + addi \src, \src, 1 > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + mv \src_stride, \dst_stride > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + add \src, \src, \src_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > >> > + push \dst, \src > >> > + mv \tmp, ra > >> > + mv \src_stride, \dst_stride > >> > + addi \dst, sp, -(\sizei * \sizei) > >> > + li \dst_stride, \sizei > >> > + call ff_put_h264_qpel_v_lowpass_\lmul > >> > + addi \src2, sp, -(\sizei * \sizei) > >> > + mv \src2_stride, \dst_stride > >> > + pop \dst, \src > >> > + mv \dst_stride, \src_stride > >> > + li \size, \sizei > >> > + mv ra, \tmp > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > >> > +endfunc > >> > + > >> > +func ff_\op\()_h264_qpel\sizei\()_mc > > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
On Wed, 28 Aug 2024 13:30:02 +0200 Niklas Haas <ffmpeg@haasn.xyz> wrote: > On Tue, 27 Aug 2024 21:47:59 +0300 Rémi Denis-Courmont <remi@remlab.net> wrote: > > Le 27 août 2024 17:12:03 GMT+03:00, Niklas Haas <ffmpeg@haasn.xyz> a écrit : > > >> > + .irp x, \vregs > > >> > + vmax.vx \x, \x, zero > > >> > + .endr > > >> > + vsetvli zero, zero, e8, \lmul, ta, ma > > >> > + .irp x, \vregs > > >> > + vnclipu.wi \x, \x, \shifti > > >> > + .endr > > >> > +.endm > > >> > + > > >> > +.macro lowpass_init lmul, sizei, size, w0, w1, backup > > >> > > >> This is needlessly convoluted. In fact, backup is not even used, which kind > > >> of highlights the point. > > > > > >That parameter was simply left over from a previous version of the code. > > > > That would not have happened if this was not a macro. > > > > >Are you suggesting we simply duplicate the contents of this macro into all of > > >thefunctions that use it? > > > > What are you implying here? Can you point to any other .S file from Arm, > > Aarch64, LoongArch or RV that does this? > > > > This macro can only realistically be used once per function - at the > > beginning. Do you typically make macros for declaring and initialising local > > variables in other languages? Because I don't and I don't know anybody else > > that does. > > > > And to make things worse, it has a conditional. TBH, this patch is > > unreviewable to me. It's simply too hard to read because of excess macro usage > > and excess macro parameter on top. > > Changed. > > > > > >> > + vsetivli zero, \sizei, e8, \lmul, ta, ma > > >> > + csrwi vxrm, 0 > > >> > + li \size, \sizei > > >> > + .ifnb \w0 > > >> > + li \w0, 20 > > >> > + li \w1, -5 > > >> > + .endif > > >> > +.endm > > >> > + > > >> > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ > > >> > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 > > >> > + addi \tmp, \src, 3 > > >> > + lbu \tmp2, 2(\src) > > >> > + vle8.v v31, (\tmp) > > >> > + lbu \tmp, 1(\src) > > >> > + vslide1up.vx v30, v31, \tmp2 > > >> > + lbu \tmp2, 0(\src) > > >> > + vslide1up.vx v29, v30, \tmp > > >> > + lbu \tmp, -1(\src) > > >> > + vslide1up.vx v28, v29, \tmp2 > > >> > + lbu \tmp2, -2(\src) > > >> > + vslide1up.vx v27, v28, \tmp > > >> > + vslide1up.vx v26, v27, \tmp2 > > >> > > >> That's a lot of sequentially dependent vector instructions to save zero- > > >> extending v31 before the MACs. Are you sure it's faster that way? > > > > > >I'm not sure what you mean. How would your alternative implementation look? > > >It's certainly possible to make these instructions less sequential by > > >emitting multiple `lbu` instructions instead of sliding up. > > > > Slides are actually quite slow, but they're unavoidable here. The point is > > that you wouldn't need v26 up-front if you zero-extended v31 first. And then > > you would be able to interleave non-dependent instructions. > > > > That doesn't affect the number of slides and scalar loads. > > Right, the reason I did this is that there's afaict no instruction for a > "widening accumulate", that is, no equivalent to vwmaccu that doesn't take an > extra scalar multiplicand. So the alternative here requires an extra scalar > instruction and multiplication. I'll bench it and get back to you. Single vwaddu: put_h264_qpel_16_mc20_8_rvv_i32: 420.7 ( 4.29x) Zero extend + separate vwmaccu: put_h264_qpel_16_mc20_8_rvv_i32: 433.9 ( 4.17x) So the hit from having a dependent vector instruction is not worth the loss due to an extra accumulate instruction to deal with v26. > > > > > >> > > >> > + vwaddu.vv \vdst, v26, v31 > > >> > + vwmaccu.vx \vdst, \w0, v28 > > >> > + vwmaccu.vx \vdst, \w0, v29 > > >> > + vwmaccsu.vx \vdst, \w1, v27 > > >> > + vwmaccsu.vx \vdst, \w1, v30 > > >> > +.endm > > >> > + > > >> > + /* output is unclipped */ > > >> > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, > > vsrc4, > > >> > vsrc5, signed=0 > > >> > + .if \signed > > >> > + vwadd.vv \vdst, \vsrc0, \vsrc5 > > >> > + vwmacc.vx \vdst, \w0, \vsrc2 > > >> > + vwmacc.vx \vdst, \w0, \vsrc3 > > >> > + vwmacc.vx \vdst, \w1, \vsrc1 > > >> > + vwmacc.vx \vdst, \w1, \vsrc4 > > >> > + .else > > >> > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > > >> > + vwmaccu.vx \vdst, \w0, \vsrc2 > > >> > + vwmaccu.vx \vdst, \w0, \vsrc3 > > >> > + vwmaccsu.vx \vdst, \w1, \vsrc1 > > >> > + vwmaccsu.vx \vdst, \w1, \vsrc4 > > >> > + .endif > > >> > +.endm > > >> > + > > >> > +.macro qpel_mc00 op, dst, src, stride, size > > >> > +func ff_\op\()_h264_qpel_pixels, zve32x > > >> > +1: > > >> > + add t0, \stride, \src > > >> > + add t1, \stride, t0 > > >> > + add t2, \stride, t1 > > >> > + vle8.v v0, (\src) > > >> > + vle8.v v1, (t0) > > >> > + vle8.v v2, (t1) > > >> > + vle8.v v3, (t2) > > >> > + addi \size, \size, -4 > > >> > + add \src, \stride, t2 > > >> > + add t0, \stride, \dst > > >> > + add t1, \stride, t0 > > >> > + add t2, \stride, t1 > > >> > + .ifc \op, avg > > >> > + vle8.v v4, (\dst) > > >> > + vle8.v v5, (t0) > > >> > + vle8.v v6, (t1) > > >> > + vle8.v v7, (t2) > > >> > + vaaddu.vv v0, v0, v4 > > >> > + vaaddu.vv v1, v1, v5 > > >> > + vaaddu.vv v2, v2, v6 > > >> > + vaaddu.vv v3, v3, v7 > > >> > + .endif > > >> > + vse8.v v0, (\dst) > > >> > + vse8.v v1, (t0) > > >> > + vse8.v v2, (t1) > > >> > + vse8.v v3, (t2) > > >> > + add \dst, \stride, t2 > > >> > + bnez \size, 1b > > >> > + ret > > >> > +endfunc > > >> > +.endm > > >> > + > > >> > + qpel_mc00 put, a0, a1, a2, a4 > > >> > + qpel_mc00 avg, a0, a1, a2, a4 > > >> > > >> Please don't add constant macro parameters. > > > > > >Why? > > > > It makes the code prohibitively difficult to read, review and revector. > > Changed. > > > > > > It makes the code much easier to modify, > > > > The opposite actually. And that's not just me. From a quick look, Arm, Aarch64 > > and LoongArch assembler is also not doing that. > > > > Thing is, those parameter are *not* variables, they are *registers*. You need > > to know which register of which type is used where, and, in the case of > > vectors, what the number alignment is. That is vastly more relevant than what > > value a register represents whilst reviewing amnd *also* if revectoring. > > Besides you can always comment what value is where. You can't reasonably > > comment what register a macro parameter is. > > > > And then constant arguments hide the commonality of the code, leading to > > unnecessary duplication. We've had it happen already (VP8 IIRC). > > > > > and arguably also to understand. > > > > To be fair, I also thought that way when I started doing outline assembler a > > decade and a half ago. But like everyone else in FFmpeg, x264, dav1d, Linux, > > etc, I grew out of that nonsense. > > > > >This design was certainly invaluable during the development process. If you > > >prefer, we could "bake" the result, but at the cost of future > > refactorability. > > > > > >Given the state of RISC-V hardware, I'd rather leave the code in a state that > > >lends itself more towards future modifications. > > > > I disagree and it seems that all of the existing code RISC-ish ISA assembler > > in FFmpeg disagrees too... > > > > >> > + > > >> > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, > > >> > src_stride, size, w0, w1, src2, src2_stride > > >> > +func > > >> > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > > >> > +1: > > >> > + add t0, \src_stride, \src > > >> > + add t1, \src_stride, t0 > > >> > + add t2, \src_stride, t1 > > >> > + lowpass_h v0, \src, \w0, \w1 > > >> > + lowpass_h v2, t0, \w0, \w1 > > >> > + lowpass_h v4, t1, \w0, \w1 > > >> > + lowpass_h v6, t2, \w0, \w1 > > >> > + add \src, \src_stride, t2 > > >> > + addi \size, \size, -4 > > >> > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 > > >> > + .ifnb \src2 > > >> > + add t0, \src2_stride, \src2 > > >> > + add t1, \src2_stride, t0 > > >> > + add t2, \src2_stride, t1 > > >> > + vle8.v v8, (\src2) > > >> > + vle8.v v10, (t0) > > >> > + vle8.v v12, (t1) > > >> > + vle8.v v14, (t2) > > >> > + add \src2, \dst_stride, t2 > > >> > + vaaddu.vv v0, v0, v8 > > >> > + vaaddu.vv v2, v2, v10 > > >> > + vaaddu.vv v4, v4, v12 > > >> > + vaaddu.vv v6, v6, v14 > > >> > + .endif > > >> > + add t0, \dst_stride, \dst > > >> > + add t1, \dst_stride, t0 > > >> > + add t2, \dst_stride, t1 > > >> > + .ifc \op, avg > > >> > + vle8.v v1, (\dst) > > >> > + vle8.v v3, (t0) > > >> > + vle8.v v5, (t1) > > >> > + vle8.v v7, (t2) > > >> > + vaaddu.vv v0, v0, v1 > > >> > + vaaddu.vv v2, v2, v3 > > >> > + vaaddu.vv v4, v4, v5 > > >> > + vaaddu.vv v6, v6, v7 > > >> > + .endif > > >> > + vse8.v v0, (\dst) > > >> > + vse8.v v2, (t0) > > >> > + vse8.v v4, (t1) > > >> > + vse8.v v6, (t2) > > >> > + add \dst, \dst_stride, t2 > > >> > + bnez \size, 1b > > >> > + ret > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > > >> > + sub t0, \src, \src_stride > > >> > + sub t1, t0, \src_stride > > >> > + vle8.v v2, (\src) > > >> > + vle8.v v1, (t0) > > >> > + vle8.v v0, (t1) > > >> > + add t0, \src, \src_stride > > >> > + add t1, t0, \src_stride > > >> > + add \src, t1, \src_stride > > >> > + vle8.v v3, (t0) > > >> > + vle8.v v4, (t1) > > >> > +1: > > >> > + add t0, \src_stride, \src > > >> > + add t1, \src_stride, t0 > > >> > + add t2, \src_stride, t1 > > >> > + vle8.v v5, (\src) > > >> > + vle8.v v6, (t0) > > >> > + vle8.v v7, (t1) > > >> > + vle8.v v8, (t2) > > >> > + add \src, \src_stride, t2 > > >> > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 > > >> > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 > > >> > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 > > >> > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 > > >> > + addi \size, \size, -4 > > >> > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 > > >> > + .ifnb \src2 > > >> > + add t0, \src2_stride, \src2 > > >> > + add t1, \src2_stride, t0 > > >> > + add t2, \src2_stride, t1 > > >> > + vle8.v v9, (\src2) > > >> > + vle8.v v10, (t0) > > >> > + vle8.v v11, (t1) > > >> > + vle8.v v12, (t2) > > >> > + add \src2, \src2_stride, t2 > > >> > + vaaddu.vv v24, v24, v9 > > >> > + vaaddu.vv v26, v26, v10 > > >> > + vaaddu.vv v28, v28, v11 > > >> > + vaaddu.vv v30, v30, v12 > > >> > + .endif > > >> > + add t0, \dst_stride, \dst > > >> > + add t1, \dst_stride, t0 > > >> > + add t2, \dst_stride, t1 > > >> > + .ifc \op, avg > > >> > + vle8.v v9, (\dst) > > >> > + vle8.v v10, (t0) > > >> > + vle8.v v11, (t1) > > >> > + vle8.v v12, (t2) > > >> > + vaaddu.vv v24, v24, v9 > > >> > + vaaddu.vv v26, v26, v10 > > >> > + vaaddu.vv v28, v28, v11 > > >> > + vaaddu.vv v30, v30, v12 > > >> > + .endif > > >> > + vse8.v v24, (\dst) > > >> > + vse8.v v26, (t0) > > >> > + vse8.v v28, (t1) > > >> > + vse8.v v30, (t2) > > >> > + add \dst, \dst_stride, t2 > > >> > + vmv.v.v v0, v4 > > >> > + vmv.v.v v1, v5 > > >> > + vmv.v.v v2, v6 > > >> > + vmv.v.v v3, v7 > > >> > + vmv.v.v v4, v8 > > >> > > >> At this point, any vector move without rationale is an automatic -1 from > > >> me. > > > > > >There is a rationale; > > > > I can't see any rationale in the comments or description. > > > > >the vectors are reused for the next pass of the (unrolled) vertical > > >convolution. The only way to eliminate them would be to > > >make a special path for 8x8 that urolls all 8 lines to avoid this vector > > >move, > > > > Typically you only need to unroll 2x to eliminate vector copies. And it's not > > ONE vector copy, it's FOUR vector copies. Without actual numbers, I don't > > trust that the performance loss is negligible. > > How do you implement a vertical convolution without either redundant loads or > vector moves? > > > > > >but IMO the gain in performance does not justify the increase in complexity > > >and binary size. > > > > >> > > >> > + bnez \size, 1b > > >> > + ret > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > > >> > + sub t0, \src, \src_stride > > >> > + sub t1, t0, \src_stride > > >> > + lowpass_h v4, \src, \w0, \w1 > > >> > + lowpass_h v2, t0, \w0, \w1 > > >> > + lowpass_h v0, t1, \w0, \w1 > > >> > + add t0, \src, \src_stride > > >> > + add t1, t0, \src_stride > > >> > + add \src, t1, \src_stride > > >> > + lowpass_h v6, t0, \w0, \w1 > > >> > + lowpass_h v8, t1, \w0, \w1 > > >> > +1: > > >> > + add t0, \src_stride, \src > > >> > + add t1, \src_stride, t0 > > >> > + add t2, \src_stride, t1 > > >> > + lowpass_h v10, \src, \w0, \w1 > > >> > + lowpass_h v12, t0, \w0, \w1 > > >> > + lowpass_h v14, t1, \w0, \w1 > > >> > + lowpass_h v16, t2, \w0, \w1 > > >> > + vsetvli zero, zero, e16, \lmul2, ta, ma > > >> > + addi \size, \size, -4 > > >> > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, > > >> > signed=1 > > >> > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, > > >> > v12, signed=1 > > >> > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, > > >> > v12, v14, signed=1 > > >> > + vnclip.wi v0, v20, 10 > > >> > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, > > >> > signed= > > >> > + vnclip.wi v2, v24, 10 > > >> > + vnclip.wi v4, v28, 10 > > >> > + vnclip.wi v6, v20, 10 > > >> > + vmax.vx v18, v0, zero > > >> > + vmax.vx v20, v2, zero > > >> > + vmax.vx v22, v4, zero > > >> > + vmax.vx v24, v6, zero > > >> > + vmv.v.v v0, v8 > > >> > + vmv.v.v v2, v10 > > >> > + vmv.v.v v4, v12 > > >> > + vmv.v.v v6, v14 > > >> > + vmv.v.v v8, v16 > > >> > + add \src, \src_stride, t2 > > >> > + vsetvli zero, zero, e8, \lmul, ta, ma > > >> > + vnclipu.wi v18, v18, 0 > > >> > + vnclipu.wi v20, v20, 0 > > >> > + vnclipu.wi v22, v22, 0 > > >> > + vnclipu.wi v24, v24, 0 > > >> > + .ifnb \src2 > > >> > + add t0, \src2_stride, \src2 > > >> > + add t1, \src2_stride, t0 > > >> > + add t2, \src2_stride, t1 > > >> > + vle8.v v26, (\src2) > > >> > + vle8.v v27, (t0) > > >> > + vle8.v v28, (t1) > > >> > + vle8.v v29, (t2) > > >> > + add \src2, \src2_stride, t2 > > >> > + vaaddu.vv v18, v18, v26 > > >> > + vaaddu.vv v20, v20, v27 > > >> > + vaaddu.vv v22, v22, v28 > > >> > + vaaddu.vv v24, v24, v29 > > >> > + .endif > > >> > + add t0, \dst_stride, \dst > > >> > + add t1, \dst_stride, t0 > > >> > + add t2, \dst_stride, t1 > > >> > + .ifc \op, avg > > >> > + vle8.v v26, (\dst) > > >> > + vle8.v v27, (t0) > > >> > + vle8.v v28, (t1) > > >> > + vle8.v v29, (t2) > > >> > + vaaddu.vv v18, v18, v26 > > >> > + vaaddu.vv v20, v20, v27 > > >> > + vaaddu.vv v22, v22, v28 > > >> > + vaaddu.vv v24, v24, v29 > > >> > + .endif > > >> > + vse8.v v18, (\dst) > > >> > + vse8.v v20, (t0) > > >> > + vse8.v v22, (t1) > > >> > + vse8.v v24, (t2) > > >> > + add \dst, \dst_stride, t2 > > >> > + bnez \size, 1b > > >> > + ret > > >> > +endfunc > > >> > +.endm > > >> > + > > >> > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > > >> > + loading 32 bit integers, but this makes the convolutions more > > >> > complicated + to implement, so it's not necessarily any faster. */ > > >> > + > > >> > +.macro h264_qpel lmul, lmul2 > > >> > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, > > t5, > > >> > t6 > > >> > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, > > >> > t5, t6, a5, a6 > > >> > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, > > >> > a2, a3, a4, t5, t6 > > >> > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, > > >> > a1, a2, a3, a4, t5, t6, a5, a6 > > >> > +.endm > > >> > + > > >> > + h264_qpel m1, m2 > > >> > + h264_qpel mf2, m1 > > >> > + h264_qpel mf4, mf2 > > >> > + h264_qpel mf8, mf4 > > >> > + > > >> > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, > > >> > src_stride, size, w0, w1, src2, src2_stride, tmp > > >> > +func > > >> > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, > > >> > + j ff_\op\()_h264_qpel_pixels > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + mv \src2, \src > > >> > + mv \src2_stride, \src_stride > > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + addi \src2, \src, 1 > > >> > + mv \src2_stride, \src_stride > > >> > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + mv \src2, \src > > >> > + mv \src2_stride, \src_stride > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + add \src2, \src, \src_stride > > >> > + mv \src2_stride, \src_stride > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > > >> It's all but impossible to tell if spilling is actually necessary when you > > >> alias registers like this. > > >> > > >> > + mv \tmp, ra > > >> > > >> Use t0 for subprocedure return. See specs. > > > > > >The subprocedure is sometimes the main procedure. > > > > Sure does not seem that way, but again, the code is so damn hard to follow. > > > > >And in any case, we use t0 > > >inside the subprocedure. > > > > Then fix it. > > > > > > > >> > > >> > + mv \src_stride, \dst_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > > >> You can use jal here > > > > > >Shouldn't the assembler be responsible for inserting the correct procedure > > >call instruction? > > > > Doesn't work here (GNU as 2.43.1). > > > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + addi \src, \src, 1 > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + add \src, \src, \src_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + add \src, \src, \src_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + addi \src, \src, 1 > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + mv \src_stride, \dst_stride > > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + add \src, \src, \src_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_h_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x > > >> > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > > >> > + push \dst, \src > > >> > + mv \tmp, ra > > >> > + mv \src_stride, \dst_stride > > >> > + addi \dst, sp, -(\sizei * \sizei) > > >> > + li \dst_stride, \sizei > > >> > + call ff_put_h264_qpel_v_lowpass_\lmul > > >> > + addi \src2, sp, -(\sizei * \sizei) > > >> > + mv \src2_stride, \dst_stride > > >> > + pop \dst, \src > > >> > + mv \dst_stride, \src_stride > > >> > + li \size, \sizei > > >> > + mv ra, \tmp > > >> > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > > >> > +endfunc > > >> > + > > >> > +func ff_\op\()_h264_qpel\sizei\()_mc > > > > > > > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > To unsubscribe, visit link above, or email > > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c index 65fef03304..faca1e8953 100644 --- a/libavcodec/h264qpel.c +++ b/libavcodec/h264qpel.c @@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth) ff_h264qpel_init_arm(c, bit_depth); #elif ARCH_PPC ff_h264qpel_init_ppc(c, bit_depth); +#elif ARCH_RISCV + ff_h264qpel_init_riscv(c, bit_depth); #elif ARCH_X86 ff_h264qpel_init_x86(c, bit_depth); #elif ARCH_MIPS diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h index 0259e8de23..24baf826f9 100644 --- a/libavcodec/h264qpel.h +++ b/libavcodec/h264qpel.h @@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth); +void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth); void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth); diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index b3a6b588c9..d4276521f3 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ riscv/h264idct_rvv.o +OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o +RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o diff --git a/libavcodec/riscv/h264qpel_init.c b/libavcodec/riscv/h264qpel_init.c new file mode 100644 index 0000000000..69a1345447 --- /dev/null +++ b/libavcodec/riscv/h264qpel_init.c @@ -0,0 +1,113 @@ +/* + * RISC-V optimised DSP functions + * Copyright (c) 2024 Niklas Haas + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/riscv/cpu.h" +#include "libavcodec/h264qpel.h" + +#define DECL_QPEL_OPS(OP, SIZE, EXT) \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + +DECL_QPEL_OPS(put, 16, rvv256) +DECL_QPEL_OPS(put, 8, rvv256) +DECL_QPEL_OPS(put, 4, rvv256) + +DECL_QPEL_OPS(avg, 16, rvv256) +DECL_QPEL_OPS(avg, 8, rvv256) +DECL_QPEL_OPS(avg, 4, rvv256) + +DECL_QPEL_OPS(put, 16, rvv) +DECL_QPEL_OPS(put, 8, rvv) +DECL_QPEL_OPS(put, 4, rvv) + +DECL_QPEL_OPS(avg, 16, rvv) +DECL_QPEL_OPS(avg, 8, rvv) +DECL_QPEL_OPS(avg, 4, rvv) + +#define SET_QPEL_FNS(OP, IDX, SIZE, EXT) \ +do { \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 0] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 1] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 2] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 3] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 4] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 5] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 6] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 7] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 8] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 9] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][10] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][11] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][12] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][13] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][14] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][15] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT; \ +} while (0) + +av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth) +{ +#if HAVE_RVV + int flags = av_get_cpu_flags(); + if (flags & AV_CPU_FLAG_RVV_I32) { + const int vlen = 8 * ff_get_rv_vlenb(); + + switch (bit_depth) { + case 8: + if (vlen >= 256) { + SET_QPEL_FNS(put, 0, 16, rvv256); + SET_QPEL_FNS(put, 1, 8, rvv256); + SET_QPEL_FNS(put, 2, 4, rvv256); + + SET_QPEL_FNS(avg, 0, 16, rvv256); + SET_QPEL_FNS(avg, 1, 8, rvv256); + SET_QPEL_FNS(avg, 2, 4, rvv256); + } else if (vlen >= 128) { + SET_QPEL_FNS(put, 0, 16, rvv); + SET_QPEL_FNS(put, 1, 8, rvv); + SET_QPEL_FNS(put, 2, 4, rvv); + + SET_QPEL_FNS(avg, 0, 16, rvv); + SET_QPEL_FNS(avg, 1, 8, rvv); + SET_QPEL_FNS(avg, 2, 4, rvv); + } + break; + } + } +#endif +} diff --git a/libavcodec/riscv/h264qpel_rvv.S b/libavcodec/riscv/h264qpel_rvv.S new file mode 100644 index 0000000000..7713372f23 --- /dev/null +++ b/libavcodec/riscv/h264qpel_rvv.S @@ -0,0 +1,554 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Niklas Haas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "libavutil/riscv/asm.S" + +.macro vnclipsu.wi shifti, lmul, lmul2, vregs:vararg + vsetvli zero, zero, e16, \lmul2, ta, ma + .irp x, \vregs + vmax.vx \x, \x, zero + .endr + vsetvli zero, zero, e8, \lmul, ta, ma + .irp x, \vregs + vnclipu.wi \x, \x, \shifti + .endr +.endm + +.macro lowpass_init lmul, sizei, size, w0, w1, backup + vsetivli zero, \sizei, e8, \lmul, ta, ma + csrwi vxrm, 0 + li \size, \sizei + .ifnb \w0 + li \w0, 20 + li \w1, -5 + .endif +.endm + + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 + addi \tmp, \src, 3 + lbu \tmp2, 2(\src) + vle8.v v31, (\tmp) + lbu \tmp, 1(\src) + vslide1up.vx v30, v31, \tmp2 + lbu \tmp2, 0(\src) + vslide1up.vx v29, v30, \tmp + lbu \tmp, -1(\src) + vslide1up.vx v28, v29, \tmp2 + lbu \tmp2, -2(\src) + vslide1up.vx v27, v28, \tmp + vslide1up.vx v26, v27, \tmp2 + vwaddu.vv \vdst, v26, v31 + vwmaccu.vx \vdst, \w0, v28 + vwmaccu.vx \vdst, \w0, v29 + vwmaccsu.vx \vdst, \w1, v27 + vwmaccsu.vx \vdst, \w1, v30 +.endm + + /* output is unclipped */ +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, signed=0 + .if \signed + vwadd.vv \vdst, \vsrc0, \vsrc5 + vwmacc.vx \vdst, \w0, \vsrc2 + vwmacc.vx \vdst, \w0, \vsrc3 + vwmacc.vx \vdst, \w1, \vsrc1 + vwmacc.vx \vdst, \w1, \vsrc4 + .else + vwaddu.vv \vdst, \vsrc0, \vsrc5 + vwmaccu.vx \vdst, \w0, \vsrc2 + vwmaccu.vx \vdst, \w0, \vsrc3 + vwmaccsu.vx \vdst, \w1, \vsrc1 + vwmaccsu.vx \vdst, \w1, \vsrc4 + .endif +.endm + +.macro qpel_mc00 op, dst, src, stride, size +func ff_\op\()_h264_qpel_pixels, zve32x +1: + add t0, \stride, \src + add t1, \stride, t0 + add t2, \stride, t1 + vle8.v v0, (\src) + vle8.v v1, (t0) + vle8.v v2, (t1) + vle8.v v3, (t2) + addi \size, \size, -4 + add \src, \stride, t2 + add t0, \stride, \dst + add t1, \stride, t0 + add t2, \stride, t1 + .ifc \op, avg + vle8.v v4, (\dst) + vle8.v v5, (t0) + vle8.v v6, (t1) + vle8.v v7, (t2) + vaaddu.vv v0, v0, v4 + vaaddu.vv v1, v1, v5 + vaaddu.vv v2, v2, v6 + vaaddu.vv v3, v3, v7 + .endif + vse8.v v0, (\dst) + vse8.v v1, (t0) + vse8.v v2, (t1) + vse8.v v3, (t2) + add \dst, \stride, t2 + bnez \size, 1b + ret +endfunc +.endm + + qpel_mc00 put, a0, a1, a2, a4 + qpel_mc00 avg, a0, a1, a2, a4 + +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride +func ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + lowpass_h v0, \src, \w0, \w1 + lowpass_h v2, t0, \w0, \w1 + lowpass_h v4, t1, \w0, \w1 + lowpass_h v6, t2, \w0, \w1 + add \src, \src_stride, t2 + addi \size, \size, -4 + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v8, (\src2) + vle8.v v10, (t0) + vle8.v v12, (t1) + vle8.v v14, (t2) + add \src2, \dst_stride, t2 + vaaddu.vv v0, v0, v8 + vaaddu.vv v2, v2, v10 + vaaddu.vv v4, v4, v12 + vaaddu.vv v6, v6, v14 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v1, (\dst) + vle8.v v3, (t0) + vle8.v v5, (t1) + vle8.v v7, (t2) + vaaddu.vv v0, v0, v1 + vaaddu.vv v2, v2, v3 + vaaddu.vv v4, v4, v5 + vaaddu.vv v6, v6, v7 + .endif + vse8.v v0, (\dst) + vse8.v v2, (t0) + vse8.v v4, (t1) + vse8.v v6, (t2) + add \dst, \dst_stride, t2 + bnez \size, 1b + ret +endfunc + +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x + sub t0, \src, \src_stride + sub t1, t0, \src_stride + vle8.v v2, (\src) + vle8.v v1, (t0) + vle8.v v0, (t1) + add t0, \src, \src_stride + add t1, t0, \src_stride + add \src, t1, \src_stride + vle8.v v3, (t0) + vle8.v v4, (t1) +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + vle8.v v5, (\src) + vle8.v v6, (t0) + vle8.v v7, (t1) + vle8.v v8, (t2) + add \src, \src_stride, t2 + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 + addi \size, \size, -4 + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v9, (\src2) + vle8.v v10, (t0) + vle8.v v11, (t1) + vle8.v v12, (t2) + add \src2, \src2_stride, t2 + vaaddu.vv v24, v24, v9 + vaaddu.vv v26, v26, v10 + vaaddu.vv v28, v28, v11 + vaaddu.vv v30, v30, v12 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v9, (\dst) + vle8.v v10, (t0) + vle8.v v11, (t1) + vle8.v v12, (t2) + vaaddu.vv v24, v24, v9 + vaaddu.vv v26, v26, v10 + vaaddu.vv v28, v28, v11 + vaaddu.vv v30, v30, v12 + .endif + vse8.v v24, (\dst) + vse8.v v26, (t0) + vse8.v v28, (t1) + vse8.v v30, (t2) + add \dst, \dst_stride, t2 + vmv.v.v v0, v4 + vmv.v.v v1, v5 + vmv.v.v v2, v6 + vmv.v.v v3, v7 + vmv.v.v v4, v8 + bnez \size, 1b + ret +endfunc + +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x + sub t0, \src, \src_stride + sub t1, t0, \src_stride + lowpass_h v4, \src, \w0, \w1 + lowpass_h v2, t0, \w0, \w1 + lowpass_h v0, t1, \w0, \w1 + add t0, \src, \src_stride + add t1, t0, \src_stride + add \src, t1, \src_stride + lowpass_h v6, t0, \w0, \w1 + lowpass_h v8, t1, \w0, \w1 +1: + add t0, \src_stride, \src + add t1, \src_stride, t0 + add t2, \src_stride, t1 + lowpass_h v10, \src, \w0, \w1 + lowpass_h v12, t0, \w0, \w1 + lowpass_h v14, t1, \w0, \w1 + lowpass_h v16, t2, \w0, \w1 + vsetvli zero, zero, e16, \lmul2, ta, ma + addi \size, \size, -4 + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, signed=1 + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, v12, signed=1 + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, v12, v14, signed=1 + vnclip.wi v0, v20, 10 + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, signed=1 + vnclip.wi v2, v24, 10 + vnclip.wi v4, v28, 10 + vnclip.wi v6, v20, 10 + vmax.vx v18, v0, zero + vmax.vx v20, v2, zero + vmax.vx v22, v4, zero + vmax.vx v24, v6, zero + vmv.v.v v0, v8 + vmv.v.v v2, v10 + vmv.v.v v4, v12 + vmv.v.v v6, v14 + vmv.v.v v8, v16 + add \src, \src_stride, t2 + vsetvli zero, zero, e8, \lmul, ta, ma + vnclipu.wi v18, v18, 0 + vnclipu.wi v20, v20, 0 + vnclipu.wi v22, v22, 0 + vnclipu.wi v24, v24, 0 + .ifnb \src2 + add t0, \src2_stride, \src2 + add t1, \src2_stride, t0 + add t2, \src2_stride, t1 + vle8.v v26, (\src2) + vle8.v v27, (t0) + vle8.v v28, (t1) + vle8.v v29, (t2) + add \src2, \src2_stride, t2 + vaaddu.vv v18, v18, v26 + vaaddu.vv v20, v20, v27 + vaaddu.vv v22, v22, v28 + vaaddu.vv v24, v24, v29 + .endif + add t0, \dst_stride, \dst + add t1, \dst_stride, t0 + add t2, \dst_stride, t1 + .ifc \op, avg + vle8.v v26, (\dst) + vle8.v v27, (t0) + vle8.v v28, (t1) + vle8.v v29, (t2) + vaaddu.vv v18, v18, v26 + vaaddu.vv v20, v20, v27 + vaaddu.vv v22, v22, v28 + vaaddu.vv v24, v24, v29 + .endif + vse8.v v18, (\dst) + vse8.v v20, (t0) + vse8.v v22, (t1) + vse8.v v24, (t2) + add \dst, \dst_stride, t2 + bnez \size, 1b + ret +endfunc +.endm + +/* Note: We could possibly specialize for the width 8 / width 4 cases by + loading 32 bit integers, but this makes the convolutions more complicated + to implement, so it's not necessarily any faster. */ + +.macro h264_qpel lmul, lmul2 + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6 + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6 + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6 + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, t5, t6, a5, a6 +.endm + + h264_qpel m1, m2 + h264_qpel mf2, m1 + h264_qpel mf4, mf2 + h264_qpel mf8, mf4 + +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, src_stride, size, w0, w1, src2, src2_stride, tmp +func ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x + lowpass_init \lmul, \sizei, \size, + j ff_\op\()_h264_qpel_pixels +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + mv \src2, \src + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + addi \src2, \src, 1 + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + mv \src2, \src + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + add \src2, \src, \src_stride + mv \src2_stride, \src_stride + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + addi \src, \src, 1 + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + addi \src, \src, 1 + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + mv \src_stride, \dst_stride + j ff_\op\()_h264_qpel_hv_lowpass_\lmul +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + add \src, \src, \src_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_h_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_v_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc + +func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x + lowpass_init \lmul, \sizei, \size, \w0, \w1 + push \dst, \src + mv \tmp, ra + addi \src, \src, 1 + mv \src_stride, \dst_stride + addi \dst, sp, -(\sizei * \sizei) + li \dst_stride, \sizei + call ff_put_h264_qpel_v_lowpass_\lmul + addi \src2, sp, -(\sizei * \sizei) + mv \src2_stride, \dst_stride + pop \dst, \src + mv \dst_stride, \src_stride + li \size, \sizei + mv ra, \tmp + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 +endfunc +.endm + + ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf4, 8, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf8, 4, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf4, 8, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf8, 4, rvv256, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns put, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf2, 8, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns put, mf4, 4, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + + ff_h264_qpel_fns avg, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf2, 8, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7 + ff_h264_qpel_fns avg, mf4, 4, rvv, a0, a1, a2, a3, a4, t5, t6, a5, a6, a7