Message ID | 20231216085056.4939-1-remi@remlab.net |
---|---|
State | Accepted |
Commit | 419145c11bb3310539eb975751291bcf023e9170 |
Headers | show |
Series | [FFmpeg-devel] lavc/vc1dsp: fix R-V V vector lengths | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Sat, 16 Dec 2023, Rémi Denis-Courmont wrote: > The 8x4 and 4x4 use a needlessly large multiplier (unless/until we care > about embedded 64-bit-vector hardware). This is merely suboptimal. > > The 8x4 case also uses an incorrect vector length, which leads to incorrect > behaviour on future/hypothetical hardware with 256-bit or larger vectors. > > Pointed-out-by: Martin Storsjö <martin@martin.st> > --- > libavcodec/riscv/vc1dsp_rvv.S | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S > index 1a503ecc87..4a00945ead 100644 > --- a/libavcodec/riscv/vc1dsp_rvv.S > +++ b/libavcodec/riscv/vc1dsp_rvv.S > @@ -68,7 +68,7 @@ endfunc > > func ff_vc1_inv_trans_8x4_dc_rvv, zve64x > lh t2, (a2) > - vsetivli zero, 8, e8, mf2, ta, ma > + vsetivli zero, 4, e8, mf4, ta, ma > vlse64.v v0, (a0), a1 > sh1add t2, t2, t2 > addi t2, t2, 1 > @@ -84,14 +84,14 @@ func ff_vc1_inv_trans_8x4_dc_rvv, zve64x > vmax.vx v4, v4, zero > vsetvli zero, zero, e8, m2, ta, ma > vnclipu.wi v0, v4, 0 > - vsetivli zero, 8, e8, mf2, ta, ma > + vsetivli zero, 4, e8, mf4, ta, ma > vsse64.v v0, (a0), a1 > ret > endfunc > > func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > lh t2, (a2) > - vsetivli zero, 4, e8, mf2, ta, ma > + vsetivli zero, 4, e8, mf4, ta, ma > vlse32.v v0, (a0), a1 > slli t1, t2, 4 > add t2, t2, t1 > @@ -107,7 +107,7 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x > vmax.vx v2, v2, zero > vsetvli zero, zero, e8, m1, ta, ma > vnclipu.wi v0, v2, 0 > - vsetivli zero, 4, e8, mf2, ta, ma > + vsetivli zero, 4, e8, mf4, ta, ma > vsse32.v v0, (a0), a1 > ret > endfunc > -- > 2.43.0 The fix sounds reasonable (although I can't say I've followed the code). Anyway, I've tested that it does indeed fix the checkasm test, so feel free to push - thanks! // Martin
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S index 1a503ecc87..4a00945ead 100644 --- a/libavcodec/riscv/vc1dsp_rvv.S +++ b/libavcodec/riscv/vc1dsp_rvv.S @@ -68,7 +68,7 @@ endfunc func ff_vc1_inv_trans_8x4_dc_rvv, zve64x lh t2, (a2) - vsetivli zero, 8, e8, mf2, ta, ma + vsetivli zero, 4, e8, mf4, ta, ma vlse64.v v0, (a0), a1 sh1add t2, t2, t2 addi t2, t2, 1 @@ -84,14 +84,14 @@ func ff_vc1_inv_trans_8x4_dc_rvv, zve64x vmax.vx v4, v4, zero vsetvli zero, zero, e8, m2, ta, ma vnclipu.wi v0, v4, 0 - vsetivli zero, 8, e8, mf2, ta, ma + vsetivli zero, 4, e8, mf4, ta, ma vsse64.v v0, (a0), a1 ret endfunc func ff_vc1_inv_trans_4x4_dc_rvv, zve32x lh t2, (a2) - vsetivli zero, 4, e8, mf2, ta, ma + vsetivli zero, 4, e8, mf4, ta, ma vlse32.v v0, (a0), a1 slli t1, t2, 4 add t2, t2, t1 @@ -107,7 +107,7 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x vmax.vx v2, v2, zero vsetvli zero, zero, e8, m1, ta, ma vnclipu.wi v0, v2, 0 - vsetivli zero, 4, e8, mf2, ta, ma + vsetivli zero, 4, e8, mf4, ta, ma vsse32.v v0, (a0), a1 ret endfunc