diff mbox series

[FFmpeg-devel,1/2] lavc/vc1dsp: match C block content in inv_trans_8x4_rvv

Message ID 20240611145505.14934-1-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,1/2] lavc/vc1dsp: match C block content in inv_trans_8x4_rvv | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Rémi Denis-Courmont June 11, 2024, 2:55 p.m. UTC
This shifts the mid-point (after horizontal, before vertical) block
state of the transform to match the C code. This forces shifting 8
vectors of 4 elements instead of 4 vectors of 8 elements and is thus
slight slower.
---
 libavcodec/riscv/vc1dsp_rvv.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
index 4b7ab33307..7e1fb84b0c 100644
--- a/libavcodec/riscv/vc1dsp_rvv.S
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -257,6 +257,9 @@  func ff_vc1_inv_trans_8x4_rvv, zve32x
         vsetivli    zero, 4, e16, mf2, ta, ma
         vlseg8e16.v v0, (a2)
         jal         t0, ff_vc1_inv_trans_8_rvv
+        .irp    n,0,1,2,3,4,5,6,7
+        vssra.vi    v\n, v\n, 3
+        .endr
         vsseg8e16.v v0, (a2)
         addi        a3, a2, 1 * 8 * 2
         vsetivli    zero, 8, e16, m1, ta, ma
@@ -266,10 +269,6 @@  func ff_vc1_inv_trans_8x4_rvv, zve32x
         addi        a5, a2, 3 * 8 * 2
         vle16.v     v2, (a4)
         vle16.v     v3, (a5)
-        .irp    n,0,1,2,3
-        # shift 4 vectors of 8 elems after transpose instead of 8 of 4
-        vssra.vi    v\n, v\n, 3
-        .endr
         li          t1, 7
         jal         t0, ff_vc1_inv_trans_4_rvv
         add         a3, a1, a0