diff mbox series

[FFmpeg-devel,3/5] lavc/h264dsp: correct VL and LMUL in idct_dc_add

Message ID 20240718193546.18939-3-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,1/5] lavc/h264dsp: factor some mostly identical R-V V code | expand

Commit Message

RĂ©mi Denis-Courmont July 18, 2024, 7:35 p.m. UTC
T-Head C908 (cycles):
h264_idct4_dc_add_8bpp_c:        94.7
h264_idct4_dc_add_8bpp_rvv_i32:  55.0 (before)
h264_idct4_dc_add_8bpp_rvv_i32:  34.5 (after)
h264_idct4_dc_add_9bpp_c:        94.7
h264_idct4_dc_add_9bpp_rvv_i32:  43.5 (before)
h264_idct4_dc_add_9bpp_rvv_i32:  38.2 (after)
h264_idct4_dc_add_10bpp_c:       94.7
h264_idct4_dc_add_10bpp_rvv_i32: 43.5 (before)
h264_idct4_dc_add_10bpp_rvv_i32: 38.2 (after)
h264_idct4_dc_add_12bpp_c:       94.7
h264_idct4_dc_add_12bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_12bpp_rvv_i32: 38.5 (after)
h264_idct4_dc_add_14bpp_c:       94.7
h264_idct4_dc_add_14bpp_rvv_i32: 43.7 (before)
h264_idct4_dc_add_14bpp_rvv_i32: 38.5 (after)
---
 libavcodec/riscv/h264idct_rvv.S | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 37b27fc92a..2648e06aeb 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -416,22 +416,23 @@  endfunc
 .endr
 
 .macro idct_dc_add8 width
-func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
+func ff_h264_idct\width\()_dc_add_8_rvv, zve64x
 .if \width == 8
-        vsetivli        zero, \width, e16, m1, ta, ma
+        vsetivli        zero, \width, e8, mf2, ta, ma
 .else
-        vsetivli        zero, \width, e16, mf2, ta, ma
+        vsetivli        zero, \width, e8, mf4, ta, ma
 .endif
         lh              a3, 0(a1)
         addi            a3, a3, 32
         srai            a3, a3, 6
         sh              zero, 0(a1)
 .if \width == 8
+        li              a6, \width * \width
         vlse64.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m8, ta, ma
+        vsetvli         zero, a6, e16, m8, ta, ma
 .else
         vlse32.v        v24, (a0), a2
-        vsetvli         t0, zero, e16, m4, ta, ma
+        vsetivli        zero, \width * \width, e16, m2, ta, ma
 .endif
         vzext.vf2       v0, v24
         vadd.vx         v0, v0, a3
@@ -439,13 +440,14 @@  func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
 .if \width == 8
         vsetvli         zero, zero, e8, m4, ta, ma
 .else
-        vsetvli         zero, zero, e8, m2, ta, ma
+        vsetvli         zero, zero, e8, m1, ta, ma
 .endif
         vnclipu.wi      v24, v0, 0
-        vsetivli        zero, \width, e8, m1, ta, ma
 .if \width == 8
+        vsetivli        zero, \width, e8, mf2, ta, ma
         vsse64.v        v24, (a0), a2
 .else
+        vsetivli        zero, \width, e8, mf4, ta, ma
         vsse32.v        v24, (a0), a2
 .endif
         ret
@@ -457,7 +459,11 @@  idct_dc_add8 8
 
 .macro idct_dc_add width
 func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
+.if \width == 8
         vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
         lw              a3, 0(a1)
         addi            a3, a3, 32
         srai            a3, a3, 6
@@ -487,7 +493,11 @@  func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
         vadd.vx         v0, v0, a3
         vmax.vx         v0, v0, zero
         vmin.vx         v0, v0, a5
+.if \width == 8
         vsetivli        zero, \width, e16, m1, ta, ma
+.else
+        vsetivli        zero, \width, e16, mf2, ta, ma
+.endif
         vse16.v         v0, (a0)
         vse16.v         v1, (t4)
         vse16.v         v2, (t5)