diff mbox series

[FFmpeg-devel,4/4] lavc/h264dsp: update R-V V intra luma loop filter

Message ID 20240701170807.107018-4-remi@remlab.net
State New
Headers show
Series [FFmpeg-devel,RFC,1/4] lavc/h264_loopfilter: expose tc0_table (for checkasm) | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/configure_x86 warning Failed to apply patch

Commit Message

Rémi Denis-Courmont July 1, 2024, 5:08 p.m. UTC
Note that the performance reported by checkasm is slightly worse.
This is expected since the assembler is now doing more work.
---
 libavcodec/riscv/h264dsp_init.c | 3 ++-
 libavcodec/riscv/h264dsp_rvv.S  | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index ab412a9924..9650cae66b 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -30,7 +30,8 @@ 
 void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
-                                      int alpha, int beta, int8_t *tc0);
+                                      int alpha, int beta, const int8_t *tc0,
+                                      const int16_t *bS);
 void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                             int alpha, int beta, int8_t *tc0);
 
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 96a8a0a8a3..6bc5406ba3 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -126,9 +126,11 @@  func ff_h264_v_loop_filter_luma_8_rvv, zve32x
 endfunc
 
 func ff_h264_h_loop_filter_luma_8_rvv, zve32x
-        vsetivli    zero, 4, e32, m1, ta, ma
-        vle8.v      v4, (a4)
+        vsetivli    zero, 4, e8, mf4, ta, ma
+        vle16.v     v8, (a5)
         li          t0, 0x01010101
+        vluxei16.v  v4, (a4), v8
+        vsetivli    zero, 4, e32, m1, ta, ma
         vzext.vf4   v6, v4
         addi        a0, a0, -3
         vmul.vx     v6, v6, t0