@@ -21,12 +21,15 @@
#include "config.h"
#include <stdint.h>
+#include <string.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/riscv/cpu.h"
#include "libavcodec/h264dsp.h"
+extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
+
void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@@ -60,6 +63,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
if (bit_depth == 8 && ff_rv_vlen_least(128)) {
+ memcpy(dsp->weight_h264_pixels_tab,
+ ff_h264_weight_funcs_8_rvv,
+ sizeof (dsp->weight_h264_pixels_tab));
+
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =
@@ -26,6 +26,83 @@
#include "libavutil/riscv/asm.S"
+func ff_h264_weight_pixels_simple_8_rvv, zve32x
+ csrwi vxrm, 0
+ sll a5, a5, a3
+1:
+ vsetvli zero, a6, e32, m4, ta, ma
+ vle8.v v8, (a0)
+ addi a2, a2, -1
+ vmv.v.x v16, a5
+ vsetvli zero, zero, e16, m2, ta, ma
+ vzext.vf2 v24, v8
+ vwmaccsu.vx v16, a4, v24
+ vnclip.wi v16, v16, 0
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m1, ta, ma
+ vnclipu.wx v8, v16, a3
+ vse8.v v8, (a0)
+ add a0, a0, a1
+ bnez a2, 1b
+
+ ret
+endfunc
+
+func ff_h264_weight_pixels_8_rvv, zve32x
+ csrwi vxrm, 0
+ sll a5, a5, a3
+1:
+ mv t0, a0
+ mv t6, a6
+2:
+ vsetvli t2, a2, e32, m8, ta, ma
+ vlse8.v v8, (t0), a1
+ addi t6, t6, -1
+ vmv.v.x v16, a5
+ vsetvli zero, zero, e16, m4, ta, ma
+ vzext.vf2 v24, v8
+ vwmaccsu.vx v16, a4, v24
+ vnclip.wi v16, v16, 0
+ vmax.vx v16, v16, zero
+ vsetvli zero, zero, e8, m2, ta, ma
+ vnclipu.wx v8, v16, a3
+ vsse8.v v8, (t0), a1
+ addi t0, t0, 1
+ bnez t6, 2b
+
+ mul t3, a1, t2
+ sub a2, a2, t2
+ add a0, a0, t3
+ bnez a2, 1b
+
+ ret
+endfunc
+
+.irp w, 16, 8, 4, 2
+func ff_h264_weight_pixels\w\()_8_rvv, zve32x
+ li a6, \w
+ .if \w == 16
+ j ff_h264_weight_pixels_simple_8_rvv
+ .else
+ j ff_h264_weight_pixels_8_rvv
+ .endif
+endfunc
+.endr
+
+ .global ff_h264_weight_funcs_8_rvv
+ .hidden ff_h264_weight_funcs_8_rvv
+const ff_h264_weight_funcs_8_rvv
+ .irp w, 16, 8, 4, 2
+#if __riscv_xlen == 32
+ .word ff_h264_weight_pixels\w\()_8_rvv
+#elif __riscv_xlen == 64
+ .dword ff_h264_weight_pixels\w\()_8_rvv
+#else
+ .qword ff_h264_weight_pixels\w\()_8_rvv
+#endif
+ .endr
+endconst
+
.variant_cc ff_h264_loop_filter_luma_8_rvv
func ff_h264_loop_filter_luma_8_rvv, zve32x
# p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13