@@ -23,6 +23,8 @@
#include "libavutil/cpu.h"
#include "libavcodec/mpegvideoencdsp.h"
+int ff_try_8x8basis_rvv(const int16_t rem[64], const int16_t weight[64],
+ const int16_t basis[16], int scale);
int ff_pix_sum_rvv(const uint8_t *pix, int line_size);
int ff_pix_norm1_rvv(const uint8_t *pix, int line_size);
@@ -32,10 +34,15 @@ av_cold void ff_mpegvideoencdsp_init_riscv(MpegvideoEncDSPContext *c,
#if HAVE_RVV
int flags = av_get_cpu_flags();
- if (flags & AV_CPU_FLAG_RVV_I64) {
- if ((flags & AV_CPU_FLAG_RVB) && ff_rv_vlen_least(128))
- c->pix_sum = ff_pix_sum_rvv;
- c->pix_norm1 = ff_pix_norm1_rvv;
+ if (flags & AV_CPU_FLAG_RVV_I32) {
+ if (flags & AV_CPU_FLAG_RVB)
+ c->try_8x8basis = ff_try_8x8basis_rvv;
+
+ if (flags & AV_CPU_FLAG_RVV_I64) {
+ if ((flags & AV_CPU_FLAG_RVB) && ff_rv_vlen_least(128))
+ c->pix_sum = ff_pix_sum_rvv;
+ c->pix_norm1 = ff_pix_norm1_rvv;
+ }
}
#endif
}
@@ -20,6 +20,41 @@
#include "libavutil/riscv/asm.S"
+.equ BASIS_SHIFT, 16
+.equ RECON_SHIFT, 6
+
+func ff_try_8x8basis_rvv, zve32x, b
+ li t1, 64
+ csrwi vxrm, 0
+ vsetvli t0, t1, e32, m8, ta, ma
+ vmv.v.x v24, zero
+ vmv.s.x v1, zero
+1:
+ vsetvli zero, zero, e16, m4, ta, ma
+ vle16.v v4, (a2)
+ sub t1, t1, t0
+ vwmul.vx v16, v4, a3
+ sh1add a2, t0, a2
+ vle16.v v8, (a0)
+ sh1add a0, t0, a0
+ vnclip.wi v4, v16, BASIS_SHIFT - RECON_SHIFT
+ vle16.v v12, (a1)
+ sh1add a1, t0, a1
+ vadd.vv v4, v8, v4
+ vsra.vi v4, v4, RECON_SHIFT
+ vwmul.vv v16, v12, v4
+ vsetvli zero, zero, e32, m8, ta, ma
+ vmul.vv v16, v16, v16
+ vsra.vi v16, v16, 4
+ vadd.vv v24, v24, v16
+ bnez t1, 1b
+
+ vredsum.vs v1, v24, v1
+ vmv.x.s a0, v1
+ srai a0, a0, 2
+ ret
+endfunc
+
func ff_pix_sum_rvv, zve64x, b
lpad 0
vsetivli t0, 16, e16, m1, ta, ma