@@ -34,6 +34,9 @@ void ff_ps_hybrid_analysis_ileave_rvv(float (*out)[32][2], float L[2][38][64],
void ff_ps_hybrid_synthesis_deint_rvv(float out[2][38][64], float (*in)[32][2],
int i, int len);
+void ff_ps_stereo_interpolate_rvv(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4], int len);
+
av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
{
#if HAVE_RVV
@@ -47,6 +50,7 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
c->add_squares = ff_ps_add_squares_rvv;
c->mul_pair_single = ff_ps_mul_pair_single_rvv;
c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
+ c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
}
}
#endif
@@ -219,3 +219,68 @@ func ff_ps_hybrid_synthesis_deint_rvv, zve32x
3:
ret
endfunc
+
+func ff_ps_stereo_interpolate_rvv, zve32f
+ vsetvli t0, zero, e32, m1, ta, ma
+ vid.v v24
+ flw ft0, (a2)
+ vadd.vi v24, v24, 1 // v24[i] = i + 1
+ flw ft1, 4(a2)
+ vfcvt.f.xu.v v24, v24
+ flw ft2, 8(a2)
+ vfmv.v.f v16, ft0
+ flw ft3, 12(a2)
+ vfmv.v.f v17, ft1
+ flw ft0, (a3)
+ vfmv.v.f v18, ft2
+ flw ft1, 4(a3)
+ vfmv.v.f v19, ft3
+ flw ft2, 8(a3)
+ vfmv.v.f v20, ft0
+ flw ft3, 12(a3)
+ vfmv.v.f v21, ft1
+ fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float))
+ vfmv.v.f v22, ft2
+ li t1, 8
+ vfmv.v.f v23, ft3
+ addi a6, a0, 4 // l[*][1]
+ vfmacc.vv v16, v24, v20 // h0 += (i + 1) * h0_step
+ addi a7, a1, 4 // r[*][1]
+ vfmacc.vv v17, v24, v21
+ fmul.s ft0, ft0, ft4
+ vfmacc.vv v18, v24, v22
+ fmul.s ft1, ft1, ft4
+ vfmacc.vv v19, v24, v23
+ fmul.s ft2, ft2, ft4
+ fmul.s ft3, ft3, ft4
+1:
+ vsetvli t0, a4, e32, m1, ta, ma
+ vlse32.v v8, (a0), t1 // l_re
+ sub a4, a4, t0
+ vlse32.v v9, (a6), t1 // l_im
+ vlse32.v v10, (a1), t1 // r_re
+ vlse32.v v11, (a7), t1 // r_im
+ vfmul.vv v12, v8, v16
+ vfmul.vv v13, v9, v16
+ vfmul.vv v14, v8, v17
+ vfmul.vv v15, v9, v17
+ vfmacc.vv v12, v10, v18
+ vfmacc.vv v13, v11, v18
+ vfmacc.vv v14, v10, v19
+ vfmacc.vv v15, v11, v19
+ vsse32.v v12, (a0), t1
+ sh3add a0, t0, a0
+ vsse32.v v13, (a6), t1
+ sh3add a6, t0, a6
+ vsse32.v v14, (a1), t1
+ sh3add a1, t0, a1
+ vsse32.v v15, (a7), t1
+ sh3add a7, t0, a7
+ vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step
+ vfadd.vf v17, v17, ft1
+ vfadd.vf v18, v18, ft2
+ vfadd.vf v19, v19, ft3
+ bnez a4, 1b
+
+ ret
+endfunc
From: Rémi Denis-Courmont <remi@remlab.net> --- libavcodec/riscv/aacpsdsp_init.c | 4 ++ libavcodec/riscv/aacpsdsp_rvv.S | 65 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+)