@@ -29,16 +29,22 @@ void ff_ps_mul_pair_single_rvv(float (*dst)[2], float (*src0)[2], float *src1,
int n);
void ff_ps_hybrid_analysis_rvv(float (*out)[2], float (*in)[2],
const float (*filter)[8][2], ptrdiff_t, int n);
+void ff_ps_hybrid_analysis_ileave_rvv(float (*out)[32][2], float L[2][38][64],
+ int i, int len);
av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
{
#if HAVE_RVV
int flags = av_get_cpu_flags();
- if (flags & AV_CPU_FLAG_RV_ZVE32F) {
- c->add_squares = ff_ps_add_squares_rvv;
- c->mul_pair_single = ff_ps_mul_pair_single_rvv;
- c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
+ if (flags & AV_CPU_FLAG_RV_ZVE32X) {
+ c->hybrid_analysis_ileave = ff_ps_hybrid_analysis_ileave_rvv;
+
+ if (flags & AV_CPU_FLAG_RV_ZVE32F) {
+ c->add_squares = ff_ps_add_squares_rvv;
+ c->mul_pair_single = ff_ps_mul_pair_single_rvv;
+ c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
+ }
}
#endif
}
@@ -153,3 +153,40 @@ func ff_ps_hybrid_analysis_rvv, zve32f
.purgem input
.purgem filter
endfunc
+
+func ff_ps_hybrid_analysis_ileave_rvv, zve32x /* no needs for zve32f here */
+ slli t0, a2, 5 + 1 + 2 // ctz(32 * 2 * 4)
+ slli t1, a2, 2
+ add a0, a0, t0
+ add a1, a1, t1
+ addi a2, a2, -64
+ li t1, 38 * 64 * 4
+ li t6, 64 * 4 // (uint8_t *)L[x][j+1][i] - L[x][j][i]
+ add a4, a1, t1 // &L[1]
+ beqz a2, 3f
+1:
+ mv t0, a0
+ mv t1, a1
+ mv t3, a3
+ mv t4, a4
+ addi a2, a2, 1
+2:
+ vsetvli t5, t3, e32, m1, ta, ma
+ vlse32.v v16, (t1), t6
+ sub t3, t3, t5
+ vlse32.v v17, (t4), t6
+ mul t2, t5, t6
+ vsseg2e32.v v16, (t0)
+ add t1, t1, t2
+ add t4, t4, t2
+ slli t2, t5, 1 + 2
+ add t0, t0, t2
+ bnez t3, 2b
+
+ add a0, a0, 32 * 2 * 4
+ add a1, a1, 4
+ add a4, a4, 4
+ bnez a2, 1b
+3:
+ ret
+endfunc
From: Rémi Denis-Courmont <remi@remlab.net> --- libavcodec/riscv/aacpsdsp_init.c | 14 ++++++++---- libavcodec/riscv/aacpsdsp_rvv.S | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 4 deletions(-)