diff mbox series

[FFmpeg-devel,1/2] lavc/ac3dsp: R-V V sum_square_butterfly_int32

Message ID 20240429192144.84571-1-remi@remlab.net
State Accepted
Commit 6459966bebc07b3c26338cbecf72f3607feb961f
Headers show
Series [FFmpeg-devel,1/2] lavc/ac3dsp: R-V V sum_square_butterfly_int32 | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Rémi Denis-Courmont April 29, 2024, 7:21 p.m. UTC
ac3_sum_square_bufferfly_int32_c:       61.0
ac3_sum_square_bufferfly_int32_rvv_i64: 14.7
---
 libavcodec/riscv/ac3dsp_init.c |  6 +++++
 libavcodec/riscv/ac3dsp_rvv.S  | 41 ++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
diff mbox series

Patch

diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index b9e14d56ca..be5e153fac 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -28,6 +28,8 @@ 
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
+void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
+                                       const int32_t *, int);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -39,6 +41,10 @@  av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
             c->extract_exponents = ff_extract_exponents_rvb;
         if (flags & AV_CPU_FLAG_RVV_F32)
             c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+# if __riscv_xlen >= 64
+        if (flags & AV_CPU_FLAG_RVV_I64)
+            c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;
+# endif
     }
 #endif
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
index b8d32c4677..dd0b4cd797 100644
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -37,3 +37,44 @@  func ff_float_to_fixed24_rvv, zve32f
 
         ret
 endfunc
+
+#if __riscv_xlen >= 64
+func ff_sum_square_butterfly_int32_rvv, zve64x
+        vsetvli    t0, zero, e64, m8, ta, ma
+        vmv.v.x    v0, zero
+        vmv.v.x    v8, zero
+1:
+        vsetvli    t0, a3, e32, m2, tu, ma
+        vle32.v    v16, (a1)
+        sub        a3, a3, t0
+        vle32.v    v20, (a2)
+        sh2add     a1, t0, a1
+        vadd.vv    v24, v16, v20
+        sh2add     a2, t0, a2
+        vsub.vv    v28, v16, v20
+        vwmacc.vv  v0, v16, v16
+        vwmacc.vv  v4, v20, v20
+        vwmacc.vv  v8, v24, v24
+        vwmacc.vv  v12, v28, v28
+        bnez       a3, 1b
+
+        vsetvli    t0, zero, e64, m4, ta, ma
+        vmv.s.x    v16, zero
+        vmv.s.x    v17, zero
+        vredsum.vs v16, v0, v16
+        vmv.s.x    v18, zero
+        vredsum.vs v17, v4, v17
+        vmv.s.x    v19, zero
+        vredsum.vs v18, v8, v18
+        vmv.x.s    t0, v16
+        vredsum.vs v19, v12, v19
+        vmv.x.s    t1, v17
+        sd         t0,   (a0)
+        vmv.x.s    t2, v18
+        sd         t1,  8(a0)
+        vmv.x.s    t3, v19
+        sd         t2, 16(a0)
+        sd         t3, 24(a0)
+        ret
+endfunc
+#endif