diff mbox series

[FFmpeg-devel,v3,5/5] avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

Message ID 18a55da1-45a2-43ee-b918-9eb67c5bf741@geoffhill.org
State New
Headers show
Series avcodec/ac3: Add aarch64 NEON DSP | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Geoff Hill April 3, 2024, 6:43 a.m. UTC
Signed-off-by: Geoff Hill <geoff@geoffhill.org>
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 ++++
 libavcodec/aarch64/ac3dsp_neon.S         | 35 ++++++++++++++++++++++++
 tests/checkasm/ac3dsp.c                  | 26 ++++++++++++++++++
 3 files changed, 66 insertions(+)

Comments

Martin Storsjö April 4, 2024, 1:01 p.m. UTC | #1
On Tue, 2 Apr 2024, Geoff Hill wrote:

> Signed-off-by: Geoff Hill <geoff@geoffhill.org>
> ---
> libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 ++++
> libavcodec/aarch64/ac3dsp_neon.S         | 35 ++++++++++++++++++++++++
> tests/checkasm/ac3dsp.c                  | 26 ++++++++++++++++++
> 3 files changed, 66 insertions(+)
>
> diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
> index fa8fcf2e47..4a78ec0b2a 100644
> --- a/libavcodec/aarch64/ac3dsp_neon.S
> +++ b/libavcodec/aarch64/ac3dsp_neon.S
> @@ -88,3 +88,38 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
>         st1         {v0.1d-v3.1d}, [x0]
> 1:      ret
> endfunc
> +
> +function ff_ac3_sum_square_butterfly_float_neon, export=1
> +        cbz         w3, 1f
> +        movi        v0.4s, #0
> +        movi        v1.4s, #0
> +        movi        v2.4s, #0
> +        movi        v3.4s, #0
> +0:      ld1         {v30.4s}, [x1], #16
> +        ld1         {v31.4s}, [x2], #16
> +        fadd        v16.4s, v30.4s, v31.4s
> +        fsub        v17.4s, v30.4s, v31.4s
> +        fmul        v30.4s, v30.4s, v30.4s
> +        fadd        v0.4s, v0.4s, v30.4s

The arm version here used vmla instead of separate vmul+vadd - is there 
any reason why we can't use fmla here?

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index e95436c651..e367353e11 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -32,6 +32,10 @@  void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
                                             const int32_t *coef0,
                                             const int32_t *coef1,
                                             int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+                                            const float *coef0,
+                                            const float *coef1,
+                                            int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -42,4 +46,5 @@  av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
     c->extract_exponents = ff_ac3_extract_exponents_neon;
     c->float_to_fixed24 = ff_float_to_fixed24_neon;
     c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+    c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index fa8fcf2e47..4a78ec0b2a 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -88,3 +88,38 @@  function ff_ac3_sum_square_butterfly_int32_neon, export=1
         st1         {v0.1d-v3.1d}, [x0]
 1:      ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+        cbz         w3, 1f
+        movi        v0.4s, #0
+        movi        v1.4s, #0
+        movi        v2.4s, #0
+        movi        v3.4s, #0
+0:      ld1         {v30.4s}, [x1], #16
+        ld1         {v31.4s}, [x2], #16
+        fadd        v16.4s, v30.4s, v31.4s
+        fsub        v17.4s, v30.4s, v31.4s
+        fmul        v30.4s, v30.4s, v30.4s
+        fadd        v0.4s, v0.4s, v30.4s
+        fmul        v31.4s, v31.4s, v31.4s
+        fadd        v1.4s, v1.4s, v31.4s
+        fmul        v16.4s, v16.4s, v16.4s
+        fadd        v2.4s, v2.4s, v16.4s
+        fmul        v17.4s, v17.4s, v17.4s
+        fadd        v3.4s, v3.4s, v17.4s
+        subs        w3, w3, #4
+        b.gt        0b
+        faddp       v0.4s, v0.4s, v0.4s
+        faddp       v0.2s, v0.2s, v0.2s
+        st1         {v0.s}[0], [x0], #4
+        faddp       v1.4s, v1.4s, v1.4s
+        faddp       v1.2s, v1.2s, v1.2s
+        st1         {v1.s}[0], [x0], #4
+        faddp       v2.4s, v2.4s, v2.4s
+        faddp       v2.2s, v2.2s, v2.2s
+        st1         {v2.s}[0], [x0], #4
+        faddp       v3.4s, v3.4s, v3.4s
+        faddp       v3.2s, v3.2s, v3.2s
+        st1         {v3.s}[0], [x0]
+1:      ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index c920dc9eb0..ef5186cfc1 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -162,6 +162,31 @@  static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
     report("ac3_sum_square_butterfly_int32");
 }
 
+static void check_ac3_sum_square_butterfly_float(AC3DSPContext *c) {
+    LOCAL_ALIGNED_32(float, lt, [ELEMS]);
+    LOCAL_ALIGNED_32(float, rt, [ELEMS]);
+    LOCAL_ALIGNED_16(float, v1, [4]);
+    LOCAL_ALIGNED_16(float, v2, [4]);
+
+    declare_func(void, float[4], const float *, const float *, int);
+
+    randomize_float(lt, ELEMS);
+    randomize_float(rt, ELEMS);
+
+    if (check_func(c->sum_square_butterfly_float,
+                   "ac3_sum_square_bufferfly_float")) {
+        call_ref(v1, lt, rt, ELEMS);
+        call_new(v2, lt, rt, ELEMS);
+
+        if (!float_near_ulp_array(v1, v2, 10, 4))
+            fail();
+
+        bench_new(v2, lt, rt, ELEMS);
+    }
+
+    report("ac3_sum_square_butterfly_float");
+}
+
 void checkasm_check_ac3dsp(void)
 {
     AC3DSPContext c;
@@ -171,4 +196,5 @@  void checkasm_check_ac3dsp(void)
     check_ac3_extract_exponents(&c);
     check_float_to_fixed24(&c);
     check_ac3_sum_square_butterfly_int32(&c);
+    check_ac3_sum_square_butterfly_float(&c);
 }