diff mbox series

[FFmpeg-devel,v3,4/5] avcodec/ac3: Implement sum_square_butterfly_int32 for aarch64 NEON

Message ID 17305bcf-364b-4f60-8e60-633466da4918@geoffhill.org
State New
Headers show
Series avcodec/ac3: Add aarch64 NEON DSP | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Geoff Hill April 3, 2024, 6:43 a.m. UTC
Signed-off-by: Geoff Hill <geoff@geoffhill.org>
---
 libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 +++++
 libavcodec/aarch64/ac3dsp_neon.S         | 24 +++++++++++++++++++++
 tests/checkasm/ac3dsp.c                  | 27 ++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

Comments

Martin Storsjö April 4, 2024, 12:58 p.m. UTC | #1
On Tue, 2 Apr 2024, Geoff Hill wrote:

> Signed-off-by: Geoff Hill <geoff@geoffhill.org>
> ---
> libavcodec/aarch64/ac3dsp_init_aarch64.c |  5 +++++
> libavcodec/aarch64/ac3dsp_neon.S         | 24 +++++++++++++++++++++
> tests/checkasm/ac3dsp.c                  | 27 ++++++++++++++++++++++++
> 3 files changed, 56 insertions(+)
>
> diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c
> index 1bdc215b51..e95436c651 100644
> --- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
> @@ -28,6 +28,10 @@
> void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
> void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
> void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
> +void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
> +                                            const int32_t *coef0,
> +                                            const int32_t *coef1,
> +                                            int len);
>
> av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
> {
> @@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
>     c->ac3_exponent_min = ff_ac3_exponent_min_neon;
>     c->extract_exponents = ff_ac3_extract_exponents_neon;
>     c->float_to_fixed24 = ff_float_to_fixed24_neon;
> +    c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
> }
> diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
> index b26f71a3f6..fa8fcf2e47 100644
> --- a/libavcodec/aarch64/ac3dsp_neon.S
> +++ b/libavcodec/aarch64/ac3dsp_neon.S
> @@ -64,3 +64,27 @@ function ff_float_to_fixed24_neon, export=1
>         b.ne        0b
>         ret
> endfunc
> +
> +function ff_ac3_sum_square_butterfly_int32_neon, export=1
> +        cbz         w3, 1f

The arm version of this patch doesn't have any corresponding check for 
whether this parameter is zero, and the checkasm test doesn't test that 
behaviour either. Is that never feasiable (and we could leave it out here) 
or should we test that and fix it in other assembly versions? In the 
latter case, it's of course ok to defer that to a separate later patch, 
not holding up this one.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c
index 1bdc215b51..e95436c651 100644
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@@ -28,6 +28,10 @@ 
 void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+                                            const int32_t *coef0,
+                                            const int32_t *coef1,
+                                            int len);
 
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@@ -37,4 +41,5 @@  av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
     c->ac3_exponent_min = ff_ac3_exponent_min_neon;
     c->extract_exponents = ff_ac3_extract_exponents_neon;
     c->float_to_fixed24 = ff_float_to_fixed24_neon;
+    c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
 }
diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S
index b26f71a3f6..fa8fcf2e47 100644
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@@ -64,3 +64,27 @@  function ff_float_to_fixed24_neon, export=1
         b.ne        0b
         ret
 endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+        cbz         w3, 1f
+        movi        v0.2d, #0
+        movi        v1.2d, #0
+        movi        v2.2d, #0
+        movi        v3.2d, #0
+0:      ld1         {v4.2s}, [x1], #8
+        ld1         {v5.2s}, [x2], #8
+        add         v6.2s, v4.2s, v5.2s
+        sub         v7.2s, v4.2s, v5.2s
+        smlal       v0.2d, v4.2s, v4.2s
+        smlal       v1.2d, v5.2s, v5.2s
+        smlal       v2.2d, v6.2s, v6.2s
+        smlal       v3.2d, v7.2s, v7.2s
+        subs        w3, w3, #2
+        b.gt        0b
+        addp        d0, v0.2d
+        addp        d1, v1.2d
+        addp        d2, v2.2d
+        addp        d3, v3.2d
+        st1         {v0.1d-v3.1d}, [x0]
+1:      ret
+endfunc
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
index a8a20349f9..c920dc9eb0 100644
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@@ -136,6 +136,32 @@  static void check_float_to_fixed24(AC3DSPContext *c) {
     report("float_to_fixed24");
 }
 
+static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
+#define ELEMS 240
+    LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]);
+    LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]);
+    LOCAL_ALIGNED_16(uint64_t, v1, [4]);
+    LOCAL_ALIGNED_16(uint64_t, v2, [4]);
+
+    declare_func(void, int64_t[4], const int32_t *, const int32_t *, int);
+
+    randomize_i24(lt, ELEMS);
+    randomize_i24(rt, ELEMS);
+
+    if (check_func(c->sum_square_butterfly_int32,
+                   "ac3_sum_square_bufferfly_int32")) {
+        call_ref(v1, lt, rt, ELEMS);
+        call_new(v2, lt, rt, ELEMS);
+
+        if (memcmp(v1, v2, sizeof(int64_t[4])) != 0)
+            fail();
+
+        bench_new(v2, lt, rt, ELEMS);
+    }
+
+    report("ac3_sum_square_butterfly_int32");
+}
+
 void checkasm_check_ac3dsp(void)
 {
     AC3DSPContext c;
@@ -144,4 +170,5 @@  void checkasm_check_ac3dsp(void)
     check_ac3_exponent_min(&c);
     check_ac3_extract_exponents(&c);
     check_float_to_fixed24(&c);
+    check_ac3_sum_square_butterfly_int32(&c);
 }