Message ID | 17305bcf-364b-4f60-8e60-633466da4918@geoffhill.org |
---|---|
State | New |
Headers | show |
Series | avcodec/ac3: Add aarch64 NEON DSP | expand |
Context | Check | Description |
---|---|---|
yinshiyou/make_loongarch64 | success | Make finished |
yinshiyou/make_fate_loongarch64 | success | Make fate finished |
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
On Tue, 2 Apr 2024, Geoff Hill wrote: > Signed-off-by: Geoff Hill <geoff@geoffhill.org> > --- > libavcodec/aarch64/ac3dsp_init_aarch64.c | 5 +++++ > libavcodec/aarch64/ac3dsp_neon.S | 24 +++++++++++++++++++++ > tests/checkasm/ac3dsp.c | 27 ++++++++++++++++++++++++ > 3 files changed, 56 insertions(+) > > diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c > index 1bdc215b51..e95436c651 100644 > --- a/libavcodec/aarch64/ac3dsp_init_aarch64.c > +++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c > @@ -28,6 +28,10 @@ > void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs); > void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs); > void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len); > +void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4], > + const int32_t *coef0, > + const int32_t *coef1, > + int len); > > av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c) > { > @@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c) > c->ac3_exponent_min = ff_ac3_exponent_min_neon; > c->extract_exponents = ff_ac3_extract_exponents_neon; > c->float_to_fixed24 = ff_float_to_fixed24_neon; > + c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon; > } > diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S > index b26f71a3f6..fa8fcf2e47 100644 > --- a/libavcodec/aarch64/ac3dsp_neon.S > +++ b/libavcodec/aarch64/ac3dsp_neon.S > @@ -64,3 +64,27 @@ function ff_float_to_fixed24_neon, export=1 > b.ne 0b > ret > endfunc > + > +function ff_ac3_sum_square_butterfly_int32_neon, export=1 > + cbz w3, 1f The arm version of this patch doesn't have any corresponding check for whether this parameter is zero, and the checkasm test doesn't test that behaviour either. Is that never feasiable (and we could leave it out here) or should we test that and fix it in other assembly versions? In the latter case, it's of course ok to defer that to a separate later patch, not holding up this one. // Martin
diff --git a/libavcodec/aarch64/ac3dsp_init_aarch64.c b/libavcodec/aarch64/ac3dsp_init_aarch64.c index 1bdc215b51..e95436c651 100644 --- a/libavcodec/aarch64/ac3dsp_init_aarch64.c +++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c @@ -28,6 +28,10 @@ void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs); void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs); void ff_float_to_fixed24_neon(int32_t *dst, const float *src, size_t len); +void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4], + const int32_t *coef0, + const int32_t *coef1, + int len); av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c) { @@ -37,4 +41,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c) c->ac3_exponent_min = ff_ac3_exponent_min_neon; c->extract_exponents = ff_ac3_extract_exponents_neon; c->float_to_fixed24 = ff_float_to_fixed24_neon; + c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon; } diff --git a/libavcodec/aarch64/ac3dsp_neon.S b/libavcodec/aarch64/ac3dsp_neon.S index b26f71a3f6..fa8fcf2e47 100644 --- a/libavcodec/aarch64/ac3dsp_neon.S +++ b/libavcodec/aarch64/ac3dsp_neon.S @@ -64,3 +64,27 @@ function ff_float_to_fixed24_neon, export=1 b.ne 0b ret endfunc + +function ff_ac3_sum_square_butterfly_int32_neon, export=1 + cbz w3, 1f + movi v0.2d, #0 + movi v1.2d, #0 + movi v2.2d, #0 + movi v3.2d, #0 +0: ld1 {v4.2s}, [x1], #8 + ld1 {v5.2s}, [x2], #8 + add v6.2s, v4.2s, v5.2s + sub v7.2s, v4.2s, v5.2s + smlal v0.2d, v4.2s, v4.2s + smlal v1.2d, v5.2s, v5.2s + smlal v2.2d, v6.2s, v6.2s + smlal v3.2d, v7.2s, v7.2s + subs w3, w3, #2 + b.gt 0b + addp d0, v0.2d + addp d1, v1.2d + addp d2, v2.2d + addp d3, v3.2d + st1 {v0.1d-v3.1d}, [x0] +1: ret +endfunc diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c index a8a20349f9..c920dc9eb0 100644 --- a/tests/checkasm/ac3dsp.c +++ b/tests/checkasm/ac3dsp.c @@ -136,6 +136,32 @@ static void check_float_to_fixed24(AC3DSPContext *c) { report("float_to_fixed24"); } +static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) { +#define ELEMS 240 + LOCAL_ALIGNED_16(int32_t, lt, [ELEMS]); + LOCAL_ALIGNED_16(int32_t, rt, [ELEMS]); + LOCAL_ALIGNED_16(uint64_t, v1, [4]); + LOCAL_ALIGNED_16(uint64_t, v2, [4]); + + declare_func(void, int64_t[4], const int32_t *, const int32_t *, int); + + randomize_i24(lt, ELEMS); + randomize_i24(rt, ELEMS); + + if (check_func(c->sum_square_butterfly_int32, + "ac3_sum_square_bufferfly_int32")) { + call_ref(v1, lt, rt, ELEMS); + call_new(v2, lt, rt, ELEMS); + + if (memcmp(v1, v2, sizeof(int64_t[4])) != 0) + fail(); + + bench_new(v2, lt, rt, ELEMS); + } + + report("ac3_sum_square_butterfly_int32"); +} + void checkasm_check_ac3dsp(void) { AC3DSPContext c; @@ -144,4 +170,5 @@ void checkasm_check_ac3dsp(void) check_ac3_exponent_min(&c); check_ac3_extract_exponents(&c); check_float_to_fixed24(&c); + check_ac3_sum_square_butterfly_int32(&c); }
Signed-off-by: Geoff Hill <geoff@geoffhill.org> --- libavcodec/aarch64/ac3dsp_init_aarch64.c | 5 +++++ libavcodec/aarch64/ac3dsp_neon.S | 24 +++++++++++++++++++++ tests/checkasm/ac3dsp.c | 27 ++++++++++++++++++++++++ 3 files changed, 56 insertions(+)