From 302346449ce53b92f2a40f6033704b72d90d47ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20B=C5=93sch?= <u@pkh.me>
Date: Thu, 25 May 2017 17:50:52 +0200
Subject: [PATCH] lavc/aarch64: add a few SIMD function for AAC PS
---
For the record, this is the C equivalent for ff_ps_hybrid_analysis_neon:
static void ps_hybrid_analysis_c(INTFLOAT (*out)[2], INTFLOAT (*in)[2],
const INTFLOAT (*filter)[8][2],
int stride, int n)
{
int i;
const INT64FLOAT re0123pCBA9[4] = { // v16.4S
in[0][0] + in[12][0],
in[1][0] + in[11][0],
in[2][0] + in[10][0],
in[3][0] + in[ 9][0],
};
const INT64FLOAT im0123pCBA9[4] = { // v17.4S
in[0][1] + in[12][1],
in[1][1] + in[11][1],
in[2][1] + in[10][1],
in[3][1] + in[ 9][1],
};
const INT64FLOAT im0123mCBA9[4] = { // v18.4S
in[0][1] - in[12][1],
in[1][1] - in[11][1],
in[2][1] - in[10][1],
in[3][1] - in[ 9][1],
};
const INT64FLOAT re0123mCBA9[4] = { // v19.4S
in[0][0] - in[12][0],
in[1][0] - in[11][0],
in[2][0] - in[10][0],
in[3][0] - in[ 9][0],
};
const INT64FLOAT re45p87_im87m45[4] = { // v20.4S
in[4][0] + in[8][0],
in[5][0] + in[7][0],
in[8][1] - in[4][1],
in[7][1] - in[5][1],
};
const INT64FLOAT im45p87_re45m87[4] = { // v21.4S
in[4][1] + in[8][1],
in[5][1] + in[7][1],
in[4][0] - in[8][0],
in[5][0] - in[7][0],
};
for (i = 0; i < n; i++) {
INT64FLOAT sum_re4[4];
INT64FLOAT sum_im4[4];
INT64FLOAT sum_re, sum_im;
const INT64FLOAT filter_re0123[4] = { // v2.4S
filter[i][0][0],
filter[i][1][0],
filter[i][2][0],
filter[i][3][0],
};
const INT64FLOAT filter_im0123[4] = { // v3.4S
filter[i][0][1],
filter[i][1][1],
filter[i][2][1],
filter[i][3][1],
};
const INT64FLOAT filter_re45_im45[4] = { // v4.4S
filter[i][4][0],
filter[i][5][0],
filter[i][4][1],
filter[i][5][1],
};
sum_re4[0] = filter_re0123[0] * re0123pCBA9[0];
sum_re4[1] = filter_re0123[1] * re0123pCBA9[1];
sum_re4[2] = filter_re0123[2] * re0123pCBA9[2];
sum_re4[3] = filter_re0123[3] * re0123pCBA9[3];
sum_im4[0] = filter_re0123[0] * im0123pCBA9[0];
sum_im4[1] = filter_re0123[1] * im0123pCBA9[1];
sum_im4[2] = filter_re0123[2] * im0123pCBA9[2];
sum_im4[3] = filter_re0123[3] * im0123pCBA9[3];
sum_re4[0] -= filter_im0123[0] * im0123mCBA9[0];
sum_re4[1] -= filter_im0123[1] * im0123mCBA9[1];
sum_re4[2] -= filter_im0123[2] * im0123mCBA9[2];
sum_re4[3] -= filter_im0123[3] * im0123mCBA9[3];
sum_im4[0] += filter_im0123[0] * re0123mCBA9[0];
sum_im4[1] += filter_im0123[1] * re0123mCBA9[1];
sum_im4[2] += filter_im0123[2] * re0123mCBA9[2];
sum_im4[3] += filter_im0123[3] * re0123mCBA9[3];
sum_re4[0] += filter_re45_im45[0] * re45p87_im87m45[0];
sum_re4[1] += filter_re45_im45[1] * re45p87_im87m45[1];
sum_re4[2] += filter_re45_im45[2] * re45p87_im87m45[2];
sum_re4[3] += filter_re45_im45[3] * re45p87_im87m45[3];
sum_im4[0] += filter_re45_im45[0] * im45p87_re45m87[0];
sum_im4[1] += filter_re45_im45[1] * im45p87_re45m87[1];
sum_im4[2] += filter_re45_im45[2] * im45p87_re45m87[2];
sum_im4[3] += filter_re45_im45[3] * im45p87_re45m87[3];
sum_re = sum_re4[0] + sum_re4[1] + sum_re4[2] + sum_re4[3]
+ (INT64FLOAT)filter[i][6][0] * in[6][0];
sum_im = sum_im4[0] + sum_im4[1] + sum_im4[2] + sum_im4[3]
+ (INT64FLOAT)filter[i][6][0] * in[6][1];
#if USE_FIXED
out[i * stride][0] = (int)((sum_re + 0x40000000) >> 31);
out[i * stride][1] = (int)((sum_im + 0x40000000) >> 31);
#else
out[i * stride][0] = sum_re;
out[i * stride][1] = sum_im;
#endif /* USE_FIXED */
}
}
---
libavcodec/aacpsdsp.h | 1 +
libavcodec/aacpsdsp_template.c | 2 +
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/aacpsdsp_init_aarch64.c | 48 ++++++++++
libavcodec/aarch64/aacpsdsp_neon.S | 145 +++++++++++++++++++++++++++++
5 files changed, 198 insertions(+)
create mode 100644 libavcodec/aarch64/aacpsdsp_init_aarch64.c
create mode 100644 libavcodec/aarch64/aacpsdsp_neon.S
@@ -51,6 +51,7 @@ typedef struct PSDSPContext {
void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s);
void ff_psdsp_init_arm(PSDSPContext *s);
+void ff_psdsp_init_aarch64(PSDSPContext *s);
void ff_psdsp_init_mips(PSDSPContext *s);
void ff_psdsp_init_x86(PSDSPContext *s);
@@ -222,6 +222,8 @@ av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s)
#if !USE_FIXED
if (ARCH_ARM)
ff_psdsp_init_arm(s);
+ if (ARCH_AARCH64)
+ ff_psdsp_init_aarch64(s);
if (ARCH_MIPS)
ff_psdsp_init_mips(s);
if (ARCH_X86)
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
# decoders/encoders
+OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_init_aarch64.o
OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_init.o
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_init_aarch64.o
@@ -42,6 +43,7 @@ NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
# decoders/encoders
+NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/aacpsdsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/synth_filter_neon.o
NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o
NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_16bpp_neon.o \
new file mode 100644
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+ float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+ const float (*filter)[8][2],
+ int stride, int n);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+void ff_ps_stereo_interpolate_ipdopd_neon(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+
+av_cold void ff_psdsp_init_aarch64(PSDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->add_squares = ff_ps_add_squares_neon;
+ s->mul_pair_single = ff_ps_mul_pair_single_neon;
+ s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
+ s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+ s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_neon;
+ }
+}
new file mode 100644
@@ -0,0 +1,145 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+1: ld1 {v0.4S,v1.4S}, [x1], #32
+ fmul v0.4S, v0.4S, v0.4S
+ fmul v1.4S, v1.4S, v1.4S
+ faddp v2.4S, v0.4S, v1.4S
+ ld1 {v3.4S}, [x0]
+ fadd v3.4S, v3.4S, v2.4S
+ st1 {v3.4S}, [x0], #16
+ subs w2, w2, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+1: ld1 {v0.4S,v1.4S}, [x1], #32
+ ld1 {v2.4S}, [x2], #16
+ zip1 v3.4S, v2.4S, v2.4S
+ zip2 v4.4S, v2.4S, v2.4S
+ fmul v0.4S, v0.4S, v3.4S
+ fmul v1.4S, v1.4S, v4.4S
+ st1 {v0.4S,v1.4S}, [x0], #32
+ subs w3, w3, #4
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+ ld1 {v0.4S}, [x2]
+ ld1 {v1.4S}, [x3]
+1: ld1 {v2.2S}, [x0]
+ ld1 {v3.2S}, [x1]
+ dup v2.2D, v2.D[0]
+ dup v3.2D, v3.D[0]
+ fadd v0.4S, v0.4S, v1.4S
+ zip1 v4.4S, v0.4S, v0.4S
+ zip2 v5.4S, v0.4S, v0.4S
+ fmul v2.4S, v2.4S, v4.4S
+ fmla v2.4S, v3.4S, v5.4S
+ st1 {v2.D}[0], [x0], #8
+ st1 {v2.D}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
+
+const ipdopd_factors, align=4
+ .float -1.0, 1.0, -1.0, 1.0
+endconst
+
+function ff_ps_stereo_interpolate_ipdopd_neon, export=1
+ movrel x5, ipdopd_factors
+ ld1 {v20.4S}, [x5]
+ ld1 {v0.4S,v1.4S}, [x2]
+ ld1 {v6.4S,v7.4S}, [x3]
+1: ld1 {v2.2S}, [x0]
+ ld1 {v3.2S}, [x1]
+ dup v2.2D, v2.D[0]
+ dup v3.2D, v3.D[0]
+ fadd v0.4S, v0.4S, v6.4S
+ fadd v1.4S, v1.4S, v7.4S
+ zip1 v16.4S, v0.4S, v0.4S
+ zip2 v17.4S, v0.4S, v0.4S
+ zip1 v18.4S, v1.4S, v1.4S
+ zip2 v19.4S, v1.4S, v1.4S
+ fmul v4.4S, v2.4S, v16.4S
+ fmla v4.4S, v3.4S, v17.4S
+ ext v2.16B, v2.16B, v2.16B, #4
+ ext v3.16B, v3.16B, v3.16B, #4
+ fmul v5.4S, v2.4S, v18.4S
+ fmla v5.4S, v3.4S, v19.4S
+ fmla v4.4S, v5.4S, v20.4S
+ st1 {v4.D}[0], [x0], #8
+ st1 {v4.D}[1], [x1], #8
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+ sxtw x3, w3
+ lsl x3, x3, #3
+ ld2 {v0.4S,v1.4S}, [x1], #32
+ ld2 {v2.2S,v3.2S}, [x1], #16
+ ld1 {v8.2S}, [x1], #8
+ ld2 {v4.2S,v5.2S}, [x1], #16
+ ld2 {v6.4S,v7.4S}, [x1]
+ rev64 v6.4S, v6.4S
+ rev64 v7.4S, v7.4S
+ ext v6.16B, v6.16B, v6.16B, #8
+ ext v7.16B, v7.16B, v7.16B, #8
+ rev64 v4.2S, v4.2S
+ rev64 v5.2S, v5.2S
+ mov v2.D[1], v3.D[0]
+ mov v4.D[1], v5.D[0]
+ mov v5.D[1], v2.D[0]
+ mov v3.D[1], v4.D[0]
+ fadd v16.4S, v0.4S, v6.4S
+ fadd v17.4S, v1.4S, v7.4S
+ fsub v18.4S, v1.4S, v7.4S
+ fsub v19.4S, v0.4S, v6.4S
+ fadd v22.4S, v2.4S, v4.4S
+ fsub v23.4S, v5.4S, v3.4S
+ trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5}
+ trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7}
+1: ld2 {v2.4S,v3.4S}, [x2], #32
+ ld2 {v4.2S,v5.2S}, [x2], #16
+ ld1 {v6.2S}, [x2], #8
+ add x2, x2, #8
+ mov v4.D[1], v5.D[0]
+ mov v6.S[1], v6.S[0]
+ fmul v6.2S, v6.2S, v8.2S
+ fmul v0.4S, v2.4S, v16.4S
+ fmul v1.4S, v2.4S, v17.4S
+ fmls v0.4S, v3.4S, v18.4S
+ fmla v1.4S, v3.4S, v19.4S
+ fmla v0.4S, v4.4S, v20.4S
+ fmla v1.4S, v4.4S, v21.4S
+ faddp v0.4S, v0.4S, v1.4S
+ faddp v0.4S, v0.4S, v0.4S
+ fadd v0.2S, v0.2S, v6.2S
+ st1 {v0.2S}, [x0], x3
+ subs w4, w4, #1
+ b.gt 1b
+ ret
+endfunc
--
2.13.0