From 5e25b89454a674d93b4565df02fb9ba834795835 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 20 Apr 2019 02:05:23 +0100
Subject: [PATCH v2 2/2] aarch64/mdct15: implement an fft15 NEON implementation
NEON:
4823 UNITS in fft15, 65536 runs, 0 skips
C:
7889 UNITS in fft15, 65535 runs, 1 skips
Total speedup: around 3% on a Raspberry Pi 3.
---
libavcodec/aarch64/Makefile | 2 +
libavcodec/aarch64/mdct15_init.c | 82 ++++++++++++++++++
libavcodec/aarch64/mdct15_neon.S | 137 +++++++++++++++++++++++++++++++
libavcodec/mdct15.c | 3 +
libavcodec/mdct15.h | 1 +
5 files changed, 225 insertions(+)
create mode 100644 libavcodec/aarch64/mdct15_init.c
create mode 100644 libavcodec/aarch64/mdct15_neon.S
@@ -6,6 +6,7 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
+OBJS-$(CONFIG_MDCT15) += aarch64/mdct15_init.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
@@ -43,6 +44,7 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o \
aarch64/simple_idct_neon.o
+NEON-OBJS-$(CONFIG_MDCT15) += aarch64/mdct15_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
new file mode 100644
@@ -0,0 +1,82 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mdct15.h"
+
+void ff_fft15_neon(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+static void perm_twiddles(MDCT15Context *s)
+{
+ int k;
+ FFTComplex *out = s->exptab;
+ FFTComplex tmp[30];
+ FFTComplex exptab_5pt[] = {
+ { s->exptab[19].re, s->exptab[19].im },
+ { -s->exptab[20].re, s->exptab[20].im },
+ { s->exptab[20].im, s->exptab[20].im },
+ { -s->exptab[20].im, -s->exptab[20].im },
+ };
+
+ /* Reorder */
+ for (k = 0; k < 5; k++) {
+ tmp[6*k + 0] = s->exptab[k + 0];
+ tmp[6*k + 2] = s->exptab[k + 5];
+ tmp[6*k + 4] = s->exptab[k + 10];
+
+ tmp[6*k + 1] = s->exptab[2 * (k + 0)];
+ tmp[6*k + 3] = s->exptab[2 * (k + 5)];
+ tmp[6*k + 5] = s->exptab[2 * k + 5 ];
+ }
+
+ /* 5-point FFT twiddles */
+ memcpy(out, exptab_5pt, 4*sizeof(FFTComplex));
+ out += 4;
+
+ /* 15-point FFT twiddles */
+ for (k = 0; k < 3; k++) {
+ FFTComplex exptab[] = {
+ { tmp[2*k + 0].re, tmp[2*k + 0].im },
+ { tmp[2*k + 1].re, tmp[2*k + 1].im },
+
+ { tmp[6*1 + 2*k + 0].re, tmp[6*1 + 2*k + 0].im },
+ { tmp[6*2 + 2*k + 0].re, tmp[6*2 + 2*k + 0].im },
+ { tmp[6*1 + 2*k + 1].re, tmp[6*1 + 2*k + 1].im },
+ { tmp[6*2 + 2*k + 1].re, tmp[6*2 + 2*k + 1].im },
+
+ { tmp[6*3 + 2*k + 0].re, tmp[6*3 + 2*k + 0].im },
+ { tmp[6*4 + 2*k + 0].re, tmp[6*4 + 2*k + 0].im },
+ { tmp[6*3 + 2*k + 1].re, tmp[6*3 + 2*k + 1].im },
+ { tmp[6*4 + 2*k + 1].re, tmp[6*4 + 2*k + 1].im },
+ };
+ memcpy(out, exptab, 10*sizeof(FFTComplex));
+ out += 10;
+ }
+}
+
+av_cold void ff_mdct15_init_aarch64(MDCT15Context *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->fft15 = ff_fft15_neon;
+ perm_twiddles(s);
+ }
+}
new file mode 100644
@@ -0,0 +1,137 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const sign_adjust_5, align=4
+ .word 0x00000000, 0x80000000, 0x80000000, 0x00000000
+endconst
+
+.macro fft5 dc, ac1, ac2
+ ld1r {v0.2d}, [x1], x5 // in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
+ ld1 {v1.d}[0], [x1], x5 // in[ 3].re, in[ 3].im, 0, 0
+ ld1 {v1.d}[1], [x1], x5 // in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
+ ld1 {v2.d}[1], [x1], x5 // 0, 0, in[ 9].re, in[ 9].im
+ ld1 {v2.d}[0], [x1] // in[12].re, in[12].im, in[ 9].re, in[ 9].im
+ sub x1, x1, #88
+
+ fsub v3.4s, v1.4s, v2.4s // t[1].im, t[1].re, t[3].im, t[3].re
+ fadd v1.4s, v1.4s, v2.4s // t[0].re, t[0].im, t[2].re, t[2].im
+
+ mov v4.d[0], v3.d[1]
+ mov v4.d[1], v3.d[0] // t[3].im, t[3].re, t[1].im, t[1].re
+
+ mov v2.d[0], v1.d[1]
+ mov v2.d[1], v1.d[0] // t[2].re, t[2].im, t[0].re, t[0].im
+
+ fadd \dc\().2s, v0.2s, v1.2s
+ fadd \dc\().2s, \dc\().2s, v2.2s // DC[0].re, DC[0].im
+
+ fmul v1.4s, v1.4s, v5.s[0] // exptab[0].re * t[0,2].re,im
+ fmul v3.4s, v3.4s, v5.s[1] // exptab[0].im * t[1,3].im,re
+
+ fmla v1.4s, v2.4s, v5.s[2] // t[0].re, t[0].im, t[4].re, t[4].im
+ fmla v3.4s, v4.4s, v6.4s // t[1].im, t[1].re, t[5].im, t[5].re
+
+ fadd v1.4s, v1.4s, v0.4s // in[0,0].re,im + t[0,4].re,im
+ rev64 v3.4s, v3.4s // t[1].re, t[1].im, t[5].re, t[5].im
+
+ eor v3.16b, v3.16b, v7.16b // (+--+)t[1,5].re,im
+
+ fadd \ac1\().4s, v1.4s, v3.4s // out[1,2].re,im
+ fsub v2.4s, v1.4s, v3.4s // out[4,3].re,im
+
+ mov \ac2\().d[0], v2.d[1]
+ mov \ac2\().d[1], v2.d[0] // out[3,4].re,im
+.endm
+
+.macro butterflies_dc offset
+ ld1 {v0.4s}, [x2], #16 // exptab
+
+ uzp1 v2.4s, v19.4s, v22.4s // t[0].re, t[0].re, t[1].re, t[1].re
+ fneg v1.2d, v0.2d
+ uzp2 v3.4s, v19.4s, v22.4s // t[0].im, t[0].im, t[1].im, t[1].im
+ rev64 v1.4s, v1.4s
+
+ fmul v2.4s, v2.4s, v0.4s // t[0,1].rere * exptab[k,2*k].reim
+ fmla v2.4s, v3.4s, v1.4s // t[0].re, t[0].im, t[1].re, t[1].im
+
+ mov v3.d[0], v2.d[1] // t[1].re, t[1].im
+ fadd v2.2s, v2.2s, v3.2s
+ fadd v2.2s, v2.2s, v16.2s
+
+ st1 {v2.2s}, [x0], x3
+.endm
+
+.macro butterflies_ac dc, ac1, ac2
+ ld1 {v0.4s, v1.4s}, [x2], #32 // exptab
+
+ trn1 v4.4s, \ac1\().4s, \ac1\().4s // t[1].re, t[1].re, t[2].re, t[2].re
+ fneg v2.2d, v0.2d
+ trn1 v6.4s, \ac2\().4s, \ac2\().4s
+ fneg v3.2d, v1.2d
+ trn2 v5.4s, \ac1\().4s, \ac1\().4s // t[1].im, t[1].im, t[2].im, t[2].im
+ rev64 v2.4s, v2.4s
+ trn2 v7.4s, \ac2\().4s, \ac2\().4s
+ rev64 v3.4s, v3.4s
+
+ fmul v0.4s, v4.4s, v0.4s // tmp2[1,1].re * exptab[1,2].reim
+ fmul v1.4s, v6.4s, v1.4s
+
+ fmla v0.4s, v5.4s, v2.4s // t[0].re, t[0].im, t[0].re, t[1].im
+ fmla v1.4s, v7.4s, v3.4s
+
+ fadd v0.4s, v0.4s, v1.4s
+
+ fadd v0.4s, v0.4s, \dc\().4s // out[stride*1].reim, out[stride*2].reim
+
+ st1 {v0.d}[0], [x0], x3
+ st1 {v0.d}[1], [x0], x3
+.endm
+
+function ff_fft15_neon, export=1
+ ld1 {v5.4s, v6.4s}, [x2], #32 // fft5 exptab
+ movrel x4, sign_adjust_5
+ ld1 {v7.16b}, [x4]
+ mov x5, #24
+ lsl x3, x3, #3
+
+ prfm pldl1keep, [x1, #120] // preload and keep coeffs for gathering
+ prfm pldl1strm, [x2, #240] // preload for streaming exptab
+
+ fft5 v16, v17, v18
+ fft5 v19, v20, v21
+ fft5 v22, v23, v24
+
+ mov v19.d[1], v19.d[0]
+ mov v22.d[1], v22.d[0]
+
+ butterflies_dc
+ butterflies_ac v17, v20, v23
+ butterflies_ac v18, v21, v24
+
+ butterflies_dc
+ butterflies_ac v17, v20, v23
+ butterflies_ac v18, v21, v24
+
+ butterflies_dc
+ butterflies_ac v17, v20, v23
+ butterflies_ac v18, v21, v24
+
+ ret
+endfunc
@@ -319,6 +319,9 @@ av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
if (ARCH_X86)
ff_mdct15_init_x86(s);
+ if (ARCH_AARCH64)
+ ff_mdct15_init_aarch64(s);
+
*ps = s;
return 0;
@@ -58,5 +58,6 @@ int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
void ff_mdct15_uninit(MDCT15Context **ps);
void ff_mdct15_init_x86(MDCT15Context *s);
+void ff_mdct15_init_aarch64(MDCT15Context *s);
#endif /* AVCODEC_MDCT15_H */
--
2.20.1