[FFmpeg-devel,2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation

Submitted by Lynne on April 20, 2019, 1:12 a.m.

Details

Message ID LcsIFXU--3-1@lynne.ee
State New
Headers show

Commit Message

Lynne April 20, 2019, 1:12 a.m.
NEON:
5225 UNITS in fft15,   65536 runs,      0 skips

C:
7889 UNITS in fft15,   65535 runs,      1 skips

Posting this to maybe get some advice, its too slow for any noticeable
decoding speed increase.

Patch hide | download patch | download mbox

From 534dd20b4961c2f0c4131fbbadac5f1940bad3a0 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Sat, 20 Apr 2019 02:05:23 +0100
Subject: [PATCH 2/2] aarch64/mdct15: implement a WIP fft15 NEON implementation

NEON:
5225 UNITS in fft15,   65536 runs,      0 skips

C:
7889 UNITS in fft15,   65535 runs,      1 skips

Posting this to maybe get some advice, its too slow for any noticeable
decoding speed increase.
---
 libavcodec/aarch64/Makefile      |   2 +
 libavcodec/aarch64/mdct15_init.c |  86 +++++++++++++++++++
 libavcodec/aarch64/mdct15_neon.S | 136 +++++++++++++++++++++++++++++++
 libavcodec/mdct15.c              |   3 +
 libavcodec/mdct15.h              |   1 +
 5 files changed, 228 insertions(+)
 create mode 100644 libavcodec/aarch64/mdct15_init.c
 create mode 100644 libavcodec/aarch64/mdct15_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 00f93bf59f..8093fc17cb 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -6,6 +6,7 @@  OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
 OBJS-$(CONFIG_H264PRED)                 += aarch64/h264pred_init.o
 OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
+OBJS-$(CONFIG_MDCT15)                   += aarch64/mdct15_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
@@ -43,6 +44,7 @@  NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
 NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
 NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_init_aarch64.o      \
                                            aarch64/simple_idct_neon.o
+NEON-OBJS-$(CONFIG_MDCT15)              += aarch64/mdct15_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/mdct15_init.c b/libavcodec/aarch64/mdct15_init.c
new file mode 100644
index 0000000000..a4e24c9c13
--- /dev/null
+++ b/libavcodec/aarch64/mdct15_init.c
@@ -0,0 +1,86 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mdct15.h"
+
+void ff_fft15_neon(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+static void perm_twiddles(MDCT15Context *s)
+{
+    int k;
+    FFTComplex tmp[30];
+
+    /* 5-point FFT twiddles */
+    s->exptab[60].re =  s->exptab[19].re;
+    s->exptab[60].im =  s->exptab[19].im;
+    s->exptab[61].re = -s->exptab[20].re;
+    s->exptab[61].im =  s->exptab[20].im;
+    s->exptab[62].re =  s->exptab[20].im;
+    s->exptab[62].im =  s->exptab[20].im;
+    s->exptab[63].re = -s->exptab[20].im;
+    s->exptab[63].im = -s->exptab[20].im;
+
+    /* 15-point FFT twiddles */
+    for (k = 0; k < 5; k++) {
+        tmp[6*k + 0] = s->exptab[k +  0];
+        tmp[6*k + 2] = s->exptab[k +  5];
+        tmp[6*k + 4] = s->exptab[k + 10];
+
+        tmp[6*k + 1] = s->exptab[2 * (k + 0)];
+        tmp[6*k + 3] = s->exptab[2 * (k + 5)];
+        tmp[6*k + 5] = s->exptab[2 *  k + 5 ];
+    }
+
+    for (k = 0; k < 6; k++) {
+        FFTComplex ac_exp[] = {
+            {  tmp[6*1 + k].re, tmp[6*1 + k].im },
+            {  tmp[6*2 + k].re, tmp[6*2 + k].im },
+            { -tmp[6*1 + k].im, tmp[6*1 + k].re },
+            { -tmp[6*2 + k].im, tmp[6*2 + k].re },
+            {  tmp[6*3 + k].re, tmp[6*3 + k].im },
+            {  tmp[6*4 + k].re, tmp[6*4 + k].im },
+            { -tmp[6*3 + k].im, tmp[6*3 + k].re },
+            { -tmp[6*4 + k].im, tmp[6*4 + k].re },
+        };
+        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
+    }
+
+    /* Specialcase when k = 0 */
+    for (k = 0; k < 3; k++) {
+        FFTComplex dc_exp[] = {
+            {  tmp[2*k + 0].re, tmp[2*k + 0].im },
+            {  tmp[2*k + 1].re, tmp[2*k + 1].im },
+            { -tmp[2*k + 0].im, tmp[2*k + 0].re },
+            { -tmp[2*k + 1].im, tmp[2*k + 1].re },
+        };
+        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
+    }
+}
+
+av_cold void ff_mdct15_init_aarch64(MDCT15Context *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (0 && have_neon(cpu_flags)) {
+        s->fft15 = ff_fft15_neon;
+        perm_twiddles(s);
+    }
+}
diff --git a/libavcodec/aarch64/mdct15_neon.S b/libavcodec/aarch64/mdct15_neon.S
new file mode 100644
index 0000000000..aee12b25b8
--- /dev/null
+++ b/libavcodec/aarch64/mdct15_neon.S
@@ -0,0 +1,136 @@ 
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const sign_adj_pnnp, align=4
+        .float 1, -1, -1, 1
+endconst
+
+.macro fft5 dc, ac1, ac2
+        ld1r {v0.2d}, [x1], #8                 // in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
+        add x4, x1, #(2*2*4)
+        ld1 {v1.d}[0], [x4]                    // in[ 3].re, in[ 3].im,         0,         0
+        add x4, x1, #(5*2*4)
+        ld1 {v1.d}[1], [x4]                    // in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
+        add x4, x1, #(8*2*4)
+        ld1 {v2.d}[1], [x4]                    //         0,         0, in[ 9].re, in[ 9].im
+        add x4, x1, #(11*2*4)
+        ld1 {v2.d}[0], [x4]                    // in[12].re, in[12].im, in[ 9].re, in[ 9].im
+
+        fsub v3.4s, v1.4s, v2.4s               // t[1].im, t[1].re, t[3].im, t[3].re
+        fadd v1.4s, v1.4s, v2.4s               // t[0].re, t[0].im, t[2].re, t[2].im
+
+        mov v4.d[0], v3.d[1]
+        mov v4.d[1], v3.d[0]                   // t[3].im, t[3].re, t[1].im, t[1].re
+
+        mov v2.d[0], v1.d[1]
+        mov v2.d[1], v1.d[0]                   // t[2].re, t[2].im, t[0].re, t[0].im
+
+        fadd \dc\().2s, v0.2s, v1.2s
+        fadd \dc\().2s, \dc\().2s, v2.2s       // DC[0].re, DC[0].im
+
+        fmul v1.4s, v1.4s, v5.s[0]             // exptab[0].re * t[0,2].re,im
+        fmul v3.4s, v3.4s, v5.s[1]             // exptab[0].im * t[1,3].im,re
+
+        fmla v1.4s, v2.4s, v5.s[2]             // t[0].re, t[0].im, t[4].re, t[4].im
+        fmla v3.4s, v4.4s, v6.4s               // t[1].im, t[1].re, t[5].im, t[5].re
+
+        fadd v1.4s, v1.4s, v0.4s               // in[0,0].re,im + t[0,4].re,im
+        rev64 v3.4s, v3.4s                     // t[1].re, t[1].im, t[5].re, t[5].im
+
+        fmul v3.4s, v3.4s, v7.4s               // (+--+)t[1,5].re,im
+
+        fadd \ac1\().4s, v1.4s, v3.4s          // out[1,2].re,im
+        fsub v2.4s, v1.4s, v3.4s               // out[4,3].re,im
+
+        mov \ac2\().d[0], v2.d[1]
+        mov \ac2\().d[1], v2.d[0]              // out[3,4].re,im
+.endm
+
+.macro butterflies_dc offset
+        add x4, x2, #(\offset)
+        ld1 {v0.4s, v1.4s}, [x4]
+
+        uzp1 v2.4s, v19.4s, v22.4s             // t[0].re, t[0].re, t[1].re, t[1].re
+        uzp2 v3.4s, v19.4s, v22.4s             // t[0].im, t[0].im, t[1].im, t[1].im
+
+        fmul v2.4s, v2.4s, v0.4s               // t[0,1].rere * exptab[k,2*k].reim
+        fmla v2.4s, v3.4s, v1.4s               // t[0].re, t[0].im, t[1].re, t[1].im
+
+        mov v3.d[0], v2.d[1]                   // t[1].re, t[1].im
+        fadd v2.2s, v2.2s, v3.2s
+        fadd v2.2s, v2.2s, v16.2s
+
+        st1 {v2.2s}, [x0], x3
+.endm
+
+.macro butterflies_ac offset, dc, ac1, ac2
+        add x4, x2, #(\offset)
+        ld1 {v0.4s, v1.4s}, [x4]               // exptab[01].reim, exptab[23].reim
+        add x4, x4, #64
+        ld1 {v2.4s, v3.4s}, [x4]               // exptab[01].reim, exptab[23].reim
+
+        trn1 v4.4s, \ac1\().4s, \ac1\().4s     // t[1].re, t[1].re, t[2].re, t[2].re
+        trn2 v5.4s, \ac1\().4s, \ac1\().4s     // t[1].im, t[1].im, t[2].im, t[2].im
+        trn1 v6.4s, \ac2\().4s, \ac2\().4s
+        trn2 v7.4s, \ac2\().4s, \ac2\().4s
+
+        fmul v0.4s, v4.4s, v0.4s               // tmp2[1,1].re * exptab[1,2].reim
+        fmul v2.4s, v6.4s, v2.4s
+
+        fmla v0.4s, v5.4s, v1.4s               // t[0].re, t[0].im, t[0].re, t[1].im
+        fmla v2.4s, v7.4s, v3.4s
+
+        fadd v0.4s, v0.4s, v2.4s
+
+        fadd v0.4s, v0.4s, \dc\().4s           // out[stride*1].reim, out[stride*2].reim
+
+        st1 {v0.d}[0], [x0], x3
+        st1 {v0.d}[1], [x0], x3
+.endm
+
+function ff_fft15_neon, export=1
+        movrel x4, sign_adj_pnnp
+        ld1 {v7.4s}, [x4]
+
+        add x4, x2, #(60*2*4)
+        ld1 {v5.4s, v6.4s}, [x4]
+
+        fft5 v16, v17, v18
+        fft5 v19, v20, v21
+        fft5 v22, v23, v24
+
+        mov v19.d[1], v19.d[0]
+        mov v22.d[1], v22.d[0]
+        lsl x3, x3, #3
+
+        butterflies_dc (8*6 + 4*0)*2*4
+        butterflies_ac (8*0 + 0*0)*2*4, v17, v20, v23
+        butterflies_ac (8*0 + 1*4)*2*4, v18, v21, v24
+
+        butterflies_dc (8*6 + 4*1)*2*4
+        butterflies_ac (8*2 + 0*0)*2*4, v17, v20, v23
+        butterflies_ac (8*2 + 1*4)*2*4, v18, v21, v24
+
+        butterflies_dc (8*6 + 4*2)*2*4
+        butterflies_ac (8*4 + 0*0)*2*4, v17, v20, v23
+        butterflies_ac (8*4 + 1*4)*2*4, v18, v21, v24
+
+        ret
+endfunc
diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
index 6f35059bfe..9e2003b1af 100644
--- a/libavcodec/mdct15.c
+++ b/libavcodec/mdct15.c
@@ -319,6 +319,9 @@  av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
     if (ARCH_X86)
         ff_mdct15_init_x86(s);
 
+    if (ARCH_AARCH64)
+        ff_mdct15_init_aarch64(s);
+
     *ps = s;
 
     return 0;
diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
index 42e60f3e10..58a1946db0 100644
--- a/libavcodec/mdct15.h
+++ b/libavcodec/mdct15.h
@@ -58,5 +58,6 @@  int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
 void ff_mdct15_uninit(MDCT15Context **ps);
 
 void ff_mdct15_init_x86(MDCT15Context *s);
+void ff_mdct15_init_aarch64(MDCT15Context *s);
 
 #endif /* AVCODEC_MDCT15_H */
-- 
2.20.1