diff mbox

[FFmpeg-devel,v3] mdct15: add assembly optimizations for the 15-point FFT

Message ID 20170622224435.9684-1-atomnuker@gmail.com
State Superseded
Headers show

Commit Message

Rostislav Pehlivanov June 22, 2017, 10:44 p.m. UTC
c:    1802 decicycles in fft15,16774635 runs,   2581 skips
fma3:  935 decicycles in fft15,16775893 runs,   1323 skips

Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com>
---
 libavcodec/mdct15.c          | 182 +++++++++++++++++++++----------------------
 libavcodec/mdct15.h          |  26 +++----
 libavcodec/x86/Makefile      |   2 +
 libavcodec/x86/mdct15.asm    | 146 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/mdct15_init.c |  95 ++++++++++++++++++++++
 5 files changed, 342 insertions(+), 109 deletions(-)
 create mode 100644 libavcodec/x86/mdct15.asm
 create mode 100644 libavcodec/x86/mdct15_init.c

Comments

Henrik Gramner June 23, 2017, 12:44 a.m. UTC | #1
On Fri, Jun 23, 2017 at 12:44 AM, Rostislav Pehlivanov
<atomnuker@gmail.com> wrote:
> +%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
> +    movddup xm0, [inq + 0*16 +  0 + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
> +    movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
> +    movsd   xm2, [inq + 2*16 + 16 + %1] ; in[ 5].re, in[ 5].im, in[ 6].re, in[ 6].im
> +    movsd   xm3, [inq + 4*16 +  8 + %1] ; in[ 8].re, in[ 8].im, in[ 9].re, in[ 9].im
> +    movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
> +
> +    vinsertf128  m0,  xm0, 1
> +
> +    shufps      xm1,  xm2, q1010        ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
> +    shufps      xm4,  xm3, q1010        ; in[12].re, in[12].im, in[ 9].re, in[ 9].im

vbroadcastsd instead of movddup + vinsertf128.
movhps instead of movsd+shufps.

> +%macro BUTTERFLIES_DC 2 ; %1 - exptab_offset, %2 - out
> +    movaps m0, [exptabq + %1]
> +    vextractf128 xm1, m0, 1
> +
> +    mulps   xm1, xm10
> +    mulps   xm0, xm9

mulps xm0, xm9, [exptabq + %1]
mulps xm1, xm10, [exptabq + %1 + 16]

(cross-lane shuffles are slow, avoid them when possible)

> +%macro BUTTERFLIES_AC 2 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m3)
> +    mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
> +    mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
> +    mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
> +    mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
> +
> +    shufps m1, m1, q2301
> +    shufps m3, m3, q2301
> +
> +    addps m0, m1
> +    addps m2, m3
> +    addps m0, m2

Adding m1 and m3 before shuffling should allow you to remove one
shufps. Might also be beneficial to reorder the multiplies so that m1
and m3 are calculated before m0 and m2.

> +cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
> +%define out0q inq
> +    shl strideq, 3
> +
> +    movaps m5, [exptabq + 480]
> +    vextractf128 xm6, m5, 1

Use two loads instead of a cross-lane shuffle.
diff mbox

Patch

diff --git a/libavcodec/mdct15.c b/libavcodec/mdct15.c
index 8c42ece483..f93881fbed 100644
--- a/libavcodec/mdct15.c
+++ b/libavcodec/mdct15.c
@@ -57,11 +57,6 @@  av_cold void ff_mdct15_uninit(MDCT15Context **ps)
     av_freep(ps);
 }
 
-static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
-
-static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
-                         ptrdiff_t stride, float scale);
-
 static inline int init_pfa_reindex_tabs(MDCT15Context *s)
 {
     int i, j;
@@ -93,88 +88,8 @@  static inline int init_pfa_reindex_tabs(MDCT15Context *s)
     return 0;
 }
 
-av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
-{
-    MDCT15Context *s;
-    double alpha, theta;
-    int len2 = 15 * (1 << N);
-    int len  = 2 * len2;
-    int i;
-
-    /* Tested and verified to work on everything in between */
-    if ((N < 2) || (N > 13))
-        return AVERROR(EINVAL);
-
-    s = av_mallocz(sizeof(*s));
-    if (!s)
-        return AVERROR(ENOMEM);
-
-    s->fft_n = N - 1;
-    s->len4 = len2 / 2;
-    s->len2 = len2;
-    s->inverse = inverse;
-    s->mdct = mdct15;
-    s->imdct_half = imdct15_half;
-
-    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
-        goto fail;
-
-    if (init_pfa_reindex_tabs(s))
-        goto fail;
-
-    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
-    if (!s->tmp)
-        goto fail;
-
-    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
-    if (!s->twiddle_exptab)
-        goto fail;
-
-    theta = 0.125f + (scale < 0 ? s->len4 : 0);
-    scale = sqrt(fabs(scale));
-    for (i = 0; i < s->len4; i++) {
-        alpha = 2 * M_PI * (i + theta) / len;
-        s->twiddle_exptab[i].re = cos(alpha) * scale;
-        s->twiddle_exptab[i].im = sin(alpha) * scale;
-    }
-
-    /* 15-point FFT exptab */
-    for (i = 0; i < 19; i++) {
-        if (i < 15) {
-            double theta = (2.0f * M_PI * i) / 15.0f;
-            if (!s->inverse)
-                theta *= -1;
-            s->exptab[i].re = cos(theta);
-            s->exptab[i].im = sin(theta);
-        } else { /* Wrap around to simplify fft15 */
-            s->exptab[i] = s->exptab[i - 15];
-        }
-    }
-
-    /* 5-point FFT exptab */
-    s->exptab[19].re = cos(2.0f * M_PI / 5.0f);
-    s->exptab[19].im = sin(2.0f * M_PI / 5.0f);
-    s->exptab[20].re = cos(1.0f * M_PI / 5.0f);
-    s->exptab[20].im = sin(1.0f * M_PI / 5.0f);
-
-    /* Invert the phase for an inverse transform, do nothing for a forward transform */
-    if (s->inverse) {
-        s->exptab[19].im *= -1;
-        s->exptab[20].im *= -1;
-    }
-
-    *ps = s;
-
-    return 0;
-
-fail:
-    ff_mdct15_uninit(&s);
-    return AVERROR(ENOMEM);
-}
-
 /* Stride is hardcoded to 3 */
-static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
-                        const FFTComplex *in)
+static inline void fft5(FFTComplex *out, FFTComplex *in, FFTComplex exptab[2])
 {
     FFTComplex z0[4], t[6];
 
@@ -219,14 +134,14 @@  static inline void fft5(const FFTComplex exptab[2], FFTComplex *out,
     out[4].im = in[0].im + z0[3].im;
 }
 
-static void fft15(const FFTComplex exptab[22], FFTComplex *out, const FFTComplex *in, size_t stride)
+static void fft15_c(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride)
 {
     int k;
     FFTComplex tmp1[5], tmp2[5], tmp3[5];
 
-    fft5(exptab + 19, tmp1, in + 0);
-    fft5(exptab + 19, tmp2, in + 1);
-    fft5(exptab + 19, tmp3, in + 2);
+    fft5(tmp1, in + 0, exptab + 19);
+    fft5(tmp2, in + 1, exptab + 19);
+    fft5(tmp3, in + 2, exptab + 19);
 
     for (k = 0; k < 5; k++) {
         FFTComplex t[2];
@@ -269,7 +184,7 @@  static void mdct15(MDCT15Context *s, float *dst, const float *src, ptrdiff_t str
             }
             CMUL(fft15in[j].re, fft15in[j].im, re, im, s->twiddle_exptab[k].re, -s->twiddle_exptab[k].im);
         }
-        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
+        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
     }
 
     /* Then a 15xN FFT (where N is a power of two) */
@@ -306,7 +221,7 @@  static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
             FFTComplex tmp = { *(in2 - 2*k*stride), *(in1 + 2*k*stride) };
             CMUL3(fft15in[j], tmp, s->twiddle_exptab[k]);
         }
-        fft15(s->exptab, s->tmp + s->ptwo_fft.revtab[i], fft15in, l_ptwo);
+        s->fft15(s->tmp + s->ptwo_fft.revtab[i], fft15in, s->exptab, l_ptwo);
     }
 
     /* Then a 15xN FFT (where N is a power of two) */
@@ -327,3 +242,86 @@  static void imdct15_half(MDCT15Context *s, float *dst, const float *src,
         z[i0].im = scale * im1;
     }
 }
+
+av_cold int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale)
+{
+    MDCT15Context *s;
+    double alpha, theta;
+    int len2 = 15 * (1 << N);
+    int len  = 2 * len2;
+    int i;
+
+    /* Tested and verified to work on everything in between */
+    if ((N < 2) || (N > 13))
+        return AVERROR(EINVAL);
+
+    s = av_mallocz(sizeof(*s));
+    if (!s)
+        return AVERROR(ENOMEM);
+
+    s->fft_n      = N - 1;
+    s->len4       = len2 / 2;
+    s->len2       = len2;
+    s->inverse    = inverse;
+    s->fft15      = fft15_c;
+    s->mdct       = mdct15;
+    s->imdct_half = imdct15_half;
+
+    if (ff_fft_init(&s->ptwo_fft, N - 1, s->inverse) < 0)
+        goto fail;
+
+    if (init_pfa_reindex_tabs(s))
+        goto fail;
+
+    s->tmp  = av_malloc_array(len, 2 * sizeof(*s->tmp));
+    if (!s->tmp)
+        goto fail;
+
+    s->twiddle_exptab  = av_malloc_array(s->len4, sizeof(*s->twiddle_exptab));
+    if (!s->twiddle_exptab)
+        goto fail;
+
+    theta = 0.125f + (scale < 0 ? s->len4 : 0);
+    scale = sqrt(fabs(scale));
+    for (i = 0; i < s->len4; i++) {
+        alpha = 2 * M_PI * (i + theta) / len;
+        s->twiddle_exptab[i].re = cosf(alpha) * scale;
+        s->twiddle_exptab[i].im = sinf(alpha) * scale;
+    }
+
+    /* 15-point FFT exptab */
+    for (i = 0; i < 19; i++) {
+        if (i < 15) {
+            double theta = (2.0f * M_PI * i) / 15.0f;
+            if (!s->inverse)
+                theta *= -1;
+            s->exptab[i].re = cosf(theta);
+            s->exptab[i].im = sinf(theta);
+        } else { /* Wrap around to simplify fft15 */
+            s->exptab[i] = s->exptab[i - 15];
+        }
+    }
+
+    /* 5-point FFT exptab */
+    s->exptab[19].re = cosf(2.0f * M_PI / 5.0f);
+    s->exptab[19].im = sinf(2.0f * M_PI / 5.0f);
+    s->exptab[20].re = cosf(1.0f * M_PI / 5.0f);
+    s->exptab[20].im = sinf(1.0f * M_PI / 5.0f);
+
+    /* Invert the phase for an inverse transform, do nothing for a forward transform */
+    if (s->inverse) {
+        s->exptab[19].im *= -1;
+        s->exptab[20].im *= -1;
+    }
+
+    if (ARCH_X86)
+        ff_mdct15_init_x86(s);
+
+    *ps = s;
+
+    return 0;
+
+fail:
+    ff_mdct15_uninit(&s);
+    return AVERROR(ENOMEM);
+}
diff --git a/libavcodec/mdct15.h b/libavcodec/mdct15.h
index ef94edff6c..7d83f3ebdf 100644
--- a/libavcodec/mdct15.h
+++ b/libavcodec/mdct15.h
@@ -34,34 +34,26 @@  typedef struct MDCT15Context {
     int *pfa_postreindex;
 
     FFTContext ptwo_fft;
-
     FFTComplex *tmp;
-
     FFTComplex *twiddle_exptab;
 
-    /* 0 - 18: fft15 twiddles, 19 - 20: fft5 twiddles */
-    FFTComplex exptab[21];
+    DECLARE_ALIGNED(32, FFTComplex, exptab)[64];
 
-    /**
-     * Calculate a full 2N -> N MDCT
-     */
+    /* 15-point FFT */
+    void (*fft15)(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+    /* Calculate a full 2N -> N MDCT */
     void (*mdct)(struct MDCT15Context *s, float *dst, const float *src, ptrdiff_t stride);
 
-    /**
-     * Calculate the middle half of the iMDCT
-     */
+    /* Calculate the middle half of the iMDCT */
     void (*imdct_half)(struct MDCT15Context *s, float *dst, const float *src,
                        ptrdiff_t src_stride, float scale);
 } MDCT15Context;
 
-/**
- * Init an (i)MDCT of the length 2 * 15 * (2^N)
- */
+/* Init an (i)MDCT of the length 2 * 15 * (2^N) */
 int ff_mdct15_init(MDCT15Context **ps, int inverse, int N, double scale);
-
-/**
- * Frees a context
- */
 void ff_mdct15_uninit(MDCT15Context **ps);
 
+void ff_mdct15_init_x86(MDCT15Context *s);
+
 #endif /* AVCODEC_MDCT15_H */
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index e65118d134..b86700b675 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@  OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
 OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
+OBJS-$(CONFIG_MDCT15)                  += x86/mdct15_init.o
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodsp.o
 OBJS-$(CONFIG_MPEGVIDEO)               += x86/mpegvideo.o              \
@@ -117,6 +118,7 @@  X86ASM-OBJS-$(CONFIG_IDCTDSP)          += x86/idctdsp.o
 X86ASM-OBJS-$(CONFIG_LLAUDDSP)         += x86/lossless_audiodsp.o
 X86ASM-OBJS-$(CONFIG_LLVIDDSP)         += x86/lossless_videodsp.o
 X86ASM-OBJS-$(CONFIG_LLVIDENCDSP)      += x86/lossless_videoencdsp.o
+X86ASM-OBJS-$(CONFIG_MDCT15)           += x86/mdct15.o
 X86ASM-OBJS-$(CONFIG_ME_CMP)           += x86/me_cmp.o
 X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP)     += x86/imdct36.o
 X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC)     += x86/mpegvideoencdsp.o
diff --git a/libavcodec/x86/mdct15.asm b/libavcodec/x86/mdct15.asm
new file mode 100644
index 0000000000..ec167fbaec
--- /dev/null
+++ b/libavcodec/x86/mdct15.asm
@@ -0,0 +1,146 @@ 
+;******************************************************************************
+;* SIMD optimized non-power-of-two MDCT functions
+;*
+;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
+
+SECTION .text
+
+%macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
+    movddup xm0, [inq + 0*16 +  0 + %1] ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
+    movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
+    movsd   xm2, [inq + 2*16 + 16 + %1] ; in[ 5].re, in[ 5].im, in[ 6].re, in[ 6].im
+    movsd   xm3, [inq + 4*16 +  8 + %1] ; in[ 8].re, in[ 8].im, in[ 9].re, in[ 9].im
+    movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
+
+    vinsertf128  m0,  xm0, 1
+
+    shufps      xm1,  xm2, q1010        ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
+    shufps      xm4,  xm3, q1010        ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
+
+    subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
+    addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
+
+    movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
+    addps       %2,   xm1
+    addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
+    movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
+
+    shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
+    shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
+
+    mulps       xm%3, xm1, xm5
+    mulps       xm4,  xm3, xm6
+    mulps       xm1,  xm6
+
+    xorps       xm1,  xm7
+    mulps       xm3,  xm5
+    addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
+    subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
+
+    movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
+    movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
+
+    xorps       xm2,  xm7
+    addps       xm%3, xm2, xm3
+    subps       xm3,  xm2
+
+    shufps      xm3,  xm3, q1032
+    vinsertf128 m%3,  xm3, 1            ; All ACs (tmp[1] through to tmp[4])
+    addps       m%3,  m0                ; Finally offset with DCs
+%endmacro
+
+%macro BUTTERFLIES_DC 2 ; %1 - exptab_offset, %2 - out
+    movaps m0, [exptabq + %1]
+    vextractf128 xm1, m0, 1
+
+    mulps   xm1, xm10
+    mulps   xm0, xm9
+
+    haddps  xm0,  xm1
+    movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
+
+    addps   xm0,  xm1
+    addps   xm0,  xm8
+
+    movsd [%2q], xm0
+%endmacro
+
+%macro BUTTERFLIES_AC 2 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m3)
+    mulps m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
+    mulps m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
+    mulps m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
+    mulps m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
+
+    shufps m1, m1, q2301
+    shufps m3, m3, q2301
+
+    addps m0, m1
+    addps m2, m3
+    addps m0, m2
+    addps m0, m11
+
+    vextractf128 xm1, m0, 1
+
+    movlps [%2q + strideq*1], xm0
+    movhps [%2q + strideq*2], xm0
+    movlps [%2q +  stride3q], xm1
+    movhps [%2q + strideq*4], xm1
+%endmacro
+
+;*****************************************************************************************
+;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+;*****************************************************************************************
+INIT_YMM avx
+cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
+%define out0q inq
+    shl strideq, 3
+
+    movaps m5, [exptabq + 480]
+    vextractf128 xm6, m5, 1
+    movaps xm7, [sign_adjust_5]
+
+    FFT5  0,  xm8, 11
+    FFT5  8,  xm9, 12
+    FFT5 16, xm10, 13
+
+    lea stride3q, [strideq  + strideq*2]
+    lea stride5q, [stride3q + strideq*2]
+
+    mov out0q, outq
+
+    BUTTERFLIES_DC (8*6 + 4*0)*2*4, out0
+    lea outq, [out0q + stride5q*1]
+    BUTTERFLIES_DC (8*6 + 4*1)*2*4, out
+    lea outq, [out0q + stride5q*2]
+    BUTTERFLIES_DC (8*6 + 4*2)*2*4, out
+
+    BUTTERFLIES_AC (8*0)*2*4, out0
+    lea outq, [out0q + stride5q*1]
+    BUTTERFLIES_AC (8*2)*2*4, out
+    lea outq, [out0q + stride5q*2]
+    BUTTERFLIES_AC (8*4)*2*4, out
+
+    RET
diff --git a/libavcodec/x86/mdct15_init.c b/libavcodec/x86/mdct15_init.c
new file mode 100644
index 0000000000..ba3d94c2ec
--- /dev/null
+++ b/libavcodec/x86/mdct15_init.c
@@ -0,0 +1,95 @@ 
+/*
+ * SIMD optimized non-power-of-two MDCT functions
+ *
+ * Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/mdct15.h"
+
+void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
+
+static void perm_twiddles(MDCT15Context *s)
+{
+    int k;
+    FFTComplex exp_5point[4];
+
+    FFTComplex tmp[21], tmp2[30];
+    memcpy(tmp, s->exptab, sizeof(FFTComplex)*21);
+
+    /* 15-point FFT twiddles */
+    for (k = 0; k < 5; k++) {
+        tmp2[6*k + 0] = tmp[k +  0];
+        tmp2[6*k + 2] = tmp[k +  5];
+        tmp2[6*k + 4] = tmp[k + 10];
+
+        tmp2[6*k + 1] = tmp[2 * (k + 0)];
+        tmp2[6*k + 3] = tmp[2 * (k + 5)];
+        tmp2[6*k + 5] = tmp[2 *  k + 5 ];
+    }
+
+    for (k = 0; k < 6; k++) {
+        FFTComplex ac_exp[] = {
+            { tmp2[6*1 + k].re,  tmp2[6*1 + k].re },
+            { tmp2[6*2 + k].re,  tmp2[6*2 + k].re },
+            { tmp2[6*3 + k].re,  tmp2[6*3 + k].re },
+            { tmp2[6*4 + k].re,  tmp2[6*4 + k].re },
+            { tmp2[6*1 + k].im, -tmp2[6*1 + k].im },
+            { tmp2[6*2 + k].im, -tmp2[6*2 + k].im },
+            { tmp2[6*3 + k].im, -tmp2[6*3 + k].im },
+            { tmp2[6*4 + k].im, -tmp2[6*4 + k].im },
+        };
+        memcpy(s->exptab + 8*k, ac_exp, 8*sizeof(FFTComplex));
+    }
+
+    /* Specialcase when k = 0 */
+    for (k = 0; k < 3; k++) {
+        FFTComplex dc_exp[] = {
+            { tmp2[2*k + 0].re, -tmp2[2*k + 0].im },
+            { tmp2[2*k + 0].im,  tmp2[2*k + 0].re },
+            { tmp2[2*k + 1].re, -tmp2[2*k + 1].im },
+            { tmp2[2*k + 1].im,  tmp2[2*k + 1].re },
+        };
+        memcpy(s->exptab + 8*6 + 4*k, dc_exp, 4*sizeof(FFTComplex));
+    }
+
+    /* 5-point FFT twiddles */
+    exp_5point[0].re = exp_5point[0].im = tmp[19].re;
+    exp_5point[1].re = exp_5point[1].im = tmp[19].im;
+    exp_5point[2].re = exp_5point[2].im = tmp[20].re;
+    exp_5point[3].re = exp_5point[3].im = tmp[20].im;
+
+    memcpy(s->exptab + 8*6 + 4*3, exp_5point, 4*sizeof(FFTComplex));
+}
+
+av_cold void ff_mdct15_init_x86(MDCT15Context *s)
+{
+    int adjust_twiddles = 0;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (ARCH_X86_64 && EXTERNAL_AVX(cpu_flags)) {
+        s->fft15 = ff_fft15_avx;
+        adjust_twiddles = 1;
+    }
+
+    if (adjust_twiddles)
+        perm_twiddles(s);
+}