diff mbox series

[FFmpeg-devel] x86/tx_float: add 15xN PFA FFT AVX SIMD

Message ID NCItbYi--3-2@lynne.ee
State New
Headers show
Series [FFmpeg-devel] x86/tx_float: add 15xN PFA FFT AVX SIMD | expand

Checks

Context Check Description
yinshiyou/configure_loongarch64 warning Failed to apply patch
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Lynne Sept. 19, 2022, 3:59 a.m. UTC
~4x faster than the C version.

Patch attached.
diff mbox series

Patch

From 00112ce7895f48861dd5f4bbfe272874f95c428a Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Mon, 19 Sep 2022 05:53:01 +0200
Subject: [PATCH] x86/tx_float: add 15xN PFA FFT AVX SIMD

~4x faster than the C version.
---
 doc/transforms.md             | 323 ++++++++++++++++++++++++++++++++++
 libavutil/tx_template.c       |  53 +++---
 libavutil/x86/tx_float.asm    | 211 ++++++++++++++++++++++
 libavutil/x86/tx_float_init.c |  58 ++++++
 4 files changed, 622 insertions(+), 23 deletions(-)

diff --git a/doc/transforms.md b/doc/transforms.md
index 78f3f68d65..04a7c408f1 100644
--- a/doc/transforms.md
+++ b/doc/transforms.md
@@ -704,3 +704,326 @@  Also note that this function requires a special iteration way, due to coefficien
 beginning to overlap, particularly `[o1]` with `[0]` after the second iteration.
 To iterate further, set `z = &z[16]` via `z += 8` for the second iteration. After
 the 4th iteration, the layout resets, so repeat the same.
+
+
+# 15-point AVX FFT transform
+The 15-point transform is based on the following unrolling. The input
+must be permuted via the following loop:
+
+``` C
+    for (int k = 0; k < s->sub[0].len; k++) {
+        int cnt = 0;
+        int tmp[15];
+        memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
+        for (int i = 1; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 2; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 0; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
+        memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
+        s->map[k*15 + 1] = tmp[2];
+        s->map[k*15 + 2] = tmp[0];
+    }
+
+```
+
+This separates the ACs from the DCs and flips the SIMD direction to
+performing 5x3pt transforms at once, followed by 3x5pt transforms.
+
+``` C
+static av_always_inline void fft15(TXComplex *out, TXComplex *in,
+                                   ptrdiff_t stride)
+{
+    const TXSample *tab = TX_TAB(ff_tx_tab_53);
+    TXComplex q[20];
+    TXComplex dc[3], pc[32];
+    TXComplex y[32], k[32];
+    TXComplex t[32];
+    TXComplex r[32];
+    TXComplex z0[32];
+
+    /* DC */
+    pc[0].re = in[ 1].im - in[ 0].im;
+    pc[0].im = in[ 1].re - in[ 0].re;
+    pc[1].re = in[ 1].re + in[ 0].re;
+    pc[1].im = in[ 1].im + in[ 0].im;
+
+    dc[0].re = in[2].re + pc[1].re;
+    dc[0].im = in[2].im + pc[1].im;
+
+    pc[0].re = tab[ 8] * pc[0].re;
+    pc[0].im = tab[ 9] * pc[0].im;
+    pc[1].re = tab[10] * pc[1].re;
+    pc[1].im = tab[11] * pc[1].im;
+
+    dc[1].re = pc[0].re + pc[1].re;
+    dc[1].im = pc[0].im + pc[1].im;
+    dc[2].re = pc[1].re - pc[0].re;
+    dc[2].im = pc[1].im - pc[0].im;
+
+    dc[1].re = in[2].re - dc[1].re;
+    dc[1].im = in[2].im + dc[1].im;
+    dc[2].re = in[2].re - dc[2].re;
+    dc[2].im = in[2].im + dc[2].im;
+
+    /* ACs */
+    q[0].im = in[ 3].re - in[ 7].re; // NOTE THE ORDER
+    q[0].re = in[ 3].im - in[ 7].im;
+    q[1].im = in[ 4].re - in[ 8].re;
+    q[1].re = in[ 4].im - in[ 8].im;
+    q[2].im = in[ 5].re - in[ 9].re;
+    q[2].re = in[ 5].im - in[ 9].im;
+    q[3].re = in[ 6].im - in[10].im;
+    q[3].im = in[ 6].re - in[10].re;
+
+    q[4].re = in[ 3].re + in[ 7].re;
+    q[4].im = in[ 3].im + in[ 7].im;
+    q[5].re = in[ 4].re + in[ 8].re;
+    q[5].im = in[ 4].im + in[ 8].im;
+    q[6].re = in[ 5].re + in[ 9].re;
+    q[6].im = in[ 5].im + in[ 9].im;
+    q[7].re = in[ 6].re + in[10].re;
+    q[7].im = in[ 6].im + in[10].im;
+
+    y[0].re = in[11].re + q[4].re;
+    y[0].im = in[11].im + q[4].im;
+    y[1].re = in[12].re + q[5].re;
+    y[1].im = in[12].im + q[5].im;
+    y[2].re = in[13].re + q[6].re;
+    y[2].im = in[13].im + q[6].im;
+    y[3].re = in[14].re + q[7].re;
+    y[3].im = in[14].im + q[7].im;
+
+    q[0].re = tab[ 8] * q[0].re;
+    q[0].im = tab[ 9] * q[0].im;
+    q[1].re = tab[ 8] * q[1].re;
+    q[1].im = tab[ 9] * q[1].im;
+    q[2].re = tab[ 8] * q[2].re;
+    q[2].im = tab[ 9] * q[2].im;
+    q[3].re = tab[ 8] * q[3].re;
+    q[3].im = tab[ 9] * q[3].im;
+
+    q[4].re = tab[10] * q[4].re;
+    q[4].im = tab[11] * q[4].im;
+    q[5].re = tab[10] * q[5].re;
+    q[5].im = tab[11] * q[5].im;
+    q[6].re = tab[10] * q[6].re;
+    q[6].im = tab[11] * q[6].im;
+    q[7].re = tab[10] * q[7].re;
+    q[7].im = tab[11] * q[7].im;
+
+    k[0].re = q[4].re - q[0].re;
+    k[0].im = q[4].im - q[0].im;
+    k[1].re = q[5].re - q[1].re;
+    k[1].im = q[5].im - q[1].im;
+    k[2].re = q[6].re - q[2].re;
+    k[2].im = q[6].im - q[2].im;
+    k[3].re = q[7].re - q[3].re;
+    k[3].im = q[7].im - q[3].im;
+
+    k[4].re = q[4].re + q[0].re;
+    k[4].im = q[4].im + q[0].im;
+    k[5].re = q[5].re + q[1].re;
+    k[5].im = q[5].im + q[1].im;
+    k[6].re = q[6].re + q[2].re;
+    k[6].im = q[6].im + q[2].im;
+    k[7].re = q[7].re + q[3].re;
+    k[7].im = q[7].im + q[3].im;
+
+    k[0].re = in[11].re - k[0].re;
+    k[0].im = in[11].im + k[0].im;
+    k[1].re = in[12].re - k[1].re;
+    k[1].im = in[12].im + k[1].im;
+    k[2].re = in[13].re - k[2].re;
+    k[2].im = in[13].im + k[2].im;
+    k[3].re = in[14].re - k[3].re;
+    k[3].im = in[14].im + k[3].im;
+
+    k[4].re = in[11].re - k[4].re;
+    k[4].im = in[11].im + k[4].im;
+    k[5].re = in[12].re - k[5].re;
+    k[5].im = in[12].im + k[5].im;
+    k[6].re = in[13].re - k[6].re;
+    k[6].im = in[13].im + k[6].im;
+    k[7].re = in[14].re - k[7].re;
+    k[7].im = in[14].im + k[7].im;
+
+    /* 15pt start here */
+    t[0].re = y[3].re + y[0].re;
+    t[0].im = y[3].im + y[0].im;
+    t[1].re = y[2].re + y[1].re;
+    t[1].im = y[2].im + y[1].im;
+    t[2].re = y[1].re - y[2].re;
+    t[2].im = y[1].im - y[2].im;
+    t[3].re = y[0].re - y[3].re;
+    t[3].im = y[0].im - y[3].im;
+
+    t[4].re = k[3].re + k[0].re;
+    t[4].im = k[3].im + k[0].im;
+    t[5].re = k[2].re + k[1].re;
+    t[5].im = k[2].im + k[1].im;
+    t[6].re = k[1].re - k[2].re;
+    t[6].im = k[1].im - k[2].im;
+    t[7].re = k[0].re - k[3].re;
+    t[7].im = k[0].im - k[3].im;
+
+    t[ 8].re = k[7].re + k[4].re;
+    t[ 8].im = k[7].im + k[4].im;
+    t[ 9].re = k[6].re + k[5].re;
+    t[ 9].im = k[6].im + k[5].im;
+    t[10].re = k[5].re - k[6].re;
+    t[10].im = k[5].im - k[6].im;
+    t[11].re = k[4].re - k[7].re;
+    t[11].im = k[4].im - k[7].im;
+
+    out[ 0*stride].re = dc[0].re + t[0].re + t[ 1].re;
+    out[ 0*stride].im = dc[0].im + t[0].im + t[ 1].im;
+    out[10*stride].re = dc[1].re + t[4].re + t[ 5].re;
+    out[10*stride].im = dc[1].im + t[4].im + t[ 5].im;
+    out[ 5*stride].re = dc[2].re + t[8].re + t[ 9].re;
+    out[ 5*stride].im = dc[2].im + t[8].im + t[ 9].im;
+
+    r[0].re = t[0].re * tab[0];
+    r[0].im = t[0].im * tab[1];
+    r[1].re = t[1].re * tab[0];
+    r[1].im = t[1].im * tab[1];
+    r[2].re = t[2].re * tab[4];
+    r[2].im = t[2].im * tab[5];
+    r[3].re = t[3].re * tab[4];
+    r[3].im = t[3].im * tab[5];
+
+    r[4].re = t[4].re * tab[0];
+    r[4].im = t[4].im * tab[1];
+    r[5].re = t[5].re * tab[0];
+    r[5].im = t[5].im * tab[1];
+    r[6].re = t[6].re * tab[4];
+    r[6].im = t[6].im * tab[5];
+    r[7].re = t[7].re * tab[4];
+    r[7].im = t[7].im * tab[5];
+
+    r[ 8].re = t[ 8].re * tab[0];
+    r[ 8].im = t[ 8].im * tab[1];
+    r[ 9].re = t[ 9].re * tab[0];
+    r[ 9].im = t[ 9].im * tab[1];
+    r[10].re = t[10].re * tab[4];
+    r[10].im = t[10].im * tab[5];
+    r[11].re = t[11].re * tab[4];
+    r[11].im = t[11].im * tab[5];
+
+    t[0].re = t[0].re * tab[2];
+    t[0].im = t[0].im * tab[3];
+    t[1].re = t[1].re * tab[2];
+    t[1].im = t[1].im * tab[3];
+    t[2].re = t[2].re * tab[6];
+    t[2].im = t[2].im * tab[7];
+    t[3].re = t[3].re * tab[6];
+    t[3].im = t[3].im * tab[7];
+
+    t[4].re = t[4].re * tab[2];
+    t[4].im = t[4].im * tab[3];
+    t[5].re = t[5].re * tab[2];
+    t[5].im = t[5].im * tab[3];
+    t[6].re = t[6].re * tab[6];
+    t[6].im = t[6].im * tab[7];
+    t[7].re = t[7].re * tab[6];
+    t[7].im = t[7].im * tab[7];
+
+    t[ 8].re = t[ 8].re * tab[2];
+    t[ 8].im = t[ 8].im * tab[3];
+    t[ 9].re = t[ 9].re * tab[2];
+    t[ 9].im = t[ 9].im * tab[3];
+    t[10].re = t[10].re * tab[6];
+    t[10].im = t[10].im * tab[7];
+    t[11].re = t[11].re * tab[6];
+    t[11].im = t[11].im * tab[7];
+
+    r[0].re = r[0].re - t[1].re;
+    r[0].im = r[0].im - t[1].im;
+    r[1].re = r[1].re - t[0].re;
+    r[1].im = r[1].im - t[0].im;
+    r[2].re = r[2].re - t[3].re;
+    r[2].im = r[2].im - t[3].im;
+    r[3].re = r[3].re + t[2].re;
+    r[3].im = r[3].im + t[2].im;
+
+    r[4].re = r[4].re - t[5].re;
+    r[4].im = r[4].im - t[5].im;
+    r[5].re = r[5].re - t[4].re;
+    r[5].im = r[5].im - t[4].im;
+    r[6].re = r[6].re - t[7].re;
+    r[6].im = r[6].im - t[7].im;
+    r[7].re = r[7].re + t[6].re;
+    r[7].im = r[7].im + t[6].im;
+
+    r[ 8].re = r[ 8].re - t[ 9].re;
+    r[ 8].im = r[ 8].im - t[ 9].im;
+    r[ 9].re = r[ 9].re - t[ 8].re;
+    r[ 9].im = r[ 9].im - t[ 8].im;
+    r[10].re = r[10].re - t[11].re;
+    r[10].im = r[10].im - t[11].im;
+    r[11].re = r[11].re + t[10].re;
+    r[11].im = r[11].im + t[10].im;
+
+    z0[ 0].re = r[ 3].im + r[ 0].re;
+    z0[ 0].im = r[ 3].re + r[ 0].im;
+    z0[ 1].re = r[ 2].im + r[ 1].re;
+    z0[ 1].im = r[ 2].re + r[ 1].im;
+    z0[ 2].re = r[ 1].im - r[ 2].re;
+    z0[ 2].im = r[ 1].re - r[ 2].im;
+    z0[ 3].re = r[ 0].im - r[ 3].re;
+    z0[ 3].im = r[ 0].re - r[ 3].im;
+
+    z0[ 4].re = r[ 7].im + r[ 4].re;
+    z0[ 4].im = r[ 7].re + r[ 4].im;
+    z0[ 5].re = r[ 6].im + r[ 5].re;
+    z0[ 5].im = r[ 6].re + r[ 5].im;
+    z0[ 6].re = r[ 5].im - r[ 6].re;
+    z0[ 6].im = r[ 5].re - r[ 6].im;
+    z0[ 7].re = r[ 4].im - r[ 7].re;
+    z0[ 7].im = r[ 4].re - r[ 7].im;
+
+    z0[ 8].re = r[11].im + r[ 8].re;
+    z0[ 8].im = r[11].re + r[ 8].im;
+    z0[ 9].re = r[10].im + r[ 9].re;
+    z0[ 9].im = r[10].re + r[ 9].im;
+    z0[10].re = r[ 9].im - r[10].re;
+    z0[10].im = r[ 9].re - r[10].im;
+    z0[11].re = r[ 8].im - r[11].re;
+    z0[11].im = r[ 8].re - r[11].im;
+
+    out[ 6*stride].re = dc[0].re + z0[0].re;
+    out[ 6*stride].im = dc[0].im + z0[3].re;
+    out[12*stride].re = dc[0].re + z0[2].im;
+    out[12*stride].im = dc[0].im + z0[1].im;
+    out[ 3*stride].re = dc[0].re + z0[1].re;
+    out[ 3*stride].im = dc[0].im + z0[2].re;
+    out[ 9*stride].re = dc[0].re + z0[3].im;
+    out[ 9*stride].im = dc[0].im + z0[0].im;
+
+    out[ 1*stride].re = dc[1].re + z0[4].re;
+    out[ 1*stride].im = dc[1].im + z0[7].re;
+    out[ 7*stride].re = dc[1].re + z0[6].im;
+    out[ 7*stride].im = dc[1].im + z0[5].im;
+    out[13*stride].re = dc[1].re + z0[5].re;
+    out[13*stride].im = dc[1].im + z0[6].re;
+    out[ 4*stride].re = dc[1].re + z0[7].im;
+    out[ 4*stride].im = dc[1].im + z0[4].im;
+
+    out[11*stride].re = dc[2].re + z0[8].re;
+    out[11*stride].im = dc[2].im + z0[11].re;
+    out[ 2*stride].re = dc[2].re + z0[10].im;
+    out[ 2*stride].im = dc[2].im + z0[9].im;
+    out[ 8*stride].re = dc[2].re + z0[9].re;
+    out[ 8*stride].im = dc[2].im + z0[10].re;
+    out[14*stride].re = dc[2].re + z0[11].im;
+    out[14*stride].im = dc[2].im + z0[8].im;
+}
+```
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index 5e7159bd87..d139d142a7 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -48,9 +48,9 @@  SR_TABLE(65536);
 SR_TABLE(131072);
 
 /* Other factors' tables */
-TABLE_DEF(53, 8);
-TABLE_DEF( 7, 6);
-TABLE_DEF( 9, 8);
+TABLE_DEF(53, 12);
+TABLE_DEF( 7,  6);
+TABLE_DEF( 9,  8);
 
 typedef struct FFSRTabsInitOnce {
     void (*func)(void);
@@ -106,14 +106,21 @@  static FFSRTabsInitOnce sr_tabs_init_once[] = {
 
 static void TX_TAB(ff_tx_init_tab_53)(void)
 {
-    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12));
-    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12));
-    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI /  6));
-    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI /  6));
-    TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI /  5));
-    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI /  5));
-    TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10));
-    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI /  5));
+    /* 5pt, doubled to eliminate AVX lane shuffles */
+    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
+    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
+    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
+    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
+    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
+    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
+    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
+    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
+
+    /* 3pt */
+    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
+    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
+    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
+    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
 }
 
 static void TX_TAB(ff_tx_init_tab_7)(void)
@@ -189,19 +196,19 @@  static av_always_inline void fft3(TXComplex *out, TXComplex *in,
     out[0*stride].im = in[0].im + tmp[1].im;
 
 #ifdef TX_INT32
-    mtmp[0] = (int64_t)tab[0] * tmp[0].re;
-    mtmp[1] = (int64_t)tab[1] * tmp[0].im;
-    mtmp[2] = (int64_t)tab[2] * tmp[1].re;
-    mtmp[3] = (int64_t)tab[2] * tmp[1].im;
+    mtmp[0] = (int64_t)tab[ 8] * tmp[0].re;
+    mtmp[1] = (int64_t)tab[ 9] * tmp[0].im;
+    mtmp[2] = (int64_t)tab[10] * tmp[1].re;
+    mtmp[3] = (int64_t)tab[10] * tmp[1].im;
     out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
     out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
     out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
     out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
 #else
-    tmp[0].re = tab[0] * tmp[0].re;
-    tmp[0].im = tab[1] * tmp[0].im;
-    tmp[1].re = tab[2] * tmp[1].re;
-    tmp[1].im = tab[2] * tmp[1].im;
+    tmp[0].re = tab[ 8] * tmp[0].re;
+    tmp[0].im = tab[ 9] * tmp[0].im;
+    tmp[1].re = tab[10] * tmp[1].re;
+    tmp[1].im = tab[10] * tmp[1].im;
     out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
     out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
     out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re;
@@ -224,10 +231,10 @@  static av_always_inline void NAME(TXComplex *out, TXComplex *in,    \
     out[D0*stride].re = in[0].re + t[0].re + t[2].re;               \
     out[D0*stride].im = in[0].im + t[0].im + t[2].im;               \
                                                                     \
-    SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re);       \
-    SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im);       \
-    CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re);     \
-    CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im);     \
+    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
+    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
+    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
+    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
                                                                     \
     BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
     BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index b3a85a7cb9..a74fe2a460 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -52,6 +52,8 @@  cextern tab_ %+ i %+ _float ; ff_tab_i_float...
 %assign i (i << 1)
 %endrep
 
+cextern tab_53_float
+
 struc AVTXContext
     .len:          resd 1 ; Length
     .inv           resd 1 ; Inverse flag
@@ -88,6 +90,9 @@  s16_mult_odd1: dd COS16_1,  COS16_1,  COS16_3,  COS16_3,  COS16_1, -COS16_1,  CO
 s16_mult_odd2: dd COS16_3, -COS16_3,  COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
 s16_perm:      dd 0, 1, 2, 3, 1, 0, 3, 2
 
+s15_perm:      dd 0, 6, 5, 3, 2, 4, 7, 1
+
+mask_mmmmmmpp: dd NEG, NEG, NEG, NEG, NEG, NEG, POS, POS
 mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
 mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
 mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
@@ -1577,3 +1582,209 @@  cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp,
 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
 IMDCT_FN avx2
 %endif
+
+%if ARCH_X86_64
+INIT_YMM avx
+cglobal fft_pfa_15xM_float, 4, 14, 16, 272, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, tgt5, stride3, stride5, btmp
+    mov btmpq, outq                  ; backup original out
+    movsxd lenq, dword [ctxq + AVTXContext.len]
+    mov outq, [ctxq + AVTXContext.tmp]
+    mov lutq, [ctxq + AVTXContext.map]
+
+    ; Load stride and 2nd transform LUT
+    mov tmpq, [ctxq + AVTXContext.sub]
+    movsxd strideq, dword [tmpq + AVTXContext.len]
+    mov mapq, [tmpq + AVTXContext.map]
+
+    shl strideq, 3
+    imul stride3q, strideq, 3
+    imul stride5q, strideq, 5
+
+    movaps m13, [mask_mmmmmmpp]      ; mmmmmmpp
+    vpermpd m12, m13, q0033          ; ppppmmmm
+    vextractf128 xm11, m13, 1        ; mmpp
+    movaps m10, [ff_tx_tab_53_float] ; tab5
+    movaps xm9, [ff_tx_tab_53_float + 32] ; tab3
+    movaps m8, [s15_perm]
+
+.dim1:
+    mov tmpd, [mapq]
+    lea tgtq, [outq + tmpq*8]
+
+    LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim
+
+    shufps xm1, xm0, xm0, q3223      ; in[1].imrereim
+    shufps xm0, xm0, xm0, q1001      ; in[0].imrereim
+
+    xorps xm1, xm11
+    addps xm1, xm0                   ; pc[0,1].imre
+
+    mov tmpd, [lutq + 8]
+    movddup xm14, [inq + tmpq*8]     ; in[2].reimreim
+    shufps xm0, xm1, xm1, q3232      ; pc[1].reimreim
+    addps xm0, xm14                  ; dc[0].reimreim
+
+    mulps xm1, xm9                   ; tab[0123]*pc[01]
+
+    shufpd xm5, xm1, xm1, 01b        ; pc[1,0].reim
+    xorps xm1, xm11
+    addps xm1, xm1, xm5
+    addsubps xm1, xm14, xm1          ; dc[1,2].reim
+
+    LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m14, m15
+    LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
+
+    subps m7, m2, m3                 ; q[0-3].imre
+    addps m6, m2, m3                 ; q[4-7]
+    shufps m7, m7, m7, q2301         ; q[0-3].reim
+
+    LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m14, m15
+
+    addps m5, m4, m6                 ; y[0-3]
+
+    vpermpd m14, m9, q1111           ; tab[23232323]
+    vbroadcastsd m15, xm9            ; tab[01010101]
+
+    mulps m6, m14
+    mulps m7, m15
+
+    subps m2, m6, m7                 ; k[0-3]
+    addps m3, m6, m7                 ; k[4-7]
+
+    addsubps m6, m4, m2              ; k[0-3]
+    addsubps m7, m4, m3              ; k[4-7]
+
+    ; 15pt from here on
+    vpermpd m2, m5, q0123            ; y[3-0]
+    vpermpd m3, m6, q0123            ; k[3-0]
+    vpermpd m4, m7, q0123            ; k[7-4]
+
+    xorps m5, m12
+    xorps m6, m12
+    xorps m7, m12
+
+    addps m2, m5                     ; t[0-3]
+    addps m3, m6                     ; t[4-7]
+    addps m4, m7                     ; t[8-11]
+
+    movlhps xm14, xm2                ; out[0]
+    unpcklpd xm7, xm3, xm4           ; out[10,5]
+    unpckhpd xm5, xm3, xm4           ; out[10,5]
+
+    addps xm14, xm2                  ; out[0]
+    addps xm7, xm5                   ; out[10,5]
+    addps xm14, xm0                  ; out[0]
+    addps xm7, xm1                   ; out[10,5]
+
+    movhps [tgtq], xm14              ; out[0]
+    movhps [tgtq + stride5q*1], xm7  ; out[5]
+    movlps [tgtq + stride5q*2], xm7  ; out[10]
+
+    shufps m14, m10, m10, q3232      ; tab5 4 5 4 5  8  9  8  9
+    shufps m15, m10, m10, q1010      ; tab5 6 7 6 7 10 11 10 11
+
+    mulps m5, m2, m14                ; t[0-3]
+    mulps m6, m3, m14                ; t[4-7]
+    mulps m7, m4, m14                ; t[8-11]
+
+    mulps m2, m15                    ; r[0-3]
+    mulps m3, m15                    ; r[4-7]
+    mulps m4, m15                    ; r[8-11]
+
+    shufps m5, m5, m5, q1032         ; t[1,0,3,2].reim
+    shufps m6, m6, m6, q1032         ; t[5,4,7,6].reim
+    shufps m7, m7, m7, q1032         ; t[9,8,11,10].reim
+
+    lea tgt5q, [tgtq + stride5q]
+    lea tmpq,  [tgtq + stride5q*2]
+
+    xorps m5, m13
+    xorps m6, m13
+    xorps m7, m13
+
+    addps m2, m5                     ; r[0,1,2,3]
+    addps m3, m6                     ; r[4,5,6,7]
+    addps m4, m7                     ; r[8,9,10,11]
+
+    shufps m5, m2, m2, q2301
+    shufps m6, m3, m3, q2301
+    shufps m7, m4, m4, q2301
+
+    xorps m2, m12
+    xorps m3, m12
+    xorps m4, m12
+
+    vpermpd m5, m5, q0123
+    vpermpd m6, m6, q0123
+    vpermpd m7, m7, q0123
+
+    addps m5, m2
+    addps m6, m3
+    addps m7, m4
+
+    vpermps m5, m8, m5
+    vpermps m6, m8, m6
+    vpermps m7, m8, m7
+
+    vbroadcastsd m0, xm0             ; dc[0]
+    vpermpd m2, m1, q1111            ; dc[2]
+    vbroadcastsd m1, xm1             ; dc[1]
+
+    addps m0, m5
+    addps m1, m6
+    addps m2, m7
+
+    vextractf128 xm3, m0, 1
+    vextractf128 xm4, m1, 1
+    vextractf128 xm5, m2, 1
+
+    movlps [tgtq  + strideq*1],  xm1
+    movhps [tgtq  + strideq*2],  xm2
+    movlps [tgtq  + stride3q*1], xm3
+    movhps [tgtq  + strideq*4],  xm4
+    movlps [tgtq  + stride3q*2], xm0
+    movlps [tgtq  + strideq*8],  xm5
+    movhps [tgtq  + stride3q*4], xm0
+    movhps [tgt5q + strideq*2],  xm1
+    movhps [tgt5q + strideq*4],  xm3
+    movlps [tmpq  + strideq*1],  xm2
+    movlps [tmpq  + stride3q*1], xm4
+    movhps [tmpq  + strideq*4],  xm5
+
+    add lutq, (mmsize/2)*3 + 12
+    add mapq, 4
+    sub lenq, 15
+    jg .dim1
+
+    mov inq, outq
+
+    movsxd stride3q, dword [ctxq + AVTXContext.len]
+
+    mov stride5q, ctxq                 ; backup original context
+    mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
+    mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context
+    mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map
+    movsxd lenq, dword [ctxq + AVTXContext.len]
+
+.dim2:
+    call tgt5q                         ; call the FFT
+    sub stride3q, lenq
+    jg .dim2
+
+    mov outq, btmpq
+    mov lutq, [stride5q + AVTXContext.map]
+    mov inq,  [stride5q + AVTXContext.tmp]
+    mov lenq, [stride5q + AVTXContext.len]
+    lea lutq, [lutq + lenq*4]
+
+.post:
+    LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9
+    movaps [outq], m0
+
+    add outq, mmsize
+    add lutq, mmsize/2
+    sub lenq, mmsize/8
+    jg .post
+
+    RET
+%endif
diff --git a/libavutil/x86/tx_float_init.c b/libavutil/x86/tx_float_init.c
index 06df749fa9..46319f737e 100644
--- a/libavutil/x86/tx_float_init.c
+++ b/libavutil/x86/tx_float_init.c
@@ -43,6 +43,8 @@  TX_DECL_FN(fft_sr_ns, fma3)
 TX_DECL_FN(fft_sr,    avx2)
 TX_DECL_FN(fft_sr_ns, avx2)
 
+TX_DECL_FN(fft_pfa_15xM, avx)
+
 TX_DECL_FN(mdct_sr_inv, avx2)
 
 TX_DECL_FN(fft2_asm, sse3)
@@ -102,6 +104,59 @@  static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
     return 0;
 }
 
+static av_cold int fft_pfa_init(AVTXContext *s,
+                                const FFTXCodelet *cd,
+                                uint64_t flags,
+                                FFTXCodeletOptions *opts,
+                                int len, int inv,
+                                const void *scale)
+{
+    int ret;
+    int sub_len = len / cd->factors[0];
+    FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
+
+    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
+    flags |=  AV_TX_INPLACE;      /* in-place */
+    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
+    flags |=  FF_TX_ASM_CALL;     /* We want an assembly function, not C */
+
+    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
+                                sub_len, inv, scale)))
+        return ret;
+
+    if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
+        return ret;
+
+    for (int k = 0; k < s->sub[0].len; k++) {
+        int cnt = 0;
+        int tmp[15];
+        memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
+        for (int i = 1; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 2; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        for (int i = 0; i < 15; i += 3) {
+            s->map[k*15 + cnt] = tmp[i];
+            cnt++;
+        }
+        memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
+        memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
+        s->map[k*15 + 1] = tmp[2];
+        s->map[k*15 + 2] = tmp[0];
+    }
+
+    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
+        return AVERROR(ENOMEM);
+
+    TX_TAB(ff_tx_init_tabs)(len / sub_len);
+
+    return 0;
+}
+
 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
     TX_DEF(fft2,     FFT,  2,  2, 2, 0, 128, NULL,  sse3, SSE3, AV_TX_INPLACE, 0),
     TX_DEF(fft2_asm, FFT,  2,  2, 2, 0, 192, b8_i0, sse3, SSE3,
@@ -160,6 +215,9 @@  const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
 
     TX_DEF(mdct_sr_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
            FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
+
+    TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx, AVX,
+           AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
 #endif
 #endif
 
-- 
2.37.2.609.g9ff673ca1a