@@ -169,6 +169,8 @@ av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
case AV_TX_FLOAT_MDCT:
if ((err = ff_tx_init_mdct_fft_float(s, tx, type, inv, len, scale, flags)))
goto fail;
+ if (ARCH_X86)
+ ff_tx_init_float_x86(s, tx);
break;
case AV_TX_DOUBLE_FFT:
case AV_TX_DOUBLE_MDCT:
@@ -158,4 +158,6 @@ typedef struct CosTabsInitOnce {
AVOnce control;
} CosTabsInitOnce;
+void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx);
+
#endif /* AVUTIL_TX_PRIV_H */
@@ -3,6 +3,7 @@ OBJS += x86/cpu.o \
x86/float_dsp_init.o \
x86/imgutils_init.o \
x86/lls_init.o \
+ x86/tx_float_init.o \
OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \
@@ -14,5 +15,6 @@ X86ASM-OBJS += x86/cpuid.o \
x86/float_dsp.o \
x86/imgutils.o \
x86/lls.o \
+ x86/tx_float.o \
X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o \
new file mode 100644
@@ -0,0 +1,171 @@
+;******************************************************************************
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86util.asm"
+
+%if ARCH_X86_64
+%define pointer resq
+%else
+%define pointer resd
+%endif
+
+struc AVTXContext
+ .n: resd 1 ; Non-power-of-two part
+ .m: resd 1 ; Power-of-two part
+ .inv: resd 1 ; Is inverse
+ .type: resd 1 ; Type
+ .flags: resq 1 ; Flags
+ .scale: resq 1 ; Scale
+
+ .exptab: pointer 1 ; MDCT exptab
+ .tmp: pointer 1 ; Temporary buffer needed for all compound transforms
+ .pfatab: pointer 1 ; Input/Output mapping for compound transforms
+ .revtab: pointer 1 ; Input mapping for power of two transforms
+ .inplace_idx: pointer 1 ; Required indices to revtab for in-place transforms
+endstruc
+
+SECTION_RODATA 32
+
+%define POS 0x00000000
+%define NEG 0x80000000
+%define M_SQRT1_2 0.707106781186547524401
+s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
+
+s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0
+
+s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3
+s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1
+
+mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
+mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
+
+SECTION .text
+
+; Single 4-point in-place complex FFT (will do 2 transforms at once in AVX mode)
+; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
+; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
+; %3 - temporary
+; %4 - temporary
+%macro DUET_FFT4 4
+ subps %3, %1, %2 ; r1, r2, r3, r4, (r5, r6, r7, r8)
+ addps %1, %2 ; t1, t2, t3, t4, (t5, t6, t7, t8)
+ shufps %2, %3, %3, q1100 ; r1, r1, r2, r2, (r5, r5, r6, r6)
+ shufps %4, %1, %1, q3322 ; t3, t3, t4, t4, (t7, t7, t8, t8)
+ shufps %3, %3, q2233 ; r4, r4, r3, r3, (r8, r8, r7, r7)
+ shufps %1, %1, q1100 ; t1, t1, t2, t2, (t5, t5, t6, t6)
+ addsubps %2, %3 ; b3, b1, b4, b2, (b7, b5, b8, b6)
+ addsubps %1, %4 ; a3, a1, a4, a2, (a7, a5, a8, a6)
+ shufps %1, %1, q2031 ; a1, a2, a3, a4, (a5, a6, a7, a8)
+ shufps %2, %2, q3021 ; b1, b2, b3, b4, (b5, b6, b7, b8)
+%endmacro
+
+INIT_XMM sse3
+cglobal fft4, 4, 4, 4, ctx, out, in, stride
+ mova m0, [inq + 0*mmsize]
+ mova m1, [inq + 1*mmsize]
+
+ cmp dword [r0 + AVTXContext.inv], 1
+ jl .s
+
+ shufps m2, m1, m0, q3210
+ shufps m0, m1, q3210
+ mova m1, m2
+
+.s: DUET_FFT4 m0, m1, m2, m3
+
+ unpcklpd m2, m0, m1
+ unpckhpd m0, m1
+
+ mova [outq + 0*mmsize], m2
+ mova [outq + 1*mmsize], m0
+
+ ret
+
+; Single 8-point in-place complex FFT
+; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
+; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim)
+; %3 - temporary
+; %4 - temporary
+%macro SINGLET_FFT8_AVX 4
+ subps %3, %1, %2 ; r1, r2, r3, r4, r5, r6, r7, r8
+ addps %1, %2 ; q1, q2, q3, q4, q5, q6, q7, q8
+ vpermilps %2, %3, [s8_perm_odd1] ; r4, r4, r2, r2, r6, r6, r8, r8
+ shufps %4, %1, %1, q3322 ; q1, q1, q2, q2, q5, q5, q6, q6
+ movsldup %3, %3 ; r1, r1, r3, r3, r5, r5, r7, r7
+ shufps %1, %1, q1100 ; q3, q3, q4, q4, q7, q7, q8, q8
+ addsubps %3, %2 ; z1, z2, z3, z4, z5, z6, z7, z8
+ addsubps %1, %4 ; s3, s1, s4, s2, s7, s5, s8, s6
+ mulps %3, [s8_mult_odd] ; z * s8_mult_odd
+ vpermilps %1, [s8_perm_even] ; s1, s2, s3, s4, s5, s6, s8, s7
+ shufps %2, %3, %3, q2332 ; c, r, a, p, z7, z8, z8, z7
+ xorps %4, %1, [mask_mmmmpppm] ; e1, e2, e3, e4, e5, e6, e8, e7
+ vpermilps %3, %3, [s8_perm_odd2] ; z2, z3, z1, z4, z6, z5, z5, z6
+ vperm2f128 %1, %4, q0003 ; e5, e6, e8, e7, s1, s2, s3, s4
+ addsubps %2, %3 ; c, r, a, p, t5, t6, t7, t8
+ subps %1, %4 ; w1, w2, w3, w4, w5, w6, w7, w8
+ vperm2f128 %2, %2, q0101 ; t5, t6, t7, t8, t5, t6, t7, t8
+ vperm2f128 %3, %3, q0000 ; z2, z3, z1, z4, z2, z3, z1, z4
+ xorps %2, [mask_ppmpmmpm] ; t5, t6, -t7, t8, -t5, -t6, t7, -t8
+ addps %2, %3, %2 ; u1, u2, u3, u4, u5, u6, u7, u8
+%endmacro
+
+; Load complex values (64 bits) via a lookup table
+; %1 - output register
+; %2 - GRP of base input memory address
+; %3 - GPR of LUT (int32_t indices) address
+; %4 - temporary GPR or vector register
+; %5 - temporary register (for avx only)
+%macro LOADZ_LUT 4-5
+%if 0
+ pcmpeqb m%4, m%4
+ vgatherdpd m%1, [%2 + xm%5*8], m%4
+ mova xm%5, [%3]
+%else
+ mov %4d, [%3 + 0]
+ movsd xm%1, [%2 + %4q*8]
+ mov %4d, [%3 + 4]
+ movhps xm%1, [%2 + %4q*8]
+%if mmsize == 32
+ mov %4d, [%3 + 8]
+ movsd xm%5, [%2 + %4q*8]
+ mov %4d, [%3 + 12]
+ movhps xm%5, [%2 + %4q*8]
+ vinsertf128 m%1, m%1, xm%5, 1
+%endif ; mmsize
+%endif ; vgather
+%endmacro
+
+INIT_YMM avx
+cglobal fft8, 4, 5, 4, ctx, out, in, stride, tmp
+ mov strideq, [r0 + AVTXContext.revtab]
+
+ LOADZ_LUT 0, inq, strideq + 0, tmp, 3
+ LOADZ_LUT 1, inq, strideq + 16, tmp, 3
+
+ SINGLET_FFT8_AVX m0, m1, m2, m3
+
+ unpcklpd m2, m0, m1
+ unpckhpd m0, m1
+
+ vperm2f128 m1, m2, m0, q0301
+ vperm2f128 m0, m2, q0002
+
+ mova [outq + 0*mmsize], m0
+ mova [outq + 1*mmsize], m1
+
+ ret
new file mode 100644
@@ -0,0 +1,66 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define TX_FLOAT
+#include "libavutil/tx_priv.h"
+#include "libavutil/x86/cpu.h"
+
+void ff_fft4_sse3(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+void ff_fft8_avx(AVTXContext *s, void *out, void *in, ptrdiff_t stride);
+
+/* Reorders the coefficients of the bottom most transforms such that even
+ * coefficients appear first in the buffer while odd ones appear last.
+ * Saves on a lot of intra-lane shuffles.
+ * If the 16-point transform is rewritten to be monolithic (instead of 8x4x4)
+ * update the len check to cover it. */
+static void revtab_avx(int *revtab, int n, int inv, int offs, int len)
+{
+ len >>= 1;
+ if (len <= (8 >> 1)) {
+ for (int j = 0; j < len; j++) {
+ int k1 = -split_radix_permutation(offs + j*2 + 0, n, inv) & (n - 1);
+ int k2 = -split_radix_permutation(offs + j*2 + 1, n, inv) & (n - 1);
+ revtab[k1] = offs + j;
+ revtab[k2] = offs + j + len;
+ }
+ return;
+ }
+ revtab_avx(revtab, n, inv, offs , len >> 0);
+ revtab_avx(revtab, n, inv, offs + (len >> 0), len >> 1);
+ revtab_avx(revtab, n, inv, offs + (len >> 0) + (len >> 1), len >> 1);
+}
+
+av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_AVX_FAST(cpu_flags)) {
+ if (s->n == 1 && s->m == 8) {
+ revtab_avx(s->revtab, s->m, s->inv, 0, s->m);
+ *tx = ff_fft8_avx;
+ return;
+ }
+ }
+
+ if (EXTERNAL_SSE3(cpu_flags)) {
+ if (s->n == 1 && s->m == 4) {
+ *tx = ff_fft4_sse3;
+ return;
+ }
+ }
+}