diff mbox series

[FFmpeg-devel] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation

Message ID 20210929011805.98907-1-mindmark@gmail.com
State New
Headers show
Series [FFmpeg-devel] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Mark Reid Sept. 29, 2021, 1:18 a.m. UTC
From: Mark Reid <mindmark@gmail.com>

Only supports float and 16bit planer formats at the momoment.
Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some
speed gains.

f32 1920x1080 1 thread with prelut
c impl
1389936500 UNITS in lut3d->interp,       1 runs,      0 skips
1425800240 UNITS in lut3d->interp,       2 runs,      0 skips
1433312777 UNITS in lut3d->interp,       4 runs,      0 skips
1443346798 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
948662320 UNITS in lut3d->interp,       1 runs,      0 skips
1101247540 UNITS in lut3d->interp,       2 runs,      0 skips
1050645695 UNITS in lut3d->interp,       4 runs,      0 skips
1041102937 UNITS in lut3d->interp,       8 runs,      0 skips

avx
633837000 UNITS in lut3d->interp,       1 runs,      0 skips
669452850 UNITS in lut3d->interp,       2 runs,      0 skips
650716580 UNITS in lut3d->interp,       4 runs,      0 skips
644698550 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
354940020 UNITS in lut3d->interp,       1 runs,      0 skips
362384340 UNITS in lut3d->interp,       2 runs,      0 skips
356799020 UNITS in lut3d->interp,       4 runs,      0 skips
357276815 UNITS in lut3d->interp,       8 runs,      0 skips

gbrap16 1920x1080 1 thread with prelut
c impl
1445071160 UNITS in lut3d->interp,       1 runs,      0 skips
1477959120 UNITS in lut3d->interp,       2 runs,      0 skips
1472102670 UNITS in lut3d->interp,       4 runs,      0 skips
1462579330 UNITS in lut3d->interp,       8 runs,      0 skips

sse2
1035437580 UNITS in lut3d->interp,       1 runs,      0 skips
1050139710 UNITS in lut3d->interp,       2 runs,      0 skips
1070147205 UNITS in lut3d->interp,       4 runs,      0 skips
1064583037 UNITS in lut3d->interp,       8 runs,      0 skips

avx
678089880 UNITS in lut3d->interp,       1 runs,      0 skips
679112485 UNITS in lut3d->interp,       2 runs,      0 skips
695527212 UNITS in lut3d->interp,       4 runs,      0 skips
691300053 UNITS in lut3d->interp,       8 runs,      0 skips

avx2
372671340 UNITS in lut3d->interp,       1 runs,      0 skips
373449870 UNITS in lut3d->interp,       2 runs,      0 skips
383725625 UNITS in lut3d->interp,       4 runs,      0 skips
382860848 UNITS in lut3d->interp,       8 runs,      0 skips

---
 libavfilter/lut3d.h             |  83 ++++
 libavfilter/vf_lut3d.c          |  61 +--
 libavfilter/x86/Makefile        |   2 +
 libavfilter/x86/vf_lut3d.asm    | 757 ++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_lut3d_init.c |  88 ++++
 5 files changed, 935 insertions(+), 56 deletions(-)
 create mode 100644 libavfilter/lut3d.h
 create mode 100644 libavfilter/x86/vf_lut3d.asm
 create mode 100644 libavfilter/x86/vf_lut3d_init.c

--
2.31.1.windows.1

Comments

chen Sept. 29, 2021, 1:37 a.m. UTC | #1
Hello,


Excuse me, how about FMADD on AVX2 platform?


For example
+    mulps m7, m7, m14
+    addps m0, m0, m7

==>


fmadd231ps m0,m7,m14


Regards,
Min Chen


 2021-09-29 09:18:05,mindmark@gmail.com 
>From: Mark Reid <mindmark@gmail.com>
>
>Only supports float and 16bit planer formats at the momoment.
>Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some
>speed gains.
>
>f32 1920x1080 1 thread with prelut
>c impl
>1389936500 UNITS in lut3d->interp,       1 runs,      0 skips
>1425800240 UNITS in lut3d->interp,       2 runs,      0 skips
>1433312777 UNITS in lut3d->interp,       4 runs,      0 skips
>1443346798 UNITS in lut3d->interp,       8 runs,      0 skips
>
>sse2
>948662320 UNITS in lut3d->interp,       1 runs,      0 skips
>1101247540 UNITS in lut3d->interp,       2 runs,      0 skips
>1050645695 UNITS in lut3d->interp,       4 runs,      0 skips
>1041102937 UNITS in lut3d->interp,       8 runs,      0 skips
>
>avx
>633837000 UNITS in lut3d->interp,       1 runs,      0 skips
>669452850 UNITS in lut3d->interp,       2 runs,      0 skips
>650716580 UNITS in lut3d->interp,       4 runs,      0 skips
>644698550 UNITS in lut3d->interp,       8 runs,      0 skips
>
>avx2
>354940020 UNITS in lut3d->interp,       1 runs,      0 skips
>362384340 UNITS in lut3d->interp,       2 runs,      0 skips
>356799020 UNITS in lut3d->interp,       4 runs,      0 skips
>357276815 UNITS in lut3d->interp,       8 runs,      0 skips
>
>gbrap16 1920x1080 1 thread with prelut
>c impl
>1445071160 UNITS in lut3d->interp,       1 runs,      0 skips
>1477959120 UNITS in lut3d->interp,       2 runs,      0 skips
>1472102670 UNITS in lut3d->interp,       4 runs,      0 skips
>1462579330 UNITS in lut3d->interp,       8 runs,      0 skips
>
>sse2
>1035437580 UNITS in lut3d->interp,       1 runs,      0 skips
>1050139710 UNITS in lut3d->interp,       2 runs,      0 skips
>1070147205 UNITS in lut3d->interp,       4 runs,      0 skips
>1064583037 UNITS in lut3d->interp,       8 runs,      0 skips
>
>avx
>678089880 UNITS in lut3d->interp,       1 runs,      0 skips
>679112485 UNITS in lut3d->interp,       2 runs,      0 skips
>695527212 UNITS in lut3d->interp,       4 runs,      0 skips
>691300053 UNITS in lut3d->interp,       8 runs,      0 skips
>
>avx2
>372671340 UNITS in lut3d->interp,       1 runs,      0 skips
>373449870 UNITS in lut3d->interp,       2 runs,      0 skips
>383725625 UNITS in lut3d->interp,       4 runs,      0 skips
>382860848 UNITS in lut3d->interp,       8 runs,      0 skips
>
>---
> libavfilter/lut3d.h             |  83 ++++
> libavfilter/vf_lut3d.c          |  61 +--
> libavfilter/x86/Makefile        |   2 +
> libavfilter/x86/vf_lut3d.asm    | 757 ++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_lut3d_init.c |  88 ++++
> 5 files changed, 935 insertions(+), 56 deletions(-)
> create mode 100644 libavfilter/lut3d.h
> create mode 100644 libavfilter/x86/vf_lut3d.asm
> create mode 100644 libavfilter/x86/vf_lut3d_init.c
>
>diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
>new file mode 100644
>index 0000000000..ded2a036a5
>--- /dev/null
>+++ b/libavfilter/lut3d.h
>@@ -0,0 +1,83 @@
>+/*
>+ * Copyright (c) 2013 Clément Bœsch
>+ * Copyright (c) 2018 Paul B Mahol
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+#ifndef AVFILTER_LUT3D_H
>+#define AVFILTER_LUT3D_H
>+
>+#include "libavutil/pixdesc.h"
>+#include "framesync.h"
>+#include "avfilter.h"
>+
>+enum interp_mode {
>+    INTERPOLATE_NEAREST,
>+    INTERPOLATE_TRILINEAR,
>+    INTERPOLATE_TETRAHEDRAL,
>+    INTERPOLATE_PYRAMID,
>+    INTERPOLATE_PRISM,
>+    NB_INTERP_MODE
>+};
>+
>+struct rgbvec {
>+    float r, g, b;
>+};
>+
>+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
>+ * of 512x512 (64x64x64) */
>+#define MAX_LEVEL 256
>+#define PRELUT_SIZE 65536
>+
>+typedef struct Lut3DPreLut {
>+    int size;
>+    float min[3];
>+    float max[3];
>+    float scale[3];
>+    float* lut[3];
>+} Lut3DPreLut;
>+
>+typedef struct LUT3DContext {
>+    const AVClass *class;
>+    struct rgbvec *lut;
>+    int lutsize;
>+    int lutsize2;
>+    struct rgbvec scale;
>+    int interpolation;          ///<interp_mode
>+    char *file;
>+    uint8_t rgba_map[4];
>+    int step;
>+    avfilter_action_func *interp;
>+    Lut3DPreLut prelut;
>+#if CONFIG_HALDCLUT_FILTER
>+    uint8_t clut_rgba_map[4];
>+    int clut_step;
>+    int clut_bits;
>+    int clut_planar;
>+    int clut_float;
>+    int clut_width;
>+    FFFrameSync fs;
>+#endif
>+} LUT3DContext;
>+
>+typedef struct ThreadData {
>+    AVFrame *in, *out;
>+} ThreadData;
>+
>+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
>+
>+#endif /* AVFILTER_LUT3D_H */
>\ No newline at end of file
>diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
>index 9fbda833b9..1fd0af06db 100644
>--- a/libavfilter/vf_lut3d.c
>+++ b/libavfilter/vf_lut3d.c
>@@ -31,73 +31,18 @@
> #include "libavutil/intreadwrite.h"
> #include "libavutil/intfloat.h"
> #include "libavutil/avassert.h"
>-#include "libavutil/pixdesc.h"
> #include "libavutil/avstring.h"
>-#include "avfilter.h"
> #include "drawutils.h"
> #include "formats.h"
>-#include "framesync.h"
> #include "internal.h"
> #include "video.h"
>+#include "lut3d.h"
>
> #define R 0
> #define G 1
> #define B 2
> #define A 3
>
>-enum interp_mode {
>-    INTERPOLATE_NEAREST,
>-    INTERPOLATE_TRILINEAR,
>-    INTERPOLATE_TETRAHEDRAL,
>-    INTERPOLATE_PYRAMID,
>-    INTERPOLATE_PRISM,
>-    NB_INTERP_MODE
>-};
>-
>-struct rgbvec {
>-    float r, g, b;
>-};
>-
>-/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
>- * of 512x512 (64x64x64) */
>-#define MAX_LEVEL 256
>-#define PRELUT_SIZE 65536
>-
>-typedef struct Lut3DPreLut {
>-    int size;
>-    float min[3];
>-    float max[3];
>-    float scale[3];
>-    float* lut[3];
>-} Lut3DPreLut;
>-
>-typedef struct LUT3DContext {
>-    const AVClass *class;
>-    int interpolation;          ///<interp_mode
>-    char *file;
>-    uint8_t rgba_map[4];
>-    int step;
>-    avfilter_action_func *interp;
>-    struct rgbvec scale;
>-    struct rgbvec *lut;
>-    int lutsize;
>-    int lutsize2;
>-    Lut3DPreLut prelut;
>-#if CONFIG_HALDCLUT_FILTER
>-    uint8_t clut_rgba_map[4];
>-    int clut_step;
>-    int clut_bits;
>-    int clut_planar;
>-    int clut_float;
>-    int clut_width;
>-    FFFrameSync fs;
>-#endif
>-} LUT3DContext;
>-
>-typedef struct ThreadData {
>-    AVFrame *in, *out;
>-} ThreadData;
>-
> #define OFFSET(x) offsetof(LUT3DContext, x)
> #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
> #define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
>@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink)
>         av_assert0(0);
>     }
>
>+    if (ARCH_X86) {
>+        ff_lut3d_init_x86(lut3d, desc);
>+    }
>+
>     return 0;
> }
>
>diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>index 016a5b3511..a29941eaeb 100644
>--- a/libavfilter/x86/Makefile
>+++ b/libavfilter/x86/Makefile
>@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
> OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
> OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
> OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
>+OBJS-$(CONFIG_LUT3D_FILTER)                  += x86/vf_lut3d_init.o
> OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += x86/vf_maskedclamp_init.o
> OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
> OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
>@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
> X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
> X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
> X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
>+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER)           += x86/vf_lut3d.o
> X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o
> X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
> X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
>diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
>new file mode 100644
>index 0000000000..b3d7c3962b
>--- /dev/null
>+++ b/libavfilter/x86/vf_lut3d.asm
>@@ -0,0 +1,757 @@
>+;*****************************************************************************
>+;* x86-optimized functions for lut3d filter
>+;*
>+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
>+;*
>+;* This file is part of FFmpeg.
>+;*
>+;* FFmpeg is free software; you can redistribute it and/or
>+;* modify it under the terms of the GNU Lesser General Public
>+;* License as published by the Free Software Foundation; either
>+;* version 2.1 of the License, or (at your option) any later version.
>+;*
>+;* FFmpeg is distributed in the hope that it will be useful,
>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+;* Lesser General Public License for more details.
>+;*
>+;* You should have received a copy of the GNU Lesser General Public
>+;* License along with FFmpeg; if not, write to the Free Software
>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+;******************************************************************************
>+
>+%include "libavutil/x86/x86util.asm"
>+
>+SECTION_RODATA
>+pd_1f:  times 8 dd 1.0
>+pd_3f:  times 8 dd 3.0
>+
>+; used to limit rshifts as they are more expensive in avx1
>+pd_001: times 8 dd 001b
>+pd_010: times 8 dd 010b
>+pd_100: times 8 dd 100b
>+
>+pd_65535f:     times 8 dd 65535.0
>+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
>+
>+pb_shuffle16:         db    0,    1, 0x80, 0x80, \
>+                            2,    3, 0x80, 0x80, \
>+                            4,    5, 0x80, 0x80, \
>+                            6,    7, 0x80, 0x80
>+
>+pb_lo_pack_shuffle16: db    0,    1,    4,    5, \
>+                            8,    9,   12,   13, \
>+                         0x80, 0x80, 0x80, 0x80, \
>+                         0x80, 0x80, 0x80, 0x80
>+
>+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
>+                         0x80, 0x80, 0x80, 0x80, \
>+                            0,    1,    4,    5, \
>+                            8,    9,   12,   13
>+
>+; tetrahedral table --------------------------------------------
>+; name:          x2|        x1|        x0|       cxxb|     cxxa
>+; values:      r 00|     r  00|     r  00|   c011 011| c001 001
>+;              g 01|     g  01|     g  01|   c101 101| c010 010
>+;              b 10|     b  10|     b  10|   c110 110| c100 100
>+
>+; g>b                                 b |          g |          r |        c110 | c100
>+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << 3) | 100b
>+; r>b                                 g |          b |          r |        c101 | c100
>+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << 3) | 100b
>+; else                                g |          r |          b |        c101 | c001
>+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << 3) | 001b
>+; b>g                                 r |          g |          b |        c011 | c001
>+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << 3) | 001b
>+; b>r                                 r |          b |          g |        c011 | c010
>+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << 3) | 010b
>+; else                                b |          r |          g |        c110 | c010
>+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << 3) | 010b
>+
>+SECTION .text
>+
>+struc Lut3DPreLut
>+    .size:    resd 1
>+    .min:     resd 3
>+    .max:     resd 3
>+    .scale:   resd 3
>+    .lut:     resq 3
>+endstruc
>+
>+struc LUT3DContext
>+    .class:        resq 1
>+    .lut:          resq 1
>+    .lutsize:      resd 1
>+    .lutsize2:     resd 1
>+    .scale:        resd 3
>+endstruc
>+
>+%define AV_NUM_DATA_POINTERS 8
>+
>+struc AVFrame
>+    .data:          resq AV_NUM_DATA_POINTERS
>+    .linesize:      resd AV_NUM_DATA_POINTERS
>+    .extended_data: resq 1
>+    .width:         resd 1
>+    .height:        resd 1
>+endstruc
>+
>+%define rm   rsp
>+%define gm   rsp+mmsize
>+%define bm   rsp+(mmsize*2)
>+
>+%define lut3dsizem  [rsp+mmsize*3]
>+%define lut3dsize2m [rsp+mmsize*4]
>+%define lut3dmaxm   [rsp+mmsize*5]
>+%define prelutmaxm  [rsp+mmsize*6]
>+
>+%define scalerm [rsp+mmsize*7]
>+%define scalegm [rsp+mmsize*8]
>+%define scalebm [rsp+mmsize*9]
>+
>+%define prelutminrm [rsp+mmsize*10]
>+%define prelutmingm [rsp+mmsize*11]
>+%define prelutminbm [rsp+mmsize*12]
>+
>+%define prelutscalerm [rsp+mmsize*13]
>+%define prelutscalegm [rsp+mmsize*14]
>+%define prelutscalebm [rsp+mmsize*15]
>+
>+; data pointers
>+%define srcrm [rsp+mmsize*16 +  0]
>+%define srcgm [rsp+mmsize*16 +  8]
>+%define srcbm [rsp+mmsize*16 + 16]
>+%define srcam [rsp+mmsize*16 + 24]
>+
>+%define dstrm [rsp+mmsize*16 + 32]
>+%define dstgm [rsp+mmsize*16 + 40]
>+%define dstbm [rsp+mmsize*16 + 48]
>+%define dstam [rsp+mmsize*16 + 56]
>+
>+%macro FETCH_PRELUT_PN 3
>+    mov tmp2d, [rm + %3]
>+    mov tmp3d, [gm + %3]
>+    movss xm%1, [tmpq + tmp2q*4]
>+    movss xm%2, [tmpq + tmp3q*4]
>+    movss [rm + %3], xm%1
>+    movss [gm + %3], xm%2
>+%endmacro
>+
>+; 1 - p
>+; 2 - n
>+; 3 - p indices
>+; 4 - n indices
>+%macro GATHER_PRELUT 4
>+    %if cpuflag(avx2)
>+        vpcmpeqb m7, m7
>+        vgatherdps m%1, [tmpq + m%3*4], m7 ; p
>+        vpcmpeqb m9, m9
>+        vgatherdps m%2, [tmpq + m%4*4], m9 ; n
>+    %else
>+        mova [rm], m%3
>+        mova [gm], m%4
>+        FETCH_PRELUT_PN %1, %2, 0
>+        FETCH_PRELUT_PN %1, %2, 4
>+        FETCH_PRELUT_PN %1, %2, 8
>+        FETCH_PRELUT_PN %1, %2, 12
>+    %if mmsize > 16
>+        FETCH_PRELUT_PN %1, %2, 16
>+        FETCH_PRELUT_PN %1, %2, 20
>+        FETCH_PRELUT_PN %1, %2, 24
>+        FETCH_PRELUT_PN %1, %2, 28
>+    %endif
>+        movu m%1, [rm]
>+        movu m%2, [gm]
>+    %endif
>+%endmacro
>+
>+%macro FLOORPS 2
>+    %if mmsize > 16
>+        vroundps %1, %2, 0x01
>+    %else
>+        cvttps2dq %1, %2
>+        cvtdq2ps  %1, %1
>+    %endif
>+%endmacro
>+
>+; 1 - dst
>+; 2 - index
>+; 3 - min
>+; 4 - scale
>+; assumes lut max m13, m14 1.0f, zero m15
>+%macro APPLY_PRELUT 4
>+    ; scale
>+    subps m5, m%1, %3 ; v - min
>+    mulps m5, m5, %4  ; v * scale
>+    ; clamp
>+    maxps m5, m5, m15 ; max zero
>+    minps m5, m5, m13 ; min lut max
>+
>+    FLOORPS m3, m5    ; prev index
>+    subps m5, m5, m3  ; d
>+    addps m4, m3, m14 ; p+1 = n index
>+    minps m4, m4, m13 ; clamp n idex
>+
>+    mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
>+    cvttps2dq m6, m3
>+    cvttps2dq m10, m4
>+    GATHER_PRELUT 3, 4, 6, 10
>+
>+    ; lerp
>+    subps m8, m4, m3
>+    mulps m8, m8, m5
>+    addps m%1, m8, m3
>+%endmacro
>+
>+; 1 - dst
>+; 2 - scale
>+; assumes lut max m13, zero m15
>+%macro APPLY_SCALE 2
>+   mulps m%1, m%1, %2
>+   maxps m%1, m%1, m15
>+   minps m%1, m%1, m13
>+%endmacro
>+
>+%macro BLEND 4
>+%if mmsize > 16
>+    vblendvps %1, %2, %3, %4
>+%else
>+    %ifidni %1,%2
>+        %error operand 1 must not equal operand 2
>+    %endif
>+    %ifidni %1,%3
>+        %error operand 1 must not equal operand 3
>+    %endif
>+    mova  %1, %2
>+    xorps %1, %3
>+    andps %1, %4
>+    xorps %1, %2
>+%endif
>+%endmacro
>+
>+; sets nans to zere, +inf -inf handled later by min/max clamps
>+%macro SANITIZE_F 1
>+    cmpps m5, %1, %1, 0x0 ; nan == nan = False
>+    %if mmsize <= 16
>+        mova m6, %1
>+        BLEND %1, m15, m6, m5
>+    %else
>+        BLEND %1, m15, %1, m5
>+    %endif
>+%endmacro
>+
>+%macro ADD3 4
>+    addps %1, %2, %3
>+    addps %1, %1, %4
>+%endmacro
>+
>+%macro CMP_EQUAL 3
>+%if cpuflag(avx2)
>+    vpcmpeqd %1, %2, %3
>+%elif cpuflag(avx)
>+    cmpps %1, %2, %3, 0x0
>+%else
>+    pcmpeqd %1, %2, %3
>+%endif
>+%endmacro
>+
>+%macro SHIFT_RIGHT 2
>+%if mmsize <= 16
>+    psrld xm%1, %2
>+%elif cpuflag(avx2)
>+    vpsrld m%1, m%1, %2
>+%else
>+    vextractf128 xm15, m%1, 1
>+    psrld xm%1, %2
>+    psrld xm15, %2
>+    vinsertf128 m%1, m%1, xm15, 1
>+%endif
>+%endmacro
>+
>+%macro FETCH_LUT3D_RGB 4
>+    mov tmp2d, [rm + %4]
>+    movss xm%1, [tmpq + tmp2q*4 + 0]
>+    movss xm%2, [tmpq + tmp2q*4 + 4]
>+    movss xm%3, [tmpq + tmp2q*4 + 8]
>+    movss [rm + %4], xm%1
>+    movss [gm + %4], xm%2
>+    movss [bm + %4], xm%3
>+%endmacro
>+
>+; 1 - dstr
>+; 2 - dstg
>+; 3 - dstb
>+; 4 - indices
>+%macro GATHER_LUT3D_INDICES 4
>+%if cpuflag(avx2)
>+    vpcmpeqb m3, m3
>+    vgatherdps m%1, [tmpq + m%4*4 + 0], m3
>+    vpcmpeqb m14, m14
>+    vgatherdps m%2, [tmpq + m%4*4 + 4], m14
>+    vpcmpeqb m15, m15
>+    vgatherdps m%3, [tmpq + m%4*4 + 8], m15
>+%else
>+    movu [rm], m%4
>+    FETCH_LUT3D_RGB %1, %2, %3, 0
>+    FETCH_LUT3D_RGB %1, %2, %3, 4
>+    FETCH_LUT3D_RGB %1, %2, %3, 8
>+    FETCH_LUT3D_RGB %1, %2, %3, 12
>+%if mmsize > 16
>+    FETCH_LUT3D_RGB %1, %2, %3, 16
>+    FETCH_LUT3D_RGB %1, %2, %3, 20
>+    FETCH_LUT3D_RGB %1, %2, %3, 24
>+    FETCH_LUT3D_RGB %1, %2, %3, 28
>+%endif
>+    movu m%1, [rm]
>+    movu m%2, [gm]
>+    movu m%3, [bm]
>+%endif
>+%endmacro
>+
>+%macro interp_tetrahedral 0
>+    %define d_r m0
>+    %define d_g m1
>+    %define d_b m2
>+
>+    %define prev_r m3
>+    %define prev_g m4
>+    %define prev_b m5
>+
>+    %define next_r m6
>+    %define next_g m7
>+    %define next_b m8
>+
>+    %define x0 m4
>+    %define x1 m5
>+    %define x2 m6
>+
>+    ; setup prev index
>+    FLOORPS prev_r, m0
>+    FLOORPS prev_g, m1
>+    FLOORPS prev_b, m2
>+
>+    ; setup deltas
>+    subps d_r, m0, prev_r
>+    subps d_g, m1, prev_g
>+    subps d_b, m2, prev_b
>+
>+    ; calculate select mask m9
>+    movu m6, [pd_tetra_table2]
>+    cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
>+    BLEND m10, m6, [pd_tetra_table1], m7
>+    cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
>+    BLEND m6, m10, [pd_tetra_table0], m7
>+
>+    movu m10, [pd_tetra_table5]
>+    cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
>+    BLEND m9, m10, [pd_tetra_table4], m7
>+    cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
>+    BLEND m10, m9, [pd_tetra_table3], m7
>+
>+    cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
>+    BLEND m9, m10, m6, m7
>+
>+    ; setup next index
>+    addps next_r, prev_r, m14 ; +1
>+    minps next_r, next_r, m13 ; clamp lutmax
>+
>+    addps next_g, prev_g, m14 ; +1
>+    minps next_g, next_g, m13 ; clamp lutmax
>+
>+    addps next_b, prev_b, m14 ; +1
>+    minps next_b, next_b, m13 ; clamp lutmax
>+
>+    ; prescale indices
>+    mulps prev_r, prev_r, lut3dsize2m
>+    mulps next_r, next_r, lut3dsize2m
>+
>+    mulps prev_g, prev_g, lut3dsizem
>+    mulps next_g, next_g, lut3dsizem
>+
>+    mulps prev_b, prev_b, [pd_3f]
>+    mulps next_b, next_b, [pd_3f]
>+
>+    movu m14, [pd_001]
>+
>+    ; cxxa m10
>+    ; b
>+    andps m15, m9, m14
>+    CMP_EQUAL m15, m15, m14
>+    BLEND m10, prev_b, next_b, m15
>+
>+    ; g
>+    andps m15, m9, [pd_010]
>+    CMP_EQUAL m15, m15, [pd_010]
>+    BLEND m12, prev_g, next_g, m15
>+
>+    ; r
>+    andps m15, m9, [pd_100]
>+    CMP_EQUAL m15, m15, [pd_100]
>+    BLEND m13, prev_r, next_r, m15
>+
>+    ADD3 m10, m10, m12, m13
>+
>+    SHIFT_RIGHT 9, 3 ; 3
>+
>+    ; cxxb m11;
>+    ; b
>+    andps m15, m9, m14
>+    CMP_EQUAL m15, m15, m14
>+    BLEND m11, prev_b, next_b, m15
>+
>+    ; g
>+    andps m15, m9, [pd_010]
>+    CMP_EQUAL m15, m15, [pd_010]
>+    BLEND m12, prev_g, next_g, m15
>+
>+    ; r
>+    andps m15, m9, [pd_100]
>+    CMP_EQUAL m15, m15, [pd_100]
>+    BLEND m13, prev_r, next_r, m15
>+
>+    ADD3 m11, m11, m12, m13
>+
>+    ; c000 m12;
>+    ADD3 m12, prev_r, prev_g, prev_b
>+
>+    ; c111 m13;
>+    ADD3 m13, next_r, next_g, next_b
>+
>+    SHIFT_RIGHT 9, 3 ; 6
>+
>+    ; x0, m4
>+    andps m15, m9, m14
>+    CMP_EQUAL m15, m15, m14
>+    BLEND m7, d_r, d_g, m15 ; r,g
>+
>+    andps m15, m9, [pd_010]
>+    CMP_EQUAL m15, m15, [pd_010]
>+    BLEND x0, m7, d_b, m15 ; b
>+
>+    ; x1, m5
>+    andps m15, m9, [pd_100]
>+    CMP_EQUAL m15, m15, [pd_100]
>+    BLEND m7, d_r, d_g, m15 ; r,g
>+
>+    SHIFT_RIGHT 9, 3 ; 9
>+
>+    andps m15, m9, m14
>+    CMP_EQUAL m15, m15, m14
>+    BLEND x1, m7, d_b, m15 ; b
>+
>+    ; x2, m6
>+    andps m15, m9, [pd_010]
>+    CMP_EQUAL m15, m15, [pd_010]
>+    BLEND m7, d_r, d_g, m15 ; r,g
>+
>+    andps m15, m9, [pd_100]
>+    CMP_EQUAL m15, m15, [pd_100]
>+    BLEND x2, m7, d_b, m15 ; b
>+
>+    ; convert indices to integer
>+    cvttps2dq m12, m12
>+    cvttps2dq m10, m10
>+    cvttps2dq m11, m11
>+    cvttps2dq m13, m13
>+
>+    ; now the gathering festival
>+    mov tmpq, [ctxq + LUT3DContext.lut]
>+
>+    GATHER_LUT3D_INDICES 0, 1, 2, 12
>+    movu m14, [pd_1f]
>+    subps m14, m14, x0; 1 - x0
>+
>+    mulps m0, m0, m14
>+    mulps m1, m1, m14
>+    mulps m2, m2, m14
>+
>+    GATHER_LUT3D_INDICES 7, 8, 9, 10
>+    subps m14, x0, x1; x0 - x1
>+    mulps m7, m7, m14
>+    addps m0, m0, m7
>+
>+    mulps m8, m8, m14
>+    addps m1, m1, m8
>+
>+    mulps m9, m9, m14
>+    addps m2, m2, m9
>+
>+    GATHER_LUT3D_INDICES 7, 8, 9, 11
>+    subps m14, x1, x2; x1 - x2
>+
>+    mulps m7, m7, m14
>+    addps m0, m0, m7
>+
>+    mulps m8, m8, m14
>+    addps m1, m1, m8
>+
>+    mulps m9, m9, m14
>+    addps m2, m2, m9
>+
>+    GATHER_LUT3D_INDICES 7, 8, 9, 13
>+    mulps m7, m7, x2
>+    addps m0, m0, m7
>+
>+    mulps m8, m8, x2
>+    addps m1, m1, m8
>+
>+    mulps m9, m9, x2
>+    addps m2, m2, m9
>+%endmacro
>+
>+%macro INIT_DATA_PTR 3
>+    mov ptrq, [%2 + AVFrame.data     + %3 * 8]
>+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>+    imul tmpd, slice_startd
>+    add ptrq, tmpq
>+    mov %1, ptrq
>+%endmacro
>+
>+%macro INC_DATA_PTR 3
>+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>+    mov ptrq, %1
>+    add ptrq, tmpq
>+    mov %1, ptrq
>+%endmacro
>+
>+%macro LOAD16 2
>+    mov ptrq, %2
>+    %if mmsize > 16
>+        movu xm%1, [ptrq + xq*2]
>+    %else
>+        movsd xm%1, [ptrq + xq*2]
>+    %endif
>+    %if cpuflag(avx2)
>+        vpmovzxwd m%1, xm%1
>+    %else
>+        %if mmsize > 16
>+            pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
>+            pshufb xm%1, xm6 ; pb_shuffle16
>+            pshufb xm4,  xm6 ; pb_shuffle16
>+            vinsertf128 m%1, m%1, xm4, 1
>+        %else
>+            pshufd  xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
>+            pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+            pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+        %endif
>+    %endif
>+    cvtdq2ps m%1, m%1
>+    mulps m%1, m%1, m7 ; pd_65535_invf
>+%endmacro
>+
>+%macro STORE16 2
>+    mulps m%2, m%2, m5  ; [pd_65535f]
>+    minps m%2, m%2, m5  ; [pd_65535f]
>+    maxps m%2, m%2, m15 ; zero
>+    cvttps2dq m%2, m%2
>+    %if mmsize > 16
>+        vextractf128 xm4, m%2, 1
>+        pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
>+        pshufb xm4,  xm7 ; [pb_hi_pack_shuffle16]
>+        por xm%2, xm4
>+    %else
>+        pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+        pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+        pshufd  xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
>+    %endif
>+    mov ptrq, %1
>+    %if mmsize > 16
>+        movu [ptrq + xq*2], xm%2
>+    %else
>+        movsd [ptrq + xq*2], xm%2
>+    %endif
>+%endmacro
>+
>+; 1 - interp method
>+; 2 - format_name
>+; 3 - depth
>+; 4 - is float format
>+%macro DEFINE_INTERP_FUNC 4
>+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
>+    ; store lut max and lutsize
>+    mov tmpd, dword [ctxq + LUT3DContext.lutsize]
>+    cvtsi2ss xm0, tmpd
>+    mulss xm0, xm0, [pd_3f]
>+    VBROADCASTSS m0, xm0
>+    mova lut3dsizem, m0
>+    sub tmpd, 1
>+    cvtsi2ss xm0, tmpd
>+    VBROADCASTSS m0, xm0
>+    mova lut3dmaxm, m0
>+
>+    ; scale_r
>+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
>+    VBROADCASTSS m1, xm1
>+    mova scalerm, m1
>+
>+    ; scale_g
>+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
>+    VBROADCASTSS m1, xm1
>+    mova scalegm, m1
>+
>+    ; scale_b
>+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
>+    VBROADCASTSS m1, xm1
>+    mova scalebm, m1
>+
>+    ; store lutsize2
>+    cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
>+    mulss xm0, xm0, [pd_3f]
>+    VBROADCASTSS m0, xm0
>+    mova lut3dsize2m, m0
>+
>+    ; init prelut values
>+    cmp prelutq, 0
>+    je %%skip_init_prelut
>+        mov tmpd, dword [prelutq + Lut3DPreLut.size]
>+        sub tmpd, 1
>+        cvtsi2ss xm0, tmpd
>+        VBROADCASTSS m0, xm0
>+        mova prelutmaxm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
>+        mova prelutminrm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
>+        mova prelutmingm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
>+        mova prelutminbm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
>+        mova prelutscalerm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
>+        mova prelutscalegm, m0
>+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
>+        mova prelutscalebm, m0
>+    %%skip_init_prelut:
>+
>+    mov widthd,  [src_imageq + AVFrame.width]
>+
>+    ; gbra pixel order
>+    INIT_DATA_PTR srcrm, src_imageq, 2
>+    INIT_DATA_PTR srcgm, src_imageq, 0
>+    INIT_DATA_PTR srcbm, src_imageq, 1
>+    INIT_DATA_PTR srcam, src_imageq, 3
>+
>+    INIT_DATA_PTR dstrm, dst_imageq, 2
>+    INIT_DATA_PTR dstgm, dst_imageq, 0
>+    INIT_DATA_PTR dstbm, dst_imageq, 1
>+    INIT_DATA_PTR dstam, dst_imageq, 3
>+
>+    %%loop_y:
>+        xor xq, xq
>+        %%loop_x:
>+            movu m14, [pd_1f]
>+            xorps m15, m15, m15
>+            %if %4 ; float
>+                mov ptrq, srcrm
>+                movu m0, [ptrq + xq*4]
>+                mov ptrq, srcgm
>+                movu m1, [ptrq + xq*4]
>+                mov ptrq, srcbm
>+                movu m2, [ptrq + xq*4]
>+                SANITIZE_F m0
>+                SANITIZE_F m1
>+                SANITIZE_F m2
>+            %else
>+                ; constants for LOAD16
>+                movu m7, [pd_65535_invf]
>+                %if notcpuflag(avx2) && mmsize >= 32
>+                    movu xm6, [pb_shuffle16]
>+                %endif
>+                LOAD16 0, srcrm
>+                LOAD16 1, srcgm
>+                LOAD16 2, srcbm
>+            %endif
>+
>+            cmp prelutq, 0
>+            je %%skip_prelut
>+                mova m13, prelutmaxm
>+                APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
>+                APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
>+                APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
>+            %%skip_prelut:
>+
>+            mova m13, lut3dmaxm
>+            APPLY_SCALE 0, scalerm
>+            APPLY_SCALE 1, scalegm
>+            APPLY_SCALE 2, scalebm
>+
>+            interp_%1
>+
>+            %if %4 ; float
>+                mov ptrq, dstrm
>+                movu [ptrq + xq*4], m0
>+                mov ptrq, dstgm
>+                movu [ptrq + xq*4], m1
>+                mov ptrq, dstbm
>+                movu [ptrq + xq*4], m2
>+                cmp has_alphad, 0
>+                je %%skip_alphaf
>+                    mov ptrq, srcam
>+                    movu m0, [ptrq + xq*4]
>+                    mov ptrq, dstam
>+                    movu [ptrq + xq*4], m0
>+                %%skip_alphaf:
>+            %else
>+                ; constants for STORE16
>+                movu m5,  [pd_65535f]
>+                %if mmsize > 16
>+                    movu xm6, [pb_lo_pack_shuffle16]
>+                    movu xm7, [pb_hi_pack_shuffle16]
>+                %endif
>+
>+                xorps m15, m15, m15
>+                STORE16 dstrm, 0
>+                STORE16 dstgm, 1
>+                STORE16 dstbm, 2
>+
>+                cmp has_alphad, 0
>+                je %%skip_alpha
>+                    %if mmsize > 16
>+                        mov ptrq, srcam
>+                        movu xm0, [ptrq + xq*2]
>+                        mov ptrq, dstam
>+                        movu [ptrq + xq*2], xm0
>+                    %else
>+                        mov ptrq, srcam
>+                        movsd xm0, [ptrq + xq*2]
>+                        mov ptrq, dstam
>+                        movsd [ptrq + xq*2], xm0
>+                    %endif
>+
>+                %%skip_alpha:
>+            %endif
>+
>+            add xq, mmsize/4
>+            cmp xd, widthd
>+            jl %%loop_x
>+
>+        INC_DATA_PTR srcrm, src_imageq, 2
>+        INC_DATA_PTR srcgm, src_imageq, 0
>+        INC_DATA_PTR srcbm, src_imageq, 1
>+        INC_DATA_PTR srcam, src_imageq, 3
>+
>+        INC_DATA_PTR dstrm, dst_imageq, 2
>+        INC_DATA_PTR dstgm, dst_imageq, 0
>+        INC_DATA_PTR dstbm, dst_imageq, 1
>+        INC_DATA_PTR dstam, dst_imageq, 3
>+
>+        inc slice_startd
>+        cmp slice_startd, slice_endd
>+        jl %%loop_y
>+
>+    RET
>+%endmacro
>+%if ARCH_X86_64
>+    %if HAVE_AVX2_EXTERNAL
>+        INIT_YMM avx2
>+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+    %endif
>+    %if HAVE_AVX_EXTERNAL
>+        INIT_YMM avx
>+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+    %endif
>+    INIT_XMM sse2
>+    DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+    DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+%endif
>\ No newline at end of file
>diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c
>new file mode 100644
>index 0000000000..9b9b36e4af
>--- /dev/null
>+++ b/libavfilter/x86/vf_lut3d_init.c
>@@ -0,0 +1,88 @@
>+/*
>+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/attributes.h"
>+#include "libavutil/cpu.h"
>+#include "libavutil/x86/cpu.h"
>+#include "libavfilter/lut3d.h"
>+
>+#define DEFINE_INTERP_FUNC(name, format, opt)                                                                                                       \
>+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \
>+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)                                                \
>+{                                                                                                                                                   \
>+    LUT3DContext *lut3d = ctx->priv;                                                                                                                \
>+    Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL;                                                                             \
>+    ThreadData *td = arg;                                                                                                                           \
>+    AVFrame *in  = td->in;                                                                                                                          \
>+    AVFrame *out = td->out;                                                                                                                         \
>+    int has_alpha = in->linesize[3] && out != in;                                                                                                   \
>+    int slice_start = (in->height *  jobnr   ) / nb_jobs;                                                                                           \
>+    int slice_end   = (in->height * (jobnr+1)) / nb_jobs;                                                                                           \
>+    ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha);                                                 \
>+    return 0;                                                                                                                                       \
>+}
>+
>+#if ARCH_X86_64
>+#if HAVE_AVX2_EXTERNAL
>+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
>+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx2)
>+#endif
>+#if HAVE_AVX_EXTERNAL
>+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
>+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx)
>+#endif
>+    DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
>+    DEFINE_INTERP_FUNC(tetrahedral, p16,  sse2)
>+#endif
>+
>+
>+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc)
>+{
>+    int cpu_flags = av_get_cpu_flags();
>+    int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
>+    int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
>+    int depth = desc->comp[0].depth;
>+
>+#if ARCH_X86_64
>+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+#if HAVE_AVX2_EXTERNAL
>+        if (isfloat && planar) {
>+            s->interp = interp_tetrahedral_pf32_avx2;
>+        } else if (depth == 16) {
>+            s->interp = interp_tetrahedral_p16_avx2;
>+        }
>+#endif
>+    } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+#if HAVE_AVX_EXTERNAL
>+        if (isfloat) {
>+            s->interp = interp_tetrahedral_pf32_avx;
>+        } else if (depth == 16) {
>+            s->interp = interp_tetrahedral_p16_avx;
>+        }
>+#endif
>+    } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+        if (isfloat) {
>+            s->interp = interp_tetrahedral_pf32_sse2;
>+        } else if (depth == 16) {
>+            s->interp = interp_tetrahedral_p16_sse2;
>+        }
>+    }
>+#endif
>+}
>--
>2.31.1.windows.1
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel@ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
Mark Reid Sept. 29, 2021, 5:27 p.m. UTC | #2
On Tue, Sep 28, 2021 at 6:38 PM chen <chenm003@163.com> wrote:

> Hello,
>
>
> Excuse me, how about FMADD on AVX2 platform?
>
>
> For example
> +    mulps m7, m7, m14
> +    addps m0, m0, m7
>
> ==>
>
>
> fmadd231ps m0,m7,m14
>
>
Interesting, does having AVX2 guarantee having FMA instructions?


>
> Regards,
> Min Chen
>
>
>  2021-09-29 09:18:05,mindmark@gmail.com
> >From: Mark Reid <mindmark@gmail.com>
> >
> >Only supports float and 16bit planer formats at the momoment.
> >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer
> some
> >speed gains.
> >
> >f32 1920x1080 1 thread with prelut
> >c impl
> >1389936500 UNITS in lut3d->interp,       1 runs,      0 skips
> >1425800240 UNITS in lut3d->interp,       2 runs,      0 skips
> >1433312777 UNITS in lut3d->interp,       4 runs,      0 skips
> >1443346798 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >sse2
> >948662320 UNITS in lut3d->interp,       1 runs,      0 skips
> >1101247540 UNITS in lut3d->interp,       2 runs,      0 skips
> >1050645695 UNITS in lut3d->interp,       4 runs,      0 skips
> >1041102937 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >avx
> >633837000 UNITS in lut3d->interp,       1 runs,      0 skips
> >669452850 UNITS in lut3d->interp,       2 runs,      0 skips
> >650716580 UNITS in lut3d->interp,       4 runs,      0 skips
> >644698550 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >avx2
> >354940020 UNITS in lut3d->interp,       1 runs,      0 skips
> >362384340 UNITS in lut3d->interp,       2 runs,      0 skips
> >356799020 UNITS in lut3d->interp,       4 runs,      0 skips
> >357276815 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >gbrap16 1920x1080 1 thread with prelut
> >c impl
> >1445071160 UNITS in lut3d->interp,       1 runs,      0 skips
> >1477959120 UNITS in lut3d->interp,       2 runs,      0 skips
> >1472102670 UNITS in lut3d->interp,       4 runs,      0 skips
> >1462579330 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >sse2
> >1035437580 UNITS in lut3d->interp,       1 runs,      0 skips
> >1050139710 UNITS in lut3d->interp,       2 runs,      0 skips
> >1070147205 UNITS in lut3d->interp,       4 runs,      0 skips
> >1064583037 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >avx
> >678089880 UNITS in lut3d->interp,       1 runs,      0 skips
> >679112485 UNITS in lut3d->interp,       2 runs,      0 skips
> >695527212 UNITS in lut3d->interp,       4 runs,      0 skips
> >691300053 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >avx2
> >372671340 UNITS in lut3d->interp,       1 runs,      0 skips
> >373449870 UNITS in lut3d->interp,       2 runs,      0 skips
> >383725625 UNITS in lut3d->interp,       4 runs,      0 skips
> >382860848 UNITS in lut3d->interp,       8 runs,      0 skips
> >
> >---
> > libavfilter/lut3d.h             |  83 ++++
> > libavfilter/vf_lut3d.c          |  61 +--
> > libavfilter/x86/Makefile        |   2 +
> > libavfilter/x86/vf_lut3d.asm    | 757 ++++++++++++++++++++++++++++++++
> > libavfilter/x86/vf_lut3d_init.c |  88 ++++
> > 5 files changed, 935 insertions(+), 56 deletions(-)
> > create mode 100644 libavfilter/lut3d.h
> > create mode 100644 libavfilter/x86/vf_lut3d.asm
> > create mode 100644 libavfilter/x86/vf_lut3d_init.c
> >
> >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
> >new file mode 100644
> >index 0000000000..ded2a036a5
> >--- /dev/null
> >+++ b/libavfilter/lut3d.h
> >@@ -0,0 +1,83 @@
> >+/*
> >+ * Copyright (c) 2013 Clément Bœsch
> >+ * Copyright (c) 2018 Paul B Mahol
> >+ *
> >+ * This file is part of FFmpeg.
> >+ *
> >+ * FFmpeg is free software; you can redistribute it and/or
> >+ * modify it under the terms of the GNU Lesser General Public
> >+ * License as published by the Free Software Foundation; either
> >+ * version 2.1 of the License, or (at your option) any later version.
> >+ *
> >+ * FFmpeg is distributed in the hope that it will be useful,
> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >+ * Lesser General Public License for more details.
> >+ *
> >+ * You should have received a copy of the GNU Lesser General Public
> >+ * License along with FFmpeg; if not, write to the Free Software
> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> >+ */
> >+#ifndef AVFILTER_LUT3D_H
> >+#define AVFILTER_LUT3D_H
> >+
> >+#include "libavutil/pixdesc.h"
> >+#include "framesync.h"
> >+#include "avfilter.h"
> >+
> >+enum interp_mode {
> >+    INTERPOLATE_NEAREST,
> >+    INTERPOLATE_TRILINEAR,
> >+    INTERPOLATE_TETRAHEDRAL,
> >+    INTERPOLATE_PYRAMID,
> >+    INTERPOLATE_PRISM,
> >+    NB_INTERP_MODE
> >+};
> >+
> >+struct rgbvec {
> >+    float r, g, b;
> >+};
> >+
> >+/* 3D LUT don't often go up to level 32, but it is common to have a Hald
> CLUT
> >+ * of 512x512 (64x64x64) */
> >+#define MAX_LEVEL 256
> >+#define PRELUT_SIZE 65536
> >+
> >+typedef struct Lut3DPreLut {
> >+    int size;
> >+    float min[3];
> >+    float max[3];
> >+    float scale[3];
> >+    float* lut[3];
> >+} Lut3DPreLut;
> >+
> >+typedef struct LUT3DContext {
> >+    const AVClass *class;
> >+    struct rgbvec *lut;
> >+    int lutsize;
> >+    int lutsize2;
> >+    struct rgbvec scale;
> >+    int interpolation;          ///<interp_mode
> >+    char *file;
> >+    uint8_t rgba_map[4];
> >+    int step;
> >+    avfilter_action_func *interp;
> >+    Lut3DPreLut prelut;
> >+#if CONFIG_HALDCLUT_FILTER
> >+    uint8_t clut_rgba_map[4];
> >+    int clut_step;
> >+    int clut_bits;
> >+    int clut_planar;
> >+    int clut_float;
> >+    int clut_width;
> >+    FFFrameSync fs;
> >+#endif
> >+} LUT3DContext;
> >+
> >+typedef struct ThreadData {
> >+    AVFrame *in, *out;
> >+} ThreadData;
> >+
> >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
> >+
> >+#endif /* AVFILTER_LUT3D_H */
> >\ No newline at end of file
> >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
> >index 9fbda833b9..1fd0af06db 100644
> >--- a/libavfilter/vf_lut3d.c
> >+++ b/libavfilter/vf_lut3d.c
> >@@ -31,73 +31,18 @@
> > #include "libavutil/intreadwrite.h"
> > #include "libavutil/intfloat.h"
> > #include "libavutil/avassert.h"
> >-#include "libavutil/pixdesc.h"
> > #include "libavutil/avstring.h"
> >-#include "avfilter.h"
> > #include "drawutils.h"
> > #include "formats.h"
> >-#include "framesync.h"
> > #include "internal.h"
> > #include "video.h"
> >+#include "lut3d.h"
> >
> > #define R 0
> > #define G 1
> > #define B 2
> > #define A 3
> >
> >-enum interp_mode {
> >-    INTERPOLATE_NEAREST,
> >-    INTERPOLATE_TRILINEAR,
> >-    INTERPOLATE_TETRAHEDRAL,
> >-    INTERPOLATE_PYRAMID,
> >-    INTERPOLATE_PRISM,
> >-    NB_INTERP_MODE
> >-};
> >-
> >-struct rgbvec {
> >-    float r, g, b;
> >-};
> >-
> >-/* 3D LUT don't often go up to level 32, but it is common to have a Hald
> CLUT
> >- * of 512x512 (64x64x64) */
> >-#define MAX_LEVEL 256
> >-#define PRELUT_SIZE 65536
> >-
> >-typedef struct Lut3DPreLut {
> >-    int size;
> >-    float min[3];
> >-    float max[3];
> >-    float scale[3];
> >-    float* lut[3];
> >-} Lut3DPreLut;
> >-
> >-typedef struct LUT3DContext {
> >-    const AVClass *class;
> >-    int interpolation;          ///<interp_mode
> >-    char *file;
> >-    uint8_t rgba_map[4];
> >-    int step;
> >-    avfilter_action_func *interp;
> >-    struct rgbvec scale;
> >-    struct rgbvec *lut;
> >-    int lutsize;
> >-    int lutsize2;
> >-    Lut3DPreLut prelut;
> >-#if CONFIG_HALDCLUT_FILTER
> >-    uint8_t clut_rgba_map[4];
> >-    int clut_step;
> >-    int clut_bits;
> >-    int clut_planar;
> >-    int clut_float;
> >-    int clut_width;
> >-    FFFrameSync fs;
> >-#endif
> >-} LUT3DContext;
> >-
> >-typedef struct ThreadData {
> >-    AVFrame *in, *out;
> >-} ThreadData;
> >-
> > #define OFFSET(x) offsetof(LUT3DContext, x)
> > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
> > #define TFLAGS
> AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
> >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink)
> >         av_assert0(0);
> >     }
> >
> >+    if (ARCH_X86) {
> >+        ff_lut3d_init_x86(lut3d, desc);
> >+    }
> >+
> >     return 0;
> > }
> >
> >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> >index 016a5b3511..a29941eaeb 100644
> >--- a/libavfilter/x86/Makefile
> >+++ b/libavfilter/x86/Makefile
> >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER)                 +=
> x86/vf_hqdn3d_init.o
> > OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
> > OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
> > OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
> >+OBJS-$(CONFIG_LUT3D_FILTER)                  += x86/vf_lut3d_init.o
> > OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += x86/vf_maskedclamp_init.o
> > OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
> > OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
> >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          +=
> x86/vf_hqdn3d.o
> > X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
> > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
> > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
> >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER)           += x86/vf_lut3d.o
> > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o
> > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
> > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
> >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
> >new file mode 100644
> >index 0000000000..b3d7c3962b
> >--- /dev/null
> >+++ b/libavfilter/x86/vf_lut3d.asm
> >@@ -0,0 +1,757 @@
>
> >+;*****************************************************************************
> >+;* x86-optimized functions for lut3d filter
> >+;*
> >+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
> >+;*
> >+;* This file is part of FFmpeg.
> >+;*
> >+;* FFmpeg is free software; you can redistribute it and/or
> >+;* modify it under the terms of the GNU Lesser General Public
> >+;* License as published by the Free Software Foundation; either
> >+;* version 2.1 of the License, or (at your option) any later version.
> >+;*
> >+;* FFmpeg is distributed in the hope that it will be useful,
> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >+;* Lesser General Public License for more details.
> >+;*
> >+;* You should have received a copy of the GNU Lesser General Public
> >+;* License along with FFmpeg; if not, write to the Free Software
> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
>
> >+;******************************************************************************
> >+
> >+%include "libavutil/x86/x86util.asm"
> >+
> >+SECTION_RODATA
> >+pd_1f:  times 8 dd 1.0
> >+pd_3f:  times 8 dd 3.0
> >+
> >+; used to limit rshifts as they are more expensive in avx1
> >+pd_001: times 8 dd 001b
> >+pd_010: times 8 dd 010b
> >+pd_100: times 8 dd 100b
> >+
> >+pd_65535f:     times 8 dd 65535.0
> >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
> >+
> >+pb_shuffle16:         db    0,    1, 0x80, 0x80, \
> >+                            2,    3, 0x80, 0x80, \
> >+                            4,    5, 0x80, 0x80, \
> >+                            6,    7, 0x80, 0x80
> >+
> >+pb_lo_pack_shuffle16: db    0,    1,    4,    5, \
> >+                            8,    9,   12,   13, \
> >+                         0x80, 0x80, 0x80, 0x80, \
> >+                         0x80, 0x80, 0x80, 0x80
> >+
> >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
> >+                         0x80, 0x80, 0x80, 0x80, \
> >+                            0,    1,    4,    5, \
> >+                            8,    9,   12,   13
> >+
> >+; tetrahedral table --------------------------------------------
> >+; name:          x2|        x1|        x0|       cxxb|     cxxa
> >+; values:      r 00|     r  00|     r  00|   c011 011| c001 001
> >+;              g 01|     g  01|     g  01|   c101 101| c010 010
> >+;              b 10|     b  10|     b  10|   c110 110| c100 100
> >+
> >+; g>b                                 b |          g |          r |
>   c110 | c100
> >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) |
> (110b << 3) | 100b
> >+; r>b                                 g |          b |          r |
>   c101 | c100
> >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) |
> (101b << 3) | 100b
> >+; else                                g |          r |          b |
>   c101 | c001
> >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) |
> (101b << 3) | 001b
> >+; b>g                                 r |          g |          b |
>   c011 | c001
> >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) |
> (011b << 3) | 001b
> >+; b>r                                 r |          b |          g |
>   c011 | c010
> >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) |
> (011b << 3) | 010b
> >+; else                                b |          r |          g |
>   c110 | c010
> >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) |
> (110b << 3) | 010b
> >+
> >+SECTION .text
> >+
> >+struc Lut3DPreLut
> >+    .size:    resd 1
> >+    .min:     resd 3
> >+    .max:     resd 3
> >+    .scale:   resd 3
> >+    .lut:     resq 3
> >+endstruc
> >+
> >+struc LUT3DContext
> >+    .class:        resq 1
> >+    .lut:          resq 1
> >+    .lutsize:      resd 1
> >+    .lutsize2:     resd 1
> >+    .scale:        resd 3
> >+endstruc
> >+
> >+%define AV_NUM_DATA_POINTERS 8
> >+
> >+struc AVFrame
> >+    .data:          resq AV_NUM_DATA_POINTERS
> >+    .linesize:      resd AV_NUM_DATA_POINTERS
> >+    .extended_data: resq 1
> >+    .width:         resd 1
> >+    .height:        resd 1
> >+endstruc
> >+
> >+%define rm   rsp
> >+%define gm   rsp+mmsize
> >+%define bm   rsp+(mmsize*2)
> >+
> >+%define lut3dsizem  [rsp+mmsize*3]
> >+%define lut3dsize2m [rsp+mmsize*4]
> >+%define lut3dmaxm   [rsp+mmsize*5]
> >+%define prelutmaxm  [rsp+mmsize*6]
> >+
> >+%define scalerm [rsp+mmsize*7]
> >+%define scalegm [rsp+mmsize*8]
> >+%define scalebm [rsp+mmsize*9]
> >+
> >+%define prelutminrm [rsp+mmsize*10]
> >+%define prelutmingm [rsp+mmsize*11]
> >+%define prelutminbm [rsp+mmsize*12]
> >+
> >+%define prelutscalerm [rsp+mmsize*13]
> >+%define prelutscalegm [rsp+mmsize*14]
> >+%define prelutscalebm [rsp+mmsize*15]
> >+
> >+; data pointers
> >+%define srcrm [rsp+mmsize*16 +  0]
> >+%define srcgm [rsp+mmsize*16 +  8]
> >+%define srcbm [rsp+mmsize*16 + 16]
> >+%define srcam [rsp+mmsize*16 + 24]
> >+
> >+%define dstrm [rsp+mmsize*16 + 32]
> >+%define dstgm [rsp+mmsize*16 + 40]
> >+%define dstbm [rsp+mmsize*16 + 48]
> >+%define dstam [rsp+mmsize*16 + 56]
> >+
> >+%macro FETCH_PRELUT_PN 3
> >+    mov tmp2d, [rm + %3]
> >+    mov tmp3d, [gm + %3]
> >+    movss xm%1, [tmpq + tmp2q*4]
> >+    movss xm%2, [tmpq + tmp3q*4]
> >+    movss [rm + %3], xm%1
> >+    movss [gm + %3], xm%2
> >+%endmacro
> >+
> >+; 1 - p
> >+; 2 - n
> >+; 3 - p indices
> >+; 4 - n indices
> >+%macro GATHER_PRELUT 4
> >+    %if cpuflag(avx2)
> >+        vpcmpeqb m7, m7
> >+        vgatherdps m%1, [tmpq + m%3*4], m7 ; p
> >+        vpcmpeqb m9, m9
> >+        vgatherdps m%2, [tmpq + m%4*4], m9 ; n
> >+    %else
> >+        mova [rm], m%3
> >+        mova [gm], m%4
> >+        FETCH_PRELUT_PN %1, %2, 0
> >+        FETCH_PRELUT_PN %1, %2, 4
> >+        FETCH_PRELUT_PN %1, %2, 8
> >+        FETCH_PRELUT_PN %1, %2, 12
> >+    %if mmsize > 16
> >+        FETCH_PRELUT_PN %1, %2, 16
> >+        FETCH_PRELUT_PN %1, %2, 20
> >+        FETCH_PRELUT_PN %1, %2, 24
> >+        FETCH_PRELUT_PN %1, %2, 28
> >+    %endif
> >+        movu m%1, [rm]
> >+        movu m%2, [gm]
> >+    %endif
> >+%endmacro
> >+
> >+%macro FLOORPS 2
> >+    %if mmsize > 16
> >+        vroundps %1, %2, 0x01
> >+    %else
> >+        cvttps2dq %1, %2
> >+        cvtdq2ps  %1, %1
> >+    %endif
> >+%endmacro
> >+
> >+; 1 - dst
> >+; 2 - index
> >+; 3 - min
> >+; 4 - scale
> >+; assumes lut max m13, m14 1.0f, zero m15
> >+%macro APPLY_PRELUT 4
> >+    ; scale
> >+    subps m5, m%1, %3 ; v - min
> >+    mulps m5, m5, %4  ; v * scale
> >+    ; clamp
> >+    maxps m5, m5, m15 ; max zero
> >+    minps m5, m5, m13 ; min lut max
> >+
> >+    FLOORPS m3, m5    ; prev index
> >+    subps m5, m5, m3  ; d
> >+    addps m4, m3, m14 ; p+1 = n index
> >+    minps m4, m4, m13 ; clamp n idex
> >+
> >+    mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
> >+    cvttps2dq m6, m3
> >+    cvttps2dq m10, m4
> >+    GATHER_PRELUT 3, 4, 6, 10
> >+
> >+    ; lerp
> >+    subps m8, m4, m3
> >+    mulps m8, m8, m5
> >+    addps m%1, m8, m3
> >+%endmacro
> >+
> >+; 1 - dst
> >+; 2 - scale
> >+; assumes lut max m13, zero m15
> >+%macro APPLY_SCALE 2
> >+   mulps m%1, m%1, %2
> >+   maxps m%1, m%1, m15
> >+   minps m%1, m%1, m13
> >+%endmacro
> >+
> >+%macro BLEND 4
> >+%if mmsize > 16
> >+    vblendvps %1, %2, %3, %4
> >+%else
> >+    %ifidni %1,%2
> >+        %error operand 1 must not equal operand 2
> >+    %endif
> >+    %ifidni %1,%3
> >+        %error operand 1 must not equal operand 3
> >+    %endif
> >+    mova  %1, %2
> >+    xorps %1, %3
> >+    andps %1, %4
> >+    xorps %1, %2
> >+%endif
> >+%endmacro
> >+
> >+; sets nans to zere, +inf -inf handled later by min/max clamps
> >+%macro SANITIZE_F 1
> >+    cmpps m5, %1, %1, 0x0 ; nan == nan = False
> >+    %if mmsize <= 16
> >+        mova m6, %1
> >+        BLEND %1, m15, m6, m5
> >+    %else
> >+        BLEND %1, m15, %1, m5
> >+    %endif
> >+%endmacro
> >+
> >+%macro ADD3 4
> >+    addps %1, %2, %3
> >+    addps %1, %1, %4
> >+%endmacro
> >+
> >+%macro CMP_EQUAL 3
> >+%if cpuflag(avx2)
> >+    vpcmpeqd %1, %2, %3
> >+%elif cpuflag(avx)
> >+    cmpps %1, %2, %3, 0x0
> >+%else
> >+    pcmpeqd %1, %2, %3
> >+%endif
> >+%endmacro
> >+
> >+%macro SHIFT_RIGHT 2
> >+%if mmsize <= 16
> >+    psrld xm%1, %2
> >+%elif cpuflag(avx2)
> >+    vpsrld m%1, m%1, %2
> >+%else
> >+    vextractf128 xm15, m%1, 1
> >+    psrld xm%1, %2
> >+    psrld xm15, %2
> >+    vinsertf128 m%1, m%1, xm15, 1
> >+%endif
> >+%endmacro
> >+
> >+%macro FETCH_LUT3D_RGB 4
> >+    mov tmp2d, [rm + %4]
> >+    movss xm%1, [tmpq + tmp2q*4 + 0]
> >+    movss xm%2, [tmpq + tmp2q*4 + 4]
> >+    movss xm%3, [tmpq + tmp2q*4 + 8]
> >+    movss [rm + %4], xm%1
> >+    movss [gm + %4], xm%2
> >+    movss [bm + %4], xm%3
> >+%endmacro
> >+
> >+; 1 - dstr
> >+; 2 - dstg
> >+; 3 - dstb
> >+; 4 - indices
> >+%macro GATHER_LUT3D_INDICES 4
> >+%if cpuflag(avx2)
> >+    vpcmpeqb m3, m3
> >+    vgatherdps m%1, [tmpq + m%4*4 + 0], m3
> >+    vpcmpeqb m14, m14
> >+    vgatherdps m%2, [tmpq + m%4*4 + 4], m14
> >+    vpcmpeqb m15, m15
> >+    vgatherdps m%3, [tmpq + m%4*4 + 8], m15
> >+%else
> >+    movu [rm], m%4
> >+    FETCH_LUT3D_RGB %1, %2, %3, 0
> >+    FETCH_LUT3D_RGB %1, %2, %3, 4
> >+    FETCH_LUT3D_RGB %1, %2, %3, 8
> >+    FETCH_LUT3D_RGB %1, %2, %3, 12
> >+%if mmsize > 16
> >+    FETCH_LUT3D_RGB %1, %2, %3, 16
> >+    FETCH_LUT3D_RGB %1, %2, %3, 20
> >+    FETCH_LUT3D_RGB %1, %2, %3, 24
> >+    FETCH_LUT3D_RGB %1, %2, %3, 28
> >+%endif
> >+    movu m%1, [rm]
> >+    movu m%2, [gm]
> >+    movu m%3, [bm]
> >+%endif
> >+%endmacro
> >+
> >+%macro interp_tetrahedral 0
> >+    %define d_r m0
> >+    %define d_g m1
> >+    %define d_b m2
> >+
> >+    %define prev_r m3
> >+    %define prev_g m4
> >+    %define prev_b m5
> >+
> >+    %define next_r m6
> >+    %define next_g m7
> >+    %define next_b m8
> >+
> >+    %define x0 m4
> >+    %define x1 m5
> >+    %define x2 m6
> >+
> >+    ; setup prev index
> >+    FLOORPS prev_r, m0
> >+    FLOORPS prev_g, m1
> >+    FLOORPS prev_b, m2
> >+
> >+    ; setup deltas
> >+    subps d_r, m0, prev_r
> >+    subps d_g, m1, prev_g
> >+    subps d_b, m2, prev_b
> >+
> >+    ; calculate select mask m9
> >+    movu m6, [pd_tetra_table2]
> >+    cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
> >+    BLEND m10, m6, [pd_tetra_table1], m7
> >+    cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
> >+    BLEND m6, m10, [pd_tetra_table0], m7
> >+
> >+    movu m10, [pd_tetra_table5]
> >+    cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
> >+    BLEND m9, m10, [pd_tetra_table4], m7
> >+    cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
> >+    BLEND m10, m9, [pd_tetra_table3], m7
> >+
> >+    cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
> >+    BLEND m9, m10, m6, m7
> >+
> >+    ; setup next index
> >+    addps next_r, prev_r, m14 ; +1
> >+    minps next_r, next_r, m13 ; clamp lutmax
> >+
> >+    addps next_g, prev_g, m14 ; +1
> >+    minps next_g, next_g, m13 ; clamp lutmax
> >+
> >+    addps next_b, prev_b, m14 ; +1
> >+    minps next_b, next_b, m13 ; clamp lutmax
> >+
> >+    ; prescale indices
> >+    mulps prev_r, prev_r, lut3dsize2m
> >+    mulps next_r, next_r, lut3dsize2m
> >+
> >+    mulps prev_g, prev_g, lut3dsizem
> >+    mulps next_g, next_g, lut3dsizem
> >+
> >+    mulps prev_b, prev_b, [pd_3f]
> >+    mulps next_b, next_b, [pd_3f]
> >+
> >+    movu m14, [pd_001]
> >+
> >+    ; cxxa m10
> >+    ; b
> >+    andps m15, m9, m14
> >+    CMP_EQUAL m15, m15, m14
> >+    BLEND m10, prev_b, next_b, m15
> >+
> >+    ; g
> >+    andps m15, m9, [pd_010]
> >+    CMP_EQUAL m15, m15, [pd_010]
> >+    BLEND m12, prev_g, next_g, m15
> >+
> >+    ; r
> >+    andps m15, m9, [pd_100]
> >+    CMP_EQUAL m15, m15, [pd_100]
> >+    BLEND m13, prev_r, next_r, m15
> >+
> >+    ADD3 m10, m10, m12, m13
> >+
> >+    SHIFT_RIGHT 9, 3 ; 3
> >+
> >+    ; cxxb m11;
> >+    ; b
> >+    andps m15, m9, m14
> >+    CMP_EQUAL m15, m15, m14
> >+    BLEND m11, prev_b, next_b, m15
> >+
> >+    ; g
> >+    andps m15, m9, [pd_010]
> >+    CMP_EQUAL m15, m15, [pd_010]
> >+    BLEND m12, prev_g, next_g, m15
> >+
> >+    ; r
> >+    andps m15, m9, [pd_100]
> >+    CMP_EQUAL m15, m15, [pd_100]
> >+    BLEND m13, prev_r, next_r, m15
> >+
> >+    ADD3 m11, m11, m12, m13
> >+
> >+    ; c000 m12;
> >+    ADD3 m12, prev_r, prev_g, prev_b
> >+
> >+    ; c111 m13;
> >+    ADD3 m13, next_r, next_g, next_b
> >+
> >+    SHIFT_RIGHT 9, 3 ; 6
> >+
> >+    ; x0, m4
> >+    andps m15, m9, m14
> >+    CMP_EQUAL m15, m15, m14
> >+    BLEND m7, d_r, d_g, m15 ; r,g
> >+
> >+    andps m15, m9, [pd_010]
> >+    CMP_EQUAL m15, m15, [pd_010]
> >+    BLEND x0, m7, d_b, m15 ; b
> >+
> >+    ; x1, m5
> >+    andps m15, m9, [pd_100]
> >+    CMP_EQUAL m15, m15, [pd_100]
> >+    BLEND m7, d_r, d_g, m15 ; r,g
> >+
> >+    SHIFT_RIGHT 9, 3 ; 9
> >+
> >+    andps m15, m9, m14
> >+    CMP_EQUAL m15, m15, m14
> >+    BLEND x1, m7, d_b, m15 ; b
> >+
> >+    ; x2, m6
> >+    andps m15, m9, [pd_010]
> >+    CMP_EQUAL m15, m15, [pd_010]
> >+    BLEND m7, d_r, d_g, m15 ; r,g
> >+
> >+    andps m15, m9, [pd_100]
> >+    CMP_EQUAL m15, m15, [pd_100]
> >+    BLEND x2, m7, d_b, m15 ; b
> >+
> >+    ; convert indices to integer
> >+    cvttps2dq m12, m12
> >+    cvttps2dq m10, m10
> >+    cvttps2dq m11, m11
> >+    cvttps2dq m13, m13
> >+
> >+    ; now the gathering festival
> >+    mov tmpq, [ctxq + LUT3DContext.lut]
> >+
> >+    GATHER_LUT3D_INDICES 0, 1, 2, 12
> >+    movu m14, [pd_1f]
> >+    subps m14, m14, x0; 1 - x0
> >+
> >+    mulps m0, m0, m14
> >+    mulps m1, m1, m14
> >+    mulps m2, m2, m14
> >+
> >+    GATHER_LUT3D_INDICES 7, 8, 9, 10
> >+    subps m14, x0, x1; x0 - x1
> >+    mulps m7, m7, m14
> >+    addps m0, m0, m7
> >+
> >+    mulps m8, m8, m14
> >+    addps m1, m1, m8
> >+
> >+    mulps m9, m9, m14
> >+    addps m2, m2, m9
> >+
> >+    GATHER_LUT3D_INDICES 7, 8, 9, 11
> >+    subps m14, x1, x2; x1 - x2
> >+
> >+    mulps m7, m7, m14
> >+    addps m0, m0, m7
> >+
> >+    mulps m8, m8, m14
> >+    addps m1, m1, m8
> >+
> >+    mulps m9, m9, m14
> >+    addps m2, m2, m9
> >+
> >+    GATHER_LUT3D_INDICES 7, 8, 9, 13
> >+    mulps m7, m7, x2
> >+    addps m0, m0, m7
> >+
> >+    mulps m8, m8, x2
> >+    addps m1, m1, m8
> >+
> >+    mulps m9, m9, x2
> >+    addps m2, m2, m9
> >+%endmacro
> >+
> >+%macro INIT_DATA_PTR 3
> >+    mov ptrq, [%2 + AVFrame.data     + %3 * 8]
> >+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
> >+    imul tmpd, slice_startd
> >+    add ptrq, tmpq
> >+    mov %1, ptrq
> >+%endmacro
> >+
> >+%macro INC_DATA_PTR 3
> >+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
> >+    mov ptrq, %1
> >+    add ptrq, tmpq
> >+    mov %1, ptrq
> >+%endmacro
> >+
> >+%macro LOAD16 2
> >+    mov ptrq, %2
> >+    %if mmsize > 16
> >+        movu xm%1, [ptrq + xq*2]
> >+    %else
> >+        movsd xm%1, [ptrq + xq*2]
> >+    %endif
> >+    %if cpuflag(avx2)
> >+        vpmovzxwd m%1, xm%1
> >+    %else
> >+        %if mmsize > 16
> >+            pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
> >+            pshufb xm%1, xm6 ; pb_shuffle16
> >+            pshufb xm4,  xm6 ; pb_shuffle16
> >+            vinsertf128 m%1, m%1, xm4, 1
> >+        %else
> >+            pshufd  xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
> >+            pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
> >+            pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
> >+        %endif
> >+    %endif
> >+    cvtdq2ps m%1, m%1
> >+    mulps m%1, m%1, m7 ; pd_65535_invf
> >+%endmacro
> >+
> >+%macro STORE16 2
> >+    mulps m%2, m%2, m5  ; [pd_65535f]
> >+    minps m%2, m%2, m5  ; [pd_65535f]
> >+    maxps m%2, m%2, m15 ; zero
> >+    cvttps2dq m%2, m%2
> >+    %if mmsize > 16
> >+        vextractf128 xm4, m%2, 1
> >+        pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
> >+        pshufb xm4,  xm7 ; [pb_hi_pack_shuffle16]
> >+        por xm%2, xm4
> >+    %else
> >+        pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
> >+        pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
> >+        pshufd  xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
> >+    %endif
> >+    mov ptrq, %1
> >+    %if mmsize > 16
> >+        movu [ptrq + xq*2], xm%2
> >+    %else
> >+        movsd [ptrq + xq*2], xm%2
> >+    %endif
> >+%endmacro
> >+
> >+; 1 - interp method
> >+; 2 - format_name
> >+; 3 - depth
> >+; 4 - is float format
> >+%macro DEFINE_INTERP_FUNC 4
> >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut,
> src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr,
> tmp, tmp2, tmp3
> >+    ; store lut max and lutsize
> >+    mov tmpd, dword [ctxq + LUT3DContext.lutsize]
> >+    cvtsi2ss xm0, tmpd
> >+    mulss xm0, xm0, [pd_3f]
> >+    VBROADCASTSS m0, xm0
> >+    mova lut3dsizem, m0
> >+    sub tmpd, 1
> >+    cvtsi2ss xm0, tmpd
> >+    VBROADCASTSS m0, xm0
> >+    mova lut3dmaxm, m0
> >+
> >+    ; scale_r
> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
> >+    VBROADCASTSS m1, xm1
> >+    mova scalerm, m1
> >+
> >+    ; scale_g
> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
> >+    VBROADCASTSS m1, xm1
> >+    mova scalegm, m1
> >+
> >+    ; scale_b
> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
> >+    VBROADCASTSS m1, xm1
> >+    mova scalebm, m1
> >+
> >+    ; store lutsize2
> >+    cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
> >+    mulss xm0, xm0, [pd_3f]
> >+    VBROADCASTSS m0, xm0
> >+    mova lut3dsize2m, m0
> >+
> >+    ; init prelut values
> >+    cmp prelutq, 0
> >+    je %%skip_init_prelut
> >+        mov tmpd, dword [prelutq + Lut3DPreLut.size]
> >+        sub tmpd, 1
> >+        cvtsi2ss xm0, tmpd
> >+        VBROADCASTSS m0, xm0
> >+        mova prelutmaxm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
> >+        mova prelutminrm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
> >+        mova prelutmingm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
> >+        mova prelutminbm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
> >+        mova prelutscalerm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
> >+        mova prelutscalegm, m0
> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
> >+        mova prelutscalebm, m0
> >+    %%skip_init_prelut:
> >+
> >+    mov widthd,  [src_imageq + AVFrame.width]
> >+
> >+    ; gbra pixel order
> >+    INIT_DATA_PTR srcrm, src_imageq, 2
> >+    INIT_DATA_PTR srcgm, src_imageq, 0
> >+    INIT_DATA_PTR srcbm, src_imageq, 1
> >+    INIT_DATA_PTR srcam, src_imageq, 3
> >+
> >+    INIT_DATA_PTR dstrm, dst_imageq, 2
> >+    INIT_DATA_PTR dstgm, dst_imageq, 0
> >+    INIT_DATA_PTR dstbm, dst_imageq, 1
> >+    INIT_DATA_PTR dstam, dst_imageq, 3
> >+
> >+    %%loop_y:
> >+        xor xq, xq
> >+        %%loop_x:
> >+            movu m14, [pd_1f]
> >+            xorps m15, m15, m15
> >+            %if %4 ; float
> >+                mov ptrq, srcrm
> >+                movu m0, [ptrq + xq*4]
> >+                mov ptrq, srcgm
> >+                movu m1, [ptrq + xq*4]
> >+                mov ptrq, srcbm
> >+                movu m2, [ptrq + xq*4]
> >+                SANITIZE_F m0
> >+                SANITIZE_F m1
> >+                SANITIZE_F m2
> >+            %else
> >+                ; constants for LOAD16
> >+                movu m7, [pd_65535_invf]
> >+                %if notcpuflag(avx2) && mmsize >= 32
> >+                    movu xm6, [pb_shuffle16]
> >+                %endif
> >+                LOAD16 0, srcrm
> >+                LOAD16 1, srcgm
> >+                LOAD16 2, srcbm
> >+            %endif
> >+
> >+            cmp prelutq, 0
> >+            je %%skip_prelut
> >+                mova m13, prelutmaxm
> >+                APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
> >+                APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
> >+                APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
> >+            %%skip_prelut:
> >+
> >+            mova m13, lut3dmaxm
> >+            APPLY_SCALE 0, scalerm
> >+            APPLY_SCALE 1, scalegm
> >+            APPLY_SCALE 2, scalebm
> >+
> >+            interp_%1
> >+
> >+            %if %4 ; float
> >+                mov ptrq, dstrm
> >+                movu [ptrq + xq*4], m0
> >+                mov ptrq, dstgm
> >+                movu [ptrq + xq*4], m1
> >+                mov ptrq, dstbm
> >+                movu [ptrq + xq*4], m2
> >+                cmp has_alphad, 0
> >+                je %%skip_alphaf
> >+                    mov ptrq, srcam
> >+                    movu m0, [ptrq + xq*4]
> >+                    mov ptrq, dstam
> >+                    movu [ptrq + xq*4], m0
> >+                %%skip_alphaf:
> >+            %else
> >+                ; constants for STORE16
> >+                movu m5,  [pd_65535f]
> >+                %if mmsize > 16
> >+                    movu xm6, [pb_lo_pack_shuffle16]
> >+                    movu xm7, [pb_hi_pack_shuffle16]
> >+                %endif
> >+
> >+                xorps m15, m15, m15
> >+                STORE16 dstrm, 0
> >+                STORE16 dstgm, 1
> >+                STORE16 dstbm, 2
> >+
> >+                cmp has_alphad, 0
> >+                je %%skip_alpha
> >+                    %if mmsize > 16
> >+                        mov ptrq, srcam
> >+                        movu xm0, [ptrq + xq*2]
> >+                        mov ptrq, dstam
> >+                        movu [ptrq + xq*2], xm0
> >+                    %else
> >+                        mov ptrq, srcam
> >+                        movsd xm0, [ptrq + xq*2]
> >+                        mov ptrq, dstam
> >+                        movsd [ptrq + xq*2], xm0
> >+                    %endif
> >+
> >+                %%skip_alpha:
> >+            %endif
> >+
> >+            add xq, mmsize/4
> >+            cmp xd, widthd
> >+            jl %%loop_x
> >+
> >+        INC_DATA_PTR srcrm, src_imageq, 2
> >+        INC_DATA_PTR srcgm, src_imageq, 0
> >+        INC_DATA_PTR srcbm, src_imageq, 1
> >+        INC_DATA_PTR srcam, src_imageq, 3
> >+
> >+        INC_DATA_PTR dstrm, dst_imageq, 2
> >+        INC_DATA_PTR dstgm, dst_imageq, 0
> >+        INC_DATA_PTR dstbm, dst_imageq, 1
> >+        INC_DATA_PTR dstam, dst_imageq, 3
> >+
> >+        inc slice_startd
> >+        cmp slice_startd, slice_endd
> >+        jl %%loop_y
> >+
> >+    RET
> >+%endmacro
> >+%if ARCH_X86_64
> >+    %if HAVE_AVX2_EXTERNAL
> >+        INIT_YMM avx2
> >+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
> >+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
> >+    %endif
> >+    %if HAVE_AVX_EXTERNAL
> >+        INIT_YMM avx
> >+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
> >+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
> >+    %endif
> >+    INIT_XMM sse2
> >+    DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
> >+    DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
> >+%endif
> >\ No newline at end of file
> >diff --git a/libavfilter/x86/vf_lut3d_init.c
> b/libavfilter/x86/vf_lut3d_init.c
> >new file mode 100644
> >index 0000000000..9b9b36e4af
> >--- /dev/null
> >+++ b/libavfilter/x86/vf_lut3d_init.c
> >@@ -0,0 +1,88 @@
> >+/*
> >+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
> >+ *
> >+ * This file is part of FFmpeg.
> >+ *
> >+ * FFmpeg is free software; you can redistribute it and/or
> >+ * modify it under the terms of the GNU Lesser General Public
> >+ * License as published by the Free Software Foundation; either
> >+ * version 2.1 of the License, or (at your option) any later version.
> >+ *
> >+ * FFmpeg is distributed in the hope that it will be useful,
> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >+ * Lesser General Public License for more details.
> >+ *
> >+ * You should have received a copy of the GNU Lesser General Public
> >+ * License along with FFmpeg; if not, write to the Free Software
> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> >+ */
> >+
> >+#include "libavutil/attributes.h"
> >+#include "libavutil/cpu.h"
> >+#include "libavutil/x86/cpu.h"
> >+#include "libavfilter/lut3d.h"
> >+
> >+#define DEFINE_INTERP_FUNC(name, format, opt)
>                                                                            \
> >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d,
> Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int
> slice_end, int has_alpha); \
> >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void
> *arg, int jobnr, int nb_jobs)
>   \
> >+{
>                                                                            \
> >+    LUT3DContext *lut3d = ctx->priv;
>
> \
> >+    Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL;
>                                                                            \
> >+    ThreadData *td = arg;
>                                                                            \
> >+    AVFrame *in  = td->in;
>
> \
> >+    AVFrame *out = td->out;
>                                                                            \
> >+    int has_alpha = in->linesize[3] && out != in;
>                                                                            \
> >+    int slice_start = (in->height *  jobnr   ) / nb_jobs;
>                                                                            \
> >+    int slice_end   = (in->height * (jobnr+1)) / nb_jobs;
>                                                                            \
> >+    ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out,
> slice_start, slice_end, has_alpha);
>          \
> >+    return 0;
>                                                                            \
> >+}
> >+
> >+#if ARCH_X86_64
> >+#if HAVE_AVX2_EXTERNAL
> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx2)
> >+#endif
> >+#if HAVE_AVX_EXTERNAL
> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx)
> >+#endif
> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  sse2)
> >+#endif
> >+
> >+
> >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor
> *desc)
> >+{
> >+    int cpu_flags = av_get_cpu_flags();
> >+    int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
> >+    int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
> >+    int depth = desc->comp[0].depth;
> >+
> >+#if ARCH_X86_64
> >+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation ==
> INTERPOLATE_TETRAHEDRAL && planar) {
> >+#if HAVE_AVX2_EXTERNAL
> >+        if (isfloat && planar) {
> >+            s->interp = interp_tetrahedral_pf32_avx2;
> >+        } else if (depth == 16) {
> >+            s->interp = interp_tetrahedral_p16_avx2;
> >+        }
> >+#endif
> >+    } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation ==
> INTERPOLATE_TETRAHEDRAL && planar) {
> >+#if HAVE_AVX_EXTERNAL
> >+        if (isfloat) {
> >+            s->interp = interp_tetrahedral_pf32_avx;
> >+        } else if (depth == 16) {
> >+            s->interp = interp_tetrahedral_p16_avx;
> >+        }
> >+#endif
> >+    } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation ==
> INTERPOLATE_TETRAHEDRAL && planar) {
> >+        if (isfloat) {
> >+            s->interp = interp_tetrahedral_pf32_sse2;
> >+        } else if (depth == 16) {
> >+            s->interp = interp_tetrahedral_p16_sse2;
> >+        }
> >+    }
> >+#endif
> >+}
> >--
> >2.31.1.windows.1
> >
> >_______________________________________________
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>
Mark Reid Oct. 4, 2021, 3:49 a.m. UTC | #3
On Wed, Sep 29, 2021 at 10:27 AM Mark Reid <mindmark@gmail.com> wrote:

>
>
> On Tue, Sep 28, 2021 at 6:38 PM chen <chenm003@163.com> wrote:
>
>> Hello,
>>
>>
>> Excuse me, how about FMADD on AVX2 platform?
>>
>>
>> For example
>> +    mulps m7, m7, m14
>> +    addps m0, m0, m7
>>
>> ==>
>>
>>
>> fmadd231ps m0,m7,m14
>>
>>
> Interesting, does having AVX2 guarantee having FMA instructions?
>
>

I'm still not 100% certain all AVX2 cpus have FMA instructions so I'll
add cpuflags check for FMA too. I also came up with a faster way to
calculate x0,x1,x2 without the lookup table.
will send a new patch.


>
>> Regards,
>> Min Chen
>>
>>
>>  2021-09-29 09:18:05,mindmark@gmail.com
>> >From: Mark Reid <mindmark@gmail.com>
>> >
>> >Only supports float and 16bit planer formats at the momoment.
>> >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer
>> some
>> >speed gains.
>> >
>> >f32 1920x1080 1 thread with prelut
>> >c impl
>> >1389936500 UNITS in lut3d->interp,       1 runs,      0 skips
>> >1425800240 UNITS in lut3d->interp,       2 runs,      0 skips
>> >1433312777 UNITS in lut3d->interp,       4 runs,      0 skips
>> >1443346798 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >sse2
>> >948662320 UNITS in lut3d->interp,       1 runs,      0 skips
>> >1101247540 UNITS in lut3d->interp,       2 runs,      0 skips
>> >1050645695 UNITS in lut3d->interp,       4 runs,      0 skips
>> >1041102937 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >avx
>> >633837000 UNITS in lut3d->interp,       1 runs,      0 skips
>> >669452850 UNITS in lut3d->interp,       2 runs,      0 skips
>> >650716580 UNITS in lut3d->interp,       4 runs,      0 skips
>> >644698550 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >avx2
>> >354940020 UNITS in lut3d->interp,       1 runs,      0 skips
>> >362384340 UNITS in lut3d->interp,       2 runs,      0 skips
>> >356799020 UNITS in lut3d->interp,       4 runs,      0 skips
>> >357276815 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >gbrap16 1920x1080 1 thread with prelut
>> >c impl
>> >1445071160 UNITS in lut3d->interp,       1 runs,      0 skips
>> >1477959120 UNITS in lut3d->interp,       2 runs,      0 skips
>> >1472102670 UNITS in lut3d->interp,       4 runs,      0 skips
>> >1462579330 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >sse2
>> >1035437580 UNITS in lut3d->interp,       1 runs,      0 skips
>> >1050139710 UNITS in lut3d->interp,       2 runs,      0 skips
>> >1070147205 UNITS in lut3d->interp,       4 runs,      0 skips
>> >1064583037 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >avx
>> >678089880 UNITS in lut3d->interp,       1 runs,      0 skips
>> >679112485 UNITS in lut3d->interp,       2 runs,      0 skips
>> >695527212 UNITS in lut3d->interp,       4 runs,      0 skips
>> >691300053 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >avx2
>> >372671340 UNITS in lut3d->interp,       1 runs,      0 skips
>> >373449870 UNITS in lut3d->interp,       2 runs,      0 skips
>> >383725625 UNITS in lut3d->interp,       4 runs,      0 skips
>> >382860848 UNITS in lut3d->interp,       8 runs,      0 skips
>> >
>> >---
>> > libavfilter/lut3d.h             |  83 ++++
>> > libavfilter/vf_lut3d.c          |  61 +--
>> > libavfilter/x86/Makefile        |   2 +
>> > libavfilter/x86/vf_lut3d.asm    | 757 ++++++++++++++++++++++++++++++++
>> > libavfilter/x86/vf_lut3d_init.c |  88 ++++
>> > 5 files changed, 935 insertions(+), 56 deletions(-)
>> > create mode 100644 libavfilter/lut3d.h
>> > create mode 100644 libavfilter/x86/vf_lut3d.asm
>> > create mode 100644 libavfilter/x86/vf_lut3d_init.c
>> >
>> >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
>> >new file mode 100644
>> >index 0000000000..ded2a036a5
>> >--- /dev/null
>> >+++ b/libavfilter/lut3d.h
>> >@@ -0,0 +1,83 @@
>> >+/*
>> >+ * Copyright (c) 2013 Clément Bœsch
>> >+ * Copyright (c) 2018 Paul B Mahol
>> >+ *
>> >+ * This file is part of FFmpeg.
>> >+ *
>> >+ * FFmpeg is free software; you can redistribute it and/or
>> >+ * modify it under the terms of the GNU Lesser General Public
>> >+ * License as published by the Free Software Foundation; either
>> >+ * version 2.1 of the License, or (at your option) any later version.
>> >+ *
>> >+ * FFmpeg is distributed in the hope that it will be useful,
>> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >+ * Lesser General Public License for more details.
>> >+ *
>> >+ * You should have received a copy of the GNU Lesser General Public
>> >+ * License along with FFmpeg; if not, write to the Free Software
>> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> >+ */
>> >+#ifndef AVFILTER_LUT3D_H
>> >+#define AVFILTER_LUT3D_H
>> >+
>> >+#include "libavutil/pixdesc.h"
>> >+#include "framesync.h"
>> >+#include "avfilter.h"
>> >+
>> >+enum interp_mode {
>> >+    INTERPOLATE_NEAREST,
>> >+    INTERPOLATE_TRILINEAR,
>> >+    INTERPOLATE_TETRAHEDRAL,
>> >+    INTERPOLATE_PYRAMID,
>> >+    INTERPOLATE_PRISM,
>> >+    NB_INTERP_MODE
>> >+};
>> >+
>> >+struct rgbvec {
>> >+    float r, g, b;
>> >+};
>> >+
>> >+/* 3D LUT don't often go up to level 32, but it is common to have a
>> Hald CLUT
>> >+ * of 512x512 (64x64x64) */
>> >+#define MAX_LEVEL 256
>> >+#define PRELUT_SIZE 65536
>> >+
>> >+typedef struct Lut3DPreLut {
>> >+    int size;
>> >+    float min[3];
>> >+    float max[3];
>> >+    float scale[3];
>> >+    float* lut[3];
>> >+} Lut3DPreLut;
>> >+
>> >+typedef struct LUT3DContext {
>> >+    const AVClass *class;
>> >+    struct rgbvec *lut;
>> >+    int lutsize;
>> >+    int lutsize2;
>> >+    struct rgbvec scale;
>> >+    int interpolation;          ///<interp_mode
>> >+    char *file;
>> >+    uint8_t rgba_map[4];
>> >+    int step;
>> >+    avfilter_action_func *interp;
>> >+    Lut3DPreLut prelut;
>> >+#if CONFIG_HALDCLUT_FILTER
>> >+    uint8_t clut_rgba_map[4];
>> >+    int clut_step;
>> >+    int clut_bits;
>> >+    int clut_planar;
>> >+    int clut_float;
>> >+    int clut_width;
>> >+    FFFrameSync fs;
>> >+#endif
>> >+} LUT3DContext;
>> >+
>> >+typedef struct ThreadData {
>> >+    AVFrame *in, *out;
>> >+} ThreadData;
>> >+
>> >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
>> >+
>> >+#endif /* AVFILTER_LUT3D_H */
>> >\ No newline at end of file
>> >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
>> >index 9fbda833b9..1fd0af06db 100644
>> >--- a/libavfilter/vf_lut3d.c
>> >+++ b/libavfilter/vf_lut3d.c
>> >@@ -31,73 +31,18 @@
>> > #include "libavutil/intreadwrite.h"
>> > #include "libavutil/intfloat.h"
>> > #include "libavutil/avassert.h"
>> >-#include "libavutil/pixdesc.h"
>> > #include "libavutil/avstring.h"
>> >-#include "avfilter.h"
>> > #include "drawutils.h"
>> > #include "formats.h"
>> >-#include "framesync.h"
>> > #include "internal.h"
>> > #include "video.h"
>> >+#include "lut3d.h"
>> >
>> > #define R 0
>> > #define G 1
>> > #define B 2
>> > #define A 3
>> >
>> >-enum interp_mode {
>> >-    INTERPOLATE_NEAREST,
>> >-    INTERPOLATE_TRILINEAR,
>> >-    INTERPOLATE_TETRAHEDRAL,
>> >-    INTERPOLATE_PYRAMID,
>> >-    INTERPOLATE_PRISM,
>> >-    NB_INTERP_MODE
>> >-};
>> >-
>> >-struct rgbvec {
>> >-    float r, g, b;
>> >-};
>> >-
>> >-/* 3D LUT don't often go up to level 32, but it is common to have a
>> Hald CLUT
>> >- * of 512x512 (64x64x64) */
>> >-#define MAX_LEVEL 256
>> >-#define PRELUT_SIZE 65536
>> >-
>> >-typedef struct Lut3DPreLut {
>> >-    int size;
>> >-    float min[3];
>> >-    float max[3];
>> >-    float scale[3];
>> >-    float* lut[3];
>> >-} Lut3DPreLut;
>> >-
>> >-typedef struct LUT3DContext {
>> >-    const AVClass *class;
>> >-    int interpolation;          ///<interp_mode
>> >-    char *file;
>> >-    uint8_t rgba_map[4];
>> >-    int step;
>> >-    avfilter_action_func *interp;
>> >-    struct rgbvec scale;
>> >-    struct rgbvec *lut;
>> >-    int lutsize;
>> >-    int lutsize2;
>> >-    Lut3DPreLut prelut;
>> >-#if CONFIG_HALDCLUT_FILTER
>> >-    uint8_t clut_rgba_map[4];
>> >-    int clut_step;
>> >-    int clut_bits;
>> >-    int clut_planar;
>> >-    int clut_float;
>> >-    int clut_width;
>> >-    FFFrameSync fs;
>> >-#endif
>> >-} LUT3DContext;
>> >-
>> >-typedef struct ThreadData {
>> >-    AVFrame *in, *out;
>> >-} ThreadData;
>> >-
>> > #define OFFSET(x) offsetof(LUT3DContext, x)
>> > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
>> > #define TFLAGS
>> AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
>> >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink)
>> >         av_assert0(0);
>> >     }
>> >
>> >+    if (ARCH_X86) {
>> >+        ff_lut3d_init_x86(lut3d, desc);
>> >+    }
>> >+
>> >     return 0;
>> > }
>> >
>> >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>> >index 016a5b3511..a29941eaeb 100644
>> >--- a/libavfilter/x86/Makefile
>> >+++ b/libavfilter/x86/Makefile
>> >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER)                 +=
>> x86/vf_hqdn3d_init.o
>> > OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
>> > OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
>> > OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
>> >+OBJS-$(CONFIG_LUT3D_FILTER)                  += x86/vf_lut3d_init.o
>> > OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            +=
>> x86/vf_maskedclamp_init.o
>> > OBJS-$(CONFIG_MASKEDMERGE_FILTER)            +=
>> x86/vf_maskedmerge_init.o
>> > OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
>> >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          +=
>> x86/vf_hqdn3d.o
>> > X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
>> > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
>> > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
>> >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER)           += x86/vf_lut3d.o
>> > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o
>> > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
>> > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
>> >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
>> >new file mode 100644
>> >index 0000000000..b3d7c3962b
>> >--- /dev/null
>> >+++ b/libavfilter/x86/vf_lut3d.asm
>> >@@ -0,0 +1,757 @@
>>
>> >+;*****************************************************************************
>> >+;* x86-optimized functions for lut3d filter
>> >+;*
>> >+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
>> >+;*
>> >+;* This file is part of FFmpeg.
>> >+;*
>> >+;* FFmpeg is free software; you can redistribute it and/or
>> >+;* modify it under the terms of the GNU Lesser General Public
>> >+;* License as published by the Free Software Foundation; either
>> >+;* version 2.1 of the License, or (at your option) any later version.
>> >+;*
>> >+;* FFmpeg is distributed in the hope that it will be useful,
>> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >+;* Lesser General Public License for more details.
>> >+;*
>> >+;* You should have received a copy of the GNU Lesser General Public
>> >+;* License along with FFmpeg; if not, write to the Free Software
>> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>>
>> >+;******************************************************************************
>> >+
>> >+%include "libavutil/x86/x86util.asm"
>> >+
>> >+SECTION_RODATA
>> >+pd_1f:  times 8 dd 1.0
>> >+pd_3f:  times 8 dd 3.0
>> >+
>> >+; used to limit rshifts as they are more expensive in avx1
>> >+pd_001: times 8 dd 001b
>> >+pd_010: times 8 dd 010b
>> >+pd_100: times 8 dd 100b
>> >+
>> >+pd_65535f:     times 8 dd 65535.0
>> >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
>> >+
>> >+pb_shuffle16:         db    0,    1, 0x80, 0x80, \
>> >+                            2,    3, 0x80, 0x80, \
>> >+                            4,    5, 0x80, 0x80, \
>> >+                            6,    7, 0x80, 0x80
>> >+
>> >+pb_lo_pack_shuffle16: db    0,    1,    4,    5, \
>> >+                            8,    9,   12,   13, \
>> >+                         0x80, 0x80, 0x80, 0x80, \
>> >+                         0x80, 0x80, 0x80, 0x80
>> >+
>> >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
>> >+                         0x80, 0x80, 0x80, 0x80, \
>> >+                            0,    1,    4,    5, \
>> >+                            8,    9,   12,   13
>> >+
>> >+; tetrahedral table --------------------------------------------
>> >+; name:          x2|        x1|        x0|       cxxb|     cxxa
>> >+; values:      r 00|     r  00|     r  00|   c011 011| c001 001
>> >+;              g 01|     g  01|     g  01|   c101 101| c010 010
>> >+;              b 10|     b  10|     b  10|   c110 110| c100 100
>> >+
>> >+; g>b                                 b |          g |          r |
>>     c110 | c100
>> >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) |
>> (110b << 3) | 100b
>> >+; r>b                                 g |          b |          r |
>>     c101 | c100
>> >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) |
>> (101b << 3) | 100b
>> >+; else                                g |          r |          b |
>>     c101 | c001
>> >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) |
>> (101b << 3) | 001b
>> >+; b>g                                 r |          g |          b |
>>     c011 | c001
>> >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) |
>> (011b << 3) | 001b
>> >+; b>r                                 r |          b |          g |
>>     c011 | c010
>> >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) |
>> (011b << 3) | 010b
>> >+; else                                b |          r |          g |
>>     c110 | c010
>> >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) |
>> (110b << 3) | 010b
>> >+
>> >+SECTION .text
>> >+
>> >+struc Lut3DPreLut
>> >+    .size:    resd 1
>> >+    .min:     resd 3
>> >+    .max:     resd 3
>> >+    .scale:   resd 3
>> >+    .lut:     resq 3
>> >+endstruc
>> >+
>> >+struc LUT3DContext
>> >+    .class:        resq 1
>> >+    .lut:          resq 1
>> >+    .lutsize:      resd 1
>> >+    .lutsize2:     resd 1
>> >+    .scale:        resd 3
>> >+endstruc
>> >+
>> >+%define AV_NUM_DATA_POINTERS 8
>> >+
>> >+struc AVFrame
>> >+    .data:          resq AV_NUM_DATA_POINTERS
>> >+    .linesize:      resd AV_NUM_DATA_POINTERS
>> >+    .extended_data: resq 1
>> >+    .width:         resd 1
>> >+    .height:        resd 1
>> >+endstruc
>> >+
>> >+%define rm   rsp
>> >+%define gm   rsp+mmsize
>> >+%define bm   rsp+(mmsize*2)
>> >+
>> >+%define lut3dsizem  [rsp+mmsize*3]
>> >+%define lut3dsize2m [rsp+mmsize*4]
>> >+%define lut3dmaxm   [rsp+mmsize*5]
>> >+%define prelutmaxm  [rsp+mmsize*6]
>> >+
>> >+%define scalerm [rsp+mmsize*7]
>> >+%define scalegm [rsp+mmsize*8]
>> >+%define scalebm [rsp+mmsize*9]
>> >+
>> >+%define prelutminrm [rsp+mmsize*10]
>> >+%define prelutmingm [rsp+mmsize*11]
>> >+%define prelutminbm [rsp+mmsize*12]
>> >+
>> >+%define prelutscalerm [rsp+mmsize*13]
>> >+%define prelutscalegm [rsp+mmsize*14]
>> >+%define prelutscalebm [rsp+mmsize*15]
>> >+
>> >+; data pointers
>> >+%define srcrm [rsp+mmsize*16 +  0]
>> >+%define srcgm [rsp+mmsize*16 +  8]
>> >+%define srcbm [rsp+mmsize*16 + 16]
>> >+%define srcam [rsp+mmsize*16 + 24]
>> >+
>> >+%define dstrm [rsp+mmsize*16 + 32]
>> >+%define dstgm [rsp+mmsize*16 + 40]
>> >+%define dstbm [rsp+mmsize*16 + 48]
>> >+%define dstam [rsp+mmsize*16 + 56]
>> >+
>> >+%macro FETCH_PRELUT_PN 3
>> >+    mov tmp2d, [rm + %3]
>> >+    mov tmp3d, [gm + %3]
>> >+    movss xm%1, [tmpq + tmp2q*4]
>> >+    movss xm%2, [tmpq + tmp3q*4]
>> >+    movss [rm + %3], xm%1
>> >+    movss [gm + %3], xm%2
>> >+%endmacro
>> >+
>> >+; 1 - p
>> >+; 2 - n
>> >+; 3 - p indices
>> >+; 4 - n indices
>> >+%macro GATHER_PRELUT 4
>> >+    %if cpuflag(avx2)
>> >+        vpcmpeqb m7, m7
>> >+        vgatherdps m%1, [tmpq + m%3*4], m7 ; p
>> >+        vpcmpeqb m9, m9
>> >+        vgatherdps m%2, [tmpq + m%4*4], m9 ; n
>> >+    %else
>> >+        mova [rm], m%3
>> >+        mova [gm], m%4
>> >+        FETCH_PRELUT_PN %1, %2, 0
>> >+        FETCH_PRELUT_PN %1, %2, 4
>> >+        FETCH_PRELUT_PN %1, %2, 8
>> >+        FETCH_PRELUT_PN %1, %2, 12
>> >+    %if mmsize > 16
>> >+        FETCH_PRELUT_PN %1, %2, 16
>> >+        FETCH_PRELUT_PN %1, %2, 20
>> >+        FETCH_PRELUT_PN %1, %2, 24
>> >+        FETCH_PRELUT_PN %1, %2, 28
>> >+    %endif
>> >+        movu m%1, [rm]
>> >+        movu m%2, [gm]
>> >+    %endif
>> >+%endmacro
>> >+
>> >+%macro FLOORPS 2
>> >+    %if mmsize > 16
>> >+        vroundps %1, %2, 0x01
>> >+    %else
>> >+        cvttps2dq %1, %2
>> >+        cvtdq2ps  %1, %1
>> >+    %endif
>> >+%endmacro
>> >+
>> >+; 1 - dst
>> >+; 2 - index
>> >+; 3 - min
>> >+; 4 - scale
>> >+; assumes lut max m13, m14 1.0f, zero m15
>> >+%macro APPLY_PRELUT 4
>> >+    ; scale
>> >+    subps m5, m%1, %3 ; v - min
>> >+    mulps m5, m5, %4  ; v * scale
>> >+    ; clamp
>> >+    maxps m5, m5, m15 ; max zero
>> >+    minps m5, m5, m13 ; min lut max
>> >+
>> >+    FLOORPS m3, m5    ; prev index
>> >+    subps m5, m5, m3  ; d
>> >+    addps m4, m3, m14 ; p+1 = n index
>> >+    minps m4, m4, m13 ; clamp n idex
>> >+
>> >+    mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
>> >+    cvttps2dq m6, m3
>> >+    cvttps2dq m10, m4
>> >+    GATHER_PRELUT 3, 4, 6, 10
>> >+
>> >+    ; lerp
>> >+    subps m8, m4, m3
>> >+    mulps m8, m8, m5
>> >+    addps m%1, m8, m3
>> >+%endmacro
>> >+
>> >+; 1 - dst
>> >+; 2 - scale
>> >+; assumes lut max m13, zero m15
>> >+%macro APPLY_SCALE 2
>> >+   mulps m%1, m%1, %2
>> >+   maxps m%1, m%1, m15
>> >+   minps m%1, m%1, m13
>> >+%endmacro
>> >+
>> >+%macro BLEND 4
>> >+%if mmsize > 16
>> >+    vblendvps %1, %2, %3, %4
>> >+%else
>> >+    %ifidni %1,%2
>> >+        %error operand 1 must not equal operand 2
>> >+    %endif
>> >+    %ifidni %1,%3
>> >+        %error operand 1 must not equal operand 3
>> >+    %endif
>> >+    mova  %1, %2
>> >+    xorps %1, %3
>> >+    andps %1, %4
>> >+    xorps %1, %2
>> >+%endif
>> >+%endmacro
>> >+
>> >+; sets nans to zere, +inf -inf handled later by min/max clamps
>> >+%macro SANITIZE_F 1
>> >+    cmpps m5, %1, %1, 0x0 ; nan == nan = False
>> >+    %if mmsize <= 16
>> >+        mova m6, %1
>> >+        BLEND %1, m15, m6, m5
>> >+    %else
>> >+        BLEND %1, m15, %1, m5
>> >+    %endif
>> >+%endmacro
>> >+
>> >+%macro ADD3 4
>> >+    addps %1, %2, %3
>> >+    addps %1, %1, %4
>> >+%endmacro
>> >+
>> >+%macro CMP_EQUAL 3
>> >+%if cpuflag(avx2)
>> >+    vpcmpeqd %1, %2, %3
>> >+%elif cpuflag(avx)
>> >+    cmpps %1, %2, %3, 0x0
>> >+%else
>> >+    pcmpeqd %1, %2, %3
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro SHIFT_RIGHT 2
>> >+%if mmsize <= 16
>> >+    psrld xm%1, %2
>> >+%elif cpuflag(avx2)
>> >+    vpsrld m%1, m%1, %2
>> >+%else
>> >+    vextractf128 xm15, m%1, 1
>> >+    psrld xm%1, %2
>> >+    psrld xm15, %2
>> >+    vinsertf128 m%1, m%1, xm15, 1
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro FETCH_LUT3D_RGB 4
>> >+    mov tmp2d, [rm + %4]
>> >+    movss xm%1, [tmpq + tmp2q*4 + 0]
>> >+    movss xm%2, [tmpq + tmp2q*4 + 4]
>> >+    movss xm%3, [tmpq + tmp2q*4 + 8]
>> >+    movss [rm + %4], xm%1
>> >+    movss [gm + %4], xm%2
>> >+    movss [bm + %4], xm%3
>> >+%endmacro
>> >+
>> >+; 1 - dstr
>> >+; 2 - dstg
>> >+; 3 - dstb
>> >+; 4 - indices
>> >+%macro GATHER_LUT3D_INDICES 4
>> >+%if cpuflag(avx2)
>> >+    vpcmpeqb m3, m3
>> >+    vgatherdps m%1, [tmpq + m%4*4 + 0], m3
>> >+    vpcmpeqb m14, m14
>> >+    vgatherdps m%2, [tmpq + m%4*4 + 4], m14
>> >+    vpcmpeqb m15, m15
>> >+    vgatherdps m%3, [tmpq + m%4*4 + 8], m15
>> >+%else
>> >+    movu [rm], m%4
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 0
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 4
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 8
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 12
>> >+%if mmsize > 16
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 16
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 20
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 24
>> >+    FETCH_LUT3D_RGB %1, %2, %3, 28
>> >+%endif
>> >+    movu m%1, [rm]
>> >+    movu m%2, [gm]
>> >+    movu m%3, [bm]
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro interp_tetrahedral 0
>> >+    %define d_r m0
>> >+    %define d_g m1
>> >+    %define d_b m2
>> >+
>> >+    %define prev_r m3
>> >+    %define prev_g m4
>> >+    %define prev_b m5
>> >+
>> >+    %define next_r m6
>> >+    %define next_g m7
>> >+    %define next_b m8
>> >+
>> >+    %define x0 m4
>> >+    %define x1 m5
>> >+    %define x2 m6
>> >+
>> >+    ; setup prev index
>> >+    FLOORPS prev_r, m0
>> >+    FLOORPS prev_g, m1
>> >+    FLOORPS prev_b, m2
>> >+
>> >+    ; setup deltas
>> >+    subps d_r, m0, prev_r
>> >+    subps d_g, m1, prev_g
>> >+    subps d_b, m2, prev_b
>> >+
>> >+    ; calculate select mask m9
>> >+    movu m6, [pd_tetra_table2]
>> >+    cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
>> >+    BLEND m10, m6, [pd_tetra_table1], m7
>> >+    cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
>> >+    BLEND m6, m10, [pd_tetra_table0], m7
>> >+
>> >+    movu m10, [pd_tetra_table5]
>> >+    cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
>> >+    BLEND m9, m10, [pd_tetra_table4], m7
>> >+    cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
>> >+    BLEND m10, m9, [pd_tetra_table3], m7
>> >+
>> >+    cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
>> >+    BLEND m9, m10, m6, m7
>> >+
>> >+    ; setup next index
>> >+    addps next_r, prev_r, m14 ; +1
>> >+    minps next_r, next_r, m13 ; clamp lutmax
>> >+
>> >+    addps next_g, prev_g, m14 ; +1
>> >+    minps next_g, next_g, m13 ; clamp lutmax
>> >+
>> >+    addps next_b, prev_b, m14 ; +1
>> >+    minps next_b, next_b, m13 ; clamp lutmax
>> >+
>> >+    ; prescale indices
>> >+    mulps prev_r, prev_r, lut3dsize2m
>> >+    mulps next_r, next_r, lut3dsize2m
>> >+
>> >+    mulps prev_g, prev_g, lut3dsizem
>> >+    mulps next_g, next_g, lut3dsizem
>> >+
>> >+    mulps prev_b, prev_b, [pd_3f]
>> >+    mulps next_b, next_b, [pd_3f]
>> >+
>> >+    movu m14, [pd_001]
>> >+
>> >+    ; cxxa m10
>> >+    ; b
>> >+    andps m15, m9, m14
>> >+    CMP_EQUAL m15, m15, m14
>> >+    BLEND m10, prev_b, next_b, m15
>> >+
>> >+    ; g
>> >+    andps m15, m9, [pd_010]
>> >+    CMP_EQUAL m15, m15, [pd_010]
>> >+    BLEND m12, prev_g, next_g, m15
>> >+
>> >+    ; r
>> >+    andps m15, m9, [pd_100]
>> >+    CMP_EQUAL m15, m15, [pd_100]
>> >+    BLEND m13, prev_r, next_r, m15
>> >+
>> >+    ADD3 m10, m10, m12, m13
>> >+
>> >+    SHIFT_RIGHT 9, 3 ; 3
>> >+
>> >+    ; cxxb m11;
>> >+    ; b
>> >+    andps m15, m9, m14
>> >+    CMP_EQUAL m15, m15, m14
>> >+    BLEND m11, prev_b, next_b, m15
>> >+
>> >+    ; g
>> >+    andps m15, m9, [pd_010]
>> >+    CMP_EQUAL m15, m15, [pd_010]
>> >+    BLEND m12, prev_g, next_g, m15
>> >+
>> >+    ; r
>> >+    andps m15, m9, [pd_100]
>> >+    CMP_EQUAL m15, m15, [pd_100]
>> >+    BLEND m13, prev_r, next_r, m15
>> >+
>> >+    ADD3 m11, m11, m12, m13
>> >+
>> >+    ; c000 m12;
>> >+    ADD3 m12, prev_r, prev_g, prev_b
>> >+
>> >+    ; c111 m13;
>> >+    ADD3 m13, next_r, next_g, next_b
>> >+
>> >+    SHIFT_RIGHT 9, 3 ; 6
>> >+
>> >+    ; x0, m4
>> >+    andps m15, m9, m14
>> >+    CMP_EQUAL m15, m15, m14
>> >+    BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+    andps m15, m9, [pd_010]
>> >+    CMP_EQUAL m15, m15, [pd_010]
>> >+    BLEND x0, m7, d_b, m15 ; b
>> >+
>> >+    ; x1, m5
>> >+    andps m15, m9, [pd_100]
>> >+    CMP_EQUAL m15, m15, [pd_100]
>> >+    BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+    SHIFT_RIGHT 9, 3 ; 9
>> >+
>> >+    andps m15, m9, m14
>> >+    CMP_EQUAL m15, m15, m14
>> >+    BLEND x1, m7, d_b, m15 ; b
>> >+
>> >+    ; x2, m6
>> >+    andps m15, m9, [pd_010]
>> >+    CMP_EQUAL m15, m15, [pd_010]
>> >+    BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+    andps m15, m9, [pd_100]
>> >+    CMP_EQUAL m15, m15, [pd_100]
>> >+    BLEND x2, m7, d_b, m15 ; b
>> >+
>> >+    ; convert indices to integer
>> >+    cvttps2dq m12, m12
>> >+    cvttps2dq m10, m10
>> >+    cvttps2dq m11, m11
>> >+    cvttps2dq m13, m13
>> >+
>> >+    ; now the gathering festival
>> >+    mov tmpq, [ctxq + LUT3DContext.lut]
>> >+
>> >+    GATHER_LUT3D_INDICES 0, 1, 2, 12
>> >+    movu m14, [pd_1f]
>> >+    subps m14, m14, x0; 1 - x0
>> >+
>> >+    mulps m0, m0, m14
>> >+    mulps m1, m1, m14
>> >+    mulps m2, m2, m14
>> >+
>> >+    GATHER_LUT3D_INDICES 7, 8, 9, 10
>> >+    subps m14, x0, x1; x0 - x1
>> >+    mulps m7, m7, m14
>> >+    addps m0, m0, m7
>> >+
>> >+    mulps m8, m8, m14
>> >+    addps m1, m1, m8
>> >+
>> >+    mulps m9, m9, m14
>> >+    addps m2, m2, m9
>> >+
>> >+    GATHER_LUT3D_INDICES 7, 8, 9, 11
>> >+    subps m14, x1, x2; x1 - x2
>> >+
>> >+    mulps m7, m7, m14
>> >+    addps m0, m0, m7
>> >+
>> >+    mulps m8, m8, m14
>> >+    addps m1, m1, m8
>> >+
>> >+    mulps m9, m9, m14
>> >+    addps m2, m2, m9
>> >+
>> >+    GATHER_LUT3D_INDICES 7, 8, 9, 13
>> >+    mulps m7, m7, x2
>> >+    addps m0, m0, m7
>> >+
>> >+    mulps m8, m8, x2
>> >+    addps m1, m1, m8
>> >+
>> >+    mulps m9, m9, x2
>> >+    addps m2, m2, m9
>> >+%endmacro
>> >+
>> >+%macro INIT_DATA_PTR 3
>> >+    mov ptrq, [%2 + AVFrame.data     + %3 * 8]
>> >+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>> >+    imul tmpd, slice_startd
>> >+    add ptrq, tmpq
>> >+    mov %1, ptrq
>> >+%endmacro
>> >+
>> >+%macro INC_DATA_PTR 3
>> >+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>> >+    mov ptrq, %1
>> >+    add ptrq, tmpq
>> >+    mov %1, ptrq
>> >+%endmacro
>> >+
>> >+%macro LOAD16 2
>> >+    mov ptrq, %2
>> >+    %if mmsize > 16
>> >+        movu xm%1, [ptrq + xq*2]
>> >+    %else
>> >+        movsd xm%1, [ptrq + xq*2]
>> >+    %endif
>> >+    %if cpuflag(avx2)
>> >+        vpmovzxwd m%1, xm%1
>> >+    %else
>> >+        %if mmsize > 16
>> >+            pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
>> >+            pshufb xm%1, xm6 ; pb_shuffle16
>> >+            pshufb xm4,  xm6 ; pb_shuffle16
>> >+            vinsertf128 m%1, m%1, xm4, 1
>> >+        %else
>> >+            pshufd  xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
>> >+            pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+            pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+        %endif
>> >+    %endif
>> >+    cvtdq2ps m%1, m%1
>> >+    mulps m%1, m%1, m7 ; pd_65535_invf
>> >+%endmacro
>> >+
>> >+%macro STORE16 2
>> >+    mulps m%2, m%2, m5  ; [pd_65535f]
>> >+    minps m%2, m%2, m5  ; [pd_65535f]
>> >+    maxps m%2, m%2, m15 ; zero
>> >+    cvttps2dq m%2, m%2
>> >+    %if mmsize > 16
>> >+        vextractf128 xm4, m%2, 1
>> >+        pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
>> >+        pshufb xm4,  xm7 ; [pb_hi_pack_shuffle16]
>> >+        por xm%2, xm4
>> >+    %else
>> >+        pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+        pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+        pshufd  xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
>> >+    %endif
>> >+    mov ptrq, %1
>> >+    %if mmsize > 16
>> >+        movu [ptrq + xq*2], xm%2
>> >+    %else
>> >+        movsd [ptrq + xq*2], xm%2
>> >+    %endif
>> >+%endmacro
>> >+
>> >+; 1 - interp method
>> >+; 2 - format_name
>> >+; 3 - depth
>> >+; 4 - is float format
>> >+%macro DEFINE_INTERP_FUNC 4
>> >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut,
>> src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr,
>> tmp, tmp2, tmp3
>> >+    ; store lut max and lutsize
>> >+    mov tmpd, dword [ctxq + LUT3DContext.lutsize]
>> >+    cvtsi2ss xm0, tmpd
>> >+    mulss xm0, xm0, [pd_3f]
>> >+    VBROADCASTSS m0, xm0
>> >+    mova lut3dsizem, m0
>> >+    sub tmpd, 1
>> >+    cvtsi2ss xm0, tmpd
>> >+    VBROADCASTSS m0, xm0
>> >+    mova lut3dmaxm, m0
>> >+
>> >+    ; scale_r
>> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
>> >+    VBROADCASTSS m1, xm1
>> >+    mova scalerm, m1
>> >+
>> >+    ; scale_g
>> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
>> >+    VBROADCASTSS m1, xm1
>> >+    mova scalegm, m1
>> >+
>> >+    ; scale_b
>> >+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
>> >+    VBROADCASTSS m1, xm1
>> >+    mova scalebm, m1
>> >+
>> >+    ; store lutsize2
>> >+    cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
>> >+    mulss xm0, xm0, [pd_3f]
>> >+    VBROADCASTSS m0, xm0
>> >+    mova lut3dsize2m, m0
>> >+
>> >+    ; init prelut values
>> >+    cmp prelutq, 0
>> >+    je %%skip_init_prelut
>> >+        mov tmpd, dword [prelutq + Lut3DPreLut.size]
>> >+        sub tmpd, 1
>> >+        cvtsi2ss xm0, tmpd
>> >+        VBROADCASTSS m0, xm0
>> >+        mova prelutmaxm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
>> >+        mova prelutminrm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
>> >+        mova prelutmingm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
>> >+        mova prelutminbm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
>> >+        mova prelutscalerm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
>> >+        mova prelutscalegm, m0
>> >+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
>> >+        mova prelutscalebm, m0
>> >+    %%skip_init_prelut:
>> >+
>> >+    mov widthd,  [src_imageq + AVFrame.width]
>> >+
>> >+    ; gbra pixel order
>> >+    INIT_DATA_PTR srcrm, src_imageq, 2
>> >+    INIT_DATA_PTR srcgm, src_imageq, 0
>> >+    INIT_DATA_PTR srcbm, src_imageq, 1
>> >+    INIT_DATA_PTR srcam, src_imageq, 3
>> >+
>> >+    INIT_DATA_PTR dstrm, dst_imageq, 2
>> >+    INIT_DATA_PTR dstgm, dst_imageq, 0
>> >+    INIT_DATA_PTR dstbm, dst_imageq, 1
>> >+    INIT_DATA_PTR dstam, dst_imageq, 3
>> >+
>> >+    %%loop_y:
>> >+        xor xq, xq
>> >+        %%loop_x:
>> >+            movu m14, [pd_1f]
>> >+            xorps m15, m15, m15
>> >+            %if %4 ; float
>> >+                mov ptrq, srcrm
>> >+                movu m0, [ptrq + xq*4]
>> >+                mov ptrq, srcgm
>> >+                movu m1, [ptrq + xq*4]
>> >+                mov ptrq, srcbm
>> >+                movu m2, [ptrq + xq*4]
>> >+                SANITIZE_F m0
>> >+                SANITIZE_F m1
>> >+                SANITIZE_F m2
>> >+            %else
>> >+                ; constants for LOAD16
>> >+                movu m7, [pd_65535_invf]
>> >+                %if notcpuflag(avx2) && mmsize >= 32
>> >+                    movu xm6, [pb_shuffle16]
>> >+                %endif
>> >+                LOAD16 0, srcrm
>> >+                LOAD16 1, srcgm
>> >+                LOAD16 2, srcbm
>> >+            %endif
>> >+
>> >+            cmp prelutq, 0
>> >+            je %%skip_prelut
>> >+                mova m13, prelutmaxm
>> >+                APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
>> >+                APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
>> >+                APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
>> >+            %%skip_prelut:
>> >+
>> >+            mova m13, lut3dmaxm
>> >+            APPLY_SCALE 0, scalerm
>> >+            APPLY_SCALE 1, scalegm
>> >+            APPLY_SCALE 2, scalebm
>> >+
>> >+            interp_%1
>> >+
>> >+            %if %4 ; float
>> >+                mov ptrq, dstrm
>> >+                movu [ptrq + xq*4], m0
>> >+                mov ptrq, dstgm
>> >+                movu [ptrq + xq*4], m1
>> >+                mov ptrq, dstbm
>> >+                movu [ptrq + xq*4], m2
>> >+                cmp has_alphad, 0
>> >+                je %%skip_alphaf
>> >+                    mov ptrq, srcam
>> >+                    movu m0, [ptrq + xq*4]
>> >+                    mov ptrq, dstam
>> >+                    movu [ptrq + xq*4], m0
>> >+                %%skip_alphaf:
>> >+            %else
>> >+                ; constants for STORE16
>> >+                movu m5,  [pd_65535f]
>> >+                %if mmsize > 16
>> >+                    movu xm6, [pb_lo_pack_shuffle16]
>> >+                    movu xm7, [pb_hi_pack_shuffle16]
>> >+                %endif
>> >+
>> >+                xorps m15, m15, m15
>> >+                STORE16 dstrm, 0
>> >+                STORE16 dstgm, 1
>> >+                STORE16 dstbm, 2
>> >+
>> >+                cmp has_alphad, 0
>> >+                je %%skip_alpha
>> >+                    %if mmsize > 16
>> >+                        mov ptrq, srcam
>> >+                        movu xm0, [ptrq + xq*2]
>> >+                        mov ptrq, dstam
>> >+                        movu [ptrq + xq*2], xm0
>> >+                    %else
>> >+                        mov ptrq, srcam
>> >+                        movsd xm0, [ptrq + xq*2]
>> >+                        mov ptrq, dstam
>> >+                        movsd [ptrq + xq*2], xm0
>> >+                    %endif
>> >+
>> >+                %%skip_alpha:
>> >+            %endif
>> >+
>> >+            add xq, mmsize/4
>> >+            cmp xd, widthd
>> >+            jl %%loop_x
>> >+
>> >+        INC_DATA_PTR srcrm, src_imageq, 2
>> >+        INC_DATA_PTR srcgm, src_imageq, 0
>> >+        INC_DATA_PTR srcbm, src_imageq, 1
>> >+        INC_DATA_PTR srcam, src_imageq, 3
>> >+
>> >+        INC_DATA_PTR dstrm, dst_imageq, 2
>> >+        INC_DATA_PTR dstgm, dst_imageq, 0
>> >+        INC_DATA_PTR dstbm, dst_imageq, 1
>> >+        INC_DATA_PTR dstam, dst_imageq, 3
>> >+
>> >+        inc slice_startd
>> >+        cmp slice_startd, slice_endd
>> >+        jl %%loop_y
>> >+
>> >+    RET
>> >+%endmacro
>> >+%if ARCH_X86_64
>> >+    %if HAVE_AVX2_EXTERNAL
>> >+        INIT_YMM avx2
>> >+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+    %endif
>> >+    %if HAVE_AVX_EXTERNAL
>> >+        INIT_YMM avx
>> >+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+    %endif
>> >+    INIT_XMM sse2
>> >+    DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+    DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+%endif
>> >\ No newline at end of file
>> >diff --git a/libavfilter/x86/vf_lut3d_init.c
>> b/libavfilter/x86/vf_lut3d_init.c
>> >new file mode 100644
>> >index 0000000000..9b9b36e4af
>> >--- /dev/null
>> >+++ b/libavfilter/x86/vf_lut3d_init.c
>> >@@ -0,0 +1,88 @@
>> >+/*
>> >+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
>> >+ *
>> >+ * This file is part of FFmpeg.
>> >+ *
>> >+ * FFmpeg is free software; you can redistribute it and/or
>> >+ * modify it under the terms of the GNU Lesser General Public
>> >+ * License as published by the Free Software Foundation; either
>> >+ * version 2.1 of the License, or (at your option) any later version.
>> >+ *
>> >+ * FFmpeg is distributed in the hope that it will be useful,
>> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >+ * Lesser General Public License for more details.
>> >+ *
>> >+ * You should have received a copy of the GNU Lesser General Public
>> >+ * License along with FFmpeg; if not, write to the Free Software
>> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> >+ */
>> >+
>> >+#include "libavutil/attributes.h"
>> >+#include "libavutil/cpu.h"
>> >+#include "libavutil/x86/cpu.h"
>> >+#include "libavfilter/lut3d.h"
>> >+
>> >+#define DEFINE_INTERP_FUNC(name, format, opt)
>>
>>  \
>> >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d,
>> Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int
>> slice_end, int has_alpha); \
>> >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void
>> *arg, int jobnr, int nb_jobs)
>>   \
>> >+{
>>
>>  \
>> >+    LUT3DContext *lut3d = ctx->priv;
>>
>> \
>> >+    Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut:
>> NULL;
>>        \
>> >+    ThreadData *td = arg;
>>
>>  \
>> >+    AVFrame *in  = td->in;
>>
>> \
>> >+    AVFrame *out = td->out;
>>
>>  \
>> >+    int has_alpha = in->linesize[3] && out != in;
>>
>>  \
>> >+    int slice_start = (in->height *  jobnr   ) / nb_jobs;
>>
>>  \
>> >+    int slice_end   = (in->height * (jobnr+1)) / nb_jobs;
>>
>>  \
>> >+    ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out,
>> slice_start, slice_end, has_alpha);
>>          \
>> >+    return 0;
>>
>>  \
>> >+}
>> >+
>> >+#if ARCH_X86_64
>> >+#if HAVE_AVX2_EXTERNAL
>> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
>> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx2)
>> >+#endif
>> >+#if HAVE_AVX_EXTERNAL
>> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
>> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx)
>> >+#endif
>> >+    DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
>> >+    DEFINE_INTERP_FUNC(tetrahedral, p16,  sse2)
>> >+#endif
>> >+
>> >+
>> >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const
>> AVPixFmtDescriptor *desc)
>> >+{
>> >+    int cpu_flags = av_get_cpu_flags();
>> >+    int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
>> >+    int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
>> >+    int depth = desc->comp[0].depth;
>> >+
>> >+#if ARCH_X86_64
>> >+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+#if HAVE_AVX2_EXTERNAL
>> >+        if (isfloat && planar) {
>> >+            s->interp = interp_tetrahedral_pf32_avx2;
>> >+        } else if (depth == 16) {
>> >+            s->interp = interp_tetrahedral_p16_avx2;
>> >+        }
>> >+#endif
>> >+    } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+#if HAVE_AVX_EXTERNAL
>> >+        if (isfloat) {
>> >+            s->interp = interp_tetrahedral_pf32_avx;
>> >+        } else if (depth == 16) {
>> >+            s->interp = interp_tetrahedral_p16_avx;
>> >+        }
>> >+#endif
>> >+    } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+        if (isfloat) {
>> >+            s->interp = interp_tetrahedral_pf32_sse2;
>> >+        } else if (depth == 16) {
>> >+            s->interp = interp_tetrahedral_p16_sse2;
>> >+        }
>> >+    }
>> >+#endif
>> >+}
>> >--
>> >2.31.1.windows.1
>> >
>> >_______________________________________________
>> >ffmpeg-devel mailing list
>> >ffmpeg-devel@ffmpeg.org
>> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >
>> >To unsubscribe, visit link above, or email
>> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
>>
>
diff mbox series

Patch

diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
new file mode 100644
index 0000000000..ded2a036a5
--- /dev/null
+++ b/libavfilter/lut3d.h
@@ -0,0 +1,83 @@ 
+/*
+ * Copyright (c) 2013 Clément Bœsch
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef AVFILTER_LUT3D_H
+#define AVFILTER_LUT3D_H
+
+#include "libavutil/pixdesc.h"
+#include "framesync.h"
+#include "avfilter.h"
+
+enum interp_mode {
+    INTERPOLATE_NEAREST,
+    INTERPOLATE_TRILINEAR,
+    INTERPOLATE_TETRAHEDRAL,
+    INTERPOLATE_PYRAMID,
+    INTERPOLATE_PRISM,
+    NB_INTERP_MODE
+};
+
+struct rgbvec {
+    float r, g, b;
+};
+
+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
+ * of 512x512 (64x64x64) */
+#define MAX_LEVEL 256
+#define PRELUT_SIZE 65536
+
+typedef struct Lut3DPreLut {
+    int size;
+    float min[3];
+    float max[3];
+    float scale[3];
+    float* lut[3];
+} Lut3DPreLut;
+
+typedef struct LUT3DContext {
+    const AVClass *class;
+    struct rgbvec *lut;
+    int lutsize;
+    int lutsize2;
+    struct rgbvec scale;
+    int interpolation;          ///<interp_mode
+    char *file;
+    uint8_t rgba_map[4];
+    int step;
+    avfilter_action_func *interp;
+    Lut3DPreLut prelut;
+#if CONFIG_HALDCLUT_FILTER
+    uint8_t clut_rgba_map[4];
+    int clut_step;
+    int clut_bits;
+    int clut_planar;
+    int clut_float;
+    int clut_width;
+    FFFrameSync fs;
+#endif
+} LUT3DContext;
+
+typedef struct ThreadData {
+    AVFrame *in, *out;
+} ThreadData;
+
+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
+
+#endif /* AVFILTER_LUT3D_H */
\ No newline at end of file
diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
index 9fbda833b9..1fd0af06db 100644
--- a/libavfilter/vf_lut3d.c
+++ b/libavfilter/vf_lut3d.c
@@ -31,73 +31,18 @@ 
 #include "libavutil/intreadwrite.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/avassert.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/avstring.h"
-#include "avfilter.h"
 #include "drawutils.h"
 #include "formats.h"
-#include "framesync.h"
 #include "internal.h"
 #include "video.h"
+#include "lut3d.h"

 #define R 0
 #define G 1
 #define B 2
 #define A 3

-enum interp_mode {
-    INTERPOLATE_NEAREST,
-    INTERPOLATE_TRILINEAR,
-    INTERPOLATE_TETRAHEDRAL,
-    INTERPOLATE_PYRAMID,
-    INTERPOLATE_PRISM,
-    NB_INTERP_MODE
-};
-
-struct rgbvec {
-    float r, g, b;
-};
-
-/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
- * of 512x512 (64x64x64) */
-#define MAX_LEVEL 256
-#define PRELUT_SIZE 65536
-
-typedef struct Lut3DPreLut {
-    int size;
-    float min[3];
-    float max[3];
-    float scale[3];
-    float* lut[3];
-} Lut3DPreLut;
-
-typedef struct LUT3DContext {
-    const AVClass *class;
-    int interpolation;          ///<interp_mode
-    char *file;
-    uint8_t rgba_map[4];
-    int step;
-    avfilter_action_func *interp;
-    struct rgbvec scale;
-    struct rgbvec *lut;
-    int lutsize;
-    int lutsize2;
-    Lut3DPreLut prelut;
-#if CONFIG_HALDCLUT_FILTER
-    uint8_t clut_rgba_map[4];
-    int clut_step;
-    int clut_bits;
-    int clut_planar;
-    int clut_float;
-    int clut_width;
-    FFFrameSync fs;
-#endif
-} LUT3DContext;
-
-typedef struct ThreadData {
-    AVFrame *in, *out;
-} ThreadData;
-
 #define OFFSET(x) offsetof(LUT3DContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
 #define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
@@ -1207,6 +1152,10 @@  static int config_input(AVFilterLink *inlink)
         av_assert0(0);
     }

+    if (ARCH_X86) {
+        ff_lut3d_init_x86(lut3d, desc);
+    }
+
     return 0;
 }

diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 016a5b3511..a29941eaeb 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -17,6 +17,7 @@  OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
 OBJS-$(CONFIG_INTERLACE_FILTER)              += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_LIMITER_FILTER)                += x86/vf_limiter_init.o
+OBJS-$(CONFIG_LUT3D_FILTER)                  += x86/vf_lut3d_init.o
 OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += x86/vf_maskedclamp_init.o
 OBJS-$(CONFIG_MASKEDMERGE_FILTER)            += x86/vf_maskedmerge_init.o
 OBJS-$(CONFIG_NOISE_FILTER)                  += x86/vf_noise.o
@@ -57,6 +58,7 @@  X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER)          += x86/vf_hqdn3d.o
 X86ASM-OBJS-$(CONFIG_IDET_FILTER)            += x86/vf_idet.o
 X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER)       += x86/vf_interlace.o
 X86ASM-OBJS-$(CONFIG_LIMITER_FILTER)         += x86/vf_limiter.o
+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER)           += x86/vf_lut3d.o
 X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER)     += x86/vf_maskedclamp.o
 X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER)     += x86/vf_maskedmerge.o
 X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER)         += x86/vf_overlay.o
diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
new file mode 100644
index 0000000000..b3d7c3962b
--- /dev/null
+++ b/libavfilter/x86/vf_lut3d.asm
@@ -0,0 +1,757 @@ 
+;*****************************************************************************
+;* x86-optimized functions for lut3d filter
+;*
+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pd_1f:  times 8 dd 1.0
+pd_3f:  times 8 dd 3.0
+
+; used to limit rshifts as they are more expensive in avx1
+pd_001: times 8 dd 001b
+pd_010: times 8 dd 010b
+pd_100: times 8 dd 100b
+
+pd_65535f:     times 8 dd 65535.0
+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
+
+pb_shuffle16:         db    0,    1, 0x80, 0x80, \
+                            2,    3, 0x80, 0x80, \
+                            4,    5, 0x80, 0x80, \
+                            6,    7, 0x80, 0x80
+
+pb_lo_pack_shuffle16: db    0,    1,    4,    5, \
+                            8,    9,   12,   13, \
+                         0x80, 0x80, 0x80, 0x80, \
+                         0x80, 0x80, 0x80, 0x80
+
+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
+                         0x80, 0x80, 0x80, 0x80, \
+                            0,    1,    4,    5, \
+                            8,    9,   12,   13
+
+; tetrahedral table --------------------------------------------
+; name:          x2|        x1|        x0|       cxxb|     cxxa
+; values:      r 00|     r  00|     r  00|   c011 011| c001 001
+;              g 01|     g  01|     g  01|   c101 101| c010 010
+;              b 10|     b  10|     b  10|   c110 110| c100 100
+
+; g>b                                 b |          g |          r |        c110 | c100
+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << 3) | 100b
+; r>b                                 g |          b |          r |        c101 | c100
+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << 3) | 100b
+; else                                g |          r |          b |        c101 | c001
+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << 3) | 001b
+; b>g                                 r |          g |          b |        c011 | c001
+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << 3) | 001b
+; b>r                                 r |          b |          g |        c011 | c010
+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << 3) | 010b
+; else                                b |          r |          g |        c110 | c010
+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << 3) | 010b
+
+SECTION .text
+
+struc Lut3DPreLut
+    .size:    resd 1
+    .min:     resd 3
+    .max:     resd 3
+    .scale:   resd 3
+    .lut:     resq 3
+endstruc
+
+struc LUT3DContext
+    .class:        resq 1
+    .lut:          resq 1
+    .lutsize:      resd 1
+    .lutsize2:     resd 1
+    .scale:        resd 3
+endstruc
+
+%define AV_NUM_DATA_POINTERS 8
+
+struc AVFrame
+    .data:          resq AV_NUM_DATA_POINTERS
+    .linesize:      resd AV_NUM_DATA_POINTERS
+    .extended_data: resq 1
+    .width:         resd 1
+    .height:        resd 1
+endstruc
+
+%define rm   rsp
+%define gm   rsp+mmsize
+%define bm   rsp+(mmsize*2)
+
+%define lut3dsizem  [rsp+mmsize*3]
+%define lut3dsize2m [rsp+mmsize*4]
+%define lut3dmaxm   [rsp+mmsize*5]
+%define prelutmaxm  [rsp+mmsize*6]
+
+%define scalerm [rsp+mmsize*7]
+%define scalegm [rsp+mmsize*8]
+%define scalebm [rsp+mmsize*9]
+
+%define prelutminrm [rsp+mmsize*10]
+%define prelutmingm [rsp+mmsize*11]
+%define prelutminbm [rsp+mmsize*12]
+
+%define prelutscalerm [rsp+mmsize*13]
+%define prelutscalegm [rsp+mmsize*14]
+%define prelutscalebm [rsp+mmsize*15]
+
+; data pointers
+%define srcrm [rsp+mmsize*16 +  0]
+%define srcgm [rsp+mmsize*16 +  8]
+%define srcbm [rsp+mmsize*16 + 16]
+%define srcam [rsp+mmsize*16 + 24]
+
+%define dstrm [rsp+mmsize*16 + 32]
+%define dstgm [rsp+mmsize*16 + 40]
+%define dstbm [rsp+mmsize*16 + 48]
+%define dstam [rsp+mmsize*16 + 56]
+
+%macro FETCH_PRELUT_PN 3
+    mov tmp2d, [rm + %3]
+    mov tmp3d, [gm + %3]
+    movss xm%1, [tmpq + tmp2q*4]
+    movss xm%2, [tmpq + tmp3q*4]
+    movss [rm + %3], xm%1
+    movss [gm + %3], xm%2
+%endmacro
+
+; 1 - p
+; 2 - n
+; 3 - p indices
+; 4 - n indices
+%macro GATHER_PRELUT 4
+    %if cpuflag(avx2)
+        vpcmpeqb m7, m7
+        vgatherdps m%1, [tmpq + m%3*4], m7 ; p
+        vpcmpeqb m9, m9
+        vgatherdps m%2, [tmpq + m%4*4], m9 ; n
+    %else
+        mova [rm], m%3
+        mova [gm], m%4
+        FETCH_PRELUT_PN %1, %2, 0
+        FETCH_PRELUT_PN %1, %2, 4
+        FETCH_PRELUT_PN %1, %2, 8
+        FETCH_PRELUT_PN %1, %2, 12
+    %if mmsize > 16
+        FETCH_PRELUT_PN %1, %2, 16
+        FETCH_PRELUT_PN %1, %2, 20
+        FETCH_PRELUT_PN %1, %2, 24
+        FETCH_PRELUT_PN %1, %2, 28
+    %endif
+        movu m%1, [rm]
+        movu m%2, [gm]
+    %endif
+%endmacro
+
+%macro FLOORPS 2
+    %if mmsize > 16
+        vroundps %1, %2, 0x01
+    %else
+        cvttps2dq %1, %2
+        cvtdq2ps  %1, %1
+    %endif
+%endmacro
+
+; 1 - dst
+; 2 - index
+; 3 - min
+; 4 - scale
+; assumes lut max m13, m14 1.0f, zero m15
+%macro APPLY_PRELUT 4
+    ; scale
+    subps m5, m%1, %3 ; v - min
+    mulps m5, m5, %4  ; v * scale
+    ; clamp
+    maxps m5, m5, m15 ; max zero
+    minps m5, m5, m13 ; min lut max
+
+    FLOORPS m3, m5    ; prev index
+    subps m5, m5, m3  ; d
+    addps m4, m3, m14 ; p+1 = n index
+    minps m4, m4, m13 ; clamp n idex
+
+    mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
+    cvttps2dq m6, m3
+    cvttps2dq m10, m4
+    GATHER_PRELUT 3, 4, 6, 10
+
+    ; lerp
+    subps m8, m4, m3
+    mulps m8, m8, m5
+    addps m%1, m8, m3
+%endmacro
+
+; 1 - dst
+; 2 - scale
+; assumes lut max m13, zero m15
+%macro APPLY_SCALE 2
+   mulps m%1, m%1, %2
+   maxps m%1, m%1, m15
+   minps m%1, m%1, m13
+%endmacro
+
+%macro BLEND 4
+%if mmsize > 16
+    vblendvps %1, %2, %3, %4
+%else
+    %ifidni %1,%2
+        %error operand 1 must not equal operand 2
+    %endif
+    %ifidni %1,%3
+        %error operand 1 must not equal operand 3
+    %endif
+    mova  %1, %2
+    xorps %1, %3
+    andps %1, %4
+    xorps %1, %2
+%endif
+%endmacro
+
+; sets nans to zere, +inf -inf handled later by min/max clamps
+%macro SANITIZE_F 1
+    cmpps m5, %1, %1, 0x0 ; nan == nan = False
+    %if mmsize <= 16
+        mova m6, %1
+        BLEND %1, m15, m6, m5
+    %else
+        BLEND %1, m15, %1, m5
+    %endif
+%endmacro
+
+%macro ADD3 4
+    addps %1, %2, %3
+    addps %1, %1, %4
+%endmacro
+
+%macro CMP_EQUAL 3
+%if cpuflag(avx2)
+    vpcmpeqd %1, %2, %3
+%elif cpuflag(avx)
+    cmpps %1, %2, %3, 0x0
+%else
+    pcmpeqd %1, %2, %3
+%endif
+%endmacro
+
+%macro SHIFT_RIGHT 2
+%if mmsize <= 16
+    psrld xm%1, %2
+%elif cpuflag(avx2)
+    vpsrld m%1, m%1, %2
+%else
+    vextractf128 xm15, m%1, 1
+    psrld xm%1, %2
+    psrld xm15, %2
+    vinsertf128 m%1, m%1, xm15, 1
+%endif
+%endmacro
+
+%macro FETCH_LUT3D_RGB 4
+    mov tmp2d, [rm + %4]
+    movss xm%1, [tmpq + tmp2q*4 + 0]
+    movss xm%2, [tmpq + tmp2q*4 + 4]
+    movss xm%3, [tmpq + tmp2q*4 + 8]
+    movss [rm + %4], xm%1
+    movss [gm + %4], xm%2
+    movss [bm + %4], xm%3
+%endmacro
+
+; 1 - dstr
+; 2 - dstg
+; 3 - dstb
+; 4 - indices
+%macro GATHER_LUT3D_INDICES 4
+%if cpuflag(avx2)
+    vpcmpeqb m3, m3
+    vgatherdps m%1, [tmpq + m%4*4 + 0], m3
+    vpcmpeqb m14, m14
+    vgatherdps m%2, [tmpq + m%4*4 + 4], m14
+    vpcmpeqb m15, m15
+    vgatherdps m%3, [tmpq + m%4*4 + 8], m15
+%else
+    movu [rm], m%4
+    FETCH_LUT3D_RGB %1, %2, %3, 0
+    FETCH_LUT3D_RGB %1, %2, %3, 4
+    FETCH_LUT3D_RGB %1, %2, %3, 8
+    FETCH_LUT3D_RGB %1, %2, %3, 12
+%if mmsize > 16
+    FETCH_LUT3D_RGB %1, %2, %3, 16
+    FETCH_LUT3D_RGB %1, %2, %3, 20
+    FETCH_LUT3D_RGB %1, %2, %3, 24
+    FETCH_LUT3D_RGB %1, %2, %3, 28
+%endif
+    movu m%1, [rm]
+    movu m%2, [gm]
+    movu m%3, [bm]
+%endif
+%endmacro
+
+%macro interp_tetrahedral 0
+    %define d_r m0
+    %define d_g m1
+    %define d_b m2
+
+    %define prev_r m3
+    %define prev_g m4
+    %define prev_b m5
+
+    %define next_r m6
+    %define next_g m7
+    %define next_b m8
+
+    %define x0 m4
+    %define x1 m5
+    %define x2 m6
+
+    ; setup prev index
+    FLOORPS prev_r, m0
+    FLOORPS prev_g, m1
+    FLOORPS prev_b, m2
+
+    ; setup deltas
+    subps d_r, m0, prev_r
+    subps d_g, m1, prev_g
+    subps d_b, m2, prev_b
+
+    ; calculate select mask m9
+    movu m6, [pd_tetra_table2]
+    cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
+    BLEND m10, m6, [pd_tetra_table1], m7
+    cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
+    BLEND m6, m10, [pd_tetra_table0], m7
+
+    movu m10, [pd_tetra_table5]
+    cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
+    BLEND m9, m10, [pd_tetra_table4], m7
+    cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
+    BLEND m10, m9, [pd_tetra_table3], m7
+
+    cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
+    BLEND m9, m10, m6, m7
+
+    ; setup next index
+    addps next_r, prev_r, m14 ; +1
+    minps next_r, next_r, m13 ; clamp lutmax
+
+    addps next_g, prev_g, m14 ; +1
+    minps next_g, next_g, m13 ; clamp lutmax
+
+    addps next_b, prev_b, m14 ; +1
+    minps next_b, next_b, m13 ; clamp lutmax
+
+    ; prescale indices
+    mulps prev_r, prev_r, lut3dsize2m
+    mulps next_r, next_r, lut3dsize2m
+
+    mulps prev_g, prev_g, lut3dsizem
+    mulps next_g, next_g, lut3dsizem
+
+    mulps prev_b, prev_b, [pd_3f]
+    mulps next_b, next_b, [pd_3f]
+
+    movu m14, [pd_001]
+
+    ; cxxa m10
+    ; b
+    andps m15, m9, m14
+    CMP_EQUAL m15, m15, m14
+    BLEND m10, prev_b, next_b, m15
+
+    ; g
+    andps m15, m9, [pd_010]
+    CMP_EQUAL m15, m15, [pd_010]
+    BLEND m12, prev_g, next_g, m15
+
+    ; r
+    andps m15, m9, [pd_100]
+    CMP_EQUAL m15, m15, [pd_100]
+    BLEND m13, prev_r, next_r, m15
+
+    ADD3 m10, m10, m12, m13
+
+    SHIFT_RIGHT 9, 3 ; 3
+
+    ; cxxb m11;
+    ; b
+    andps m15, m9, m14
+    CMP_EQUAL m15, m15, m14
+    BLEND m11, prev_b, next_b, m15
+
+    ; g
+    andps m15, m9, [pd_010]
+    CMP_EQUAL m15, m15, [pd_010]
+    BLEND m12, prev_g, next_g, m15
+
+    ; r
+    andps m15, m9, [pd_100]
+    CMP_EQUAL m15, m15, [pd_100]
+    BLEND m13, prev_r, next_r, m15
+
+    ADD3 m11, m11, m12, m13
+
+    ; c000 m12;
+    ADD3 m12, prev_r, prev_g, prev_b
+
+    ; c111 m13;
+    ADD3 m13, next_r, next_g, next_b
+
+    SHIFT_RIGHT 9, 3 ; 6
+
+    ; x0, m4
+    andps m15, m9, m14
+    CMP_EQUAL m15, m15, m14
+    BLEND m7, d_r, d_g, m15 ; r,g
+
+    andps m15, m9, [pd_010]
+    CMP_EQUAL m15, m15, [pd_010]
+    BLEND x0, m7, d_b, m15 ; b
+
+    ; x1, m5
+    andps m15, m9, [pd_100]
+    CMP_EQUAL m15, m15, [pd_100]
+    BLEND m7, d_r, d_g, m15 ; r,g
+
+    SHIFT_RIGHT 9, 3 ; 9
+
+    andps m15, m9, m14
+    CMP_EQUAL m15, m15, m14
+    BLEND x1, m7, d_b, m15 ; b
+
+    ; x2, m6
+    andps m15, m9, [pd_010]
+    CMP_EQUAL m15, m15, [pd_010]
+    BLEND m7, d_r, d_g, m15 ; r,g
+
+    andps m15, m9, [pd_100]
+    CMP_EQUAL m15, m15, [pd_100]
+    BLEND x2, m7, d_b, m15 ; b
+
+    ; convert indices to integer
+    cvttps2dq m12, m12
+    cvttps2dq m10, m10
+    cvttps2dq m11, m11
+    cvttps2dq m13, m13
+
+    ; now the gathering festival
+    mov tmpq, [ctxq + LUT3DContext.lut]
+
+    GATHER_LUT3D_INDICES 0, 1, 2, 12
+    movu m14, [pd_1f]
+    subps m14, m14, x0; 1 - x0
+
+    mulps m0, m0, m14
+    mulps m1, m1, m14
+    mulps m2, m2, m14
+
+    GATHER_LUT3D_INDICES 7, 8, 9, 10
+    subps m14, x0, x1; x0 - x1
+    mulps m7, m7, m14
+    addps m0, m0, m7
+
+    mulps m8, m8, m14
+    addps m1, m1, m8
+
+    mulps m9, m9, m14
+    addps m2, m2, m9
+
+    GATHER_LUT3D_INDICES 7, 8, 9, 11
+    subps m14, x1, x2; x1 - x2
+
+    mulps m7, m7, m14
+    addps m0, m0, m7
+
+    mulps m8, m8, m14
+    addps m1, m1, m8
+
+    mulps m9, m9, m14
+    addps m2, m2, m9
+
+    GATHER_LUT3D_INDICES 7, 8, 9, 13
+    mulps m7, m7, x2
+    addps m0, m0, m7
+
+    mulps m8, m8, x2
+    addps m1, m1, m8
+
+    mulps m9, m9, x2
+    addps m2, m2, m9
+%endmacro
+
+%macro INIT_DATA_PTR 3
+    mov ptrq, [%2 + AVFrame.data     + %3 * 8]
+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
+    imul tmpd, slice_startd
+    add ptrq, tmpq
+    mov %1, ptrq
+%endmacro
+
+%macro INC_DATA_PTR 3
+    mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
+    mov ptrq, %1
+    add ptrq, tmpq
+    mov %1, ptrq
+%endmacro
+
+%macro LOAD16 2
+    mov ptrq, %2
+    %if mmsize > 16
+        movu xm%1, [ptrq + xq*2]
+    %else
+        movsd xm%1, [ptrq + xq*2]
+    %endif
+    %if cpuflag(avx2)
+        vpmovzxwd m%1, xm%1
+    %else
+        %if mmsize > 16
+            pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
+            pshufb xm%1, xm6 ; pb_shuffle16
+            pshufb xm4,  xm6 ; pb_shuffle16
+            vinsertf128 m%1, m%1, xm4, 1
+        %else
+            pshufd  xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
+            pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+            pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+        %endif
+    %endif
+    cvtdq2ps m%1, m%1
+    mulps m%1, m%1, m7 ; pd_65535_invf
+%endmacro
+
+%macro STORE16 2
+    mulps m%2, m%2, m5  ; [pd_65535f]
+    minps m%2, m%2, m5  ; [pd_65535f]
+    maxps m%2, m%2, m15 ; zero
+    cvttps2dq m%2, m%2
+    %if mmsize > 16
+        vextractf128 xm4, m%2, 1
+        pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
+        pshufb xm4,  xm7 ; [pb_hi_pack_shuffle16]
+        por xm%2, xm4
+    %else
+        pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+        pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
+        pshufd  xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
+    %endif
+    mov ptrq, %1
+    %if mmsize > 16
+        movu [ptrq + xq*2], xm%2
+    %else
+        movsd [ptrq + xq*2], xm%2
+    %endif
+%endmacro
+
+; 1 - interp method
+; 2 - format_name
+; 3 - depth
+; 4 - is float format
+%macro DEFINE_INTERP_FUNC 4
+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
+    ; store lut max and lutsize
+    mov tmpd, dword [ctxq + LUT3DContext.lutsize]
+    cvtsi2ss xm0, tmpd
+    mulss xm0, xm0, [pd_3f]
+    VBROADCASTSS m0, xm0
+    mova lut3dsizem, m0
+    sub tmpd, 1
+    cvtsi2ss xm0, tmpd
+    VBROADCASTSS m0, xm0
+    mova lut3dmaxm, m0
+
+    ; scale_r
+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
+    VBROADCASTSS m1, xm1
+    mova scalerm, m1
+
+    ; scale_g
+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
+    VBROADCASTSS m1, xm1
+    mova scalegm, m1
+
+    ; scale_b
+    mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
+    VBROADCASTSS m1, xm1
+    mova scalebm, m1
+
+    ; store lutsize2
+    cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
+    mulss xm0, xm0, [pd_3f]
+    VBROADCASTSS m0, xm0
+    mova lut3dsize2m, m0
+
+    ; init prelut values
+    cmp prelutq, 0
+    je %%skip_init_prelut
+        mov tmpd, dword [prelutq + Lut3DPreLut.size]
+        sub tmpd, 1
+        cvtsi2ss xm0, tmpd
+        VBROADCASTSS m0, xm0
+        mova prelutmaxm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
+        mova prelutminrm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
+        mova prelutmingm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
+        mova prelutminbm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
+        mova prelutscalerm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
+        mova prelutscalegm, m0
+        VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
+        mova prelutscalebm, m0
+    %%skip_init_prelut:
+
+    mov widthd,  [src_imageq + AVFrame.width]
+
+    ; gbra pixel order
+    INIT_DATA_PTR srcrm, src_imageq, 2
+    INIT_DATA_PTR srcgm, src_imageq, 0
+    INIT_DATA_PTR srcbm, src_imageq, 1
+    INIT_DATA_PTR srcam, src_imageq, 3
+
+    INIT_DATA_PTR dstrm, dst_imageq, 2
+    INIT_DATA_PTR dstgm, dst_imageq, 0
+    INIT_DATA_PTR dstbm, dst_imageq, 1
+    INIT_DATA_PTR dstam, dst_imageq, 3
+
+    %%loop_y:
+        xor xq, xq
+        %%loop_x:
+            movu m14, [pd_1f]
+            xorps m15, m15, m15
+            %if %4 ; float
+                mov ptrq, srcrm
+                movu m0, [ptrq + xq*4]
+                mov ptrq, srcgm
+                movu m1, [ptrq + xq*4]
+                mov ptrq, srcbm
+                movu m2, [ptrq + xq*4]
+                SANITIZE_F m0
+                SANITIZE_F m1
+                SANITIZE_F m2
+            %else
+                ; constants for LOAD16
+                movu m7, [pd_65535_invf]
+                %if notcpuflag(avx2) && mmsize >= 32
+                    movu xm6, [pb_shuffle16]
+                %endif
+                LOAD16 0, srcrm
+                LOAD16 1, srcgm
+                LOAD16 2, srcbm
+            %endif
+
+            cmp prelutq, 0
+            je %%skip_prelut
+                mova m13, prelutmaxm
+                APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
+                APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
+                APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
+            %%skip_prelut:
+
+            mova m13, lut3dmaxm
+            APPLY_SCALE 0, scalerm
+            APPLY_SCALE 1, scalegm
+            APPLY_SCALE 2, scalebm
+
+            interp_%1
+
+            %if %4 ; float
+                mov ptrq, dstrm
+                movu [ptrq + xq*4], m0
+                mov ptrq, dstgm
+                movu [ptrq + xq*4], m1
+                mov ptrq, dstbm
+                movu [ptrq + xq*4], m2
+                cmp has_alphad, 0
+                je %%skip_alphaf
+                    mov ptrq, srcam
+                    movu m0, [ptrq + xq*4]
+                    mov ptrq, dstam
+                    movu [ptrq + xq*4], m0
+                %%skip_alphaf:
+            %else
+                ; constants for STORE16
+                movu m5,  [pd_65535f]
+                %if mmsize > 16
+                    movu xm6, [pb_lo_pack_shuffle16]
+                    movu xm7, [pb_hi_pack_shuffle16]
+                %endif
+
+                xorps m15, m15, m15
+                STORE16 dstrm, 0
+                STORE16 dstgm, 1
+                STORE16 dstbm, 2
+
+                cmp has_alphad, 0
+                je %%skip_alpha
+                    %if mmsize > 16
+                        mov ptrq, srcam
+                        movu xm0, [ptrq + xq*2]
+                        mov ptrq, dstam
+                        movu [ptrq + xq*2], xm0
+                    %else
+                        mov ptrq, srcam
+                        movsd xm0, [ptrq + xq*2]
+                        mov ptrq, dstam
+                        movsd [ptrq + xq*2], xm0
+                    %endif
+
+                %%skip_alpha:
+            %endif
+
+            add xq, mmsize/4
+            cmp xd, widthd
+            jl %%loop_x
+
+        INC_DATA_PTR srcrm, src_imageq, 2
+        INC_DATA_PTR srcgm, src_imageq, 0
+        INC_DATA_PTR srcbm, src_imageq, 1
+        INC_DATA_PTR srcam, src_imageq, 3
+
+        INC_DATA_PTR dstrm, dst_imageq, 2
+        INC_DATA_PTR dstgm, dst_imageq, 0
+        INC_DATA_PTR dstbm, dst_imageq, 1
+        INC_DATA_PTR dstam, dst_imageq, 3
+
+        inc slice_startd
+        cmp slice_startd, slice_endd
+        jl %%loop_y
+
+    RET
+%endmacro
+%if ARCH_X86_64
+    %if HAVE_AVX2_EXTERNAL
+        INIT_YMM avx2
+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+    %endif
+    %if HAVE_AVX_EXTERNAL
+        INIT_YMM avx
+        DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+        DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+    %endif
+    INIT_XMM sse2
+    DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
+    DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
+%endif
\ No newline at end of file
diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c
new file mode 100644
index 0000000000..9b9b36e4af
--- /dev/null
+++ b/libavfilter/x86/vf_lut3d_init.c
@@ -0,0 +1,88 @@ 
+/*
+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/lut3d.h"
+
+#define DEFINE_INTERP_FUNC(name, format, opt)                                                                                                       \
+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \
+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)                                                \
+{                                                                                                                                                   \
+    LUT3DContext *lut3d = ctx->priv;                                                                                                                \
+    Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL;                                                                             \
+    ThreadData *td = arg;                                                                                                                           \
+    AVFrame *in  = td->in;                                                                                                                          \
+    AVFrame *out = td->out;                                                                                                                         \
+    int has_alpha = in->linesize[3] && out != in;                                                                                                   \
+    int slice_start = (in->height *  jobnr   ) / nb_jobs;                                                                                           \
+    int slice_end   = (in->height * (jobnr+1)) / nb_jobs;                                                                                           \
+    ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha);                                                 \
+    return 0;                                                                                                                                       \
+}
+
+#if ARCH_X86_64
+#if HAVE_AVX2_EXTERNAL
+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx2)
+#endif
+#if HAVE_AVX_EXTERNAL
+    DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
+    DEFINE_INTERP_FUNC(tetrahedral, p16,  avx)
+#endif
+    DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
+    DEFINE_INTERP_FUNC(tetrahedral, p16,  sse2)
+#endif
+
+
+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc)
+{
+    int cpu_flags = av_get_cpu_flags();
+    int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
+    int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
+    int depth = desc->comp[0].depth;
+
+#if ARCH_X86_64
+    if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
+#if HAVE_AVX2_EXTERNAL
+        if (isfloat && planar) {
+            s->interp = interp_tetrahedral_pf32_avx2;
+        } else if (depth == 16) {
+            s->interp = interp_tetrahedral_p16_avx2;
+        }
+#endif
+    } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
+#if HAVE_AVX_EXTERNAL
+        if (isfloat) {
+            s->interp = interp_tetrahedral_pf32_avx;
+        } else if (depth == 16) {
+            s->interp = interp_tetrahedral_p16_avx;
+        }
+#endif
+    } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
+        if (isfloat) {
+            s->interp = interp_tetrahedral_pf32_sse2;
+        } else if (depth == 16) {
+            s->interp = interp_tetrahedral_p16_sse2;
+        }
+    }
+#endif
+}