Message ID | 20210929011805.98907-1-mindmark@gmail.com |
---|---|
State | New |
Headers | show |
Series | [FFmpeg-devel] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation | expand |
Context | Check | Description |
---|---|---|
andriy/make_x86 | success | Make finished |
andriy/make_fate_x86 | success | Make fate finished |
andriy/make_ppc | success | Make finished |
andriy/make_fate_ppc | success | Make fate finished |
Hello, Excuse me, how about FMADD on AVX2 platform? For example + mulps m7, m7, m14 + addps m0, m0, m7 ==> fmadd231ps m0,m7,m14 Regards, Min Chen 2021-09-29 09:18:05,mindmark@gmail.com >From: Mark Reid <mindmark@gmail.com> > >Only supports float and 16bit planer formats at the momoment. >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some >speed gains. > >f32 1920x1080 1 thread with prelut >c impl >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips > >sse2 >948662320 UNITS in lut3d->interp, 1 runs, 0 skips >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips > >avx >633837000 UNITS in lut3d->interp, 1 runs, 0 skips >669452850 UNITS in lut3d->interp, 2 runs, 0 skips >650716580 UNITS in lut3d->interp, 4 runs, 0 skips >644698550 UNITS in lut3d->interp, 8 runs, 0 skips > >avx2 >354940020 UNITS in lut3d->interp, 1 runs, 0 skips >362384340 UNITS in lut3d->interp, 2 runs, 0 skips >356799020 UNITS in lut3d->interp, 4 runs, 0 skips >357276815 UNITS in lut3d->interp, 8 runs, 0 skips > >gbrap16 1920x1080 1 thread with prelut >c impl >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips > >sse2 >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips > >avx >678089880 UNITS in lut3d->interp, 1 runs, 0 skips >679112485 UNITS in lut3d->interp, 2 runs, 0 skips >695527212 UNITS in lut3d->interp, 4 runs, 0 skips >691300053 UNITS in lut3d->interp, 8 runs, 0 skips > >avx2 >372671340 UNITS in lut3d->interp, 1 runs, 0 skips >373449870 UNITS in lut3d->interp, 2 runs, 0 skips >383725625 UNITS in lut3d->interp, 4 runs, 0 skips >382860848 UNITS in lut3d->interp, 8 runs, 0 skips > >--- > libavfilter/lut3d.h | 83 ++++ > libavfilter/vf_lut3d.c | 61 +-- > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_lut3d_init.c | 88 ++++ > 5 files changed, 935 insertions(+), 56 deletions(-) > create mode 100644 libavfilter/lut3d.h > create mode 100644 libavfilter/x86/vf_lut3d.asm > create mode 100644 libavfilter/x86/vf_lut3d_init.c > >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h >new file mode 100644 >index 0000000000..ded2a036a5 >--- /dev/null >+++ b/libavfilter/lut3d.h >@@ -0,0 +1,83 @@ >+/* >+ * Copyright (c) 2013 Clément Bœsch >+ * Copyright (c) 2018 Paul B Mahol >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+ */ >+#ifndef AVFILTER_LUT3D_H >+#define AVFILTER_LUT3D_H >+ >+#include "libavutil/pixdesc.h" >+#include "framesync.h" >+#include "avfilter.h" >+ >+enum interp_mode { >+ INTERPOLATE_NEAREST, >+ INTERPOLATE_TRILINEAR, >+ INTERPOLATE_TETRAHEDRAL, >+ INTERPOLATE_PYRAMID, >+ INTERPOLATE_PRISM, >+ NB_INTERP_MODE >+}; >+ >+struct rgbvec { >+ float r, g, b; >+}; >+ >+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT >+ * of 512x512 (64x64x64) */ >+#define MAX_LEVEL 256 >+#define PRELUT_SIZE 65536 >+ >+typedef struct Lut3DPreLut { >+ int size; >+ float min[3]; >+ float max[3]; >+ float scale[3]; >+ float* lut[3]; >+} Lut3DPreLut; >+ >+typedef struct LUT3DContext { >+ const AVClass *class; >+ struct rgbvec *lut; >+ int lutsize; >+ int lutsize2; >+ struct rgbvec scale; >+ int interpolation; ///<interp_mode >+ char *file; >+ uint8_t rgba_map[4]; >+ int step; >+ avfilter_action_func *interp; >+ Lut3DPreLut prelut; >+#if CONFIG_HALDCLUT_FILTER >+ uint8_t clut_rgba_map[4]; >+ int clut_step; >+ int clut_bits; >+ int clut_planar; >+ int clut_float; >+ int clut_width; >+ FFFrameSync fs; >+#endif >+} LUT3DContext; >+ >+typedef struct ThreadData { >+ AVFrame *in, *out; >+} ThreadData; >+ >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); >+ >+#endif /* AVFILTER_LUT3D_H */ >\ No newline at end of file >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c >index 9fbda833b9..1fd0af06db 100644 >--- a/libavfilter/vf_lut3d.c >+++ b/libavfilter/vf_lut3d.c >@@ -31,73 +31,18 @@ > #include "libavutil/intreadwrite.h" > #include "libavutil/intfloat.h" > #include "libavutil/avassert.h" >-#include "libavutil/pixdesc.h" > #include "libavutil/avstring.h" >-#include "avfilter.h" > #include "drawutils.h" > #include "formats.h" >-#include "framesync.h" > #include "internal.h" > #include "video.h" >+#include "lut3d.h" > > #define R 0 > #define G 1 > #define B 2 > #define A 3 > >-enum interp_mode { >- INTERPOLATE_NEAREST, >- INTERPOLATE_TRILINEAR, >- INTERPOLATE_TETRAHEDRAL, >- INTERPOLATE_PYRAMID, >- INTERPOLATE_PRISM, >- NB_INTERP_MODE >-}; >- >-struct rgbvec { >- float r, g, b; >-}; >- >-/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT >- * of 512x512 (64x64x64) */ >-#define MAX_LEVEL 256 >-#define PRELUT_SIZE 65536 >- >-typedef struct Lut3DPreLut { >- int size; >- float min[3]; >- float max[3]; >- float scale[3]; >- float* lut[3]; >-} Lut3DPreLut; >- >-typedef struct LUT3DContext { >- const AVClass *class; >- int interpolation; ///<interp_mode >- char *file; >- uint8_t rgba_map[4]; >- int step; >- avfilter_action_func *interp; >- struct rgbvec scale; >- struct rgbvec *lut; >- int lutsize; >- int lutsize2; >- Lut3DPreLut prelut; >-#if CONFIG_HALDCLUT_FILTER >- uint8_t clut_rgba_map[4]; >- int clut_step; >- int clut_bits; >- int clut_planar; >- int clut_float; >- int clut_width; >- FFFrameSync fs; >-#endif >-} LUT3DContext; >- >-typedef struct ThreadData { >- AVFrame *in, *out; >-} ThreadData; >- > #define OFFSET(x) offsetof(LUT3DContext, x) > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM > #define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) > av_assert0(0); > } > >+ if (ARCH_X86) { >+ ff_lut3d_init_x86(lut3d, desc); >+ } >+ > return 0; > } > >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >index 016a5b3511..a29941eaeb 100644 >--- a/libavfilter/x86/Makefile >+++ b/libavfilter/x86/Makefile >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o > OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm >new file mode 100644 >index 0000000000..b3d7c3962b >--- /dev/null >+++ b/libavfilter/x86/vf_lut3d.asm >@@ -0,0 +1,757 @@ >+;***************************************************************************** >+;* x86-optimized functions for lut3d filter >+;* >+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com> >+;* >+;* This file is part of FFmpeg. >+;* >+;* FFmpeg is free software; you can redistribute it and/or >+;* modify it under the terms of the GNU Lesser General Public >+;* License as published by the Free Software Foundation; either >+;* version 2.1 of the License, or (at your option) any later version. >+;* >+;* FFmpeg is distributed in the hope that it will be useful, >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+;* Lesser General Public License for more details. >+;* >+;* You should have received a copy of the GNU Lesser General Public >+;* License along with FFmpeg; if not, write to the Free Software >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+;****************************************************************************** >+ >+%include "libavutil/x86/x86util.asm" >+ >+SECTION_RODATA >+pd_1f: times 8 dd 1.0 >+pd_3f: times 8 dd 3.0 >+ >+; used to limit rshifts as they are more expensive in avx1 >+pd_001: times 8 dd 001b >+pd_010: times 8 dd 010b >+pd_100: times 8 dd 100b >+ >+pd_65535f: times 8 dd 65535.0 >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 >+ >+pb_shuffle16: db 0, 1, 0x80, 0x80, \ >+ 2, 3, 0x80, 0x80, \ >+ 4, 5, 0x80, 0x80, \ >+ 6, 7, 0x80, 0x80 >+ >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ >+ 8, 9, 12, 13, \ >+ 0x80, 0x80, 0x80, 0x80, \ >+ 0x80, 0x80, 0x80, 0x80 >+ >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ >+ 0x80, 0x80, 0x80, 0x80, \ >+ 0, 1, 4, 5, \ >+ 8, 9, 12, 13 >+ >+; tetrahedral table -------------------------------------------- >+; name: x2| x1| x0| cxxb| cxxa >+; values: r 00| r 00| r 00| c011 011| c001 001 >+; g 01| g 01| g 01| c101 101| c010 010 >+; b 10| b 10| b 10| c110 110| c100 100 >+ >+; g>b b | g | r | c110 | c100 >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << 3) | 100b >+; r>b g | b | r | c101 | c100 >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << 3) | 100b >+; else g | r | b | c101 | c001 >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << 3) | 001b >+; b>g r | g | b | c011 | c001 >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << 3) | 001b >+; b>r r | b | g | c011 | c010 >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << 3) | 010b >+; else b | r | g | c110 | c010 >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << 3) | 010b >+ >+SECTION .text >+ >+struc Lut3DPreLut >+ .size: resd 1 >+ .min: resd 3 >+ .max: resd 3 >+ .scale: resd 3 >+ .lut: resq 3 >+endstruc >+ >+struc LUT3DContext >+ .class: resq 1 >+ .lut: resq 1 >+ .lutsize: resd 1 >+ .lutsize2: resd 1 >+ .scale: resd 3 >+endstruc >+ >+%define AV_NUM_DATA_POINTERS 8 >+ >+struc AVFrame >+ .data: resq AV_NUM_DATA_POINTERS >+ .linesize: resd AV_NUM_DATA_POINTERS >+ .extended_data: resq 1 >+ .width: resd 1 >+ .height: resd 1 >+endstruc >+ >+%define rm rsp >+%define gm rsp+mmsize >+%define bm rsp+(mmsize*2) >+ >+%define lut3dsizem [rsp+mmsize*3] >+%define lut3dsize2m [rsp+mmsize*4] >+%define lut3dmaxm [rsp+mmsize*5] >+%define prelutmaxm [rsp+mmsize*6] >+ >+%define scalerm [rsp+mmsize*7] >+%define scalegm [rsp+mmsize*8] >+%define scalebm [rsp+mmsize*9] >+ >+%define prelutminrm [rsp+mmsize*10] >+%define prelutmingm [rsp+mmsize*11] >+%define prelutminbm [rsp+mmsize*12] >+ >+%define prelutscalerm [rsp+mmsize*13] >+%define prelutscalegm [rsp+mmsize*14] >+%define prelutscalebm [rsp+mmsize*15] >+ >+; data pointers >+%define srcrm [rsp+mmsize*16 + 0] >+%define srcgm [rsp+mmsize*16 + 8] >+%define srcbm [rsp+mmsize*16 + 16] >+%define srcam [rsp+mmsize*16 + 24] >+ >+%define dstrm [rsp+mmsize*16 + 32] >+%define dstgm [rsp+mmsize*16 + 40] >+%define dstbm [rsp+mmsize*16 + 48] >+%define dstam [rsp+mmsize*16 + 56] >+ >+%macro FETCH_PRELUT_PN 3 >+ mov tmp2d, [rm + %3] >+ mov tmp3d, [gm + %3] >+ movss xm%1, [tmpq + tmp2q*4] >+ movss xm%2, [tmpq + tmp3q*4] >+ movss [rm + %3], xm%1 >+ movss [gm + %3], xm%2 >+%endmacro >+ >+; 1 - p >+; 2 - n >+; 3 - p indices >+; 4 - n indices >+%macro GATHER_PRELUT 4 >+ %if cpuflag(avx2) >+ vpcmpeqb m7, m7 >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p >+ vpcmpeqb m9, m9 >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n >+ %else >+ mova [rm], m%3 >+ mova [gm], m%4 >+ FETCH_PRELUT_PN %1, %2, 0 >+ FETCH_PRELUT_PN %1, %2, 4 >+ FETCH_PRELUT_PN %1, %2, 8 >+ FETCH_PRELUT_PN %1, %2, 12 >+ %if mmsize > 16 >+ FETCH_PRELUT_PN %1, %2, 16 >+ FETCH_PRELUT_PN %1, %2, 20 >+ FETCH_PRELUT_PN %1, %2, 24 >+ FETCH_PRELUT_PN %1, %2, 28 >+ %endif >+ movu m%1, [rm] >+ movu m%2, [gm] >+ %endif >+%endmacro >+ >+%macro FLOORPS 2 >+ %if mmsize > 16 >+ vroundps %1, %2, 0x01 >+ %else >+ cvttps2dq %1, %2 >+ cvtdq2ps %1, %1 >+ %endif >+%endmacro >+ >+; 1 - dst >+; 2 - index >+; 3 - min >+; 4 - scale >+; assumes lut max m13, m14 1.0f, zero m15 >+%macro APPLY_PRELUT 4 >+ ; scale >+ subps m5, m%1, %3 ; v - min >+ mulps m5, m5, %4 ; v * scale >+ ; clamp >+ maxps m5, m5, m15 ; max zero >+ minps m5, m5, m13 ; min lut max >+ >+ FLOORPS m3, m5 ; prev index >+ subps m5, m5, m3 ; d >+ addps m4, m3, m14 ; p+1 = n index >+ minps m4, m4, m13 ; clamp n idex >+ >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] >+ cvttps2dq m6, m3 >+ cvttps2dq m10, m4 >+ GATHER_PRELUT 3, 4, 6, 10 >+ >+ ; lerp >+ subps m8, m4, m3 >+ mulps m8, m8, m5 >+ addps m%1, m8, m3 >+%endmacro >+ >+; 1 - dst >+; 2 - scale >+; assumes lut max m13, zero m15 >+%macro APPLY_SCALE 2 >+ mulps m%1, m%1, %2 >+ maxps m%1, m%1, m15 >+ minps m%1, m%1, m13 >+%endmacro >+ >+%macro BLEND 4 >+%if mmsize > 16 >+ vblendvps %1, %2, %3, %4 >+%else >+ %ifidni %1,%2 >+ %error operand 1 must not equal operand 2 >+ %endif >+ %ifidni %1,%3 >+ %error operand 1 must not equal operand 3 >+ %endif >+ mova %1, %2 >+ xorps %1, %3 >+ andps %1, %4 >+ xorps %1, %2 >+%endif >+%endmacro >+ >+; sets nans to zere, +inf -inf handled later by min/max clamps >+%macro SANITIZE_F 1 >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False >+ %if mmsize <= 16 >+ mova m6, %1 >+ BLEND %1, m15, m6, m5 >+ %else >+ BLEND %1, m15, %1, m5 >+ %endif >+%endmacro >+ >+%macro ADD3 4 >+ addps %1, %2, %3 >+ addps %1, %1, %4 >+%endmacro >+ >+%macro CMP_EQUAL 3 >+%if cpuflag(avx2) >+ vpcmpeqd %1, %2, %3 >+%elif cpuflag(avx) >+ cmpps %1, %2, %3, 0x0 >+%else >+ pcmpeqd %1, %2, %3 >+%endif >+%endmacro >+ >+%macro SHIFT_RIGHT 2 >+%if mmsize <= 16 >+ psrld xm%1, %2 >+%elif cpuflag(avx2) >+ vpsrld m%1, m%1, %2 >+%else >+ vextractf128 xm15, m%1, 1 >+ psrld xm%1, %2 >+ psrld xm15, %2 >+ vinsertf128 m%1, m%1, xm15, 1 >+%endif >+%endmacro >+ >+%macro FETCH_LUT3D_RGB 4 >+ mov tmp2d, [rm + %4] >+ movss xm%1, [tmpq + tmp2q*4 + 0] >+ movss xm%2, [tmpq + tmp2q*4 + 4] >+ movss xm%3, [tmpq + tmp2q*4 + 8] >+ movss [rm + %4], xm%1 >+ movss [gm + %4], xm%2 >+ movss [bm + %4], xm%3 >+%endmacro >+ >+; 1 - dstr >+; 2 - dstg >+; 3 - dstb >+; 4 - indices >+%macro GATHER_LUT3D_INDICES 4 >+%if cpuflag(avx2) >+ vpcmpeqb m3, m3 >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3 >+ vpcmpeqb m14, m14 >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14 >+ vpcmpeqb m15, m15 >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15 >+%else >+ movu [rm], m%4 >+ FETCH_LUT3D_RGB %1, %2, %3, 0 >+ FETCH_LUT3D_RGB %1, %2, %3, 4 >+ FETCH_LUT3D_RGB %1, %2, %3, 8 >+ FETCH_LUT3D_RGB %1, %2, %3, 12 >+%if mmsize > 16 >+ FETCH_LUT3D_RGB %1, %2, %3, 16 >+ FETCH_LUT3D_RGB %1, %2, %3, 20 >+ FETCH_LUT3D_RGB %1, %2, %3, 24 >+ FETCH_LUT3D_RGB %1, %2, %3, 28 >+%endif >+ movu m%1, [rm] >+ movu m%2, [gm] >+ movu m%3, [bm] >+%endif >+%endmacro >+ >+%macro interp_tetrahedral 0 >+ %define d_r m0 >+ %define d_g m1 >+ %define d_b m2 >+ >+ %define prev_r m3 >+ %define prev_g m4 >+ %define prev_b m5 >+ >+ %define next_r m6 >+ %define next_g m7 >+ %define next_b m8 >+ >+ %define x0 m4 >+ %define x1 m5 >+ %define x2 m6 >+ >+ ; setup prev index >+ FLOORPS prev_r, m0 >+ FLOORPS prev_g, m1 >+ FLOORPS prev_b, m2 >+ >+ ; setup deltas >+ subps d_r, m0, prev_r >+ subps d_g, m1, prev_g >+ subps d_b, m2, prev_b >+ >+ ; calculate select mask m9 >+ movu m6, [pd_tetra_table2] >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ >+ BLEND m10, m6, [pd_tetra_table1], m7 >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ >+ BLEND m6, m10, [pd_tetra_table0], m7 >+ >+ movu m10, [pd_tetra_table5] >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ >+ BLEND m9, m10, [pd_tetra_table4], m7 >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ >+ BLEND m10, m9, [pd_tetra_table3], m7 >+ >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ >+ BLEND m9, m10, m6, m7 >+ >+ ; setup next index >+ addps next_r, prev_r, m14 ; +1 >+ minps next_r, next_r, m13 ; clamp lutmax >+ >+ addps next_g, prev_g, m14 ; +1 >+ minps next_g, next_g, m13 ; clamp lutmax >+ >+ addps next_b, prev_b, m14 ; +1 >+ minps next_b, next_b, m13 ; clamp lutmax >+ >+ ; prescale indices >+ mulps prev_r, prev_r, lut3dsize2m >+ mulps next_r, next_r, lut3dsize2m >+ >+ mulps prev_g, prev_g, lut3dsizem >+ mulps next_g, next_g, lut3dsizem >+ >+ mulps prev_b, prev_b, [pd_3f] >+ mulps next_b, next_b, [pd_3f] >+ >+ movu m14, [pd_001] >+ >+ ; cxxa m10 >+ ; b >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m10, prev_b, next_b, m15 >+ >+ ; g >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m12, prev_g, next_g, m15 >+ >+ ; r >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m13, prev_r, next_r, m15 >+ >+ ADD3 m10, m10, m12, m13 >+ >+ SHIFT_RIGHT 9, 3 ; 3 >+ >+ ; cxxb m11; >+ ; b >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m11, prev_b, next_b, m15 >+ >+ ; g >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m12, prev_g, next_g, m15 >+ >+ ; r >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m13, prev_r, next_r, m15 >+ >+ ADD3 m11, m11, m12, m13 >+ >+ ; c000 m12; >+ ADD3 m12, prev_r, prev_g, prev_b >+ >+ ; c111 m13; >+ ADD3 m13, next_r, next_g, next_b >+ >+ SHIFT_RIGHT 9, 3 ; 6 >+ >+ ; x0, m4 >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND x0, m7, d_b, m15 ; b >+ >+ ; x1, m5 >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ SHIFT_RIGHT 9, 3 ; 9 >+ >+ andps m15, m9, m14 >+ CMP_EQUAL m15, m15, m14 >+ BLEND x1, m7, d_b, m15 ; b >+ >+ ; x2, m6 >+ andps m15, m9, [pd_010] >+ CMP_EQUAL m15, m15, [pd_010] >+ BLEND m7, d_r, d_g, m15 ; r,g >+ >+ andps m15, m9, [pd_100] >+ CMP_EQUAL m15, m15, [pd_100] >+ BLEND x2, m7, d_b, m15 ; b >+ >+ ; convert indices to integer >+ cvttps2dq m12, m12 >+ cvttps2dq m10, m10 >+ cvttps2dq m11, m11 >+ cvttps2dq m13, m13 >+ >+ ; now the gathering festival >+ mov tmpq, [ctxq + LUT3DContext.lut] >+ >+ GATHER_LUT3D_INDICES 0, 1, 2, 12 >+ movu m14, [pd_1f] >+ subps m14, m14, x0; 1 - x0 >+ >+ mulps m0, m0, m14 >+ mulps m1, m1, m14 >+ mulps m2, m2, m14 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 10 >+ subps m14, x0, x1; x0 - x1 >+ mulps m7, m7, m14 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, m14 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, m14 >+ addps m2, m2, m9 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 11 >+ subps m14, x1, x2; x1 - x2 >+ >+ mulps m7, m7, m14 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, m14 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, m14 >+ addps m2, m2, m9 >+ >+ GATHER_LUT3D_INDICES 7, 8, 9, 13 >+ mulps m7, m7, x2 >+ addps m0, m0, m7 >+ >+ mulps m8, m8, x2 >+ addps m1, m1, m8 >+ >+ mulps m9, m9, x2 >+ addps m2, m2, m9 >+%endmacro >+ >+%macro INIT_DATA_PTR 3 >+ mov ptrq, [%2 + AVFrame.data + %3 * 8] >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >+ imul tmpd, slice_startd >+ add ptrq, tmpq >+ mov %1, ptrq >+%endmacro >+ >+%macro INC_DATA_PTR 3 >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >+ mov ptrq, %1 >+ add ptrq, tmpq >+ mov %1, ptrq >+%endmacro >+ >+%macro LOAD16 2 >+ mov ptrq, %2 >+ %if mmsize > 16 >+ movu xm%1, [ptrq + xq*2] >+ %else >+ movsd xm%1, [ptrq + xq*2] >+ %endif >+ %if cpuflag(avx2) >+ vpmovzxwd m%1, xm%1 >+ %else >+ %if mmsize > 16 >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) >+ pshufb xm%1, xm6 ; pb_shuffle16 >+ pshufb xm4, xm6 ; pb_shuffle16 >+ vinsertf128 m%1, m%1, xm4, 1 >+ %else >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ %endif >+ %endif >+ cvtdq2ps m%1, m%1 >+ mulps m%1, m%1, m7 ; pd_65535_invf >+%endmacro >+ >+%macro STORE16 2 >+ mulps m%2, m%2, m5 ; [pd_65535f] >+ minps m%2, m%2, m5 ; [pd_65535f] >+ maxps m%2, m%2, m15 ; zero >+ cvttps2dq m%2, m%2 >+ %if mmsize > 16 >+ vextractf128 xm4, m%2, 1 >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] >+ por xm%2, xm4 >+ %else >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) >+ %endif >+ mov ptrq, %1 >+ %if mmsize > 16 >+ movu [ptrq + xq*2], xm%2 >+ %else >+ movsd [ptrq + xq*2], xm%2 >+ %endif >+%endmacro >+ >+; 1 - interp method >+; 2 - format_name >+; 3 - depth >+; 4 - is float format >+%macro DEFINE_INTERP_FUNC 4 >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3 >+ ; store lut max and lutsize >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize] >+ cvtsi2ss xm0, tmpd >+ mulss xm0, xm0, [pd_3f] >+ VBROADCASTSS m0, xm0 >+ mova lut3dsizem, m0 >+ sub tmpd, 1 >+ cvtsi2ss xm0, tmpd >+ VBROADCASTSS m0, xm0 >+ mova lut3dmaxm, m0 >+ >+ ; scale_r >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] >+ VBROADCASTSS m1, xm1 >+ mova scalerm, m1 >+ >+ ; scale_g >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] >+ VBROADCASTSS m1, xm1 >+ mova scalegm, m1 >+ >+ ; scale_b >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] >+ VBROADCASTSS m1, xm1 >+ mova scalebm, m1 >+ >+ ; store lutsize2 >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] >+ mulss xm0, xm0, [pd_3f] >+ VBROADCASTSS m0, xm0 >+ mova lut3dsize2m, m0 >+ >+ ; init prelut values >+ cmp prelutq, 0 >+ je %%skip_init_prelut >+ mov tmpd, dword [prelutq + Lut3DPreLut.size] >+ sub tmpd, 1 >+ cvtsi2ss xm0, tmpd >+ VBROADCASTSS m0, xm0 >+ mova prelutmaxm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] >+ mova prelutminrm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] >+ mova prelutmingm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] >+ mova prelutminbm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] >+ mova prelutscalerm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] >+ mova prelutscalegm, m0 >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] >+ mova prelutscalebm, m0 >+ %%skip_init_prelut: >+ >+ mov widthd, [src_imageq + AVFrame.width] >+ >+ ; gbra pixel order >+ INIT_DATA_PTR srcrm, src_imageq, 2 >+ INIT_DATA_PTR srcgm, src_imageq, 0 >+ INIT_DATA_PTR srcbm, src_imageq, 1 >+ INIT_DATA_PTR srcam, src_imageq, 3 >+ >+ INIT_DATA_PTR dstrm, dst_imageq, 2 >+ INIT_DATA_PTR dstgm, dst_imageq, 0 >+ INIT_DATA_PTR dstbm, dst_imageq, 1 >+ INIT_DATA_PTR dstam, dst_imageq, 3 >+ >+ %%loop_y: >+ xor xq, xq >+ %%loop_x: >+ movu m14, [pd_1f] >+ xorps m15, m15, m15 >+ %if %4 ; float >+ mov ptrq, srcrm >+ movu m0, [ptrq + xq*4] >+ mov ptrq, srcgm >+ movu m1, [ptrq + xq*4] >+ mov ptrq, srcbm >+ movu m2, [ptrq + xq*4] >+ SANITIZE_F m0 >+ SANITIZE_F m1 >+ SANITIZE_F m2 >+ %else >+ ; constants for LOAD16 >+ movu m7, [pd_65535_invf] >+ %if notcpuflag(avx2) && mmsize >= 32 >+ movu xm6, [pb_shuffle16] >+ %endif >+ LOAD16 0, srcrm >+ LOAD16 1, srcgm >+ LOAD16 2, srcbm >+ %endif >+ >+ cmp prelutq, 0 >+ je %%skip_prelut >+ mova m13, prelutmaxm >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm >+ %%skip_prelut: >+ >+ mova m13, lut3dmaxm >+ APPLY_SCALE 0, scalerm >+ APPLY_SCALE 1, scalegm >+ APPLY_SCALE 2, scalebm >+ >+ interp_%1 >+ >+ %if %4 ; float >+ mov ptrq, dstrm >+ movu [ptrq + xq*4], m0 >+ mov ptrq, dstgm >+ movu [ptrq + xq*4], m1 >+ mov ptrq, dstbm >+ movu [ptrq + xq*4], m2 >+ cmp has_alphad, 0 >+ je %%skip_alphaf >+ mov ptrq, srcam >+ movu m0, [ptrq + xq*4] >+ mov ptrq, dstam >+ movu [ptrq + xq*4], m0 >+ %%skip_alphaf: >+ %else >+ ; constants for STORE16 >+ movu m5, [pd_65535f] >+ %if mmsize > 16 >+ movu xm6, [pb_lo_pack_shuffle16] >+ movu xm7, [pb_hi_pack_shuffle16] >+ %endif >+ >+ xorps m15, m15, m15 >+ STORE16 dstrm, 0 >+ STORE16 dstgm, 1 >+ STORE16 dstbm, 2 >+ >+ cmp has_alphad, 0 >+ je %%skip_alpha >+ %if mmsize > 16 >+ mov ptrq, srcam >+ movu xm0, [ptrq + xq*2] >+ mov ptrq, dstam >+ movu [ptrq + xq*2], xm0 >+ %else >+ mov ptrq, srcam >+ movsd xm0, [ptrq + xq*2] >+ mov ptrq, dstam >+ movsd [ptrq + xq*2], xm0 >+ %endif >+ >+ %%skip_alpha: >+ %endif >+ >+ add xq, mmsize/4 >+ cmp xd, widthd >+ jl %%loop_x >+ >+ INC_DATA_PTR srcrm, src_imageq, 2 >+ INC_DATA_PTR srcgm, src_imageq, 0 >+ INC_DATA_PTR srcbm, src_imageq, 1 >+ INC_DATA_PTR srcam, src_imageq, 3 >+ >+ INC_DATA_PTR dstrm, dst_imageq, 2 >+ INC_DATA_PTR dstgm, dst_imageq, 0 >+ INC_DATA_PTR dstbm, dst_imageq, 1 >+ INC_DATA_PTR dstam, dst_imageq, 3 >+ >+ inc slice_startd >+ cmp slice_startd, slice_endd >+ jl %%loop_y >+ >+ RET >+%endmacro >+%if ARCH_X86_64 >+ %if HAVE_AVX2_EXTERNAL >+ INIT_YMM avx2 >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+ %endif >+ %if HAVE_AVX_EXTERNAL >+ INIT_YMM avx >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+ %endif >+ INIT_XMM sse2 >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >+%endif >\ No newline at end of file >diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c >new file mode 100644 >index 0000000000..9b9b36e4af >--- /dev/null >+++ b/libavfilter/x86/vf_lut3d_init.c >@@ -0,0 +1,88 @@ >+/* >+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com> >+ * >+ * This file is part of FFmpeg. >+ * >+ * FFmpeg is free software; you can redistribute it and/or >+ * modify it under the terms of the GNU Lesser General Public >+ * License as published by the Free Software Foundation; either >+ * version 2.1 of the License, or (at your option) any later version. >+ * >+ * FFmpeg is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >+ * Lesser General Public License for more details. >+ * >+ * You should have received a copy of the GNU Lesser General Public >+ * License along with FFmpeg; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA >+ */ >+ >+#include "libavutil/attributes.h" >+#include "libavutil/cpu.h" >+#include "libavutil/x86/cpu.h" >+#include "libavfilter/lut3d.h" >+ >+#define DEFINE_INTERP_FUNC(name, format, opt) \ >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \ >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \ >+{ \ >+ LUT3DContext *lut3d = ctx->priv; \ >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; \ >+ ThreadData *td = arg; \ >+ AVFrame *in = td->in; \ >+ AVFrame *out = td->out; \ >+ int has_alpha = in->linesize[3] && out != in; \ >+ int slice_start = (in->height * jobnr ) / nb_jobs; \ >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; \ >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha); \ >+ return 0; \ >+} >+ >+#if ARCH_X86_64 >+#if HAVE_AVX2_EXTERNAL >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) >+#endif >+#if HAVE_AVX_EXTERNAL >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx) >+#endif >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) >+#endif >+ >+ >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc) >+{ >+ int cpu_flags = av_get_cpu_flags(); >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; >+ int depth = desc->comp[0].depth; >+ >+#if ARCH_X86_64 >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { >+#if HAVE_AVX2_EXTERNAL >+ if (isfloat && planar) { >+ s->interp = interp_tetrahedral_pf32_avx2; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_avx2; >+ } >+#endif >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { >+#if HAVE_AVX_EXTERNAL >+ if (isfloat) { >+ s->interp = interp_tetrahedral_pf32_avx; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_avx; >+ } >+#endif >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { >+ if (isfloat) { >+ s->interp = interp_tetrahedral_pf32_sse2; >+ } else if (depth == 16) { >+ s->interp = interp_tetrahedral_p16_sse2; >+ } >+ } >+#endif >+} >-- >2.31.1.windows.1 > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe".
On Tue, Sep 28, 2021 at 6:38 PM chen <chenm003@163.com> wrote: > Hello, > > > Excuse me, how about FMADD on AVX2 platform? > > > For example > + mulps m7, m7, m14 > + addps m0, m0, m7 > > ==> > > > fmadd231ps m0,m7,m14 > > Interesting, does having AVX2 guarantee having FMA instructions? > > Regards, > Min Chen > > > 2021-09-29 09:18:05,mindmark@gmail.com > >From: Mark Reid <mindmark@gmail.com> > > > >Only supports float and 16bit planer formats at the momoment. > >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer > some > >speed gains. > > > >f32 1920x1080 1 thread with prelut > >c impl > >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips > >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips > >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips > >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips > > > >sse2 > >948662320 UNITS in lut3d->interp, 1 runs, 0 skips > >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips > >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips > >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips > > > >avx > >633837000 UNITS in lut3d->interp, 1 runs, 0 skips > >669452850 UNITS in lut3d->interp, 2 runs, 0 skips > >650716580 UNITS in lut3d->interp, 4 runs, 0 skips > >644698550 UNITS in lut3d->interp, 8 runs, 0 skips > > > >avx2 > >354940020 UNITS in lut3d->interp, 1 runs, 0 skips > >362384340 UNITS in lut3d->interp, 2 runs, 0 skips > >356799020 UNITS in lut3d->interp, 4 runs, 0 skips > >357276815 UNITS in lut3d->interp, 8 runs, 0 skips > > > >gbrap16 1920x1080 1 thread with prelut > >c impl > >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips > >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips > >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips > >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips > > > >sse2 > >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips > >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips > >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips > >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips > > > >avx > >678089880 UNITS in lut3d->interp, 1 runs, 0 skips > >679112485 UNITS in lut3d->interp, 2 runs, 0 skips > >695527212 UNITS in lut3d->interp, 4 runs, 0 skips > >691300053 UNITS in lut3d->interp, 8 runs, 0 skips > > > >avx2 > >372671340 UNITS in lut3d->interp, 1 runs, 0 skips > >373449870 UNITS in lut3d->interp, 2 runs, 0 skips > >383725625 UNITS in lut3d->interp, 4 runs, 0 skips > >382860848 UNITS in lut3d->interp, 8 runs, 0 skips > > > >--- > > libavfilter/lut3d.h | 83 ++++ > > libavfilter/vf_lut3d.c | 61 +-- > > libavfilter/x86/Makefile | 2 + > > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ > > libavfilter/x86/vf_lut3d_init.c | 88 ++++ > > 5 files changed, 935 insertions(+), 56 deletions(-) > > create mode 100644 libavfilter/lut3d.h > > create mode 100644 libavfilter/x86/vf_lut3d.asm > > create mode 100644 libavfilter/x86/vf_lut3d_init.c > > > >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h > >new file mode 100644 > >index 0000000000..ded2a036a5 > >--- /dev/null > >+++ b/libavfilter/lut3d.h > >@@ -0,0 +1,83 @@ > >+/* > >+ * Copyright (c) 2013 Clément Bœsch > >+ * Copyright (c) 2018 Paul B Mahol > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+#ifndef AVFILTER_LUT3D_H > >+#define AVFILTER_LUT3D_H > >+ > >+#include "libavutil/pixdesc.h" > >+#include "framesync.h" > >+#include "avfilter.h" > >+ > >+enum interp_mode { > >+ INTERPOLATE_NEAREST, > >+ INTERPOLATE_TRILINEAR, > >+ INTERPOLATE_TETRAHEDRAL, > >+ INTERPOLATE_PYRAMID, > >+ INTERPOLATE_PRISM, > >+ NB_INTERP_MODE > >+}; > >+ > >+struct rgbvec { > >+ float r, g, b; > >+}; > >+ > >+/* 3D LUT don't often go up to level 32, but it is common to have a Hald > CLUT > >+ * of 512x512 (64x64x64) */ > >+#define MAX_LEVEL 256 > >+#define PRELUT_SIZE 65536 > >+ > >+typedef struct Lut3DPreLut { > >+ int size; > >+ float min[3]; > >+ float max[3]; > >+ float scale[3]; > >+ float* lut[3]; > >+} Lut3DPreLut; > >+ > >+typedef struct LUT3DContext { > >+ const AVClass *class; > >+ struct rgbvec *lut; > >+ int lutsize; > >+ int lutsize2; > >+ struct rgbvec scale; > >+ int interpolation; ///<interp_mode > >+ char *file; > >+ uint8_t rgba_map[4]; > >+ int step; > >+ avfilter_action_func *interp; > >+ Lut3DPreLut prelut; > >+#if CONFIG_HALDCLUT_FILTER > >+ uint8_t clut_rgba_map[4]; > >+ int clut_step; > >+ int clut_bits; > >+ int clut_planar; > >+ int clut_float; > >+ int clut_width; > >+ FFFrameSync fs; > >+#endif > >+} LUT3DContext; > >+ > >+typedef struct ThreadData { > >+ AVFrame *in, *out; > >+} ThreadData; > >+ > >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); > >+ > >+#endif /* AVFILTER_LUT3D_H */ > >\ No newline at end of file > >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c > >index 9fbda833b9..1fd0af06db 100644 > >--- a/libavfilter/vf_lut3d.c > >+++ b/libavfilter/vf_lut3d.c > >@@ -31,73 +31,18 @@ > > #include "libavutil/intreadwrite.h" > > #include "libavutil/intfloat.h" > > #include "libavutil/avassert.h" > >-#include "libavutil/pixdesc.h" > > #include "libavutil/avstring.h" > >-#include "avfilter.h" > > #include "drawutils.h" > > #include "formats.h" > >-#include "framesync.h" > > #include "internal.h" > > #include "video.h" > >+#include "lut3d.h" > > > > #define R 0 > > #define G 1 > > #define B 2 > > #define A 3 > > > >-enum interp_mode { > >- INTERPOLATE_NEAREST, > >- INTERPOLATE_TRILINEAR, > >- INTERPOLATE_TETRAHEDRAL, > >- INTERPOLATE_PYRAMID, > >- INTERPOLATE_PRISM, > >- NB_INTERP_MODE > >-}; > >- > >-struct rgbvec { > >- float r, g, b; > >-}; > >- > >-/* 3D LUT don't often go up to level 32, but it is common to have a Hald > CLUT > >- * of 512x512 (64x64x64) */ > >-#define MAX_LEVEL 256 > >-#define PRELUT_SIZE 65536 > >- > >-typedef struct Lut3DPreLut { > >- int size; > >- float min[3]; > >- float max[3]; > >- float scale[3]; > >- float* lut[3]; > >-} Lut3DPreLut; > >- > >-typedef struct LUT3DContext { > >- const AVClass *class; > >- int interpolation; ///<interp_mode > >- char *file; > >- uint8_t rgba_map[4]; > >- int step; > >- avfilter_action_func *interp; > >- struct rgbvec scale; > >- struct rgbvec *lut; > >- int lutsize; > >- int lutsize2; > >- Lut3DPreLut prelut; > >-#if CONFIG_HALDCLUT_FILTER > >- uint8_t clut_rgba_map[4]; > >- int clut_step; > >- int clut_bits; > >- int clut_planar; > >- int clut_float; > >- int clut_width; > >- FFFrameSync fs; > >-#endif > >-} LUT3DContext; > >- > >-typedef struct ThreadData { > >- AVFrame *in, *out; > >-} ThreadData; > >- > > #define OFFSET(x) offsetof(LUT3DContext, x) > > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM > > #define TFLAGS > AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM > >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) > > av_assert0(0); > > } > > > >+ if (ARCH_X86) { > >+ ff_lut3d_init_x86(lut3d, desc); > >+ } > >+ > > return 0; > > } > > > >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > >index 016a5b3511..a29941eaeb 100644 > >--- a/libavfilter/x86/Makefile > >+++ b/libavfilter/x86/Makefile > >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += > x86/vf_hqdn3d_init.o > > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o > > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o > > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o > >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o > > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o > > OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o > > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o > >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += > x86/vf_hqdn3d.o > > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o > > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o > > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o > >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o > > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o > > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o > > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o > >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm > >new file mode 100644 > >index 0000000000..b3d7c3962b > >--- /dev/null > >+++ b/libavfilter/x86/vf_lut3d.asm > >@@ -0,0 +1,757 @@ > > >+;***************************************************************************** > >+;* x86-optimized functions for lut3d filter > >+;* > >+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com> > >+;* > >+;* This file is part of FFmpeg. > >+;* > >+;* FFmpeg is free software; you can redistribute it and/or > >+;* modify it under the terms of the GNU Lesser General Public > >+;* License as published by the Free Software Foundation; either > >+;* version 2.1 of the License, or (at your option) any later version. > >+;* > >+;* FFmpeg is distributed in the hope that it will be useful, > >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of > >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+;* Lesser General Public License for more details. > >+;* > >+;* You should have received a copy of the GNU Lesser General Public > >+;* License along with FFmpeg; if not, write to the Free Software > >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > >+;****************************************************************************** > >+ > >+%include "libavutil/x86/x86util.asm" > >+ > >+SECTION_RODATA > >+pd_1f: times 8 dd 1.0 > >+pd_3f: times 8 dd 3.0 > >+ > >+; used to limit rshifts as they are more expensive in avx1 > >+pd_001: times 8 dd 001b > >+pd_010: times 8 dd 010b > >+pd_100: times 8 dd 100b > >+ > >+pd_65535f: times 8 dd 65535.0 > >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 > >+ > >+pb_shuffle16: db 0, 1, 0x80, 0x80, \ > >+ 2, 3, 0x80, 0x80, \ > >+ 4, 5, 0x80, 0x80, \ > >+ 6, 7, 0x80, 0x80 > >+ > >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ > >+ 8, 9, 12, 13, \ > >+ 0x80, 0x80, 0x80, 0x80, \ > >+ 0x80, 0x80, 0x80, 0x80 > >+ > >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ > >+ 0x80, 0x80, 0x80, 0x80, \ > >+ 0, 1, 4, 5, \ > >+ 8, 9, 12, 13 > >+ > >+; tetrahedral table -------------------------------------------- > >+; name: x2| x1| x0| cxxb| cxxa > >+; values: r 00| r 00| r 00| c011 011| c001 001 > >+; g 01| g 01| g 01| c101 101| c010 010 > >+; b 10| b 10| b 10| c110 110| c100 100 > >+ > >+; g>b b | g | r | > c110 | c100 > >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | > (110b << 3) | 100b > >+; r>b g | b | r | > c101 | c100 > >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | > (101b << 3) | 100b > >+; else g | r | b | > c101 | c001 > >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | > (101b << 3) | 001b > >+; b>g r | g | b | > c011 | c001 > >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | > (011b << 3) | 001b > >+; b>r r | b | g | > c011 | c010 > >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | > (011b << 3) | 010b > >+; else b | r | g | > c110 | c010 > >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | > (110b << 3) | 010b > >+ > >+SECTION .text > >+ > >+struc Lut3DPreLut > >+ .size: resd 1 > >+ .min: resd 3 > >+ .max: resd 3 > >+ .scale: resd 3 > >+ .lut: resq 3 > >+endstruc > >+ > >+struc LUT3DContext > >+ .class: resq 1 > >+ .lut: resq 1 > >+ .lutsize: resd 1 > >+ .lutsize2: resd 1 > >+ .scale: resd 3 > >+endstruc > >+ > >+%define AV_NUM_DATA_POINTERS 8 > >+ > >+struc AVFrame > >+ .data: resq AV_NUM_DATA_POINTERS > >+ .linesize: resd AV_NUM_DATA_POINTERS > >+ .extended_data: resq 1 > >+ .width: resd 1 > >+ .height: resd 1 > >+endstruc > >+ > >+%define rm rsp > >+%define gm rsp+mmsize > >+%define bm rsp+(mmsize*2) > >+ > >+%define lut3dsizem [rsp+mmsize*3] > >+%define lut3dsize2m [rsp+mmsize*4] > >+%define lut3dmaxm [rsp+mmsize*5] > >+%define prelutmaxm [rsp+mmsize*6] > >+ > >+%define scalerm [rsp+mmsize*7] > >+%define scalegm [rsp+mmsize*8] > >+%define scalebm [rsp+mmsize*9] > >+ > >+%define prelutminrm [rsp+mmsize*10] > >+%define prelutmingm [rsp+mmsize*11] > >+%define prelutminbm [rsp+mmsize*12] > >+ > >+%define prelutscalerm [rsp+mmsize*13] > >+%define prelutscalegm [rsp+mmsize*14] > >+%define prelutscalebm [rsp+mmsize*15] > >+ > >+; data pointers > >+%define srcrm [rsp+mmsize*16 + 0] > >+%define srcgm [rsp+mmsize*16 + 8] > >+%define srcbm [rsp+mmsize*16 + 16] > >+%define srcam [rsp+mmsize*16 + 24] > >+ > >+%define dstrm [rsp+mmsize*16 + 32] > >+%define dstgm [rsp+mmsize*16 + 40] > >+%define dstbm [rsp+mmsize*16 + 48] > >+%define dstam [rsp+mmsize*16 + 56] > >+ > >+%macro FETCH_PRELUT_PN 3 > >+ mov tmp2d, [rm + %3] > >+ mov tmp3d, [gm + %3] > >+ movss xm%1, [tmpq + tmp2q*4] > >+ movss xm%2, [tmpq + tmp3q*4] > >+ movss [rm + %3], xm%1 > >+ movss [gm + %3], xm%2 > >+%endmacro > >+ > >+; 1 - p > >+; 2 - n > >+; 3 - p indices > >+; 4 - n indices > >+%macro GATHER_PRELUT 4 > >+ %if cpuflag(avx2) > >+ vpcmpeqb m7, m7 > >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p > >+ vpcmpeqb m9, m9 > >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n > >+ %else > >+ mova [rm], m%3 > >+ mova [gm], m%4 > >+ FETCH_PRELUT_PN %1, %2, 0 > >+ FETCH_PRELUT_PN %1, %2, 4 > >+ FETCH_PRELUT_PN %1, %2, 8 > >+ FETCH_PRELUT_PN %1, %2, 12 > >+ %if mmsize > 16 > >+ FETCH_PRELUT_PN %1, %2, 16 > >+ FETCH_PRELUT_PN %1, %2, 20 > >+ FETCH_PRELUT_PN %1, %2, 24 > >+ FETCH_PRELUT_PN %1, %2, 28 > >+ %endif > >+ movu m%1, [rm] > >+ movu m%2, [gm] > >+ %endif > >+%endmacro > >+ > >+%macro FLOORPS 2 > >+ %if mmsize > 16 > >+ vroundps %1, %2, 0x01 > >+ %else > >+ cvttps2dq %1, %2 > >+ cvtdq2ps %1, %1 > >+ %endif > >+%endmacro > >+ > >+; 1 - dst > >+; 2 - index > >+; 3 - min > >+; 4 - scale > >+; assumes lut max m13, m14 1.0f, zero m15 > >+%macro APPLY_PRELUT 4 > >+ ; scale > >+ subps m5, m%1, %3 ; v - min > >+ mulps m5, m5, %4 ; v * scale > >+ ; clamp > >+ maxps m5, m5, m15 ; max zero > >+ minps m5, m5, m13 ; min lut max > >+ > >+ FLOORPS m3, m5 ; prev index > >+ subps m5, m5, m3 ; d > >+ addps m4, m3, m14 ; p+1 = n index > >+ minps m4, m4, m13 ; clamp n idex > >+ > >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] > >+ cvttps2dq m6, m3 > >+ cvttps2dq m10, m4 > >+ GATHER_PRELUT 3, 4, 6, 10 > >+ > >+ ; lerp > >+ subps m8, m4, m3 > >+ mulps m8, m8, m5 > >+ addps m%1, m8, m3 > >+%endmacro > >+ > >+; 1 - dst > >+; 2 - scale > >+; assumes lut max m13, zero m15 > >+%macro APPLY_SCALE 2 > >+ mulps m%1, m%1, %2 > >+ maxps m%1, m%1, m15 > >+ minps m%1, m%1, m13 > >+%endmacro > >+ > >+%macro BLEND 4 > >+%if mmsize > 16 > >+ vblendvps %1, %2, %3, %4 > >+%else > >+ %ifidni %1,%2 > >+ %error operand 1 must not equal operand 2 > >+ %endif > >+ %ifidni %1,%3 > >+ %error operand 1 must not equal operand 3 > >+ %endif > >+ mova %1, %2 > >+ xorps %1, %3 > >+ andps %1, %4 > >+ xorps %1, %2 > >+%endif > >+%endmacro > >+ > >+; sets nans to zere, +inf -inf handled later by min/max clamps > >+%macro SANITIZE_F 1 > >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False > >+ %if mmsize <= 16 > >+ mova m6, %1 > >+ BLEND %1, m15, m6, m5 > >+ %else > >+ BLEND %1, m15, %1, m5 > >+ %endif > >+%endmacro > >+ > >+%macro ADD3 4 > >+ addps %1, %2, %3 > >+ addps %1, %1, %4 > >+%endmacro > >+ > >+%macro CMP_EQUAL 3 > >+%if cpuflag(avx2) > >+ vpcmpeqd %1, %2, %3 > >+%elif cpuflag(avx) > >+ cmpps %1, %2, %3, 0x0 > >+%else > >+ pcmpeqd %1, %2, %3 > >+%endif > >+%endmacro > >+ > >+%macro SHIFT_RIGHT 2 > >+%if mmsize <= 16 > >+ psrld xm%1, %2 > >+%elif cpuflag(avx2) > >+ vpsrld m%1, m%1, %2 > >+%else > >+ vextractf128 xm15, m%1, 1 > >+ psrld xm%1, %2 > >+ psrld xm15, %2 > >+ vinsertf128 m%1, m%1, xm15, 1 > >+%endif > >+%endmacro > >+ > >+%macro FETCH_LUT3D_RGB 4 > >+ mov tmp2d, [rm + %4] > >+ movss xm%1, [tmpq + tmp2q*4 + 0] > >+ movss xm%2, [tmpq + tmp2q*4 + 4] > >+ movss xm%3, [tmpq + tmp2q*4 + 8] > >+ movss [rm + %4], xm%1 > >+ movss [gm + %4], xm%2 > >+ movss [bm + %4], xm%3 > >+%endmacro > >+ > >+; 1 - dstr > >+; 2 - dstg > >+; 3 - dstb > >+; 4 - indices > >+%macro GATHER_LUT3D_INDICES 4 > >+%if cpuflag(avx2) > >+ vpcmpeqb m3, m3 > >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3 > >+ vpcmpeqb m14, m14 > >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14 > >+ vpcmpeqb m15, m15 > >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15 > >+%else > >+ movu [rm], m%4 > >+ FETCH_LUT3D_RGB %1, %2, %3, 0 > >+ FETCH_LUT3D_RGB %1, %2, %3, 4 > >+ FETCH_LUT3D_RGB %1, %2, %3, 8 > >+ FETCH_LUT3D_RGB %1, %2, %3, 12 > >+%if mmsize > 16 > >+ FETCH_LUT3D_RGB %1, %2, %3, 16 > >+ FETCH_LUT3D_RGB %1, %2, %3, 20 > >+ FETCH_LUT3D_RGB %1, %2, %3, 24 > >+ FETCH_LUT3D_RGB %1, %2, %3, 28 > >+%endif > >+ movu m%1, [rm] > >+ movu m%2, [gm] > >+ movu m%3, [bm] > >+%endif > >+%endmacro > >+ > >+%macro interp_tetrahedral 0 > >+ %define d_r m0 > >+ %define d_g m1 > >+ %define d_b m2 > >+ > >+ %define prev_r m3 > >+ %define prev_g m4 > >+ %define prev_b m5 > >+ > >+ %define next_r m6 > >+ %define next_g m7 > >+ %define next_b m8 > >+ > >+ %define x0 m4 > >+ %define x1 m5 > >+ %define x2 m6 > >+ > >+ ; setup prev index > >+ FLOORPS prev_r, m0 > >+ FLOORPS prev_g, m1 > >+ FLOORPS prev_b, m2 > >+ > >+ ; setup deltas > >+ subps d_r, m0, prev_r > >+ subps d_g, m1, prev_g > >+ subps d_b, m2, prev_b > >+ > >+ ; calculate select mask m9 > >+ movu m6, [pd_tetra_table2] > >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ > >+ BLEND m10, m6, [pd_tetra_table1], m7 > >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ > >+ BLEND m6, m10, [pd_tetra_table0], m7 > >+ > >+ movu m10, [pd_tetra_table5] > >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ > >+ BLEND m9, m10, [pd_tetra_table4], m7 > >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ > >+ BLEND m10, m9, [pd_tetra_table3], m7 > >+ > >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ > >+ BLEND m9, m10, m6, m7 > >+ > >+ ; setup next index > >+ addps next_r, prev_r, m14 ; +1 > >+ minps next_r, next_r, m13 ; clamp lutmax > >+ > >+ addps next_g, prev_g, m14 ; +1 > >+ minps next_g, next_g, m13 ; clamp lutmax > >+ > >+ addps next_b, prev_b, m14 ; +1 > >+ minps next_b, next_b, m13 ; clamp lutmax > >+ > >+ ; prescale indices > >+ mulps prev_r, prev_r, lut3dsize2m > >+ mulps next_r, next_r, lut3dsize2m > >+ > >+ mulps prev_g, prev_g, lut3dsizem > >+ mulps next_g, next_g, lut3dsizem > >+ > >+ mulps prev_b, prev_b, [pd_3f] > >+ mulps next_b, next_b, [pd_3f] > >+ > >+ movu m14, [pd_001] > >+ > >+ ; cxxa m10 > >+ ; b > >+ andps m15, m9, m14 > >+ CMP_EQUAL m15, m15, m14 > >+ BLEND m10, prev_b, next_b, m15 > >+ > >+ ; g > >+ andps m15, m9, [pd_010] > >+ CMP_EQUAL m15, m15, [pd_010] > >+ BLEND m12, prev_g, next_g, m15 > >+ > >+ ; r > >+ andps m15, m9, [pd_100] > >+ CMP_EQUAL m15, m15, [pd_100] > >+ BLEND m13, prev_r, next_r, m15 > >+ > >+ ADD3 m10, m10, m12, m13 > >+ > >+ SHIFT_RIGHT 9, 3 ; 3 > >+ > >+ ; cxxb m11; > >+ ; b > >+ andps m15, m9, m14 > >+ CMP_EQUAL m15, m15, m14 > >+ BLEND m11, prev_b, next_b, m15 > >+ > >+ ; g > >+ andps m15, m9, [pd_010] > >+ CMP_EQUAL m15, m15, [pd_010] > >+ BLEND m12, prev_g, next_g, m15 > >+ > >+ ; r > >+ andps m15, m9, [pd_100] > >+ CMP_EQUAL m15, m15, [pd_100] > >+ BLEND m13, prev_r, next_r, m15 > >+ > >+ ADD3 m11, m11, m12, m13 > >+ > >+ ; c000 m12; > >+ ADD3 m12, prev_r, prev_g, prev_b > >+ > >+ ; c111 m13; > >+ ADD3 m13, next_r, next_g, next_b > >+ > >+ SHIFT_RIGHT 9, 3 ; 6 > >+ > >+ ; x0, m4 > >+ andps m15, m9, m14 > >+ CMP_EQUAL m15, m15, m14 > >+ BLEND m7, d_r, d_g, m15 ; r,g > >+ > >+ andps m15, m9, [pd_010] > >+ CMP_EQUAL m15, m15, [pd_010] > >+ BLEND x0, m7, d_b, m15 ; b > >+ > >+ ; x1, m5 > >+ andps m15, m9, [pd_100] > >+ CMP_EQUAL m15, m15, [pd_100] > >+ BLEND m7, d_r, d_g, m15 ; r,g > >+ > >+ SHIFT_RIGHT 9, 3 ; 9 > >+ > >+ andps m15, m9, m14 > >+ CMP_EQUAL m15, m15, m14 > >+ BLEND x1, m7, d_b, m15 ; b > >+ > >+ ; x2, m6 > >+ andps m15, m9, [pd_010] > >+ CMP_EQUAL m15, m15, [pd_010] > >+ BLEND m7, d_r, d_g, m15 ; r,g > >+ > >+ andps m15, m9, [pd_100] > >+ CMP_EQUAL m15, m15, [pd_100] > >+ BLEND x2, m7, d_b, m15 ; b > >+ > >+ ; convert indices to integer > >+ cvttps2dq m12, m12 > >+ cvttps2dq m10, m10 > >+ cvttps2dq m11, m11 > >+ cvttps2dq m13, m13 > >+ > >+ ; now the gathering festival > >+ mov tmpq, [ctxq + LUT3DContext.lut] > >+ > >+ GATHER_LUT3D_INDICES 0, 1, 2, 12 > >+ movu m14, [pd_1f] > >+ subps m14, m14, x0; 1 - x0 > >+ > >+ mulps m0, m0, m14 > >+ mulps m1, m1, m14 > >+ mulps m2, m2, m14 > >+ > >+ GATHER_LUT3D_INDICES 7, 8, 9, 10 > >+ subps m14, x0, x1; x0 - x1 > >+ mulps m7, m7, m14 > >+ addps m0, m0, m7 > >+ > >+ mulps m8, m8, m14 > >+ addps m1, m1, m8 > >+ > >+ mulps m9, m9, m14 > >+ addps m2, m2, m9 > >+ > >+ GATHER_LUT3D_INDICES 7, 8, 9, 11 > >+ subps m14, x1, x2; x1 - x2 > >+ > >+ mulps m7, m7, m14 > >+ addps m0, m0, m7 > >+ > >+ mulps m8, m8, m14 > >+ addps m1, m1, m8 > >+ > >+ mulps m9, m9, m14 > >+ addps m2, m2, m9 > >+ > >+ GATHER_LUT3D_INDICES 7, 8, 9, 13 > >+ mulps m7, m7, x2 > >+ addps m0, m0, m7 > >+ > >+ mulps m8, m8, x2 > >+ addps m1, m1, m8 > >+ > >+ mulps m9, m9, x2 > >+ addps m2, m2, m9 > >+%endmacro > >+ > >+%macro INIT_DATA_PTR 3 > >+ mov ptrq, [%2 + AVFrame.data + %3 * 8] > >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] > >+ imul tmpd, slice_startd > >+ add ptrq, tmpq > >+ mov %1, ptrq > >+%endmacro > >+ > >+%macro INC_DATA_PTR 3 > >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] > >+ mov ptrq, %1 > >+ add ptrq, tmpq > >+ mov %1, ptrq > >+%endmacro > >+ > >+%macro LOAD16 2 > >+ mov ptrq, %2 > >+ %if mmsize > 16 > >+ movu xm%1, [ptrq + xq*2] > >+ %else > >+ movsd xm%1, [ptrq + xq*2] > >+ %endif > >+ %if cpuflag(avx2) > >+ vpmovzxwd m%1, xm%1 > >+ %else > >+ %if mmsize > 16 > >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) > >+ pshufb xm%1, xm6 ; pb_shuffle16 > >+ pshufb xm4, xm6 ; pb_shuffle16 > >+ vinsertf128 m%1, m%1, xm4, 1 > >+ %else > >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) > >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) > >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) > >+ %endif > >+ %endif > >+ cvtdq2ps m%1, m%1 > >+ mulps m%1, m%1, m7 ; pd_65535_invf > >+%endmacro > >+ > >+%macro STORE16 2 > >+ mulps m%2, m%2, m5 ; [pd_65535f] > >+ minps m%2, m%2, m5 ; [pd_65535f] > >+ maxps m%2, m%2, m15 ; zero > >+ cvttps2dq m%2, m%2 > >+ %if mmsize > 16 > >+ vextractf128 xm4, m%2, 1 > >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] > >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] > >+ por xm%2, xm4 > >+ %else > >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) > >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) > >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) > >+ %endif > >+ mov ptrq, %1 > >+ %if mmsize > 16 > >+ movu [ptrq + xq*2], xm%2 > >+ %else > >+ movsd [ptrq + xq*2], xm%2 > >+ %endif > >+%endmacro > >+ > >+; 1 - interp method > >+; 2 - format_name > >+; 3 - depth > >+; 4 - is float format > >+%macro DEFINE_INTERP_FUNC 4 > >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, > src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, > tmp, tmp2, tmp3 > >+ ; store lut max and lutsize > >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize] > >+ cvtsi2ss xm0, tmpd > >+ mulss xm0, xm0, [pd_3f] > >+ VBROADCASTSS m0, xm0 > >+ mova lut3dsizem, m0 > >+ sub tmpd, 1 > >+ cvtsi2ss xm0, tmpd > >+ VBROADCASTSS m0, xm0 > >+ mova lut3dmaxm, m0 > >+ > >+ ; scale_r > >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] > >+ VBROADCASTSS m1, xm1 > >+ mova scalerm, m1 > >+ > >+ ; scale_g > >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] > >+ VBROADCASTSS m1, xm1 > >+ mova scalegm, m1 > >+ > >+ ; scale_b > >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] > >+ VBROADCASTSS m1, xm1 > >+ mova scalebm, m1 > >+ > >+ ; store lutsize2 > >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] > >+ mulss xm0, xm0, [pd_3f] > >+ VBROADCASTSS m0, xm0 > >+ mova lut3dsize2m, m0 > >+ > >+ ; init prelut values > >+ cmp prelutq, 0 > >+ je %%skip_init_prelut > >+ mov tmpd, dword [prelutq + Lut3DPreLut.size] > >+ sub tmpd, 1 > >+ cvtsi2ss xm0, tmpd > >+ VBROADCASTSS m0, xm0 > >+ mova prelutmaxm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] > >+ mova prelutminrm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] > >+ mova prelutmingm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] > >+ mova prelutminbm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] > >+ mova prelutscalerm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] > >+ mova prelutscalegm, m0 > >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] > >+ mova prelutscalebm, m0 > >+ %%skip_init_prelut: > >+ > >+ mov widthd, [src_imageq + AVFrame.width] > >+ > >+ ; gbra pixel order > >+ INIT_DATA_PTR srcrm, src_imageq, 2 > >+ INIT_DATA_PTR srcgm, src_imageq, 0 > >+ INIT_DATA_PTR srcbm, src_imageq, 1 > >+ INIT_DATA_PTR srcam, src_imageq, 3 > >+ > >+ INIT_DATA_PTR dstrm, dst_imageq, 2 > >+ INIT_DATA_PTR dstgm, dst_imageq, 0 > >+ INIT_DATA_PTR dstbm, dst_imageq, 1 > >+ INIT_DATA_PTR dstam, dst_imageq, 3 > >+ > >+ %%loop_y: > >+ xor xq, xq > >+ %%loop_x: > >+ movu m14, [pd_1f] > >+ xorps m15, m15, m15 > >+ %if %4 ; float > >+ mov ptrq, srcrm > >+ movu m0, [ptrq + xq*4] > >+ mov ptrq, srcgm > >+ movu m1, [ptrq + xq*4] > >+ mov ptrq, srcbm > >+ movu m2, [ptrq + xq*4] > >+ SANITIZE_F m0 > >+ SANITIZE_F m1 > >+ SANITIZE_F m2 > >+ %else > >+ ; constants for LOAD16 > >+ movu m7, [pd_65535_invf] > >+ %if notcpuflag(avx2) && mmsize >= 32 > >+ movu xm6, [pb_shuffle16] > >+ %endif > >+ LOAD16 0, srcrm > >+ LOAD16 1, srcgm > >+ LOAD16 2, srcbm > >+ %endif > >+ > >+ cmp prelutq, 0 > >+ je %%skip_prelut > >+ mova m13, prelutmaxm > >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm > >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm > >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm > >+ %%skip_prelut: > >+ > >+ mova m13, lut3dmaxm > >+ APPLY_SCALE 0, scalerm > >+ APPLY_SCALE 1, scalegm > >+ APPLY_SCALE 2, scalebm > >+ > >+ interp_%1 > >+ > >+ %if %4 ; float > >+ mov ptrq, dstrm > >+ movu [ptrq + xq*4], m0 > >+ mov ptrq, dstgm > >+ movu [ptrq + xq*4], m1 > >+ mov ptrq, dstbm > >+ movu [ptrq + xq*4], m2 > >+ cmp has_alphad, 0 > >+ je %%skip_alphaf > >+ mov ptrq, srcam > >+ movu m0, [ptrq + xq*4] > >+ mov ptrq, dstam > >+ movu [ptrq + xq*4], m0 > >+ %%skip_alphaf: > >+ %else > >+ ; constants for STORE16 > >+ movu m5, [pd_65535f] > >+ %if mmsize > 16 > >+ movu xm6, [pb_lo_pack_shuffle16] > >+ movu xm7, [pb_hi_pack_shuffle16] > >+ %endif > >+ > >+ xorps m15, m15, m15 > >+ STORE16 dstrm, 0 > >+ STORE16 dstgm, 1 > >+ STORE16 dstbm, 2 > >+ > >+ cmp has_alphad, 0 > >+ je %%skip_alpha > >+ %if mmsize > 16 > >+ mov ptrq, srcam > >+ movu xm0, [ptrq + xq*2] > >+ mov ptrq, dstam > >+ movu [ptrq + xq*2], xm0 > >+ %else > >+ mov ptrq, srcam > >+ movsd xm0, [ptrq + xq*2] > >+ mov ptrq, dstam > >+ movsd [ptrq + xq*2], xm0 > >+ %endif > >+ > >+ %%skip_alpha: > >+ %endif > >+ > >+ add xq, mmsize/4 > >+ cmp xd, widthd > >+ jl %%loop_x > >+ > >+ INC_DATA_PTR srcrm, src_imageq, 2 > >+ INC_DATA_PTR srcgm, src_imageq, 0 > >+ INC_DATA_PTR srcbm, src_imageq, 1 > >+ INC_DATA_PTR srcam, src_imageq, 3 > >+ > >+ INC_DATA_PTR dstrm, dst_imageq, 2 > >+ INC_DATA_PTR dstgm, dst_imageq, 0 > >+ INC_DATA_PTR dstbm, dst_imageq, 1 > >+ INC_DATA_PTR dstam, dst_imageq, 3 > >+ > >+ inc slice_startd > >+ cmp slice_startd, slice_endd > >+ jl %%loop_y > >+ > >+ RET > >+%endmacro > >+%if ARCH_X86_64 > >+ %if HAVE_AVX2_EXTERNAL > >+ INIT_YMM avx2 > >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 > >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 > >+ %endif > >+ %if HAVE_AVX_EXTERNAL > >+ INIT_YMM avx > >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 > >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 > >+ %endif > >+ INIT_XMM sse2 > >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 > >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 > >+%endif > >\ No newline at end of file > >diff --git a/libavfilter/x86/vf_lut3d_init.c > b/libavfilter/x86/vf_lut3d_init.c > >new file mode 100644 > >index 0000000000..9b9b36e4af > >--- /dev/null > >+++ b/libavfilter/x86/vf_lut3d_init.c > >@@ -0,0 +1,88 @@ > >+/* > >+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com> > >+ * > >+ * This file is part of FFmpeg. > >+ * > >+ * FFmpeg is free software; you can redistribute it and/or > >+ * modify it under the terms of the GNU Lesser General Public > >+ * License as published by the Free Software Foundation; either > >+ * version 2.1 of the License, or (at your option) any later version. > >+ * > >+ * FFmpeg is distributed in the hope that it will be useful, > >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of > >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >+ * Lesser General Public License for more details. > >+ * > >+ * You should have received a copy of the GNU Lesser General Public > >+ * License along with FFmpeg; if not, write to the Free Software > >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > >+ */ > >+ > >+#include "libavutil/attributes.h" > >+#include "libavutil/cpu.h" > >+#include "libavutil/x86/cpu.h" > >+#include "libavfilter/lut3d.h" > >+ > >+#define DEFINE_INTERP_FUNC(name, format, opt) > \ > >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, > Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int > slice_end, int has_alpha); \ > >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void > *arg, int jobnr, int nb_jobs) > \ > >+{ > \ > >+ LUT3DContext *lut3d = ctx->priv; > > \ > >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; > \ > >+ ThreadData *td = arg; > \ > >+ AVFrame *in = td->in; > > \ > >+ AVFrame *out = td->out; > \ > >+ int has_alpha = in->linesize[3] && out != in; > \ > >+ int slice_start = (in->height * jobnr ) / nb_jobs; > \ > >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; > \ > >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, > slice_start, slice_end, has_alpha); > \ > >+ return 0; > \ > >+} > >+ > >+#if ARCH_X86_64 > >+#if HAVE_AVX2_EXTERNAL > >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) > >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) > >+#endif > >+#if HAVE_AVX_EXTERNAL > >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) > >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx) > >+#endif > >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) > >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) > >+#endif > >+ > >+ > >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor > *desc) > >+{ > >+ int cpu_flags = av_get_cpu_flags(); > >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; > >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; > >+ int depth = desc->comp[0].depth; > >+ > >+#if ARCH_X86_64 > >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == > INTERPOLATE_TETRAHEDRAL && planar) { > >+#if HAVE_AVX2_EXTERNAL > >+ if (isfloat && planar) { > >+ s->interp = interp_tetrahedral_pf32_avx2; > >+ } else if (depth == 16) { > >+ s->interp = interp_tetrahedral_p16_avx2; > >+ } > >+#endif > >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == > INTERPOLATE_TETRAHEDRAL && planar) { > >+#if HAVE_AVX_EXTERNAL > >+ if (isfloat) { > >+ s->interp = interp_tetrahedral_pf32_avx; > >+ } else if (depth == 16) { > >+ s->interp = interp_tetrahedral_p16_avx; > >+ } > >+#endif > >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == > INTERPOLATE_TETRAHEDRAL && planar) { > >+ if (isfloat) { > >+ s->interp = interp_tetrahedral_pf32_sse2; > >+ } else if (depth == 16) { > >+ s->interp = interp_tetrahedral_p16_sse2; > >+ } > >+ } > >+#endif > >+} > >-- > >2.31.1.windows.1 > > > >_______________________________________________ > >ffmpeg-devel mailing list > >ffmpeg-devel@ffmpeg.org > >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > >To unsubscribe, visit link above, or email > >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >
On Wed, Sep 29, 2021 at 10:27 AM Mark Reid <mindmark@gmail.com> wrote: > > > On Tue, Sep 28, 2021 at 6:38 PM chen <chenm003@163.com> wrote: > >> Hello, >> >> >> Excuse me, how about FMADD on AVX2 platform? >> >> >> For example >> + mulps m7, m7, m14 >> + addps m0, m0, m7 >> >> ==> >> >> >> fmadd231ps m0,m7,m14 >> >> > Interesting, does having AVX2 guarantee having FMA instructions? > > I'm still not 100% certain all AVX2 cpus have FMA instructions so I'll add cpuflags check for FMA too. I also came up with a faster way to calculate x0,x1,x2 without the lookup table. will send a new patch. > >> Regards, >> Min Chen >> >> >> 2021-09-29 09:18:05,mindmark@gmail.com >> >From: Mark Reid <mindmark@gmail.com> >> > >> >Only supports float and 16bit planer formats at the momoment. >> >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer >> some >> >speed gains. >> > >> >f32 1920x1080 1 thread with prelut >> >c impl >> >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips >> >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips >> >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips >> >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >sse2 >> >948662320 UNITS in lut3d->interp, 1 runs, 0 skips >> >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips >> >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips >> >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx >> >633837000 UNITS in lut3d->interp, 1 runs, 0 skips >> >669452850 UNITS in lut3d->interp, 2 runs, 0 skips >> >650716580 UNITS in lut3d->interp, 4 runs, 0 skips >> >644698550 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx2 >> >354940020 UNITS in lut3d->interp, 1 runs, 0 skips >> >362384340 UNITS in lut3d->interp, 2 runs, 0 skips >> >356799020 UNITS in lut3d->interp, 4 runs, 0 skips >> >357276815 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >gbrap16 1920x1080 1 thread with prelut >> >c impl >> >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips >> >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips >> >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips >> >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >sse2 >> >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips >> >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips >> >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips >> >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx >> >678089880 UNITS in lut3d->interp, 1 runs, 0 skips >> >679112485 UNITS in lut3d->interp, 2 runs, 0 skips >> >695527212 UNITS in lut3d->interp, 4 runs, 0 skips >> >691300053 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >avx2 >> >372671340 UNITS in lut3d->interp, 1 runs, 0 skips >> >373449870 UNITS in lut3d->interp, 2 runs, 0 skips >> >383725625 UNITS in lut3d->interp, 4 runs, 0 skips >> >382860848 UNITS in lut3d->interp, 8 runs, 0 skips >> > >> >--- >> > libavfilter/lut3d.h | 83 ++++ >> > libavfilter/vf_lut3d.c | 61 +-- >> > libavfilter/x86/Makefile | 2 + >> > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ >> > libavfilter/x86/vf_lut3d_init.c | 88 ++++ >> > 5 files changed, 935 insertions(+), 56 deletions(-) >> > create mode 100644 libavfilter/lut3d.h >> > create mode 100644 libavfilter/x86/vf_lut3d.asm >> > create mode 100644 libavfilter/x86/vf_lut3d_init.c >> > >> >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h >> >new file mode 100644 >> >index 0000000000..ded2a036a5 >> >--- /dev/null >> >+++ b/libavfilter/lut3d.h >> >@@ -0,0 +1,83 @@ >> >+/* >> >+ * Copyright (c) 2013 Clément Bœsch >> >+ * Copyright (c) 2018 Paul B Mahol >> >+ * >> >+ * This file is part of FFmpeg. >> >+ * >> >+ * FFmpeg is free software; you can redistribute it and/or >> >+ * modify it under the terms of the GNU Lesser General Public >> >+ * License as published by the Free Software Foundation; either >> >+ * version 2.1 of the License, or (at your option) any later version. >> >+ * >> >+ * FFmpeg is distributed in the hope that it will be useful, >> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+ * Lesser General Public License for more details. >> >+ * >> >+ * You should have received a copy of the GNU Lesser General Public >> >+ * License along with FFmpeg; if not, write to the Free Software >> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >+ */ >> >+#ifndef AVFILTER_LUT3D_H >> >+#define AVFILTER_LUT3D_H >> >+ >> >+#include "libavutil/pixdesc.h" >> >+#include "framesync.h" >> >+#include "avfilter.h" >> >+ >> >+enum interp_mode { >> >+ INTERPOLATE_NEAREST, >> >+ INTERPOLATE_TRILINEAR, >> >+ INTERPOLATE_TETRAHEDRAL, >> >+ INTERPOLATE_PYRAMID, >> >+ INTERPOLATE_PRISM, >> >+ NB_INTERP_MODE >> >+}; >> >+ >> >+struct rgbvec { >> >+ float r, g, b; >> >+}; >> >+ >> >+/* 3D LUT don't often go up to level 32, but it is common to have a >> Hald CLUT >> >+ * of 512x512 (64x64x64) */ >> >+#define MAX_LEVEL 256 >> >+#define PRELUT_SIZE 65536 >> >+ >> >+typedef struct Lut3DPreLut { >> >+ int size; >> >+ float min[3]; >> >+ float max[3]; >> >+ float scale[3]; >> >+ float* lut[3]; >> >+} Lut3DPreLut; >> >+ >> >+typedef struct LUT3DContext { >> >+ const AVClass *class; >> >+ struct rgbvec *lut; >> >+ int lutsize; >> >+ int lutsize2; >> >+ struct rgbvec scale; >> >+ int interpolation; ///<interp_mode >> >+ char *file; >> >+ uint8_t rgba_map[4]; >> >+ int step; >> >+ avfilter_action_func *interp; >> >+ Lut3DPreLut prelut; >> >+#if CONFIG_HALDCLUT_FILTER >> >+ uint8_t clut_rgba_map[4]; >> >+ int clut_step; >> >+ int clut_bits; >> >+ int clut_planar; >> >+ int clut_float; >> >+ int clut_width; >> >+ FFFrameSync fs; >> >+#endif >> >+} LUT3DContext; >> >+ >> >+typedef struct ThreadData { >> >+ AVFrame *in, *out; >> >+} ThreadData; >> >+ >> >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); >> >+ >> >+#endif /* AVFILTER_LUT3D_H */ >> >\ No newline at end of file >> >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c >> >index 9fbda833b9..1fd0af06db 100644 >> >--- a/libavfilter/vf_lut3d.c >> >+++ b/libavfilter/vf_lut3d.c >> >@@ -31,73 +31,18 @@ >> > #include "libavutil/intreadwrite.h" >> > #include "libavutil/intfloat.h" >> > #include "libavutil/avassert.h" >> >-#include "libavutil/pixdesc.h" >> > #include "libavutil/avstring.h" >> >-#include "avfilter.h" >> > #include "drawutils.h" >> > #include "formats.h" >> >-#include "framesync.h" >> > #include "internal.h" >> > #include "video.h" >> >+#include "lut3d.h" >> > >> > #define R 0 >> > #define G 1 >> > #define B 2 >> > #define A 3 >> > >> >-enum interp_mode { >> >- INTERPOLATE_NEAREST, >> >- INTERPOLATE_TRILINEAR, >> >- INTERPOLATE_TETRAHEDRAL, >> >- INTERPOLATE_PYRAMID, >> >- INTERPOLATE_PRISM, >> >- NB_INTERP_MODE >> >-}; >> >- >> >-struct rgbvec { >> >- float r, g, b; >> >-}; >> >- >> >-/* 3D LUT don't often go up to level 32, but it is common to have a >> Hald CLUT >> >- * of 512x512 (64x64x64) */ >> >-#define MAX_LEVEL 256 >> >-#define PRELUT_SIZE 65536 >> >- >> >-typedef struct Lut3DPreLut { >> >- int size; >> >- float min[3]; >> >- float max[3]; >> >- float scale[3]; >> >- float* lut[3]; >> >-} Lut3DPreLut; >> >- >> >-typedef struct LUT3DContext { >> >- const AVClass *class; >> >- int interpolation; ///<interp_mode >> >- char *file; >> >- uint8_t rgba_map[4]; >> >- int step; >> >- avfilter_action_func *interp; >> >- struct rgbvec scale; >> >- struct rgbvec *lut; >> >- int lutsize; >> >- int lutsize2; >> >- Lut3DPreLut prelut; >> >-#if CONFIG_HALDCLUT_FILTER >> >- uint8_t clut_rgba_map[4]; >> >- int clut_step; >> >- int clut_bits; >> >- int clut_planar; >> >- int clut_float; >> >- int clut_width; >> >- FFFrameSync fs; >> >-#endif >> >-} LUT3DContext; >> >- >> >-typedef struct ThreadData { >> >- AVFrame *in, *out; >> >-} ThreadData; >> >- >> > #define OFFSET(x) offsetof(LUT3DContext, x) >> > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM >> > #define TFLAGS >> AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM >> >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) >> > av_assert0(0); >> > } >> > >> >+ if (ARCH_X86) { >> >+ ff_lut3d_init_x86(lut3d, desc); >> >+ } >> >+ >> > return 0; >> > } >> > >> >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile >> >index 016a5b3511..a29941eaeb 100644 >> >--- a/libavfilter/x86/Makefile >> >+++ b/libavfilter/x86/Makefile >> >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += >> x86/vf_hqdn3d_init.o >> > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o >> > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o >> > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o >> >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o >> > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += >> x86/vf_maskedclamp_init.o >> > OBJS-$(CONFIG_MASKEDMERGE_FILTER) += >> x86/vf_maskedmerge_init.o >> > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o >> >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += >> x86/vf_hqdn3d.o >> > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o >> > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o >> > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o >> >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o >> > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o >> > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o >> > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o >> >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm >> >new file mode 100644 >> >index 0000000000..b3d7c3962b >> >--- /dev/null >> >+++ b/libavfilter/x86/vf_lut3d.asm >> >@@ -0,0 +1,757 @@ >> >> >+;***************************************************************************** >> >+;* x86-optimized functions for lut3d filter >> >+;* >> >+;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com> >> >+;* >> >+;* This file is part of FFmpeg. >> >+;* >> >+;* FFmpeg is free software; you can redistribute it and/or >> >+;* modify it under the terms of the GNU Lesser General Public >> >+;* License as published by the Free Software Foundation; either >> >+;* version 2.1 of the License, or (at your option) any later version. >> >+;* >> >+;* FFmpeg is distributed in the hope that it will be useful, >> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+;* Lesser General Public License for more details. >> >+;* >> >+;* You should have received a copy of the GNU Lesser General Public >> >+;* License along with FFmpeg; if not, write to the Free Software >> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >> >+;****************************************************************************** >> >+ >> >+%include "libavutil/x86/x86util.asm" >> >+ >> >+SECTION_RODATA >> >+pd_1f: times 8 dd 1.0 >> >+pd_3f: times 8 dd 3.0 >> >+ >> >+; used to limit rshifts as they are more expensive in avx1 >> >+pd_001: times 8 dd 001b >> >+pd_010: times 8 dd 010b >> >+pd_100: times 8 dd 100b >> >+ >> >+pd_65535f: times 8 dd 65535.0 >> >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 >> >+ >> >+pb_shuffle16: db 0, 1, 0x80, 0x80, \ >> >+ 2, 3, 0x80, 0x80, \ >> >+ 4, 5, 0x80, 0x80, \ >> >+ 6, 7, 0x80, 0x80 >> >+ >> >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ >> >+ 8, 9, 12, 13, \ >> >+ 0x80, 0x80, 0x80, 0x80, \ >> >+ 0x80, 0x80, 0x80, 0x80 >> >+ >> >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ >> >+ 0x80, 0x80, 0x80, 0x80, \ >> >+ 0, 1, 4, 5, \ >> >+ 8, 9, 12, 13 >> >+ >> >+; tetrahedral table -------------------------------------------- >> >+; name: x2| x1| x0| cxxb| cxxa >> >+; values: r 00| r 00| r 00| c011 011| c001 001 >> >+; g 01| g 01| g 01| c101 101| c010 010 >> >+; b 10| b 10| b 10| c110 110| c100 100 >> >+ >> >+; g>b b | g | r | >> c110 | c100 >> >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | >> (110b << 3) | 100b >> >+; r>b g | b | r | >> c101 | c100 >> >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | >> (101b << 3) | 100b >> >+; else g | r | b | >> c101 | c001 >> >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | >> (101b << 3) | 001b >> >+; b>g r | g | b | >> c011 | c001 >> >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | >> (011b << 3) | 001b >> >+; b>r r | b | g | >> c011 | c010 >> >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | >> (011b << 3) | 010b >> >+; else b | r | g | >> c110 | c010 >> >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | >> (110b << 3) | 010b >> >+ >> >+SECTION .text >> >+ >> >+struc Lut3DPreLut >> >+ .size: resd 1 >> >+ .min: resd 3 >> >+ .max: resd 3 >> >+ .scale: resd 3 >> >+ .lut: resq 3 >> >+endstruc >> >+ >> >+struc LUT3DContext >> >+ .class: resq 1 >> >+ .lut: resq 1 >> >+ .lutsize: resd 1 >> >+ .lutsize2: resd 1 >> >+ .scale: resd 3 >> >+endstruc >> >+ >> >+%define AV_NUM_DATA_POINTERS 8 >> >+ >> >+struc AVFrame >> >+ .data: resq AV_NUM_DATA_POINTERS >> >+ .linesize: resd AV_NUM_DATA_POINTERS >> >+ .extended_data: resq 1 >> >+ .width: resd 1 >> >+ .height: resd 1 >> >+endstruc >> >+ >> >+%define rm rsp >> >+%define gm rsp+mmsize >> >+%define bm rsp+(mmsize*2) >> >+ >> >+%define lut3dsizem [rsp+mmsize*3] >> >+%define lut3dsize2m [rsp+mmsize*4] >> >+%define lut3dmaxm [rsp+mmsize*5] >> >+%define prelutmaxm [rsp+mmsize*6] >> >+ >> >+%define scalerm [rsp+mmsize*7] >> >+%define scalegm [rsp+mmsize*8] >> >+%define scalebm [rsp+mmsize*9] >> >+ >> >+%define prelutminrm [rsp+mmsize*10] >> >+%define prelutmingm [rsp+mmsize*11] >> >+%define prelutminbm [rsp+mmsize*12] >> >+ >> >+%define prelutscalerm [rsp+mmsize*13] >> >+%define prelutscalegm [rsp+mmsize*14] >> >+%define prelutscalebm [rsp+mmsize*15] >> >+ >> >+; data pointers >> >+%define srcrm [rsp+mmsize*16 + 0] >> >+%define srcgm [rsp+mmsize*16 + 8] >> >+%define srcbm [rsp+mmsize*16 + 16] >> >+%define srcam [rsp+mmsize*16 + 24] >> >+ >> >+%define dstrm [rsp+mmsize*16 + 32] >> >+%define dstgm [rsp+mmsize*16 + 40] >> >+%define dstbm [rsp+mmsize*16 + 48] >> >+%define dstam [rsp+mmsize*16 + 56] >> >+ >> >+%macro FETCH_PRELUT_PN 3 >> >+ mov tmp2d, [rm + %3] >> >+ mov tmp3d, [gm + %3] >> >+ movss xm%1, [tmpq + tmp2q*4] >> >+ movss xm%2, [tmpq + tmp3q*4] >> >+ movss [rm + %3], xm%1 >> >+ movss [gm + %3], xm%2 >> >+%endmacro >> >+ >> >+; 1 - p >> >+; 2 - n >> >+; 3 - p indices >> >+; 4 - n indices >> >+%macro GATHER_PRELUT 4 >> >+ %if cpuflag(avx2) >> >+ vpcmpeqb m7, m7 >> >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p >> >+ vpcmpeqb m9, m9 >> >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n >> >+ %else >> >+ mova [rm], m%3 >> >+ mova [gm], m%4 >> >+ FETCH_PRELUT_PN %1, %2, 0 >> >+ FETCH_PRELUT_PN %1, %2, 4 >> >+ FETCH_PRELUT_PN %1, %2, 8 >> >+ FETCH_PRELUT_PN %1, %2, 12 >> >+ %if mmsize > 16 >> >+ FETCH_PRELUT_PN %1, %2, 16 >> >+ FETCH_PRELUT_PN %1, %2, 20 >> >+ FETCH_PRELUT_PN %1, %2, 24 >> >+ FETCH_PRELUT_PN %1, %2, 28 >> >+ %endif >> >+ movu m%1, [rm] >> >+ movu m%2, [gm] >> >+ %endif >> >+%endmacro >> >+ >> >+%macro FLOORPS 2 >> >+ %if mmsize > 16 >> >+ vroundps %1, %2, 0x01 >> >+ %else >> >+ cvttps2dq %1, %2 >> >+ cvtdq2ps %1, %1 >> >+ %endif >> >+%endmacro >> >+ >> >+; 1 - dst >> >+; 2 - index >> >+; 3 - min >> >+; 4 - scale >> >+; assumes lut max m13, m14 1.0f, zero m15 >> >+%macro APPLY_PRELUT 4 >> >+ ; scale >> >+ subps m5, m%1, %3 ; v - min >> >+ mulps m5, m5, %4 ; v * scale >> >+ ; clamp >> >+ maxps m5, m5, m15 ; max zero >> >+ minps m5, m5, m13 ; min lut max >> >+ >> >+ FLOORPS m3, m5 ; prev index >> >+ subps m5, m5, m3 ; d >> >+ addps m4, m3, m14 ; p+1 = n index >> >+ minps m4, m4, m13 ; clamp n idex >> >+ >> >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] >> >+ cvttps2dq m6, m3 >> >+ cvttps2dq m10, m4 >> >+ GATHER_PRELUT 3, 4, 6, 10 >> >+ >> >+ ; lerp >> >+ subps m8, m4, m3 >> >+ mulps m8, m8, m5 >> >+ addps m%1, m8, m3 >> >+%endmacro >> >+ >> >+; 1 - dst >> >+; 2 - scale >> >+; assumes lut max m13, zero m15 >> >+%macro APPLY_SCALE 2 >> >+ mulps m%1, m%1, %2 >> >+ maxps m%1, m%1, m15 >> >+ minps m%1, m%1, m13 >> >+%endmacro >> >+ >> >+%macro BLEND 4 >> >+%if mmsize > 16 >> >+ vblendvps %1, %2, %3, %4 >> >+%else >> >+ %ifidni %1,%2 >> >+ %error operand 1 must not equal operand 2 >> >+ %endif >> >+ %ifidni %1,%3 >> >+ %error operand 1 must not equal operand 3 >> >+ %endif >> >+ mova %1, %2 >> >+ xorps %1, %3 >> >+ andps %1, %4 >> >+ xorps %1, %2 >> >+%endif >> >+%endmacro >> >+ >> >+; sets nans to zere, +inf -inf handled later by min/max clamps >> >+%macro SANITIZE_F 1 >> >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False >> >+ %if mmsize <= 16 >> >+ mova m6, %1 >> >+ BLEND %1, m15, m6, m5 >> >+ %else >> >+ BLEND %1, m15, %1, m5 >> >+ %endif >> >+%endmacro >> >+ >> >+%macro ADD3 4 >> >+ addps %1, %2, %3 >> >+ addps %1, %1, %4 >> >+%endmacro >> >+ >> >+%macro CMP_EQUAL 3 >> >+%if cpuflag(avx2) >> >+ vpcmpeqd %1, %2, %3 >> >+%elif cpuflag(avx) >> >+ cmpps %1, %2, %3, 0x0 >> >+%else >> >+ pcmpeqd %1, %2, %3 >> >+%endif >> >+%endmacro >> >+ >> >+%macro SHIFT_RIGHT 2 >> >+%if mmsize <= 16 >> >+ psrld xm%1, %2 >> >+%elif cpuflag(avx2) >> >+ vpsrld m%1, m%1, %2 >> >+%else >> >+ vextractf128 xm15, m%1, 1 >> >+ psrld xm%1, %2 >> >+ psrld xm15, %2 >> >+ vinsertf128 m%1, m%1, xm15, 1 >> >+%endif >> >+%endmacro >> >+ >> >+%macro FETCH_LUT3D_RGB 4 >> >+ mov tmp2d, [rm + %4] >> >+ movss xm%1, [tmpq + tmp2q*4 + 0] >> >+ movss xm%2, [tmpq + tmp2q*4 + 4] >> >+ movss xm%3, [tmpq + tmp2q*4 + 8] >> >+ movss [rm + %4], xm%1 >> >+ movss [gm + %4], xm%2 >> >+ movss [bm + %4], xm%3 >> >+%endmacro >> >+ >> >+; 1 - dstr >> >+; 2 - dstg >> >+; 3 - dstb >> >+; 4 - indices >> >+%macro GATHER_LUT3D_INDICES 4 >> >+%if cpuflag(avx2) >> >+ vpcmpeqb m3, m3 >> >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3 >> >+ vpcmpeqb m14, m14 >> >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14 >> >+ vpcmpeqb m15, m15 >> >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15 >> >+%else >> >+ movu [rm], m%4 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 0 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 4 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 8 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 12 >> >+%if mmsize > 16 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 16 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 20 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 24 >> >+ FETCH_LUT3D_RGB %1, %2, %3, 28 >> >+%endif >> >+ movu m%1, [rm] >> >+ movu m%2, [gm] >> >+ movu m%3, [bm] >> >+%endif >> >+%endmacro >> >+ >> >+%macro interp_tetrahedral 0 >> >+ %define d_r m0 >> >+ %define d_g m1 >> >+ %define d_b m2 >> >+ >> >+ %define prev_r m3 >> >+ %define prev_g m4 >> >+ %define prev_b m5 >> >+ >> >+ %define next_r m6 >> >+ %define next_g m7 >> >+ %define next_b m8 >> >+ >> >+ %define x0 m4 >> >+ %define x1 m5 >> >+ %define x2 m6 >> >+ >> >+ ; setup prev index >> >+ FLOORPS prev_r, m0 >> >+ FLOORPS prev_g, m1 >> >+ FLOORPS prev_b, m2 >> >+ >> >+ ; setup deltas >> >+ subps d_r, m0, prev_r >> >+ subps d_g, m1, prev_g >> >+ subps d_b, m2, prev_b >> >+ >> >+ ; calculate select mask m9 >> >+ movu m6, [pd_tetra_table2] >> >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ >> >+ BLEND m10, m6, [pd_tetra_table1], m7 >> >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ >> >+ BLEND m6, m10, [pd_tetra_table0], m7 >> >+ >> >+ movu m10, [pd_tetra_table5] >> >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ >> >+ BLEND m9, m10, [pd_tetra_table4], m7 >> >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ >> >+ BLEND m10, m9, [pd_tetra_table3], m7 >> >+ >> >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ >> >+ BLEND m9, m10, m6, m7 >> >+ >> >+ ; setup next index >> >+ addps next_r, prev_r, m14 ; +1 >> >+ minps next_r, next_r, m13 ; clamp lutmax >> >+ >> >+ addps next_g, prev_g, m14 ; +1 >> >+ minps next_g, next_g, m13 ; clamp lutmax >> >+ >> >+ addps next_b, prev_b, m14 ; +1 >> >+ minps next_b, next_b, m13 ; clamp lutmax >> >+ >> >+ ; prescale indices >> >+ mulps prev_r, prev_r, lut3dsize2m >> >+ mulps next_r, next_r, lut3dsize2m >> >+ >> >+ mulps prev_g, prev_g, lut3dsizem >> >+ mulps next_g, next_g, lut3dsizem >> >+ >> >+ mulps prev_b, prev_b, [pd_3f] >> >+ mulps next_b, next_b, [pd_3f] >> >+ >> >+ movu m14, [pd_001] >> >+ >> >+ ; cxxa m10 >> >+ ; b >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m10, prev_b, next_b, m15 >> >+ >> >+ ; g >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m12, prev_g, next_g, m15 >> >+ >> >+ ; r >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m13, prev_r, next_r, m15 >> >+ >> >+ ADD3 m10, m10, m12, m13 >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 3 >> >+ >> >+ ; cxxb m11; >> >+ ; b >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m11, prev_b, next_b, m15 >> >+ >> >+ ; g >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m12, prev_g, next_g, m15 >> >+ >> >+ ; r >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m13, prev_r, next_r, m15 >> >+ >> >+ ADD3 m11, m11, m12, m13 >> >+ >> >+ ; c000 m12; >> >+ ADD3 m12, prev_r, prev_g, prev_b >> >+ >> >+ ; c111 m13; >> >+ ADD3 m13, next_r, next_g, next_b >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 6 >> >+ >> >+ ; x0, m4 >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND x0, m7, d_b, m15 ; b >> >+ >> >+ ; x1, m5 >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ SHIFT_RIGHT 9, 3 ; 9 >> >+ >> >+ andps m15, m9, m14 >> >+ CMP_EQUAL m15, m15, m14 >> >+ BLEND x1, m7, d_b, m15 ; b >> >+ >> >+ ; x2, m6 >> >+ andps m15, m9, [pd_010] >> >+ CMP_EQUAL m15, m15, [pd_010] >> >+ BLEND m7, d_r, d_g, m15 ; r,g >> >+ >> >+ andps m15, m9, [pd_100] >> >+ CMP_EQUAL m15, m15, [pd_100] >> >+ BLEND x2, m7, d_b, m15 ; b >> >+ >> >+ ; convert indices to integer >> >+ cvttps2dq m12, m12 >> >+ cvttps2dq m10, m10 >> >+ cvttps2dq m11, m11 >> >+ cvttps2dq m13, m13 >> >+ >> >+ ; now the gathering festival >> >+ mov tmpq, [ctxq + LUT3DContext.lut] >> >+ >> >+ GATHER_LUT3D_INDICES 0, 1, 2, 12 >> >+ movu m14, [pd_1f] >> >+ subps m14, m14, x0; 1 - x0 >> >+ >> >+ mulps m0, m0, m14 >> >+ mulps m1, m1, m14 >> >+ mulps m2, m2, m14 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 10 >> >+ subps m14, x0, x1; x0 - x1 >> >+ mulps m7, m7, m14 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, m14 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, m14 >> >+ addps m2, m2, m9 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 11 >> >+ subps m14, x1, x2; x1 - x2 >> >+ >> >+ mulps m7, m7, m14 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, m14 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, m14 >> >+ addps m2, m2, m9 >> >+ >> >+ GATHER_LUT3D_INDICES 7, 8, 9, 13 >> >+ mulps m7, m7, x2 >> >+ addps m0, m0, m7 >> >+ >> >+ mulps m8, m8, x2 >> >+ addps m1, m1, m8 >> >+ >> >+ mulps m9, m9, x2 >> >+ addps m2, m2, m9 >> >+%endmacro >> >+ >> >+%macro INIT_DATA_PTR 3 >> >+ mov ptrq, [%2 + AVFrame.data + %3 * 8] >> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >> >+ imul tmpd, slice_startd >> >+ add ptrq, tmpq >> >+ mov %1, ptrq >> >+%endmacro >> >+ >> >+%macro INC_DATA_PTR 3 >> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4] >> >+ mov ptrq, %1 >> >+ add ptrq, tmpq >> >+ mov %1, ptrq >> >+%endmacro >> >+ >> >+%macro LOAD16 2 >> >+ mov ptrq, %2 >> >+ %if mmsize > 16 >> >+ movu xm%1, [ptrq + xq*2] >> >+ %else >> >+ movsd xm%1, [ptrq + xq*2] >> >+ %endif >> >+ %if cpuflag(avx2) >> >+ vpmovzxwd m%1, xm%1 >> >+ %else >> >+ %if mmsize > 16 >> >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) >> >+ pshufb xm%1, xm6 ; pb_shuffle16 >> >+ pshufb xm4, xm6 ; pb_shuffle16 >> >+ vinsertf128 m%1, m%1, xm4, 1 >> >+ %else >> >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) >> >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ %endif >> >+ %endif >> >+ cvtdq2ps m%1, m%1 >> >+ mulps m%1, m%1, m7 ; pd_65535_invf >> >+%endmacro >> >+ >> >+%macro STORE16 2 >> >+ mulps m%2, m%2, m5 ; [pd_65535f] >> >+ minps m%2, m%2, m5 ; [pd_65535f] >> >+ maxps m%2, m%2, m15 ; zero >> >+ cvttps2dq m%2, m%2 >> >+ %if mmsize > 16 >> >+ vextractf128 xm4, m%2, 1 >> >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] >> >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] >> >+ por xm%2, xm4 >> >+ %else >> >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) >> >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) >> >+ %endif >> >+ mov ptrq, %1 >> >+ %if mmsize > 16 >> >+ movu [ptrq + xq*2], xm%2 >> >+ %else >> >+ movsd [ptrq + xq*2], xm%2 >> >+ %endif >> >+%endmacro >> >+ >> >+; 1 - interp method >> >+; 2 - format_name >> >+; 3 - depth >> >+; 4 - is float format >> >+%macro DEFINE_INTERP_FUNC 4 >> >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, >> src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, >> tmp, tmp2, tmp3 >> >+ ; store lut max and lutsize >> >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize] >> >+ cvtsi2ss xm0, tmpd >> >+ mulss xm0, xm0, [pd_3f] >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dsizem, m0 >> >+ sub tmpd, 1 >> >+ cvtsi2ss xm0, tmpd >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dmaxm, m0 >> >+ >> >+ ; scale_r >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalerm, m1 >> >+ >> >+ ; scale_g >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalegm, m1 >> >+ >> >+ ; scale_b >> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] >> >+ VBROADCASTSS m1, xm1 >> >+ mova scalebm, m1 >> >+ >> >+ ; store lutsize2 >> >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] >> >+ mulss xm0, xm0, [pd_3f] >> >+ VBROADCASTSS m0, xm0 >> >+ mova lut3dsize2m, m0 >> >+ >> >+ ; init prelut values >> >+ cmp prelutq, 0 >> >+ je %%skip_init_prelut >> >+ mov tmpd, dword [prelutq + Lut3DPreLut.size] >> >+ sub tmpd, 1 >> >+ cvtsi2ss xm0, tmpd >> >+ VBROADCASTSS m0, xm0 >> >+ mova prelutmaxm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] >> >+ mova prelutminrm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] >> >+ mova prelutmingm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] >> >+ mova prelutminbm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] >> >+ mova prelutscalerm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] >> >+ mova prelutscalegm, m0 >> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] >> >+ mova prelutscalebm, m0 >> >+ %%skip_init_prelut: >> >+ >> >+ mov widthd, [src_imageq + AVFrame.width] >> >+ >> >+ ; gbra pixel order >> >+ INIT_DATA_PTR srcrm, src_imageq, 2 >> >+ INIT_DATA_PTR srcgm, src_imageq, 0 >> >+ INIT_DATA_PTR srcbm, src_imageq, 1 >> >+ INIT_DATA_PTR srcam, src_imageq, 3 >> >+ >> >+ INIT_DATA_PTR dstrm, dst_imageq, 2 >> >+ INIT_DATA_PTR dstgm, dst_imageq, 0 >> >+ INIT_DATA_PTR dstbm, dst_imageq, 1 >> >+ INIT_DATA_PTR dstam, dst_imageq, 3 >> >+ >> >+ %%loop_y: >> >+ xor xq, xq >> >+ %%loop_x: >> >+ movu m14, [pd_1f] >> >+ xorps m15, m15, m15 >> >+ %if %4 ; float >> >+ mov ptrq, srcrm >> >+ movu m0, [ptrq + xq*4] >> >+ mov ptrq, srcgm >> >+ movu m1, [ptrq + xq*4] >> >+ mov ptrq, srcbm >> >+ movu m2, [ptrq + xq*4] >> >+ SANITIZE_F m0 >> >+ SANITIZE_F m1 >> >+ SANITIZE_F m2 >> >+ %else >> >+ ; constants for LOAD16 >> >+ movu m7, [pd_65535_invf] >> >+ %if notcpuflag(avx2) && mmsize >= 32 >> >+ movu xm6, [pb_shuffle16] >> >+ %endif >> >+ LOAD16 0, srcrm >> >+ LOAD16 1, srcgm >> >+ LOAD16 2, srcbm >> >+ %endif >> >+ >> >+ cmp prelutq, 0 >> >+ je %%skip_prelut >> >+ mova m13, prelutmaxm >> >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm >> >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm >> >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm >> >+ %%skip_prelut: >> >+ >> >+ mova m13, lut3dmaxm >> >+ APPLY_SCALE 0, scalerm >> >+ APPLY_SCALE 1, scalegm >> >+ APPLY_SCALE 2, scalebm >> >+ >> >+ interp_%1 >> >+ >> >+ %if %4 ; float >> >+ mov ptrq, dstrm >> >+ movu [ptrq + xq*4], m0 >> >+ mov ptrq, dstgm >> >+ movu [ptrq + xq*4], m1 >> >+ mov ptrq, dstbm >> >+ movu [ptrq + xq*4], m2 >> >+ cmp has_alphad, 0 >> >+ je %%skip_alphaf >> >+ mov ptrq, srcam >> >+ movu m0, [ptrq + xq*4] >> >+ mov ptrq, dstam >> >+ movu [ptrq + xq*4], m0 >> >+ %%skip_alphaf: >> >+ %else >> >+ ; constants for STORE16 >> >+ movu m5, [pd_65535f] >> >+ %if mmsize > 16 >> >+ movu xm6, [pb_lo_pack_shuffle16] >> >+ movu xm7, [pb_hi_pack_shuffle16] >> >+ %endif >> >+ >> >+ xorps m15, m15, m15 >> >+ STORE16 dstrm, 0 >> >+ STORE16 dstgm, 1 >> >+ STORE16 dstbm, 2 >> >+ >> >+ cmp has_alphad, 0 >> >+ je %%skip_alpha >> >+ %if mmsize > 16 >> >+ mov ptrq, srcam >> >+ movu xm0, [ptrq + xq*2] >> >+ mov ptrq, dstam >> >+ movu [ptrq + xq*2], xm0 >> >+ %else >> >+ mov ptrq, srcam >> >+ movsd xm0, [ptrq + xq*2] >> >+ mov ptrq, dstam >> >+ movsd [ptrq + xq*2], xm0 >> >+ %endif >> >+ >> >+ %%skip_alpha: >> >+ %endif >> >+ >> >+ add xq, mmsize/4 >> >+ cmp xd, widthd >> >+ jl %%loop_x >> >+ >> >+ INC_DATA_PTR srcrm, src_imageq, 2 >> >+ INC_DATA_PTR srcgm, src_imageq, 0 >> >+ INC_DATA_PTR srcbm, src_imageq, 1 >> >+ INC_DATA_PTR srcam, src_imageq, 3 >> >+ >> >+ INC_DATA_PTR dstrm, dst_imageq, 2 >> >+ INC_DATA_PTR dstgm, dst_imageq, 0 >> >+ INC_DATA_PTR dstbm, dst_imageq, 1 >> >+ INC_DATA_PTR dstam, dst_imageq, 3 >> >+ >> >+ inc slice_startd >> >+ cmp slice_startd, slice_endd >> >+ jl %%loop_y >> >+ >> >+ RET >> >+%endmacro >> >+%if ARCH_X86_64 >> >+ %if HAVE_AVX2_EXTERNAL >> >+ INIT_YMM avx2 >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+ %endif >> >+ %if HAVE_AVX_EXTERNAL >> >+ INIT_YMM avx >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+ %endif >> >+ INIT_XMM sse2 >> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 >> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 >> >+%endif >> >\ No newline at end of file >> >diff --git a/libavfilter/x86/vf_lut3d_init.c >> b/libavfilter/x86/vf_lut3d_init.c >> >new file mode 100644 >> >index 0000000000..9b9b36e4af >> >--- /dev/null >> >+++ b/libavfilter/x86/vf_lut3d_init.c >> >@@ -0,0 +1,88 @@ >> >+/* >> >+ * Copyright (c) 2021 Mark Reid <mindmark@gmail.com> >> >+ * >> >+ * This file is part of FFmpeg. >> >+ * >> >+ * FFmpeg is free software; you can redistribute it and/or >> >+ * modify it under the terms of the GNU Lesser General Public >> >+ * License as published by the Free Software Foundation; either >> >+ * version 2.1 of the License, or (at your option) any later version. >> >+ * >> >+ * FFmpeg is distributed in the hope that it will be useful, >> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >+ * Lesser General Public License for more details. >> >+ * >> >+ * You should have received a copy of the GNU Lesser General Public >> >+ * License along with FFmpeg; if not, write to the Free Software >> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> >+ */ >> >+ >> >+#include "libavutil/attributes.h" >> >+#include "libavutil/cpu.h" >> >+#include "libavutil/x86/cpu.h" >> >+#include "libavfilter/lut3d.h" >> >+ >> >+#define DEFINE_INTERP_FUNC(name, format, opt) >> >> \ >> >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, >> Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int >> slice_end, int has_alpha); \ >> >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void >> *arg, int jobnr, int nb_jobs) >> \ >> >+{ >> >> \ >> >+ LUT3DContext *lut3d = ctx->priv; >> >> \ >> >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: >> NULL; >> \ >> >+ ThreadData *td = arg; >> >> \ >> >+ AVFrame *in = td->in; >> >> \ >> >+ AVFrame *out = td->out; >> >> \ >> >+ int has_alpha = in->linesize[3] && out != in; >> >> \ >> >+ int slice_start = (in->height * jobnr ) / nb_jobs; >> >> \ >> >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; >> >> \ >> >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, >> slice_start, slice_end, has_alpha); >> \ >> >+ return 0; >> >> \ >> >+} >> >+ >> >+#if ARCH_X86_64 >> >+#if HAVE_AVX2_EXTERNAL >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) >> >+#endif >> >+#if HAVE_AVX_EXTERNAL >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx) >> >+#endif >> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) >> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) >> >+#endif >> >+ >> >+ >> >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const >> AVPixFmtDescriptor *desc) >> >+{ >> >+ int cpu_flags = av_get_cpu_flags(); >> >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; >> >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; >> >+ int depth = desc->comp[0].depth; >> >+ >> >+#if ARCH_X86_64 >> >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+#if HAVE_AVX2_EXTERNAL >> >+ if (isfloat && planar) { >> >+ s->interp = interp_tetrahedral_pf32_avx2; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_avx2; >> >+ } >> >+#endif >> >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+#if HAVE_AVX_EXTERNAL >> >+ if (isfloat) { >> >+ s->interp = interp_tetrahedral_pf32_avx; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_avx; >> >+ } >> >+#endif >> >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == >> INTERPOLATE_TETRAHEDRAL && planar) { >> >+ if (isfloat) { >> >+ s->interp = interp_tetrahedral_pf32_sse2; >> >+ } else if (depth == 16) { >> >+ s->interp = interp_tetrahedral_p16_sse2; >> >+ } >> >+ } >> >+#endif >> >+} >> >-- >> >2.31.1.windows.1 >> > >> >_______________________________________________ >> >ffmpeg-devel mailing list >> >ffmpeg-devel@ffmpeg.org >> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> > >> >To unsubscribe, visit link above, or email >> >ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-request@ffmpeg.org with subject "unsubscribe". >> >
diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h new file mode 100644 index 0000000000..ded2a036a5 --- /dev/null +++ b/libavfilter/lut3d.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2013 Clément Bœsch + * Copyright (c) 2018 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVFILTER_LUT3D_H +#define AVFILTER_LUT3D_H + +#include "libavutil/pixdesc.h" +#include "framesync.h" +#include "avfilter.h" + +enum interp_mode { + INTERPOLATE_NEAREST, + INTERPOLATE_TRILINEAR, + INTERPOLATE_TETRAHEDRAL, + INTERPOLATE_PYRAMID, + INTERPOLATE_PRISM, + NB_INTERP_MODE +}; + +struct rgbvec { + float r, g, b; +}; + +/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT + * of 512x512 (64x64x64) */ +#define MAX_LEVEL 256 +#define PRELUT_SIZE 65536 + +typedef struct Lut3DPreLut { + int size; + float min[3]; + float max[3]; + float scale[3]; + float* lut[3]; +} Lut3DPreLut; + +typedef struct LUT3DContext { + const AVClass *class; + struct rgbvec *lut; + int lutsize; + int lutsize2; + struct rgbvec scale; + int interpolation; ///<interp_mode + char *file; + uint8_t rgba_map[4]; + int step; + avfilter_action_func *interp; + Lut3DPreLut prelut; +#if CONFIG_HALDCLUT_FILTER + uint8_t clut_rgba_map[4]; + int clut_step; + int clut_bits; + int clut_planar; + int clut_float; + int clut_width; + FFFrameSync fs; +#endif +} LUT3DContext; + +typedef struct ThreadData { + AVFrame *in, *out; +} ThreadData; + +void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc); + +#endif /* AVFILTER_LUT3D_H */ \ No newline at end of file diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c index 9fbda833b9..1fd0af06db 100644 --- a/libavfilter/vf_lut3d.c +++ b/libavfilter/vf_lut3d.c @@ -31,73 +31,18 @@ #include "libavutil/intreadwrite.h" #include "libavutil/intfloat.h" #include "libavutil/avassert.h" -#include "libavutil/pixdesc.h" #include "libavutil/avstring.h" -#include "avfilter.h" #include "drawutils.h" #include "formats.h" -#include "framesync.h" #include "internal.h" #include "video.h" +#include "lut3d.h" #define R 0 #define G 1 #define B 2 #define A 3 -enum interp_mode { - INTERPOLATE_NEAREST, - INTERPOLATE_TRILINEAR, - INTERPOLATE_TETRAHEDRAL, - INTERPOLATE_PYRAMID, - INTERPOLATE_PRISM, - NB_INTERP_MODE -}; - -struct rgbvec { - float r, g, b; -}; - -/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT - * of 512x512 (64x64x64) */ -#define MAX_LEVEL 256 -#define PRELUT_SIZE 65536 - -typedef struct Lut3DPreLut { - int size; - float min[3]; - float max[3]; - float scale[3]; - float* lut[3]; -} Lut3DPreLut; - -typedef struct LUT3DContext { - const AVClass *class; - int interpolation; ///<interp_mode - char *file; - uint8_t rgba_map[4]; - int step; - avfilter_action_func *interp; - struct rgbvec scale; - struct rgbvec *lut; - int lutsize; - int lutsize2; - Lut3DPreLut prelut; -#if CONFIG_HALDCLUT_FILTER - uint8_t clut_rgba_map[4]; - int clut_step; - int clut_bits; - int clut_planar; - int clut_float; - int clut_width; - FFFrameSync fs; -#endif -} LUT3DContext; - -typedef struct ThreadData { - AVFrame *in, *out; -} ThreadData; - #define OFFSET(x) offsetof(LUT3DContext, x) #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM #define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM @@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink) av_assert0(0); } + if (ARCH_X86) { + ff_lut3d_init_x86(lut3d, desc); + } + return 0; } diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 016a5b3511..a29941eaeb 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o +OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o @@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o +X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm new file mode 100644 index 0000000000..b3d7c3962b --- /dev/null +++ b/libavfilter/x86/vf_lut3d.asm @@ -0,0 +1,757 @@ +;***************************************************************************** +;* x86-optimized functions for lut3d filter +;* +;* Copyright (c) 2021 Mark Reid <mindmark@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA +pd_1f: times 8 dd 1.0 +pd_3f: times 8 dd 3.0 + +; used to limit rshifts as they are more expensive in avx1 +pd_001: times 8 dd 001b +pd_010: times 8 dd 010b +pd_100: times 8 dd 100b + +pd_65535f: times 8 dd 65535.0 +pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 + +pb_shuffle16: db 0, 1, 0x80, 0x80, \ + 2, 3, 0x80, 0x80, \ + 4, 5, 0x80, 0x80, \ + 6, 7, 0x80, 0x80 + +pb_lo_pack_shuffle16: db 0, 1, 4, 5, \ + 8, 9, 12, 13, \ + 0x80, 0x80, 0x80, 0x80, \ + 0x80, 0x80, 0x80, 0x80 + +pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \ + 0x80, 0x80, 0x80, 0x80, \ + 0, 1, 4, 5, \ + 8, 9, 12, 13 + +; tetrahedral table -------------------------------------------- +; name: x2| x1| x0| cxxb| cxxa +; values: r 00| r 00| r 00| c011 011| c001 001 +; g 01| g 01| g 01| c101 101| c010 010 +; b 10| b 10| b 10| c110 110| c100 100 + +; g>b b | g | r | c110 | c100 +pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << 3) | 100b +; r>b g | b | r | c101 | c100 +pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << 3) | 100b +; else g | r | b | c101 | c001 +pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << 3) | 001b +; b>g r | g | b | c011 | c001 +pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << 3) | 001b +; b>r r | b | g | c011 | c010 +pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << 3) | 010b +; else b | r | g | c110 | c010 +pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << 3) | 010b + +SECTION .text + +struc Lut3DPreLut + .size: resd 1 + .min: resd 3 + .max: resd 3 + .scale: resd 3 + .lut: resq 3 +endstruc + +struc LUT3DContext + .class: resq 1 + .lut: resq 1 + .lutsize: resd 1 + .lutsize2: resd 1 + .scale: resd 3 +endstruc + +%define AV_NUM_DATA_POINTERS 8 + +struc AVFrame + .data: resq AV_NUM_DATA_POINTERS + .linesize: resd AV_NUM_DATA_POINTERS + .extended_data: resq 1 + .width: resd 1 + .height: resd 1 +endstruc + +%define rm rsp +%define gm rsp+mmsize +%define bm rsp+(mmsize*2) + +%define lut3dsizem [rsp+mmsize*3] +%define lut3dsize2m [rsp+mmsize*4] +%define lut3dmaxm [rsp+mmsize*5] +%define prelutmaxm [rsp+mmsize*6] + +%define scalerm [rsp+mmsize*7] +%define scalegm [rsp+mmsize*8] +%define scalebm [rsp+mmsize*9] + +%define prelutminrm [rsp+mmsize*10] +%define prelutmingm [rsp+mmsize*11] +%define prelutminbm [rsp+mmsize*12] + +%define prelutscalerm [rsp+mmsize*13] +%define prelutscalegm [rsp+mmsize*14] +%define prelutscalebm [rsp+mmsize*15] + +; data pointers +%define srcrm [rsp+mmsize*16 + 0] +%define srcgm [rsp+mmsize*16 + 8] +%define srcbm [rsp+mmsize*16 + 16] +%define srcam [rsp+mmsize*16 + 24] + +%define dstrm [rsp+mmsize*16 + 32] +%define dstgm [rsp+mmsize*16 + 40] +%define dstbm [rsp+mmsize*16 + 48] +%define dstam [rsp+mmsize*16 + 56] + +%macro FETCH_PRELUT_PN 3 + mov tmp2d, [rm + %3] + mov tmp3d, [gm + %3] + movss xm%1, [tmpq + tmp2q*4] + movss xm%2, [tmpq + tmp3q*4] + movss [rm + %3], xm%1 + movss [gm + %3], xm%2 +%endmacro + +; 1 - p +; 2 - n +; 3 - p indices +; 4 - n indices +%macro GATHER_PRELUT 4 + %if cpuflag(avx2) + vpcmpeqb m7, m7 + vgatherdps m%1, [tmpq + m%3*4], m7 ; p + vpcmpeqb m9, m9 + vgatherdps m%2, [tmpq + m%4*4], m9 ; n + %else + mova [rm], m%3 + mova [gm], m%4 + FETCH_PRELUT_PN %1, %2, 0 + FETCH_PRELUT_PN %1, %2, 4 + FETCH_PRELUT_PN %1, %2, 8 + FETCH_PRELUT_PN %1, %2, 12 + %if mmsize > 16 + FETCH_PRELUT_PN %1, %2, 16 + FETCH_PRELUT_PN %1, %2, 20 + FETCH_PRELUT_PN %1, %2, 24 + FETCH_PRELUT_PN %1, %2, 28 + %endif + movu m%1, [rm] + movu m%2, [gm] + %endif +%endmacro + +%macro FLOORPS 2 + %if mmsize > 16 + vroundps %1, %2, 0x01 + %else + cvttps2dq %1, %2 + cvtdq2ps %1, %1 + %endif +%endmacro + +; 1 - dst +; 2 - index +; 3 - min +; 4 - scale +; assumes lut max m13, m14 1.0f, zero m15 +%macro APPLY_PRELUT 4 + ; scale + subps m5, m%1, %3 ; v - min + mulps m5, m5, %4 ; v * scale + ; clamp + maxps m5, m5, m15 ; max zero + minps m5, m5, m13 ; min lut max + + FLOORPS m3, m5 ; prev index + subps m5, m5, m3 ; d + addps m4, m3, m14 ; p+1 = n index + minps m4, m4, m13 ; clamp n idex + + mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8] + cvttps2dq m6, m3 + cvttps2dq m10, m4 + GATHER_PRELUT 3, 4, 6, 10 + + ; lerp + subps m8, m4, m3 + mulps m8, m8, m5 + addps m%1, m8, m3 +%endmacro + +; 1 - dst +; 2 - scale +; assumes lut max m13, zero m15 +%macro APPLY_SCALE 2 + mulps m%1, m%1, %2 + maxps m%1, m%1, m15 + minps m%1, m%1, m13 +%endmacro + +%macro BLEND 4 +%if mmsize > 16 + vblendvps %1, %2, %3, %4 +%else + %ifidni %1,%2 + %error operand 1 must not equal operand 2 + %endif + %ifidni %1,%3 + %error operand 1 must not equal operand 3 + %endif + mova %1, %2 + xorps %1, %3 + andps %1, %4 + xorps %1, %2 +%endif +%endmacro + +; sets nans to zere, +inf -inf handled later by min/max clamps +%macro SANITIZE_F 1 + cmpps m5, %1, %1, 0x0 ; nan == nan = False + %if mmsize <= 16 + mova m6, %1 + BLEND %1, m15, m6, m5 + %else + BLEND %1, m15, %1, m5 + %endif +%endmacro + +%macro ADD3 4 + addps %1, %2, %3 + addps %1, %1, %4 +%endmacro + +%macro CMP_EQUAL 3 +%if cpuflag(avx2) + vpcmpeqd %1, %2, %3 +%elif cpuflag(avx) + cmpps %1, %2, %3, 0x0 +%else + pcmpeqd %1, %2, %3 +%endif +%endmacro + +%macro SHIFT_RIGHT 2 +%if mmsize <= 16 + psrld xm%1, %2 +%elif cpuflag(avx2) + vpsrld m%1, m%1, %2 +%else + vextractf128 xm15, m%1, 1 + psrld xm%1, %2 + psrld xm15, %2 + vinsertf128 m%1, m%1, xm15, 1 +%endif +%endmacro + +%macro FETCH_LUT3D_RGB 4 + mov tmp2d, [rm + %4] + movss xm%1, [tmpq + tmp2q*4 + 0] + movss xm%2, [tmpq + tmp2q*4 + 4] + movss xm%3, [tmpq + tmp2q*4 + 8] + movss [rm + %4], xm%1 + movss [gm + %4], xm%2 + movss [bm + %4], xm%3 +%endmacro + +; 1 - dstr +; 2 - dstg +; 3 - dstb +; 4 - indices +%macro GATHER_LUT3D_INDICES 4 +%if cpuflag(avx2) + vpcmpeqb m3, m3 + vgatherdps m%1, [tmpq + m%4*4 + 0], m3 + vpcmpeqb m14, m14 + vgatherdps m%2, [tmpq + m%4*4 + 4], m14 + vpcmpeqb m15, m15 + vgatherdps m%3, [tmpq + m%4*4 + 8], m15 +%else + movu [rm], m%4 + FETCH_LUT3D_RGB %1, %2, %3, 0 + FETCH_LUT3D_RGB %1, %2, %3, 4 + FETCH_LUT3D_RGB %1, %2, %3, 8 + FETCH_LUT3D_RGB %1, %2, %3, 12 +%if mmsize > 16 + FETCH_LUT3D_RGB %1, %2, %3, 16 + FETCH_LUT3D_RGB %1, %2, %3, 20 + FETCH_LUT3D_RGB %1, %2, %3, 24 + FETCH_LUT3D_RGB %1, %2, %3, 28 +%endif + movu m%1, [rm] + movu m%2, [gm] + movu m%3, [bm] +%endif +%endmacro + +%macro interp_tetrahedral 0 + %define d_r m0 + %define d_g m1 + %define d_b m2 + + %define prev_r m3 + %define prev_g m4 + %define prev_b m5 + + %define next_r m6 + %define next_g m7 + %define next_b m8 + + %define x0 m4 + %define x1 m5 + %define x2 m6 + + ; setup prev index + FLOORPS prev_r, m0 + FLOORPS prev_g, m1 + FLOORPS prev_b, m2 + + ; setup deltas + subps d_r, m0, prev_r + subps d_g, m1, prev_g + subps d_b, m2, prev_b + + ; calculate select mask m9 + movu m6, [pd_tetra_table2] + cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ + BLEND m10, m6, [pd_tetra_table1], m7 + cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ + BLEND m6, m10, [pd_tetra_table0], m7 + + movu m10, [pd_tetra_table5] + cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ + BLEND m9, m10, [pd_tetra_table4], m7 + cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ + BLEND m10, m9, [pd_tetra_table3], m7 + + cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ + BLEND m9, m10, m6, m7 + + ; setup next index + addps next_r, prev_r, m14 ; +1 + minps next_r, next_r, m13 ; clamp lutmax + + addps next_g, prev_g, m14 ; +1 + minps next_g, next_g, m13 ; clamp lutmax + + addps next_b, prev_b, m14 ; +1 + minps next_b, next_b, m13 ; clamp lutmax + + ; prescale indices + mulps prev_r, prev_r, lut3dsize2m + mulps next_r, next_r, lut3dsize2m + + mulps prev_g, prev_g, lut3dsizem + mulps next_g, next_g, lut3dsizem + + mulps prev_b, prev_b, [pd_3f] + mulps next_b, next_b, [pd_3f] + + movu m14, [pd_001] + + ; cxxa m10 + ; b + andps m15, m9, m14 + CMP_EQUAL m15, m15, m14 + BLEND m10, prev_b, next_b, m15 + + ; g + andps m15, m9, [pd_010] + CMP_EQUAL m15, m15, [pd_010] + BLEND m12, prev_g, next_g, m15 + + ; r + andps m15, m9, [pd_100] + CMP_EQUAL m15, m15, [pd_100] + BLEND m13, prev_r, next_r, m15 + + ADD3 m10, m10, m12, m13 + + SHIFT_RIGHT 9, 3 ; 3 + + ; cxxb m11; + ; b + andps m15, m9, m14 + CMP_EQUAL m15, m15, m14 + BLEND m11, prev_b, next_b, m15 + + ; g + andps m15, m9, [pd_010] + CMP_EQUAL m15, m15, [pd_010] + BLEND m12, prev_g, next_g, m15 + + ; r + andps m15, m9, [pd_100] + CMP_EQUAL m15, m15, [pd_100] + BLEND m13, prev_r, next_r, m15 + + ADD3 m11, m11, m12, m13 + + ; c000 m12; + ADD3 m12, prev_r, prev_g, prev_b + + ; c111 m13; + ADD3 m13, next_r, next_g, next_b + + SHIFT_RIGHT 9, 3 ; 6 + + ; x0, m4 + andps m15, m9, m14 + CMP_EQUAL m15, m15, m14 + BLEND m7, d_r, d_g, m15 ; r,g + + andps m15, m9, [pd_010] + CMP_EQUAL m15, m15, [pd_010] + BLEND x0, m7, d_b, m15 ; b + + ; x1, m5 + andps m15, m9, [pd_100] + CMP_EQUAL m15, m15, [pd_100] + BLEND m7, d_r, d_g, m15 ; r,g + + SHIFT_RIGHT 9, 3 ; 9 + + andps m15, m9, m14 + CMP_EQUAL m15, m15, m14 + BLEND x1, m7, d_b, m15 ; b + + ; x2, m6 + andps m15, m9, [pd_010] + CMP_EQUAL m15, m15, [pd_010] + BLEND m7, d_r, d_g, m15 ; r,g + + andps m15, m9, [pd_100] + CMP_EQUAL m15, m15, [pd_100] + BLEND x2, m7, d_b, m15 ; b + + ; convert indices to integer + cvttps2dq m12, m12 + cvttps2dq m10, m10 + cvttps2dq m11, m11 + cvttps2dq m13, m13 + + ; now the gathering festival + mov tmpq, [ctxq + LUT3DContext.lut] + + GATHER_LUT3D_INDICES 0, 1, 2, 12 + movu m14, [pd_1f] + subps m14, m14, x0; 1 - x0 + + mulps m0, m0, m14 + mulps m1, m1, m14 + mulps m2, m2, m14 + + GATHER_LUT3D_INDICES 7, 8, 9, 10 + subps m14, x0, x1; x0 - x1 + mulps m7, m7, m14 + addps m0, m0, m7 + + mulps m8, m8, m14 + addps m1, m1, m8 + + mulps m9, m9, m14 + addps m2, m2, m9 + + GATHER_LUT3D_INDICES 7, 8, 9, 11 + subps m14, x1, x2; x1 - x2 + + mulps m7, m7, m14 + addps m0, m0, m7 + + mulps m8, m8, m14 + addps m1, m1, m8 + + mulps m9, m9, m14 + addps m2, m2, m9 + + GATHER_LUT3D_INDICES 7, 8, 9, 13 + mulps m7, m7, x2 + addps m0, m0, m7 + + mulps m8, m8, x2 + addps m1, m1, m8 + + mulps m9, m9, x2 + addps m2, m2, m9 +%endmacro + +%macro INIT_DATA_PTR 3 + mov ptrq, [%2 + AVFrame.data + %3 * 8] + mov tmpd, [%2 + AVFrame.linesize + %3 * 4] + imul tmpd, slice_startd + add ptrq, tmpq + mov %1, ptrq +%endmacro + +%macro INC_DATA_PTR 3 + mov tmpd, [%2 + AVFrame.linesize + %3 * 4] + mov ptrq, %1 + add ptrq, tmpq + mov %1, ptrq +%endmacro + +%macro LOAD16 2 + mov ptrq, %2 + %if mmsize > 16 + movu xm%1, [ptrq + xq*2] + %else + movsd xm%1, [ptrq + xq*2] + %endif + %if cpuflag(avx2) + vpmovzxwd m%1, xm%1 + %else + %if mmsize > 16 + pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0) + pshufb xm%1, xm6 ; pb_shuffle16 + pshufb xm4, xm6 ; pb_shuffle16 + vinsertf128 m%1, m%1, xm4, 1 + %else + pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0) + pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) + pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0) + %endif + %endif + cvtdq2ps m%1, m%1 + mulps m%1, m%1, m7 ; pd_65535_invf +%endmacro + +%macro STORE16 2 + mulps m%2, m%2, m5 ; [pd_65535f] + minps m%2, m%2, m5 ; [pd_65535f] + maxps m%2, m%2, m15 ; zero + cvttps2dq m%2, m%2 + %if mmsize > 16 + vextractf128 xm4, m%2, 1 + pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16] + pshufb xm4, xm7 ; [pb_hi_pack_shuffle16] + por xm%2, xm4 + %else + pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) + pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) + pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) + %endif + mov ptrq, %1 + %if mmsize > 16 + movu [ptrq + xq*2], xm%2 + %else + movsd [ptrq + xq*2], xm%2 + %endif +%endmacro + +; 1 - interp method +; 2 - format_name +; 3 - depth +; 4 - is float format +%macro DEFINE_INTERP_FUNC 4 +cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3 + ; store lut max and lutsize + mov tmpd, dword [ctxq + LUT3DContext.lutsize] + cvtsi2ss xm0, tmpd + mulss xm0, xm0, [pd_3f] + VBROADCASTSS m0, xm0 + mova lut3dsizem, m0 + sub tmpd, 1 + cvtsi2ss xm0, tmpd + VBROADCASTSS m0, xm0 + mova lut3dmaxm, m0 + + ; scale_r + mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4] + VBROADCASTSS m1, xm1 + mova scalerm, m1 + + ; scale_g + mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4] + VBROADCASTSS m1, xm1 + mova scalegm, m1 + + ; scale_b + mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4] + VBROADCASTSS m1, xm1 + mova scalebm, m1 + + ; store lutsize2 + cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2] + mulss xm0, xm0, [pd_3f] + VBROADCASTSS m0, xm0 + mova lut3dsize2m, m0 + + ; init prelut values + cmp prelutq, 0 + je %%skip_init_prelut + mov tmpd, dword [prelutq + Lut3DPreLut.size] + sub tmpd, 1 + cvtsi2ss xm0, tmpd + VBROADCASTSS m0, xm0 + mova prelutmaxm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4] + mova prelutminrm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4] + mova prelutmingm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4] + mova prelutminbm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4] + mova prelutscalerm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4] + mova prelutscalegm, m0 + VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4] + mova prelutscalebm, m0 + %%skip_init_prelut: + + mov widthd, [src_imageq + AVFrame.width] + + ; gbra pixel order + INIT_DATA_PTR srcrm, src_imageq, 2 + INIT_DATA_PTR srcgm, src_imageq, 0 + INIT_DATA_PTR srcbm, src_imageq, 1 + INIT_DATA_PTR srcam, src_imageq, 3 + + INIT_DATA_PTR dstrm, dst_imageq, 2 + INIT_DATA_PTR dstgm, dst_imageq, 0 + INIT_DATA_PTR dstbm, dst_imageq, 1 + INIT_DATA_PTR dstam, dst_imageq, 3 + + %%loop_y: + xor xq, xq + %%loop_x: + movu m14, [pd_1f] + xorps m15, m15, m15 + %if %4 ; float + mov ptrq, srcrm + movu m0, [ptrq + xq*4] + mov ptrq, srcgm + movu m1, [ptrq + xq*4] + mov ptrq, srcbm + movu m2, [ptrq + xq*4] + SANITIZE_F m0 + SANITIZE_F m1 + SANITIZE_F m2 + %else + ; constants for LOAD16 + movu m7, [pd_65535_invf] + %if notcpuflag(avx2) && mmsize >= 32 + movu xm6, [pb_shuffle16] + %endif + LOAD16 0, srcrm + LOAD16 1, srcgm + LOAD16 2, srcbm + %endif + + cmp prelutq, 0 + je %%skip_prelut + mova m13, prelutmaxm + APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm + APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm + APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm + %%skip_prelut: + + mova m13, lut3dmaxm + APPLY_SCALE 0, scalerm + APPLY_SCALE 1, scalegm + APPLY_SCALE 2, scalebm + + interp_%1 + + %if %4 ; float + mov ptrq, dstrm + movu [ptrq + xq*4], m0 + mov ptrq, dstgm + movu [ptrq + xq*4], m1 + mov ptrq, dstbm + movu [ptrq + xq*4], m2 + cmp has_alphad, 0 + je %%skip_alphaf + mov ptrq, srcam + movu m0, [ptrq + xq*4] + mov ptrq, dstam + movu [ptrq + xq*4], m0 + %%skip_alphaf: + %else + ; constants for STORE16 + movu m5, [pd_65535f] + %if mmsize > 16 + movu xm6, [pb_lo_pack_shuffle16] + movu xm7, [pb_hi_pack_shuffle16] + %endif + + xorps m15, m15, m15 + STORE16 dstrm, 0 + STORE16 dstgm, 1 + STORE16 dstbm, 2 + + cmp has_alphad, 0 + je %%skip_alpha + %if mmsize > 16 + mov ptrq, srcam + movu xm0, [ptrq + xq*2] + mov ptrq, dstam + movu [ptrq + xq*2], xm0 + %else + mov ptrq, srcam + movsd xm0, [ptrq + xq*2] + mov ptrq, dstam + movsd [ptrq + xq*2], xm0 + %endif + + %%skip_alpha: + %endif + + add xq, mmsize/4 + cmp xd, widthd + jl %%loop_x + + INC_DATA_PTR srcrm, src_imageq, 2 + INC_DATA_PTR srcgm, src_imageq, 0 + INC_DATA_PTR srcbm, src_imageq, 1 + INC_DATA_PTR srcam, src_imageq, 3 + + INC_DATA_PTR dstrm, dst_imageq, 2 + INC_DATA_PTR dstgm, dst_imageq, 0 + INC_DATA_PTR dstbm, dst_imageq, 1 + INC_DATA_PTR dstam, dst_imageq, 3 + + inc slice_startd + cmp slice_startd, slice_endd + jl %%loop_y + + RET +%endmacro +%if ARCH_X86_64 + %if HAVE_AVX2_EXTERNAL + INIT_YMM avx2 + DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 + DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 + %endif + %if HAVE_AVX_EXTERNAL + INIT_YMM avx + DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 + DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 + %endif + INIT_XMM sse2 + DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1 + DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0 +%endif \ No newline at end of file diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c new file mode 100644 index 0000000000..9b9b36e4af --- /dev/null +++ b/libavfilter/x86/vf_lut3d_init.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021 Mark Reid <mindmark@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/lut3d.h" + +#define DEFINE_INTERP_FUNC(name, format, opt) \ +void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \ +static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \ +{ \ + LUT3DContext *lut3d = ctx->priv; \ + Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; \ + ThreadData *td = arg; \ + AVFrame *in = td->in; \ + AVFrame *out = td->out; \ + int has_alpha = in->linesize[3] && out != in; \ + int slice_start = (in->height * jobnr ) / nb_jobs; \ + int slice_end = (in->height * (jobnr+1)) / nb_jobs; \ + ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha); \ + return 0; \ +} + +#if ARCH_X86_64 +#if HAVE_AVX2_EXTERNAL + DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2) + DEFINE_INTERP_FUNC(tetrahedral, p16, avx2) +#endif +#if HAVE_AVX_EXTERNAL + DEFINE_INTERP_FUNC(tetrahedral, pf32, avx) + DEFINE_INTERP_FUNC(tetrahedral, p16, avx) +#endif + DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2) + DEFINE_INTERP_FUNC(tetrahedral, p16, sse2) +#endif + + +av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc) +{ + int cpu_flags = av_get_cpu_flags(); + int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR; + int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT; + int depth = desc->comp[0].depth; + +#if ARCH_X86_64 + if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { +#if HAVE_AVX2_EXTERNAL + if (isfloat && planar) { + s->interp = interp_tetrahedral_pf32_avx2; + } else if (depth == 16) { + s->interp = interp_tetrahedral_p16_avx2; + } +#endif + } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { +#if HAVE_AVX_EXTERNAL + if (isfloat) { + s->interp = interp_tetrahedral_pf32_avx; + } else if (depth == 16) { + s->interp = interp_tetrahedral_p16_avx; + } +#endif + } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) { + if (isfloat) { + s->interp = interp_tetrahedral_pf32_sse2; + } else if (depth == 16) { + s->interp = interp_tetrahedral_p16_sse2; + } + } +#endif +}
From: Mark Reid <mindmark@gmail.com> Only supports float and 16bit planer formats at the momoment. Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some speed gains. f32 1920x1080 1 thread with prelut c impl 1389936500 UNITS in lut3d->interp, 1 runs, 0 skips 1425800240 UNITS in lut3d->interp, 2 runs, 0 skips 1433312777 UNITS in lut3d->interp, 4 runs, 0 skips 1443346798 UNITS in lut3d->interp, 8 runs, 0 skips sse2 948662320 UNITS in lut3d->interp, 1 runs, 0 skips 1101247540 UNITS in lut3d->interp, 2 runs, 0 skips 1050645695 UNITS in lut3d->interp, 4 runs, 0 skips 1041102937 UNITS in lut3d->interp, 8 runs, 0 skips avx 633837000 UNITS in lut3d->interp, 1 runs, 0 skips 669452850 UNITS in lut3d->interp, 2 runs, 0 skips 650716580 UNITS in lut3d->interp, 4 runs, 0 skips 644698550 UNITS in lut3d->interp, 8 runs, 0 skips avx2 354940020 UNITS in lut3d->interp, 1 runs, 0 skips 362384340 UNITS in lut3d->interp, 2 runs, 0 skips 356799020 UNITS in lut3d->interp, 4 runs, 0 skips 357276815 UNITS in lut3d->interp, 8 runs, 0 skips gbrap16 1920x1080 1 thread with prelut c impl 1445071160 UNITS in lut3d->interp, 1 runs, 0 skips 1477959120 UNITS in lut3d->interp, 2 runs, 0 skips 1472102670 UNITS in lut3d->interp, 4 runs, 0 skips 1462579330 UNITS in lut3d->interp, 8 runs, 0 skips sse2 1035437580 UNITS in lut3d->interp, 1 runs, 0 skips 1050139710 UNITS in lut3d->interp, 2 runs, 0 skips 1070147205 UNITS in lut3d->interp, 4 runs, 0 skips 1064583037 UNITS in lut3d->interp, 8 runs, 0 skips avx 678089880 UNITS in lut3d->interp, 1 runs, 0 skips 679112485 UNITS in lut3d->interp, 2 runs, 0 skips 695527212 UNITS in lut3d->interp, 4 runs, 0 skips 691300053 UNITS in lut3d->interp, 8 runs, 0 skips avx2 372671340 UNITS in lut3d->interp, 1 runs, 0 skips 373449870 UNITS in lut3d->interp, 2 runs, 0 skips 383725625 UNITS in lut3d->interp, 4 runs, 0 skips 382860848 UNITS in lut3d->interp, 8 runs, 0 skips --- libavfilter/lut3d.h | 83 ++++ libavfilter/vf_lut3d.c | 61 +-- libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++ libavfilter/x86/vf_lut3d_init.c | 88 ++++ 5 files changed, 935 insertions(+), 56 deletions(-) create mode 100644 libavfilter/lut3d.h create mode 100644 libavfilter/x86/vf_lut3d.asm create mode 100644 libavfilter/x86/vf_lut3d_init.c -- 2.31.1.windows.1