diff mbox series

[FFmpeg-devel,1/2] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512

Message ID 20230906142442.1634320-1-alankelly@google.com
State New
Headers show
Series [FFmpeg-devel,1/2] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512 | expand

Commit Message

Alan Kelly Sept. 6, 2023, 2:24 p.m. UTC
---
 libswscale/x86/swscale.c    |  7 +++++++
 libswscale/x86/yuv2yuvX.asm | 19 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

Comments

Kieran Kunhya Sept. 6, 2023, 3:30 p.m. UTC | #1
On Wed, 6 Sept 2023 at 15:24, Alan Kelly via ffmpeg-devel <
ffmpeg-devel@ffmpeg.org> wrote:

> ---
>  libswscale/x86/swscale.c    |  7 +++++++
>  libswscale/x86/yuv2yuvX.asm | 19 ++++++++++++++++++-
>  2 files changed, 25 insertions(+), 1 deletion(-)
>

Could you include benchmarks below the main commit message please?
Otherwise ok.

Kieran
diff mbox series

Patch

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index ff16398988..00e42b4bec 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -225,6 +225,9 @@  YUV2YUVX_FUNC(sse3, 32)
 #if HAVE_AVX2_EXTERNAL
 YUV2YUVX_FUNC(avx2, 64)
 #endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+YUV2YUVX_FUNC(avx512, 128)
+#endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
@@ -467,6 +470,10 @@  av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 #if HAVE_AVX2_EXTERNAL
         if (EXTERNAL_AVX2_FAST(cpu_flags))
             c->yuv2planeX = yuv2yuvX_avx2;
+#endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+        if (EXTERNAL_AVX512ICL(cpu_flags))
+            c->yuv2planeX = yuv2yuvX_avx512;
 #endif
     }
 #if ARCH_X86_32 && !HAVE_ALIGNED_STACK
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 369c850674..57bfa09d66 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -22,6 +22,10 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA 64
+
+permutation: dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -50,6 +54,10 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %else
     movq                 xm3, [ditherq]
 %endif ; avx2
+
+%if cpuflag(avx512)
+    mova                 m15, [permutation]
+%endif
     cmp                  offsetd, 0
     jz                   .offset
 
@@ -109,7 +117,10 @@  cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
     packuswb             m6, m6, m1
 %endif
     mov                  srcq, [filterq]
-%if cpuflag(avx2)
+%if cpuflag(avx512)
+    vpermt2q             m3, m15, m3
+    vpermt2q             m6, m15, m6
+%elif cpuflag(avx2)
     vpermq               m3, m3, 216
     vpermq               m6, m6, 216
 %endif
@@ -131,4 +142,10 @@  YUV2YUVX_FUNC
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 YUV2YUVX_FUNC
+%if HAVE_AVX512_EXTERNAL
+%if ARCH_X86_64
+INIT_ZMM avx512
+YUV2YUVX_FUNC
+%endif
+%endif
 %endif