diff mbox series

[FFmpeg-devel,v4,2/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx512

Message ID 20210930085023.58812-2-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v4,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2 | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Sept. 30, 2021, 8:50 a.m. UTC
Performance data(Less is better):
    shuffle_bytes_avx2      0.94288
    shuffle_bytes_avx512    0.60049

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libswscale/x86/rgb2rgb.c     | 13 +++++++++++++
 libswscale/x86/rgb_2_rgb.asm |  8 ++++++++
 2 files changed, 21 insertions(+)

Comments

Wu Jianhua Nov. 19, 2021, 5:53 a.m. UTC | #1
Ping for the patches 2-4
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Thursday, September 30, 2021 4:50 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH v4 2/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx512
> 
> Performance data(Less is better):
>     shuffle_bytes_avx2      0.94288
>     shuffle_bytes_avx512    0.60049
> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 13 +++++++++++++
>  libswscale/x86/rgb_2_rgb.asm |  8 ++++++++
>  2 files changed, 21 insertions(+)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index
> 0ab139aca4..c9ff33ab77 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -152,6 +152,12 @@ void ff_shuffle_bytes_1230_avx2(const uint8_t *src,
> uint8_t *dst, int src_size);  void ff_shuffle_bytes_3012_avx2(const uint8_t
> *src, uint8_t *dst, int src_size);  void ff_shuffle_bytes_3210_avx2(const
> uint8_t *src, uint8_t *dst, int src_size);
> 
> +void ff_shuffle_bytes_2103_avx512(const uint8_t *src, uint8_t *dst, int
> +src_size); void ff_shuffle_bytes_0321_avx512(const uint8_t *src,
> +uint8_t *dst, int src_size); void ff_shuffle_bytes_1230_avx512(const
> +uint8_t *src, uint8_t *dst, int src_size); void
> +ff_shuffle_bytes_3012_avx512(const uint8_t *src, uint8_t *dst, int
> +src_size); void ff_shuffle_bytes_3210_avx512(const uint8_t *src,
> +uint8_t *dst, int src_size);
> +
>  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>                            const uint8_t *src, int width, int height,
>                            int lumStride, int chromStride, int srcStride); @@ -200,6 +206,13
> @@ av_cold void rgb2rgb_init_x86(void)
>          shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
>          shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
>      }
> +    if (EXTERNAL_AVX512(cpu_flags)) {
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512;
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512;
> +    }
>      if (EXTERNAL_AVX(cpu_flags)) {
>          uyvytoyuv422 = ff_uyvytoyuv422_avx;
>      }
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index c695c61d5c..3380a1272c 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -168,6 +168,14 @@ SHUFFLE_BYTES 1, 2, 3, 0  SHUFFLE_BYTES 3, 0, 1, 2
> SHUFFLE_BYTES 3, 2, 1, 0  %endif
> +%if HAVE_AVX512_EXTERNAL
> +INIT_ZMM avx512
> +SHUFFLE_BYTES 2, 1, 0, 3
> +SHUFFLE_BYTES 0, 3, 2, 1
> +SHUFFLE_BYTES 1, 2, 3, 0
> +SHUFFLE_BYTES 3, 0, 1, 2
> +SHUFFLE_BYTES 3, 2, 1, 0
> +%endif
>  %endif
> 
>  ;--------------------------------------------------------------------------------------------
> ---
> --
> 2.17.1
diff mbox series

Patch

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 0ab139aca4..c9ff33ab77 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -152,6 +152,12 @@  void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
 
+void ff_shuffle_bytes_2103_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_avx512(const uint8_t *src, uint8_t *dst, int src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
@@ -200,6 +206,13 @@  av_cold void rgb2rgb_init_x86(void)
         shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
         shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
     }
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512;
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512;
+    }
     if (EXTERNAL_AVX(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx;
     }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index c695c61d5c..3380a1272c 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -168,6 +168,14 @@  SHUFFLE_BYTES 1, 2, 3, 0
 SHUFFLE_BYTES 3, 0, 1, 2
 SHUFFLE_BYTES 3, 2, 1, 0
 %endif
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+%endif
 %endif
 
 ;-----------------------------------------------------------------------------------------------