diff mbox series

[FFmpeg-devel,v3,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2

Message ID 20210930084355.76628-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v3,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2 | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Sept. 30, 2021, 8:43 a.m. UTC
Performance data(Less is better):
    shuffle_bytes_ssse3   3.64654
    shuffle_bytes_avx2    0.94288

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libswscale/x86/rgb2rgb.c     | 17 +++++++++++++++--
 libswscale/x86/rgb_2_rgb.asm | 11 +++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

Comments

Wu Jianhua Oct. 14, 2021, 7:19 a.m. UTC | #1
Ping.

> -----Original Message-----
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Thursday, September 30, 2021 4:44 PM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH v3 1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2
> 
> Performance data(Less is better):
>     shuffle_bytes_ssse3   3.64654
>     shuffle_bytes_avx2    0.94288
> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 17 +++++++++++++++--
>  libswscale/x86/rgb_2_rgb.asm | 11 +++++++++++
>  2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index
> c38a953277..0ab139aca4 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -146,6 +146,12 @@ void ff_shuffle_bytes_3012_ssse3(const uint8_t *src,
> uint8_t *dst, int src_size)  void ff_shuffle_bytes_3210_ssse3(const uint8_t
> *src, uint8_t *dst, int src_size);
> 
>  #if ARCH_X86_64
> +void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int
> +src_size); void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t
> +*dst, int src_size); void ff_shuffle_bytes_1230_avx2(const uint8_t
> +*src, uint8_t *dst, int src_size); void
> +ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int
> +src_size); void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t
> +*dst, int src_size);
> +
>  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>                            const uint8_t *src, int width, int height,
>                            int lumStride, int chromStride, int srcStride); @@ -186,9 +192,16
> @@ av_cold void rgb2rgb_init_x86(void)
>          shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
>          shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
>      }
> -    if (EXTERNAL_AVX(cpu_flags)) {
>  #if ARCH_X86_64
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
> +    }
> +    if (EXTERNAL_AVX(cpu_flags)) {
>          uyvytoyuv422 = ff_uyvytoyuv422_avx; -#endif
>      }
> +#endif
>  }
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 29b856e281..c695c61d5c 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -159,6 +159,17 @@ SHUFFLE_BYTES 1, 2, 3, 0  SHUFFLE_BYTES 3, 0, 1, 2
> SHUFFLE_BYTES 3, 2, 1, 0
> 
> +%if ARCH_X86_64
> +%if HAVE_AVX2_EXTERNAL
> +INIT_YMM avx2
> +SHUFFLE_BYTES 2, 1, 0, 3
> +SHUFFLE_BYTES 0, 3, 2, 1
> +SHUFFLE_BYTES 1, 2, 3, 0
> +SHUFFLE_BYTES 3, 0, 1, 2
> +SHUFFLE_BYTES 3, 2, 1, 0
> +%endif
> +%endif
> +
>  ;--------------------------------------------------------------------------------------------
> ---
>  ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>  ;              const uint8_t *src, int width, int height,
> --
> 2.17.1

Hi there,

Any update?

Thanks,
Jianhua
Michael Niedermayer Oct. 15, 2021, 1:18 p.m. UTC | #2
On Thu, Oct 14, 2021 at 07:19:37AM +0000, Wu, Jianhua wrote:
> Ping.
> 
> > -----Original Message-----
> > From: Wu, Jianhua <jianhua.wu@intel.com>
> > Sent: Thursday, September 30, 2021 4:44 PM
> > To: ffmpeg-devel@ffmpeg.org
> > Cc: Wu, Jianhua <jianhua.wu@intel.com>
> > Subject: [PATCH v3 1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2
> > 
> > Performance data(Less is better):
> >     shuffle_bytes_ssse3   3.64654
> >     shuffle_bytes_avx2    0.94288
> > 
> > Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> > ---
> >  libswscale/x86/rgb2rgb.c     | 17 +++++++++++++++--
> >  libswscale/x86/rgb_2_rgb.asm | 11 +++++++++++
> >  2 files changed, 26 insertions(+), 2 deletions(-)
> > 
> > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index
> > c38a953277..0ab139aca4 100644
> > --- a/libswscale/x86/rgb2rgb.c
> > +++ b/libswscale/x86/rgb2rgb.c
> > @@ -146,6 +146,12 @@ void ff_shuffle_bytes_3012_ssse3(const uint8_t *src,
> > uint8_t *dst, int src_size)  void ff_shuffle_bytes_3210_ssse3(const uint8_t
> > *src, uint8_t *dst, int src_size);
> > 
> >  #if ARCH_X86_64
> > +void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int
> > +src_size); void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t
> > +*dst, int src_size); void ff_shuffle_bytes_1230_avx2(const uint8_t
> > +*src, uint8_t *dst, int src_size); void
> > +ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int
> > +src_size); void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t
> > +*dst, int src_size);
> > +
> >  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> >                            const uint8_t *src, int width, int height,
> >                            int lumStride, int chromStride, int srcStride); @@ -186,9 +192,16
> > @@ av_cold void rgb2rgb_init_x86(void)
> >          shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
> >          shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
> >      }
> > -    if (EXTERNAL_AVX(cpu_flags)) {
> >  #if ARCH_X86_64
> > +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> > +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
> > +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
> > +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
> > +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
> > +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
> > +    }
> > +    if (EXTERNAL_AVX(cpu_flags)) {
> >          uyvytoyuv422 = ff_uyvytoyuv422_avx; -#endif
> >      }
> > +#endif
> >  }
> > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> > index 29b856e281..c695c61d5c 100644
> > --- a/libswscale/x86/rgb_2_rgb.asm
> > +++ b/libswscale/x86/rgb_2_rgb.asm
> > @@ -159,6 +159,17 @@ SHUFFLE_BYTES 1, 2, 3, 0  SHUFFLE_BYTES 3, 0, 1, 2
> > SHUFFLE_BYTES 3, 2, 1, 0
> > 
> > +%if ARCH_X86_64
> > +%if HAVE_AVX2_EXTERNAL
> > +INIT_YMM avx2
> > +SHUFFLE_BYTES 2, 1, 0, 3
> > +SHUFFLE_BYTES 0, 3, 2, 1
> > +SHUFFLE_BYTES 1, 2, 3, 0
> > +SHUFFLE_BYTES 3, 0, 1, 2
> > +SHUFFLE_BYTES 3, 2, 1, 0
> > +%endif
> > +%endif
> > +
> >  ;--------------------------------------------------------------------------------------------
> > ---
> >  ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> >  ;              const uint8_t *src, int width, int height,
> > --
> > 2.17.1
> 
> Hi there,
> 
> Any update?

you posted a v4 but "ping" v3 ?
is that intended

[...]
diff mbox series

Patch

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index c38a953277..0ab139aca4 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -146,6 +146,12 @@  void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
 void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 
 #if ARCH_X86_64
+void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
@@ -186,9 +192,16 @@  av_cold void rgb2rgb_init_x86(void)
         shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
         shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
     }
-    if (EXTERNAL_AVX(cpu_flags)) {
 #if ARCH_X86_64
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx;
-#endif
     }
+#endif
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 29b856e281..c695c61d5c 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -159,6 +159,17 @@  SHUFFLE_BYTES 1, 2, 3, 0
 SHUFFLE_BYTES 3, 0, 1, 2
 SHUFFLE_BYTES 3, 2, 1, 0
 
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+%endif
+%endif
+
 ;-----------------------------------------------------------------------------------------------
 ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 ;              const uint8_t *src, int width, int height,