diff mbox series

[FFmpeg-devel,v2,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2

Message ID 20210930015612.29608-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Sept. 30, 2021, 1:56 a.m. UTC
Performance data(Less is better):
    shuffle_bytes_ssse3   3.64654
    shuffle_bytes_avx2    0.94288

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libswscale/x86/rgb2rgb.c     | 17 +++++++++++++++--
 libswscale/x86/rgb_2_rgb.asm | 11 +++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

Comments

Paul B Mahol Oct. 14, 2021, 7:55 a.m. UTC | #1
looks very trivial, so will apply shortly
diff mbox series

Patch

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index c38a953277..0ab139aca4 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -146,6 +146,12 @@  void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
 void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 
 #if ARCH_X86_64
+void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
@@ -186,9 +192,16 @@  av_cold void rgb2rgb_init_x86(void)
         shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
         shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
     }
-    if (EXTERNAL_AVX(cpu_flags)) {
 #if ARCH_X86_64
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
+    }
+    if (EXTERNAL_AVX(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx;
-#endif
     }
+#endif
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 29b856e281..c695c61d5c 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -159,6 +159,17 @@  SHUFFLE_BYTES 1, 2, 3, 0
 SHUFFLE_BYTES 3, 0, 1, 2
 SHUFFLE_BYTES 3, 2, 1, 0
 
+%if ARCH_X86_64
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+%endif
+%endif
+
 ;-----------------------------------------------------------------------------------------------
 ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 ;              const uint8_t *src, int width, int height,