diff mbox series

[FFmpeg-devel,v4,4/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx512

Message ID 20210930085023.58812-4-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v4,1/4] libswscale/x86/rgb2rgb: add shuffle_bytes avx2 | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Sept. 30, 2021, 8:50 a.m. UTC
With the accelerating by means of AVX512, the uyvytoyuv422 can be faster.

Performance data(Less is better):
    uyvytoyuv422_avx2      0.27309
    uyvytoyuv422_avx512    0.16229

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libswscale/x86/rgb2rgb.c     |  6 ++++++
 libswscale/x86/rgb_2_rgb.asm | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)
diff mbox series

Patch

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index a965a1755c..c59136a352 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -167,6 +167,9 @@  void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx512(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                            const uint8_t *src, int width, int height,
+                            int lumStride, int chromStride, int srcStride);
 #endif
 
 av_cold void rgb2rgb_init_x86(void)
@@ -222,5 +225,8 @@  av_cold void rgb2rgb_init_x86(void)
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx2;
     }
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        uyvytoyuv422 = ff_uyvytoyuv422_avx512;
+    }
 #endif
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 1777a99faf..d55d2ca07f 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -31,8 +31,10 @@  pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
 pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
 pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+pd_permd512_uv: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
 pb_shuffle_low: db 1, 3, 5, 7, 9, 11, 13, 15
 pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
+pq_permq512_yy: dq 0, 2, 4, 6, 1, 3, 5, 7
 
 SECTION .text
 
@@ -194,7 +196,11 @@  SHUFFLE_BYTES 3, 2, 1, 0
 %macro UYVY_TO_YUV422 0
 cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
     pxor         m0, m0
+%if mmsize == 64
+    vpternlogd   m1, m1, m1, 0xff
+%else
     pcmpeqw      m1, m1
+%endif
     psrlw        m1, 8
 
     movsxdifnidn            wq, wd
@@ -213,7 +219,12 @@  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
 
 %if mmsize > 16
     vpbroadcastq    m13, [pb_shuffle_low]
+%if mmsize == 32
     movu            m15, [pd_permd256_uv]
+%else
+    movu            m14, [pq_permq512_yy]
+    movu            m15, [pd_permd512_uv]
+%endif
 %endif
 
 .loop_line:
@@ -271,6 +282,7 @@  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         pshufb         m7, m3, m13
         punpcklqdq     m6, m6, m7
         VPERM   q, 32, m6, m6, 0xd8
+        VPERM   q, 64, m6, m14, m6
 %endif
         movu [ydstq + wq], m6
 
@@ -287,6 +299,7 @@  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         pshufb         m7, m5, m13
         punpcklqdq     m6, m6, m7
         VPERM   q, 32, m6, m6, 0xd8
+        VPERM   q, 64, m6, m14, m6
 %endif
         movu [ydstq + wq + mmsize], m6
 
@@ -305,6 +318,7 @@  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb     m6, m7 ; UUUU
 
         VPERM d, 32, m6, m15, m6
+        VPERM d, 64, m6, m15, m6
 
         movu   [udstq + whalfq], m6
 
@@ -314,6 +328,7 @@  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb     m2, m4 ; VVVV
 
         VPERM d, 32, m2, m15, m2
+        VPERM d, 64, m2, m15, m2
 
         movu   [vdstq + whalfq], m2
 
@@ -350,4 +365,9 @@  UYVY_TO_YUV422
 INIT_YMM avx2
 UYVY_TO_YUV422
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+UYVY_TO_YUV422
+%endif
 %endif