diff mbox series

[FFmpeg-devel,v2,1/3] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()

Message ID 20211008023101.4100-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu, Jianhua Oct. 8, 2021, 2:30 a.m. UTC
Performance(Less is better):
8bit:
    ff_hflip_byte_ssse3   0.61
    ff_hflip_byte_avx2    0.37
    ff_hflip_byte_avx512  0.19
16bit:
    ff_hflip_short_ssse3  1.27
    ff_hflip_short_avx2   0.76
    ff_hflip_short_avx512 0.40

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
 libavfilter/x86/vf_hflip_init.c |  8 ++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 285618954f..c2237217f7 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -26,12 +26,16 @@  SECTION_RODATA
 
 pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
 
 SECTION .text
 
 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
 %macro HFLIP 3
 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
+%if mmsize == 64
+    movu              m3, [pd_flip_indicies]
+%endif
     VBROADCASTI128    m0, [pb_flip_%1]
     xor               xq, xq
 %if %3 == 1
@@ -47,12 +51,15 @@  cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
 
     .loop0:
         neg     xq
-%if mmsize == 32
-        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
-        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%if   mmsize == 64
+        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
+        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3]
+%elif mmsize == 32
+        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
+        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
 %else
-        movu    m1, [srcq + xq -     mmsize + %3]
-        movu    m2, [srcq + xq - 2 * mmsize + %3]
+        movu        m1, [srcq + xq -     mmsize + %3]
+        movu        m2, [srcq + xq - 2 * mmsize + %3]
 %endif
         pshufb  m1, m0
         pshufb  m2, m0
@@ -88,3 +95,9 @@  INIT_YMM avx2
 HFLIP byte, b, 1
 HFLIP short, w, 2
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 0ac399b0d4..25fc40f7b0 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -25,8 +25,10 @@ 
 
 void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
 
 av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
 {
@@ -41,6 +43,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_byte_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_avx512;
+            }
         } else if (step[i] == 2) {
             if (EXTERNAL_SSSE3(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_ssse3;
@@ -48,6 +53,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_avx512;
+            }
         }
     }
 }