diff mbox series

[FFmpeg-devel,v2,1/3] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()

Message ID 20211008023101.4100-1-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512()
Related show

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Oct. 8, 2021, 2:30 a.m. UTC
Performance(Less is better):
8bit:
    ff_hflip_byte_ssse3   0.61
    ff_hflip_byte_avx2    0.37
    ff_hflip_byte_avx512  0.19
16bit:
    ff_hflip_short_ssse3  1.27
    ff_hflip_short_avx2   0.76
    ff_hflip_short_avx512 0.40

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
 libavfilter/x86/vf_hflip_init.c |  8 ++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

Comments

Wu Jianhua Nov. 19, 2021, 5:50 a.m. UTC | #1
Ping.
> From: Wu, Jianhua <jianhua.wu@intel.com>
> Sent: Friday, October 8, 2021 10:31 AM
> To: ffmpeg-devel@ffmpeg.org
> Cc: Wu, Jianhua <jianhua.wu@intel.com>
> Subject: [PATCH v2 1/3] libavfilter/x86/vf_hflip: add
> ff_flip_byte/short_avx512()
> 
> Performance(Less is better):
> 8bit:
>     ff_hflip_byte_ssse3   0.61
>     ff_hflip_byte_avx2    0.37
>     ff_hflip_byte_avx512  0.19
> 16bit:
>     ff_hflip_short_ssse3  1.27
>     ff_hflip_short_avx2   0.76
>     ff_hflip_short_avx512 0.40
> 
> Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
> ---
>  libavfilter/x86/vf_hflip.asm    | 23 ++++++++++++++++++-----
>  libavfilter/x86/vf_hflip_init.c |  8 ++++++++
>  2 files changed, 26 insertions(+), 5 deletions(-)
> 

Hi there,

Is there anyone who could review the patches related to AVX512?
This commit could increase fps from 800 to 1600 tested with raw yuv
video on my i7 11700k processor.

Thanks,
Jianhua
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 285618954f..c2237217f7 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -26,12 +26,16 @@  SECTION_RODATA
 
 pb_flip_byte:  db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
+pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3
 
 SECTION .text
 
 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
 %macro HFLIP 3
 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
+%if mmsize == 64
+    movu              m3, [pd_flip_indicies]
+%endif
     VBROADCASTI128    m0, [pb_flip_%1]
     xor               xq, xq
 %if %3 == 1
@@ -47,12 +51,15 @@  cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
 
     .loop0:
         neg     xq
-%if mmsize == 32
-        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
-        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%if   mmsize == 64
+        vpermd  m1, m3, [srcq + xq -     mmsize + %3]
+        vpermd  m2, m3, [srcq + xq - 2 * mmsize + %3]
+%elif mmsize == 32
+        vpermq      m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
+        vpermq      m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
 %else
-        movu    m1, [srcq + xq -     mmsize + %3]
-        movu    m2, [srcq + xq - 2 * mmsize + %3]
+        movu        m1, [srcq + xq -     mmsize + %3]
+        movu        m2, [srcq + xq - 2 * mmsize + %3]
 %endif
         pshufb  m1, m0
         pshufb  m2, m0
@@ -88,3 +95,9 @@  INIT_YMM avx2
 HFLIP byte, b, 1
 HFLIP short, w, 2
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 0ac399b0d4..25fc40f7b0 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -25,8 +25,10 @@ 
 
 void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w);
 
 av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
 {
@@ -41,6 +43,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_byte_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_avx512;
+            }
         } else if (step[i] == 2) {
             if (EXTERNAL_SSSE3(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_ssse3;
@@ -48,6 +53,9 @@  av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
             if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                 s->flip_line[i] = ff_hflip_short_avx2;
             }
+            if (EXTERNAL_AVX512(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_avx512;
+            }
         }
     }
 }