diff mbox series

[FFmpeg-devel,v2,2/3] libavfilter/x86/vf_threshold: add ff_threshold8/16_avx512

Message ID 20211008023101.4100-2-jianhua.wu@intel.com
State New
Headers show
Series [FFmpeg-devel,v2,1/3] libavfilter/x86/vf_hflip: add ff_flip_byte/short_avx512() | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

Wu Jianhua Oct. 8, 2021, 2:31 a.m. UTC
Performance(Less is better)
8bit:
    ff_threshold8_sse4    32.7555351
    ff_threshold8_avx2    32.1713562
    ff_threshold8_avx512  32.0103531
16bit:
    ff_threshold16_sse4   37.7713432
    ff_threshold16_avx2   35.3348312
    ff_threshold16_avx512 32.6976166

Signed-off-by: Wu Jianhua <jianhua.wu@intel.com>
---
 libavfilter/x86/vf_threshold.asm    | 44 +++++++++++++++++++++--------
 libavfilter/x86/vf_threshold_init.c |  8 ++++++
 2 files changed, 41 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_threshold.asm
index 098069b083..dc4126c7af 100644
--- a/libavfilter/x86/vf_threshold.asm
+++ b/libavfilter/x86/vf_threshold.asm
@@ -29,6 +29,15 @@  pb_128_0 : times 8 db 0, 128
 
 SECTION .text
 
+%macro DECL_MASK 2
+%if mmsize < 64
+    %xdefine %1 m%2
+%else
+    %assign  %%i %2 + 1
+    %xdefine  %1 k %+ %%i
+%endif
+%endmacro
+
 ;%1 depth (8 or 16) ; %2 b or w ; %3 constant
 %macro THRESHOLD 3
 %if ARCH_X86_64
@@ -58,17 +67,24 @@  cglobal threshold%1, 5, 7, 5, in, threshold, min, max, out, w, x
 .nextrow:
     mov         xq, wq
 
-    .loop:
-        movu            m1, [inq + xq]
-        movu            m0, [thresholdq + xq]
-        movu            m2, [minq + xq]
-        movu            m3, [maxq + xq]
-        pxor            m0, m4
-        pxor            m1, m4
-        pcmpgt%2        m0, m1
-        PBLENDVB        m3, m2, m0
-        movu   [outq + xq], m3
-        add             xq, mmsize
+.loop:
+    movu              m1, [inq + xq]
+    movu              m0, [thresholdq + xq]
+    movu              m2, [minq + xq]
+    movu              m3, [maxq + xq]
+    pxor              m0, m4
+    pxor              m1, m4
+    DECL_MASK       mask, 0
+    pcmpgt%2        mask, m0, m1
+
+%if mmsize == 64
+    vpblendm%2  m3{mask}, m3, m2
+%else
+    PBLENDVB          m3, m2, mask
+%endif
+
+    movu     [outq + xq], m3
+    add               xq, mmsize
     jl .loop
 
     add          inq, ilinesizeq
@@ -90,3 +106,9 @@  INIT_YMM avx2
 THRESHOLD 8, b, pb_128
 THRESHOLD 16, w, pb_128_0
 %endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+THRESHOLD 8, b, pb_128
+THRESHOLD 16, w, pb_128_0
+%endif
diff --git a/libavfilter/x86/vf_threshold_init.c b/libavfilter/x86/vf_threshold_init.c
index 8e42296791..0c75ea2870 100644
--- a/libavfilter/x86/vf_threshold_init.c
+++ b/libavfilter/x86/vf_threshold_init.c
@@ -34,8 +34,10 @@  void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\
 
 THRESHOLD_FUNC(8, sse4)
 THRESHOLD_FUNC(8, avx2)
+THRESHOLD_FUNC(8, avx512)
 THRESHOLD_FUNC(16, sse4)
 THRESHOLD_FUNC(16, avx2)
+THRESHOLD_FUNC(16, avx512)
 
 av_cold void ff_threshold_init_x86(ThresholdContext *s)
 {
@@ -48,6 +50,9 @@  av_cold void ff_threshold_init_x86(ThresholdContext *s)
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             s->threshold = ff_threshold8_avx2;
         }
+        if (EXTERNAL_AVX512(cpu_flags)) {
+            s->threshold = ff_threshold8_avx512;
+        }
     } else if (s->depth == 16) {
         if (EXTERNAL_SSE4(cpu_flags)) {
             s->threshold = ff_threshold16_sse4;
@@ -55,5 +60,8 @@  av_cold void ff_threshold_init_x86(ThresholdContext *s)
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             s->threshold = ff_threshold16_avx2;
         }
+        if (EXTERNAL_AVX512(cpu_flags)) {
+            s->threshold = ff_threshold16_avx512;
+        }
     }
 }