@@ -29,6 +29,15 @@ pb_128_0 : times 8 db 0, 128
SECTION .text
+%macro DECL_MASK 2
+%if mmsize < 64
+ %xdefine %1 m%2
+%else
+ %assign %%i %2 + 1
+ %xdefine %1 k %+ %%i
+%endif
+%endmacro
+
;%1 depth (8 or 16) ; %2 b or w ; %3 constant
%macro THRESHOLD 3
%if ARCH_X86_64
@@ -58,17 +67,24 @@ cglobal threshold%1, 5, 7, 5, in, threshold, min, max, out, w, x
.nextrow:
mov xq, wq
- .loop:
- movu m1, [inq + xq]
- movu m0, [thresholdq + xq]
- movu m2, [minq + xq]
- movu m3, [maxq + xq]
- pxor m0, m4
- pxor m1, m4
- pcmpgt%2 m0, m1
- PBLENDVB m3, m2, m0
- movu [outq + xq], m3
- add xq, mmsize
+.loop:
+ movu m1, [inq + xq]
+ movu m0, [thresholdq + xq]
+ movu m2, [minq + xq]
+ movu m3, [maxq + xq]
+ pxor m0, m4
+ pxor m1, m4
+ DECL_MASK mask, 0
+ pcmpgt%2 mask, m0, m1
+
+%if mmsize == 64
+ vpblendm%2 m3{mask}, m3, m2
+%else
+ PBLENDVB m3, m2, mask
+%endif
+
+ movu [outq + xq], m3
+ add xq, mmsize
jl .loop
add inq, ilinesizeq
@@ -90,3 +106,9 @@ INIT_YMM avx2
THRESHOLD 8, b, pb_128
THRESHOLD 16, w, pb_128_0
%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+THRESHOLD 8, b, pb_128
+THRESHOLD 16, w, pb_128_0
+%endif
@@ -34,8 +34,10 @@ void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\
THRESHOLD_FUNC(8, sse4)
THRESHOLD_FUNC(8, avx2)
+THRESHOLD_FUNC(8, avx512)
THRESHOLD_FUNC(16, sse4)
THRESHOLD_FUNC(16, avx2)
+THRESHOLD_FUNC(16, avx512)
av_cold void ff_threshold_init_x86(ThresholdContext *s)
{
@@ -48,6 +50,9 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->threshold = ff_threshold8_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ s->threshold = ff_threshold8_avx512;
+ }
} else {
if (EXTERNAL_SSE4(cpu_flags)) {
s->threshold = ff_threshold16_sse4;
@@ -55,5 +60,8 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
s->threshold = ff_threshold16_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ s->threshold = ff_threshold16_avx512;
+ }
}
}
Performance(Less is better) 8bit: ff_threshold8_sse4 32.7555351 ff_threshold8_avx2 32.1713562 ff_threshold8_avx512 32.0103531 16bit: ff_threshold16_sse4 37.7713432 ff_threshold16_avx2 35.3348312 ff_threshold16_avx512 32.6976166 Signed-off-by: Wu Jianhua <jianhua.wu@intel.com> --- libavfilter/x86/vf_threshold.asm | 44 +++++++++++++++++++++-------- libavfilter/x86/vf_threshold_init.c | 8 ++++++ 2 files changed, 41 insertions(+), 11 deletions(-)