diff mbox series

[FFmpeg-devel,v2,5/5] avfilter/bwdif: add avx2 filter_line function

Message ID 20230320164925.299207-5-jdarnley@obe.tv
State Accepted
Commit 073ec3b9da01f80fe5e9853fb2df9ce088fd5d0a
Headers show
Series [FFmpeg-devel,v2,1/5] avfilter/bwdif: move filter_line init to a dedicated function | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 fail Make fate failed

Commit Message

James Darnley March 20, 2023, 4:49 p.m. UTC
8-bit:
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
10-bit:
2.00x faster (1703±1.7 vs. 853±2.0 decicycles) compared with ssse3
---
Fixed the word broadcast

 libavfilter/x86/vf_bwdif.asm    | 29 ++++++++++++++++++++++++-----
 libavfilter/x86/vf_bwdif_init.c | 12 ++++++++++++
 2 files changed, 36 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
index 0b453da53b..c93b41ec48 100644
--- a/libavfilter/x86/vf_bwdif.asm
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -26,18 +26,22 @@ 
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_coefhf:  times 4 dw  1016, 5570
-pw_coefhf1: times 8 dw -3801
-pw_coefsp:  times 4 dw  5077, -981
-pw_splfdif: times 4 dw  -768,  768
+pw_coefhf:  times 8 dw  1016, 5570
+pw_coefhf1: times 16 dw -3801
+pw_coefsp:  times 8 dw  5077, -981
+pw_splfdif: times 8 dw  -768,  768
 
 SECTION .text
 
 %macro LOAD8 2
+    %if mmsize == 32
+        pmovzxbw %1, %2
+    %else
     movh         %1, %2
     punpcklbw    %1, m7
+    %endif
 %endmacro
 
 %macro LOAD12 2
@@ -45,8 +49,14 @@  SECTION .text
 %endmacro
 
 %macro DISP8 0
+    %if mmsize == 32
+        vextracti128  xm1,    m2, 1
+        packuswb      xm2,   xm1
+        movu         [dstq], xm2
+    %else
     packuswb     m2, m2
     movh     [dstq], m2
+    %endif
 %endmacro
 
 %macro DISP12 0
@@ -244,8 +254,12 @@  cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
                                               prefs, mrefs, prefs2, mrefs2, \
                                               prefs3, mrefs3, prefs4, \
                                               mrefs4, parity, clip_max
+    %if mmsize == 32
+        vpbroadcastw m12, WORD clip_maxm
+    %else
     movd        m12, DWORD clip_maxm
     SPLATW      m12, m12, 0
+    %endif
 %else
 cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
                                               prefs, mrefs, prefs2, mrefs2, \
@@ -264,3 +278,8 @@  INIT_XMM ssse3
 BWDIF
 INIT_XMM sse2
 BWDIF
+
+%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
+INIT_YMM avx2
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
index ba7bc40c3d..f833318c10 100644
--- a/libavfilter/x86/vf_bwdif_init.c
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -32,6 +32,10 @@  void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
                                 int w, int prefs, int mrefs, int prefs2,
                                 int mrefs2, int prefs3, int mrefs3, int prefs4,
                                 int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+                               int w, int prefs, int mrefs, int prefs2,
+                               int mrefs2, int prefs3, int mrefs3, int prefs4,
+                               int mrefs4, int parity, int clip_max);
 
 void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
                                      int w, int prefs, int mrefs, int prefs2,
@@ -41,6 +45,10 @@  void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
                                       int w, int prefs, int mrefs, int prefs2,
                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
                                       int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
+                                     int w, int prefs, int mrefs, int prefs2,
+                                     int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                     int mrefs4, int parity, int clip_max);
 
 av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
 {
@@ -51,10 +59,14 @@  av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
             bwdif->filter_line = ff_bwdif_filter_line_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+        if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_avx2;
     } else if (bit_depth <= 12) {
         if (EXTERNAL_SSE2(cpu_flags))
             bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+        if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
     }
 }