diff mbox series

[FFmpeg-devel,3/3] avfilter/yadif: add avx2 filter_line function

Message ID 20230210130657.455866-3-jdarnley@obe.tv
State New
Headers show
Series [FFmpeg-devel,1/3] avfilter: move yadif's filter_line init into a dedicated function | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Darnley Feb. 10, 2023, 1:06 p.m. UTC
Zen 2 (Ryzen 7 3700X):
1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3

Using an SD y4m file speed increases from ~ 3600 fps to ~4700.
---
 libavfilter/x86/vf_yadif.asm    | 83 +++++++++++++++++++++++----------
 libavfilter/x86/vf_yadif_init.c |  4 ++
 2 files changed, 62 insertions(+), 25 deletions(-)

Comments

James Darnley Feb. 20, 2023, 12:55 p.m. UTC | #1
On 2/10/23 14:06, James Darnley wrote:
> snip
This patch set is broken.  The checkasm test is incomplete.  This avx2 
function has some bug that only manifests when the strides (prefs mrefs) 
are opposite signs (one positive and one negative).  That situation is 
what happens with real usage.  I fixed my checkasm test which also shows it.

Consider this patch set retracted until I can fix it.
diff mbox series

Patch

diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 809cebdd3f..571febfca3 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -25,11 +25,30 @@ 
 
 SECTION_RODATA
 
-pb_1: times 16 db 1
-pw_1: times  8 dw 1
+pb_1: times 32 db 1
+pw_1: times 16 dw 1
 
 SECTION .text
 
+%unmacro RSHIFT 2
+
+%macro RSHIFT 2
+%if mmsize == 32
+    vextracti128 xm7, %1, 1
+    palignr xmm %+ %1, xm7, xmm %+ %1, 2
+%else
+    psrldq %1, %2
+%endif
+%endmacro
+
+%macro UNPACK 1
+%if mmsize == 32
+    pmovzxbw %1, xmm %+ %1
+%else
+    punpcklbw %1, m7
+%endif
+%endmacro
+
 %macro CHECK 2
     movu      m2, [curq+t1+%1]
     movu      m3, [curq+t0+%2]
@@ -40,7 +59,7 @@  SECTION .text
     pand      m4, [pb_1]
     psubusb   m5, m4
     RSHIFT    m5, 1
-    punpcklbw m5, m7
+    UNPACK    m5
     mova      m4, m2
     psubusb   m2, m3
     psubusb   m3, m4
@@ -49,9 +68,9 @@  SECTION .text
     mova      m4, m2
     RSHIFT    m3, 1
     RSHIFT    m4, 2
-    punpcklbw m2, m7
-    punpcklbw m3, m7
-    punpcklbw m4, m7
+    UNPACK    m2
+    UNPACK    m3
+    UNPACK    m4
     paddw     m2, m3
     paddw     m2, m4
 %endmacro
@@ -81,13 +100,19 @@  SECTION .text
 %endmacro
 
 %macro LOAD 2
-    movh      %1, %2
-    punpcklbw %1, m7
+    %if mmsize == 32
+        pmovzxbw %1, %2
+    %else
+        movh      %1, %2
+        punpcklbw %1, m7
+    %endif
 %endmacro
 
 %macro FILTER 3
 .loop%1:
-    pxor         m7, m7
+    %if mmsize != 32
+        pxor         m7, m7
+    %endif
     LOAD         m0, [curq+t1]
     LOAD         m1, [curq+t0]
     LOAD         m2, [%2]
@@ -95,9 +120,9 @@  SECTION .text
     mova         m4, m3
     paddw        m3, m2
     psraw        m3, 1
-    mova   [rsp+ 0], m0
-    mova   [rsp+16], m3
-    mova   [rsp+32], m1
+    mova   [rsp+0*mmsize], m0
+    mova   [rsp+1*mmsize], m3
+    mova   [rsp+2*mmsize], m1
     psubw        m2, m4
     ABS1         m2, m4
     LOAD         m3, [prevq+t1]
@@ -119,7 +144,7 @@  SECTION .text
     paddw        m3, m4
     psrlw        m3, 1
     pmaxsw       m2, m3
-    mova   [rsp+48], m2
+    mova   [rsp+3*mmsize], m2
 
     paddw        m1, m0
     paddw        m0, m0
@@ -134,9 +159,9 @@  SECTION .text
     psubusb      m3, m4
     pmaxub       m2, m3
     mova         m3, m2
-    psrldq       m3, 2
-    punpcklbw    m2, m7
-    punpcklbw    m3, m7
+    RSHIFT       m3, 2
+    UNPACK       m2
+    UNPACK       m3
     paddw        m0, m2
     paddw        m0, m3
     psubw        m0, [pw_1]
@@ -150,7 +175,7 @@  SECTION .text
     CHECK 1, -3
     CHECK2
 
-    mova         m6, [rsp+48]
+    mova         m6, [rsp+3*mmsize]
     cmp   DWORD r8m, 2
     jge .end%1
     LOAD         m2, [%2+t1*2]
@@ -161,9 +186,9 @@  SECTION .text
     paddw        m3, m5
     psrlw        m2, 1
     psrlw        m3, 1
-    mova         m4, [rsp+ 0]
-    mova         m5, [rsp+16]
-    mova         m7, [rsp+32]
+    mova         m4, [rsp+0*mmsize]
+    mova         m5, [rsp+1*mmsize]
+    mova         m7, [rsp+2*mmsize]
     psubw        m2, m4
     psubw        m3, m7
     mova         m0, m5
@@ -182,15 +207,21 @@  SECTION .text
     pmaxsw       m6, m4
 
 .end%1:
-    mova         m2, [rsp+16]
+    mova         m2, [rsp+1*mmsize]
     mova         m3, m2
     psubw        m2, m6
     paddw        m3, m6
     pmaxsw       m1, m2
     pminsw       m1, m3
-    packuswb     m1, m1
 
-    movh     [dstq], m1
+    %if mmsize == 32
+        vextracti128 xm4, ym1, 1
+        packuswb xm1, xm4
+        movu [dstq], xm1
+    %else
+        packuswb     m1, m1
+        movh     [dstq], m1
+    %endif
     add        dstq, mmsize/2
     add       prevq, mmsize/2
     add        curq, mmsize/2
@@ -201,10 +232,10 @@  SECTION .text
 
 %macro YADIF 0
 %if ARCH_X86_32
-cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %else
-cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
+cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \
                                         mrefs, parity, mode
 %endif
 %if ARCH_X86_32
@@ -233,3 +264,5 @@  INIT_XMM ssse3
 YADIF
 INIT_XMM sse2
 YADIF
+INIT_YMM avx2
+YADIF
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index d648f0f835..48858dc295 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -29,6 +29,8 @@  void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur,
 void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
                                 void *next, int w, int prefs,
                                 int mrefs, int parity, int mode);
+void ff_yadif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
+        int w, int prefs, int mrefs, int parity, int mode);
 
 void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
                                      void *next, int w, int prefs,
@@ -68,5 +70,7 @@  av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth)
             yadif->filter_line = ff_yadif_filter_line_sse2;
         if (EXTERNAL_SSSE3(cpu_flags))
             yadif->filter_line = ff_yadif_filter_line_ssse3;
+        if (EXTERNAL_AVX2(cpu_flags))
+            yadif->filter_line = ff_yadif_filter_line_avx2;
     }
 }