diff mbox series

[FFmpeg-devel,3/5] x86/vvc_sad: add sse4 versions of all functions

Message ID 20240523122716.2158-3-jamrial@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/5] avcodec/vvc_mc: split the SAD dsp prototype into one function per blocksize width | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer May 23, 2024, 12:27 p.m. UTC
And remove sad_8x8_avx2, as it's not faster than sad_8x8_sse4.

sad_8x8_c: 54.8
sad_8x8_sse4: 14.3
sad_16x16_c: 200.8
sad_16x16_sse4: 34.8
sad_16x16_avx2: 29.8
sad_32x32_c: 826.3
sad_32x32_sse4: 113.8
sad_32x32_avx2: 69.3
sad_64x64_c: 3679.8
sad_64x64_sse4: 392.8
sad_64x64_avx2: 257.3
sad_128x128_c: 12581.3
sad_128x128_sse4: 1560.8
sad_128x128_avx2: 1151.8

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/vvc/vvc_sad.asm   | 53 +++++++++++++++++++++-----------
 libavcodec/x86/vvc/vvcdsp_init.c | 42 +++++++++++++++++--------
 2 files changed, 65 insertions(+), 30 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/x86/vvc/vvc_sad.asm b/libavcodec/x86/vvc/vvc_sad.asm
index 829dbce489..26df25ec66 100644
--- a/libavcodec/x86/vvc/vvc_sad.asm
+++ b/libavcodec/x86/vvc/vvc_sad.asm
@@ -26,7 +26,7 @@ 
 
 SECTION_RODATA
 
-pw_1: times 2 dw 1
+cextern pw_1
 
 ; DMVR SAD is only calculated on even rows to reduce complexity
 SECTION .text
@@ -38,20 +38,21 @@  SECTION .text
 %endmacro
 
 %macro HORIZ_ADD 3  ; xm0, xm1, m1
+%if mmsize == 32
     vextracti128     %1, %3, q0001  ;        3        2      1          0
-    paddd            %1, %2         ; xm0 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
-    pshufd           %2, %1, q0032  ; xm1    -      -     (7 + 3)   (6 + 2)
+    paddd            %2, %1         ; xm1 (7 + 3) (6 + 2) (5 + 1)   (4 + 0)
+%endif
+    pshufd           %1, %2, q0032  ; xm0    -      -     (7 + 3)   (6 + 2)
     paddd            %1, %1, %2     ; xm0    _      _     (5 1 7 3) (4 0 6 2)
     pshufd           %2, %1, q0001  ; xm1    _      _     (5 1 7 3) (5 1 7 3)
     paddd            %1, %1, %2     ;                               (01234567)
 %endmacro
 
-%if ARCH_X86_64
-%if HAVE_AVX2_EXTERNAL
-
-INIT_YMM avx2
-
-cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, row_idx
+%macro VVC_SAD 1
+cglobal vvc_sad_%1, 4, 7, 5, src1, src2, dx, dy, off1, block_h, off2
+%if UNIX64 == 0
+    mov             block_hd, dword r5m
+%endif
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -74,29 +75,32 @@  cglobal vvc_sad_8, 6, 9, 5, src1, src2, dx, dy, block_w, block_h, off1, off2, ro
     lea             src2q, [src2q + off2q * 2 + 2 * 2]
 
     pxor               m3, m3
+%if mmsize == 32
     vpbroadcastd       m4, [pw_1]
+%else
+    mova               m4, [pw_1]
+%endif
 
         .loop_height:
-        movu              xm0, [src1q]
-        vinserti128        m0, m0, [src1q + MAX_PB_SIZE * ROWS * 2], 1
-        movu              xm1, [src2q]
-        vinserti128        m1, m1, [src2q + MAX_PB_SIZE * ROWS * 2], 1
-
+        movu               m0, [src1q]
+        movu               m1, [src2q]
         MIN_MAX_SAD        m1, m0, m2
         pmaddwd            m1, m4
         paddd              m3, m1
 
-        add         src1q, 2 * MAX_PB_SIZE * ROWS * 2
-        add         src2q, 2 * MAX_PB_SIZE * ROWS * 2
+        add         src1q, ROWS * MAX_PB_SIZE * 2
+        add         src2q, ROWS * MAX_PB_SIZE * 2
 
-        sub      block_hd, 4
+        sub      block_hd, 2
         jg   .loop_height
 
         HORIZ_ADD     xm0, xm3, m3
         movd          eax, xm0
     RET
+%endmacro
 
-cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
+%macro VVC_SAD_LOOP 1
+cglobal vvc_sad_%1, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
     movsxdifnidn    dxq, dxd
     movsxdifnidn    dyq, dyd
 
@@ -119,7 +123,11 @@  cglobal vvc_sad_16, 6, 8, 5, src1, src2, dx, dy, block_w, block_h, off1, off2
     lea             src2q, [src2q + off2q * 2 + 2 * 2]
 
     pxor               m3, m3
+%if mmsize == 32
     vpbroadcastd       m4, [pw_1]
+%else
+    mova               m4, [pw_1]
+%endif
 
         shl      block_wd, 1
         add         src1q, block_wq
@@ -149,6 +157,15 @@  DEFINE_ARGS src1, src2, dx, dy, block_w, block_h, row_idx
         HORIZ_ADD     xm0, xm3, m3
         movd          eax, xm0
     RET
+%endmacro
 
+%if ARCH_X86_64
+INIT_XMM sse4
+VVC_SAD 8
+VVC_SAD_LOOP 16
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+VVC_SAD 16
+VVC_SAD_LOOP 32
 %endif
 %endif
diff --git a/libavcodec/x86/vvc/vvcdsp_init.c b/libavcodec/x86/vvc/vvcdsp_init.c
index bd60963432..cdf0e36b62 100644
--- a/libavcodec/x86/vvc/vvcdsp_init.c
+++ b/libavcodec/x86/vvc/vvcdsp_init.c
@@ -316,16 +316,10 @@  ALF_FUNCS(16, 12, avx2)
 int bf(ff_vvc_sad, w, opt)(const int16_t *src0, const int16_t *src1, \
                            int dx, int dy, int block_w, int block_h) \
 
-SAD_PROTOTYPE(8,   avx2);
+SAD_PROTOTYPE(8,   sse4);
+SAD_PROTOTYPE(16,  sse4);
 SAD_PROTOTYPE(16,  avx2);
-
-#define SAD_INIT(opt) do {                   \
-    c->inter.sad[0] = ff_vvc_sad_8_##opt;    \
-    c->inter.sad[1] =                        \
-    c->inter.sad[2] =                        \
-    c->inter.sad[3] =                        \
-    c->inter.sad[4] = ff_vvc_sad_16_##opt;   \
-} while (0)
+SAD_PROTOTYPE(32,  avx2);
 #endif
 
 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
@@ -337,36 +331,60 @@  void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
     case 8:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(8);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(8);
             AVG_INIT(8, avx2);
             MC_LINKS_AVX2(8);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     case 10:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(10);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(10);
             AVG_INIT(10, avx2);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     case 12:
         if (EXTERNAL_SSE4(cpu_flags)) {
             MC_LINK_SSE4(12);
+            c->inter.sad[0] = ff_vvc_sad_8_sse4;
+            c->inter.sad[1] =
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_16_sse4;
         }
         if (EXTERNAL_AVX2_FAST(cpu_flags)) {
             ALF_INIT(12);
             AVG_INIT(12, avx2);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
-            SAD_INIT(avx2);
+            c->inter.sad[1] = ff_vvc_sad_16_avx2;
+            c->inter.sad[2] =
+            c->inter.sad[3] =
+            c->inter.sad[4] = ff_vvc_sad_32_avx2;
         }
         break;
     default: