[FFmpeg-devel] lavc/x86/videodsp: Drop MMX usage

Message ID 20241112191344.132426-1-post@frankplowman.com
State New
Headers
Series [FFmpeg-devel] lavc/x86/videodsp: Drop MMX usage |

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished

Commit Message

Frank Plowman Nov. 12, 2024, 7:10 p.m. UTC
Remove the MMX versions of these functions and modify the SSE
implementations to avoid using MMX registers.

Signed-off-by: Frank Plowman <post@frankplowman.com>
---
This wasn't wholly straightforward as the existing SSE implementation did
not only use SSE but rather a blend of SSE and MMX.  I would appreciate
some review, as I am not particularly familiar with x86 assembly.  Of
particular interest are the changes to {READ,WRITE}_NUM_BYTES.  The new
implementation does not make economic use of the XMM registers like the
old one did, but it still does not exhaust them -- was this more of a
concern when this was originally written?
---
 libavcodec/x86/videodsp.asm    | 63 ++++++----------------
 libavcodec/x86/videodsp_init.c | 99 +++++++++++++++++-----------------
 2 files changed, 65 insertions(+), 97 deletions(-)
  

Comments

Ronald S. Bultje Nov. 12, 2024, 7:27 p.m. UTC | #1
Hi,

On Tue, Nov 12, 2024 at 2:14 PM Frank Plowman <post@frankplowman.com> wrote:

> Remove the MMX versions of these functions and modify the SSE
> implementations to avoid using MMX registers.
>
> Signed-off-by: Frank Plowman <post@frankplowman.com>
> ---
> This wasn't wholly straightforward as the existing SSE implementation did
> not only use SSE but rather a blend of SSE and MMX.  I would appreciate
> some review, as I am not particularly familiar with x86 assembly.  Of
> particular interest are the changes to {READ,WRITE}_NUM_BYTES.  The new
> implementation does not make economic use of the XMM registers like the
> old one did, but it still does not exhaust them -- was this more of a
> concern when this was originally written?
>

I wrote the original SIMD code (IIRC), and back then MMX usage was
acceptable. Nowadays, it carries performance penalties with it and is
deprecated, so the baseline assumptions are just very different.

Your changes look reasonable, thanks for working on this. OK from me.

Ronald
  

Patch

diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 3cc07878d3..20dcd3e2b9 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -123,54 +123,43 @@  hvar_fn
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
 ;         - if (%2 & 4)  fills 4 bytes into xmm$next
 ;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
-; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
-;         - if (%2 & 4)  fills 4 bytes into mm$next
-;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
 ; writing data out is in the same way
 %macro READ_NUM_BYTES 2
 %assign %%off 0     ; offset in source buffer
-%assign %%mmx_idx 0 ; mmx register index
 %assign %%xmm_idx 0 ; xmm register index
 
 %rep %2/mmsize
-%if mmsize == 16
     movu   xmm %+ %%xmm_idx, [srcq+%%off]
 %assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
-    movu    mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
-%endif
 %assign %%off %%off+mmsize
 %endrep ; %2/mmsize
 
-%if mmsize == 16
 %if (%2-%%off) >= 8
 %if %2 > 16 && (%2-%%off) > 8
     movu   xmm %+ %%xmm_idx, [srcq+%2-16]
 %assign %%xmm_idx %%xmm_idx+1
 %assign %%off %2
 %else
-    movq    mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
+    movq   xmm %+ %%xmm_idx, [srcq+%%off]
+%assign %%xmm_idx %%xmm_idx+1
 %assign %%off %%off+8
 %endif
 %endif ; (%2-%%off) >= 8
-%endif
 
 %if (%2-%%off) >= 4
 %if %2 > 8 && (%2-%%off) > 4
-    movq    mm %+ %%mmx_idx, [srcq+%2-8]
+    movq   xmm %+ %%xmm_idx, [srcq+%2-8]
 %assign %%off %2
 %else
-    movd    mm %+ %%mmx_idx, [srcq+%%off]
+    movd   xmm %+ %%xmm_idx, [srcq+%%off]
 %assign %%off %%off+4
 %endif
-%assign %%mmx_idx %%mmx_idx+1
+%assign %%xmm_idx %%xmm_idx+1
 %endif ; (%2-%%off) >= 4
 
 %if (%2-%%off) >= 1
 %if %2 >= 4
-    movd mm %+ %%mmx_idx, [srcq+%2-4]
+    movd xmm %+ %%xmm_idx, [srcq+%2-4]
 %elif (%2-%%off) == 1
     mov            valb, [srcq+%2-1]
 %elif (%2-%%off) == 2
@@ -185,48 +174,40 @@  hvar_fn
 
 %macro WRITE_NUM_BYTES 2
 %assign %%off 0     ; offset in destination buffer
-%assign %%mmx_idx 0 ; mmx register index
 %assign %%xmm_idx 0 ; xmm register index
 
 %rep %2/mmsize
-%if mmsize == 16
     movu   [dstq+%%off], xmm %+ %%xmm_idx
 %assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
-    movu   [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
-%endif
 %assign %%off %%off+mmsize
 %endrep ; %2/mmsize
 
-%if mmsize == 16
 %if (%2-%%off) >= 8
 %if %2 > 16 && (%2-%%off) > 8
     movu   [dstq+%2-16], xmm %+ %%xmm_idx
 %assign %%xmm_idx %%xmm_idx+1
 %assign %%off %2
 %else
-    movq   [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
+    movq   [dstq+%%off], xmm %+ %%xmm_idx
+%assign %%xmm_idx %%xmm_idx+1
 %assign %%off %%off+8
 %endif
 %endif ; (%2-%%off) >= 8
-%endif
 
 %if (%2-%%off) >= 4
 %if %2 > 8 && (%2-%%off) > 4
-    movq    [dstq+%2-8], mm %+ %%mmx_idx
+    movq    [dstq+%2-8], xmm %+ %%xmm_idx
 %assign %%off %2
 %else
-    movd   [dstq+%%off], mm %+ %%mmx_idx
+    movd   [dstq+%%off], xmm %+ %%xmm_idx
 %assign %%off %%off+4
 %endif
-%assign %%mmx_idx %%mmx_idx+1
+%assign %%xmm_idx %%xmm_idx+1
 %endif ; (%2-%%off) >= 4
 
 %if (%2-%%off) >= 1
 %if %2 >= 4
-    movd    [dstq+%2-4], mm %+ %%mmx_idx
+    movd    [dstq+%2-4], xmm %+ %%xmm_idx
 %elif (%2-%%off) == 1
     mov     [dstq+%2-1], valb
 %elif (%2-%%off) == 2
@@ -318,11 +299,8 @@  cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
 %endrep ; 1+%2-%1
 %endmacro ; VERTICAL_EXTEND
 
-INIT_MMX mmx
-VERTICAL_EXTEND 1, 15
-
-INIT_XMM sse
-VERTICAL_EXTEND 16, 22
+INIT_XMM sse2
+VERTICAL_EXTEND 1, 22
 
 ; left/right (horizontal) fast extend functions
 ; these are essentially identical to the vertical extend ones above,
@@ -337,11 +315,7 @@  VERTICAL_EXTEND 16, 22
     imul           vald, 0x01010101
 %if %1 >= 8
     movd             m0, vald
-%if mmsize == 16
     pshufd           m0, m0, q0000
-%else
-    punpckldq        m0, m0
-%endif ; mmsize == 16
 %endif ; %1 > 16
 %endif ; avx2
 %endmacro ; READ_V_PIXEL
@@ -356,7 +330,6 @@  VERTICAL_EXTEND 16, 22
 %assign %%off %%off+mmsize
 %endrep ; %1/mmsize
 
-%if mmsize == 16
 %if %1-%%off >= 8
 %if %1 > 16 && %1-%%off > 8
     movu     [%2+%1-16], m0
@@ -366,7 +339,6 @@  VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
@@ -415,18 +387,15 @@  cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
 %endrep ; 1+(%2-%1)/2
 %endmacro ; H_EXTEND
 
-INIT_MMX mmx
-H_EXTEND 2, 14
-
 INIT_XMM sse2
-H_EXTEND 16, 22
+H_EXTEND 2, 22
 
 %if HAVE_AVX2_EXTERNAL
 INIT_XMM avx2
 H_EXTEND 8, 22
 %endif
 
-INIT_MMX mmxext
+INIT_XMM sse2
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
     prefetcht0 [bufq]
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index ae9db95624..376eec9c23 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -37,37 +37,37 @@  typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
-extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func * const vfixtbl_sse[22] = {
-    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
-    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
-    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
-    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
-    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
-    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
-    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
-    ff_emu_edge_vfix22_sse
+extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2;
+static emu_edge_vfix_func * const vfixtbl_sse2[22] = {
+    ff_emu_edge_vfix1_sse2,  ff_emu_edge_vfix2_sse2,  ff_emu_edge_vfix3_sse2,
+    ff_emu_edge_vfix4_sse2,  ff_emu_edge_vfix5_sse2,  ff_emu_edge_vfix6_sse2,
+    ff_emu_edge_vfix7_sse2,  ff_emu_edge_vfix8_sse2,  ff_emu_edge_vfix9_sse2,
+    ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, ff_emu_edge_vfix12_sse2,
+    ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, ff_emu_edge_vfix15_sse2,
+    ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, ff_emu_edge_vfix18_sse2,
+    ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, ff_emu_edge_vfix21_sse2,
+    ff_emu_edge_vfix22_sse2
 };
 extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
 
@@ -76,21 +76,21 @@  typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
 typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
                                 x86_reg start_x, x86_reg n_words, x86_reg bh);
 
-extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
 static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
-    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
-    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
-    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
+    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
+    ff_emu_edge_hfix8_sse2,  ff_emu_edge_hfix10_sse2, ff_emu_edge_hfix12_sse2,
+    ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
@@ -104,7 +104,7 @@  extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
 static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
-    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
     ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
     ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
     ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
@@ -196,7 +196,7 @@  static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                               int h)
 {
     emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
-                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
 
@@ -209,24 +209,23 @@  static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
                                               int h)
 {
     emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
-                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
                      hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
 }
 #endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_X86ASM */
 
-void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h);
+void ff_prefetch_sse2(const uint8_t *buf, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
 {
 #if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        ctx->prefetch = ff_prefetch_mmxext;
-    }
-    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
-        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        ctx->prefetch = ff_prefetch_sse2;
+        if (bpc <= 8)
+            ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
 #if HAVE_AVX2_EXTERNAL
     if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {