diff mbox series

[FFmpeg-devel,1/3] x86/ac3dsp: reduce instruction count inside the float_to_fixed24 loop

Message ID 20231122194913.9856-1-jamrial@gmail.com
State Accepted
Commit d8b1a34433ecf0c2c9fb50754e98954f5ab67d4a
Headers show
Series [FFmpeg-devel,1/3] x86/ac3dsp: reduce instruction count inside the float_to_fixed24 loop | expand

Checks

Context Check Description
yinshiyou/make_loongarch64 success Make finished
yinshiyou/make_fate_loongarch64 success Make fate finished
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

James Almer Nov. 22, 2023, 7:49 p.m. UTC
Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/ac3dsp.asm | 46 +++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 23 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index a95d359d95..42c8310462 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -77,16 +77,20 @@  AC3_EXPONENT_MIN
 INIT_XMM sse2
 cglobal float_to_fixed24, 3, 3, 9, dst, src, len
     movaps     m0, [pf_1_24]
+    shl      lenq, 2
+    add      srcq, lenq
+    add      dstq, lenq
+    neg      lenq
 .loop:
-    movaps     m1, [srcq    ]
-    movaps     m2, [srcq+16 ]
-    movaps     m3, [srcq+32 ]
-    movaps     m4, [srcq+48 ]
+    movaps     m1, [srcq+lenq    ]
+    movaps     m2, [srcq+lenq+16 ]
+    movaps     m3, [srcq+lenq+32 ]
+    movaps     m4, [srcq+lenq+48 ]
 %ifdef m8
-    movaps     m5, [srcq+64 ]
-    movaps     m6, [srcq+80 ]
-    movaps     m7, [srcq+96 ]
-    movaps     m8, [srcq+112]
+    movaps     m5, [srcq+lenq+64 ]
+    movaps     m6, [srcq+lenq+80 ]
+    movaps     m7, [srcq+lenq+96 ]
+    movaps     m8, [srcq+lenq+112]
 %endif
     mulps      m1, m0
     mulps      m2, m0
@@ -108,24 +112,20 @@  cglobal float_to_fixed24, 3, 3, 9, dst, src, len
     cvtps2dq   m7, m7
     cvtps2dq   m8, m8
 %endif
-    movdqa  [dstq    ], m1
-    movdqa  [dstq+16 ], m2
-    movdqa  [dstq+32 ], m3
-    movdqa  [dstq+48 ], m4
+    movdqa  [dstq+lenq    ], m1
+    movdqa  [dstq+lenq+16 ], m2
+    movdqa  [dstq+lenq+32 ], m3
+    movdqa  [dstq+lenq+48 ], m4
 %ifdef m8
-    movdqa  [dstq+64 ], m5
-    movdqa  [dstq+80 ], m6
-    movdqa  [dstq+96 ], m7
-    movdqa  [dstq+112], m8
-    add      srcq, 128
-    add      dstq, 128
-    sub      lenq, 32
+    movdqa  [dstq+lenq+64 ], m5
+    movdqa  [dstq+lenq+80 ], m6
+    movdqa  [dstq+lenq+96 ], m7
+    movdqa  [dstq+lenq+112], m8
+    add      lenq, 128
 %else
-    add      srcq, 64
-    add      dstq, 64
-    sub      lenq, 16
+    add      lenq, 64
 %endif
-    ja .loop
+    jl .loop
     RET
 
 ;------------------------------------------------------------------------------