diff mbox series

[FFmpeg-devel,v2,7/7] aarch64: me_cmp: Improve scheduling in vsse_intra8

Message ID 20221003141020.3564715-8-gjb@semihalf.com
State New
Headers show
Series arm64 neon implementation for 8bits functions | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Grzegorz Bernacki Oct. 3, 2022, 2:10 p.m. UTC
From: Martin Storsjö <martin@martin.st>

Before:  Cortex A53    A72    A73
vsse_5_neon:   74.7   31.5   26.0
After:
vsse_5_neon:   62.7   32.5   25.7
---
 libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 61e4f68335..d8a18cd4b8 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@  function vsse_intra8_neon, export=1
         // x3           ptrdiff_t stride
         // w4           int h
 
+        sub             w4, w4, #1 // we need to make h-1 iterations
         ld1             {v0.8b}, [x1], x3
+        cmp             w4, #3
         movi            v16.4s, #0
 
-        sub             w4, w4, #1 // we need to make h-1 iterations
-        cmp             w4, #3
         b.lt            2f
 
 1:
@@ -1127,13 +1127,13 @@  function vsse_intra8_neon, export=1
         ld1             {v2.8b}, [x1], x3
         uabd            v30.8b, v0.8b, v1.8b
         ld1             {v3.8b}, [x1], x3
-        umull           v29.8h, v30.8b, v30.8b
         uabd            v27.8b, v1.8b, v2.8b
-        uadalp          v16.4s, v29.8h
-        umull           v26.8h, v27.8b, v27.8b
+        umull           v29.8h, v30.8b, v30.8b
         uabd            v25.8b, v2.8b, v3.8b
-        uadalp          v16.4s, v26.8h
+        umull           v26.8h, v27.8b, v27.8b
+        uadalp          v16.4s, v29.8h
         umull           v24.8h, v25.8b, v25.8b
+        uadalp          v16.4s, v26.8h
         sub             w4, w4, #3
         uadalp          v16.4s, v24.8h
         cmp             w4, #3