diff mbox series

[FFmpeg-devel,v2,2/7] aarch64: me_cmp: Improve scheduling in ff_pix_abs8_y2_neon

Message ID 20221003141020.3564715-3-gjb@semihalf.com
State New
Headers show
Series arm64 neon implementation for 8bits functions | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished

Commit Message

Grzegorz Bernacki Oct. 3, 2022, 2:10 p.m. UTC
From: Martin Storsjö <martin@martin.st>

Before:       Cortex A53    A72    A73
pix_abs_1_2_neon:   73.7   31.0   25.7
After:
pix_abs_1_2_neon:   61.7   30.2   24.7

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)
diff mbox series

Patch

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 43e068bb7f..3662419edf 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@  function ff_pix_abs8_y2_neon, export=1
 1:
         ld1             {v2.8b}, [x2], x3
         ld1             {v0.8b}, [x1], x3
-        ld1             {v6.8b}, [x1], x3
         urhadd          v30.8b, v1.8b, v2.8b
         ld1             {v5.8b}, [x2], x3
-        ld1             {v21.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
         uabal           v26.8h, v0.8b, v30.8b
         urhadd          v29.8b, v2.8b, v5.8b
         ld1             {v20.8b}, [x2], x3
-        ld1             {v24.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
         uabal           v26.8h, v6.8b, v29.8b
         urhadd          v28.8b, v5.8b, v20.8b
-        uabal           v26.8h, v21.8b, v28.8b
-        ld1             {v23.8b}, [x2], x3
-        mov             v1.8b, v23.8b
+        ld1             {v1.8b},  [x2], x3
+        ld1             {v24.8b}, [x1], x3
+        urhadd          v27.8b, v20.8b, v1.8b
         sub             w4, w4, #4
-        urhadd          v27.8b, v20.8b, v23.8b
+        uabal           v26.8h, v21.8b, v28.8b
         cmp             w4, #4
         uabal           v26.8h, v24.8b, v27.8b