diff mbox

[FFmpeg-devel,23/34] aarch64: vp9lpf: Interleave the start of flat8in into the calculation above

Message ID 1488967274-8143-23-git-send-email-martin@martin.st
State Accepted
Commit b0806088d3b27044145b20421da8d39089ae0c6a
Headers show

Commit Message

Martin Storsjö March 8, 2017, 10:01 a.m. UTC
This adds lots of extra .ifs, but speeds it up by a couple cycles,
by avoiding stalls.

This is cherrypicked from libav commit
 libavcodec/aarch64/vp9lpf_neon.S | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)
diff mbox


diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 7fe2c88..cd3e26c 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -338,20 +338,28 @@ 
         uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
         uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
+.if \wd >= 8
+        mov             x5,  v6.d[0]
+.ifc \sz, .16b
+        mov             x6,  v6.d[1]
         saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
         ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
         sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
         sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
+.if \wd >= 8
+.ifc \sz, .16b
+        adds            x5,  x5,  x6
         bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
         bit             v25\sz, v2\sz,  v5\sz
         // If no pixels need flat8in, jump to flat8out
         // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-        mov             x5,  v6.d[0]
 .ifc \sz, .16b
-        mov             x6,  v6.d[1]
-        adds            x5,  x5,  x6
         b.eq            6f
         cbz             x5,  6f