diff mbox

[FFmpeg-devel,27/34] aarch64: vp9lpf: Use dup+rev16+uzp1 instead of dup+lsr+dup+trn1

Message ID 1488967274-8143-27-git-send-email-martin@martin.st
State Accepted
Commit 3bf9c48320f25f3d5557485b0202f22ae60748b0
Headers show

Commit Message

Martin Storsjö March 8, 2017, 10:01 a.m. UTC
This is one cycle faster in total, and three instructions fewer.

Before:
vp9_loop_filter_mix2_v_44_16_neon: 123.2
After:
vp9_loop_filter_mix2_v_44_16_neon: 122.2

This is cherrypicked from libav commit
3bf9c48320f25f3d5557485b0202f22ae60748b0.
---
 libavcodec/aarch64/vp9lpf_neon.S | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)
diff mbox

Patch

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index a9eea7f..0878763 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -162,18 +162,15 @@ 
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8b,  w2        // E
-        dup             v2.8b,  w3        // I
-        dup             v3.8b,  w4        // H
-        lsr             w5,     w2,  #8
-        lsr             w6,     w3,  #8
-        lsr             w7,     w4,  #8
-        dup             v1.8b,  w5        // E
-        dup             v4.8b,  w6        // I
-        dup             v5.8b,  w7        // H
-        trn1            v0.2d,  v0.2d,  v1.2d
-        trn1            v2.2d,  v2.2d,  v4.2d
-        trn1            v3.2d,  v3.2d,  v5.2d
+        dup             v0.8h,  w2        // E
+        dup             v2.8h,  w3        // I
+        dup             v3.8h,  w4        // H
+        rev16           v1.16b, v0.16b    // E
+        rev16           v4.16b, v2.16b    // I
+        rev16           v5.16b, v3.16b    // H
+        uzp1            v0.16b, v0.16b, v1.16b
+        uzp1            v2.16b, v2.16b, v4.16b
+        uzp1            v3.16b, v3.16b, v5.16b
 .endif
 
         uabd            v4\sz,  v20\sz, v21\sz        // abs(p3 - p2)