diff mbox

[FFmpeg-devel,01/13] aarch64: vp9: use alternative returns in the core loop filter function

Message ID 1484000119-4959-1-git-send-email-martin@martin.st
State Accepted
Commit d7595de0b25e7064fd9e06dea5d0425536cef6dc
Headers show

Commit Message

Martin Storsjö Jan. 9, 2017, 10:15 p.m. UTC
From: Janne Grunau <janne-libav@jannau.net>

Since aarch64 has enough free general purpose registers use them to
branch to the appropiate storage code. 1-2 cycles faster for the
functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
(up to 2 cycles faster/slower) on a cortex-a57.

This is cherrypicked from libav commit
d7595de0b25e7064fd9e06dea5d0425536cef6dc.
---
 libavcodec/aarch64/vp9lpf_neon.S | 48 +++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

Comments

Michael Niedermayer Jan. 11, 2017, 3:52 a.m. UTC | #1
On Tue, Jan 10, 2017 at 12:15:07AM +0200, Martin Storsjö wrote:

fate on qemu-arm (32bit) passes fine with the whole patchset

[...]
Michael Niedermayer Jan. 14, 2017, 8:36 p.m. UTC | #2
On Tue, Jan 10, 2017 at 12:15:07AM +0200, Martin Storsjö wrote:
> From: Janne Grunau <janne-libav@jannau.net>
> 
> Since aarch64 has enough free general purpose registers use them to
> branch to the appropiate storage code. 1-2 cycles faster for the
> functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
> (up to 2 cycles faster/slower) on a cortex-a57.
> 
> This is cherrypicked from libav commit
> d7595de0b25e7064fd9e06dea5d0425536cef6dc.
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 48 +++++++++++++++-------------------------
>  1 file changed, 18 insertions(+), 30 deletions(-)

patchset applied

[...]
diff mbox

Patch

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e727a4d..78aae61 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@ 
 .endif
         // If no pixels needed flat8in nor flat8out, jump to a
         // writeout of the inner 4 pixels
-        cbz             x5,  7f
+        cbnz            x5,  1f
+        br              x14
+1:
         mov             x5,  v7.d[0]
 .ifc \sz, .16b
         mov             x6,  v7.d[1]
         orr             x5,  x5,  x6
 .endif
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
-        cbz             x5,  8f
+        cbnz            x5,  1f
+        br              x15
 
+1:
         // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is the input
@@ -549,35 +553,24 @@  endfunc
 
 function vp9_loop_filter_8
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_8_16b_mix
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
-        mov             x5,  #0
         ret
 6:
-        mov             x5,  #6
-        ret
+        br              x13
 9:
         br              x10
 endfunc
 
 function vp9_loop_filter_16
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -589,13 +582,6 @@  endfunc
 
 function vp9_loop_filter_16_16b
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
-        mov             x5,  #0
-        ret
-7:
-        mov             x5,  #7
-        ret
-8:
-        mov             x5,  #8
         ret
 9:
         ldp             d8,  d9,  [sp], 0x10
@@ -614,11 +600,14 @@  endfunc
 .endm
 
 .macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
         bl              vp9_loop_filter_8
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_8_16b_mix mix
+        // calculate alternative 'return' targets
+        adr             x13, 6f
 .if \mix == 48
         mov             x11, #0xffffffff00000000
 .elseif \mix == 84
@@ -627,21 +616,20 @@  endfunc
         mov             x11, #0xffffffffffffffff
 .endif
         bl              vp9_loop_filter_8_16b_mix
-        cbnz            x5,  6f
 .endm
 
 .macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm
 
 .macro loop_filter_16_16b
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
         bl              vp9_loop_filter_16_16b
-        cmp             x5,  7
-        b.gt            8f
-        b.eq            7f
 .endm