diff mbox

[FFmpeg-devel,01/14] arm: vp9itxfm: Template the quarter/half idct32 function

Message ID 1489702219-12643-1-git-send-email-martin@martin.st
State Accepted
Commit 98ee855ae0cc118bd1d20921d6bdb14731832462
Headers show

Commit Message

Martin Storsjö March 16, 2017, 10:10 p.m. UTC
This reduces the number of lines and reduces the duplication.

Also simplify the eob check for the half case.

If we are in the half case, we know we at least will need to do the
first three slices, we only need to check eob for the fourth one,
so we can hardcode the value to check against instead of loading
from the min_eob array.

Since at most one slice can be skipped in the first pass, we can
unroll the loop for filling zeros completely, as it was done for
the quarter case before.

This allows skipping loading the min_eob pointer when using the
quarter/half cases.

This is cherrypicked from libav commit
98ee855ae0cc118bd1d20921d6bdb14731832462.
---
 libavcodec/arm/vp9itxfm_neon.S | 57 +++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 37 deletions(-)

Comments

Michael Niedermayer March 18, 2017, 11:25 p.m. UTC | #1
On Fri, Mar 17, 2017 at 12:10:06AM +0200, Martin Storsjö wrote:
> This reduces the number of lines and reduces the duplication.
> 
> Also simplify the eob check for the half case.
> 
> If we are in the half case, we know we at least will need to do the
> first three slices, we only need to check eob for the fourth one,
> so we can hardcode the value to check against instead of loading
> from the min_eob array.
> 
> Since at most one slice can be skipped in the first pass, we can
> unroll the loop for filling zeros completely, as it was done for
> the quarter case before.
> 
> This allows skipping loading the min_eob pointer when using the
> quarter/half cases.
> 
> This is cherrypicked from libav commit
> 98ee855ae0cc118bd1d20921d6bdb14731832462.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 57 +++++++++++++++---------------------------
>  1 file changed, 20 insertions(+), 37 deletions(-)

patchset seems to pass fate under qemu arm

thx

[...]
diff mbox

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index ebbbda9..adc9896 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -1575,7 +1575,6 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
         beq             idct32x32_dc_add_neon
         push            {r4-r8,lr}
         vpush           {q4-q6}
-        movrel          r8,  min_eob_idct_idct_32 + 2
 
         @ Align the stack, allocate a temp buffer
 T       mov             r7,  sp
@@ -1597,6 +1596,8 @@  A       and             r7,  sp,  #15
         cmp             r3,  #135
         ble             idct32x32_half_add_neon
 
+        movrel          r8,  min_eob_idct_idct_32 + 2
+
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
         add             r0,  sp,  #(\i*64)
 .if \i > 0
@@ -1634,72 +1635,54 @@  A       and             r7,  sp,  #15
         pop             {r4-r8,pc}
 endfunc
 
-function idct32x32_quarter_add_neon
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
 .irp i, 0, 4
         add             r0,  sp,  #(\i*64)
+.ifc \size,quarter
 .if \i == 4
         cmp             r3,  #9
         ble             1f
 .endif
+.endif
         add             r2,  r6,  #(\i*2)
-        bl              idct32_1d_4x32_pass1_quarter_neon
-.endr
-        b               3f
-
-1:
-        @ Write zeros to the temp buffer for pass 2
-        vmov.i16        q14, #0
-        vmov.i16        q15, #0
-.rept 8
-        vst1.16         {q14-q15}, [r0,:128]!
-.endr
-3:
-.irp i, 0, 4, 8, 12, 16, 20, 24, 28
-        add             r0,  r4,  #(\i)
-        mov             r1,  r5
-        add             r2,  sp,  #(\i*2)
-        bl              idct32_1d_4x32_pass2_quarter_neon
+        bl              idct32_1d_4x32_pass1_\size\()_neon
 .endr
 
-        add             sp,  sp,  r7
-        vpop            {q4-q6}
-        pop             {r4-r8,pc}
-endfunc
-
-function idct32x32_half_add_neon
-.irp i, 0, 4, 8, 12
+.ifc \size,half
+.irp i, 8, 12
         add             r0,  sp,  #(\i*64)
-.if \i > 0
-        ldrh_post       r1,  r8,  #2
-        cmp             r3,  r1
-        it              le
-        movle           r1,  #(16 - \i)/2
+.if \i == 12
+        cmp             r3,  #70
         ble             1f
 .endif
         add             r2,  r6,  #(\i*2)
-        bl              idct32_1d_4x32_pass1_half_neon
+        bl              idct32_1d_4x32_pass1_\size\()_neon
 .endr
+.endif
         b               3f
 
 1:
         @ Write zeros to the temp buffer for pass 2
         vmov.i16        q14, #0
         vmov.i16        q15, #0
-2:
-        subs            r1,  r1,  #1
-.rept 4
+.rept 8
         vst1.16         {q14-q15}, [r0,:128]!
 .endr
-        bne             2b
+
 3:
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
         add             r0,  r4,  #(\i)
         mov             r1,  r5
         add             r2,  sp,  #(\i*2)
-        bl              idct32_1d_4x32_pass2_half_neon
+        bl              idct32_1d_4x32_pass2_\size\()_neon
 .endr
 
         add             sp,  sp,  r7
         vpop            {q4-q6}
         pop             {r4-r8,pc}
 endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half