diff mbox series

[FFmpeg-devel,v2,6/6] lavc/aarch64: clean-up sao band 8x8 function formatting

Message ID 20211117045614.55251-6-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel,v2,1/6] lavc/arm: dont assign hevc_qpel functions for non-multiple of 8 widths | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_ppc success Make finished
andriy/make_fate_ppc success Make fate finished

Commit Message

J. Dekker Nov. 17, 2021, 4:56 a.m. UTC
Signed-off-by: J. Dekker <jdek@itanimul.li>
---
 libavcodec/aarch64/hevcdsp_sao_neon.S | 195 ++++++++++++--------------
 1 file changed, 90 insertions(+), 105 deletions(-)

 Now matches the 9,25 indentation like other ASM.

Comments

Martin Storsjö Nov. 18, 2021, 8:51 a.m. UTC | #1
On Wed, 17 Nov 2021, J. Dekker wrote:

> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
> libavcodec/aarch64/hevcdsp_sao_neon.S | 195 ++++++++++++--------------
> 1 file changed, 90 insertions(+), 105 deletions(-)
>
> Now matches the 9,25 indentation like other ASM.

No it doesn't.

This changes the indentation of the preexisting function, which was almost 
right into an even worse form. It still doesn't fix left-adjusting the 
operand columns as agreed upon before.

It doesn't fix the function added in patch 3/6 (and it shouldn't - you 
should fix that issue in that in that patch). It does fix the code added 
in patch 4/6 though, but that fix should be squashed into patch 4/6.

Ideally you'd reorder this patch first in the set, so you first fix the 
indentation of the preexisting code to be more consistent (keeping the 
instruction column alignment, left-adjusting the operand columns), then 
add more functions that all have consistent alignment from that point on.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index 82b234aa47..3ca34705db 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -3,7 +3,7 @@ 
  *
  * AArch64 NEON optimised SAO functions for HEVC decoding
  *
- * Copyright (c) 2020 Josh Dekker <josh@itanimul.li>
+ * Copyright (c) 2020-2021  J. Dekker <jdek@itanimul.li>
  *
  * This file is part of FFmpeg.
  *
@@ -29,64 +29,49 @@ 
 //                      int16_t *sao_offset_val, int sao_left_class,
 //                      int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
-        sub             sp,  sp, #64
-        stp            xzr, xzr, [sp]
-        stp            xzr, xzr, [sp, #16]
-        stp            xzr, xzr, [sp, #32]
-        stp            xzr, xzr, [sp, #48]
-        mov             w8,  #4
-        sxtw            x6,  w6
-0:
-        ldrsh           x9, [x4,  x8, lsl #1] // x9 = sao_offset_val[k+1]
-        subs            w8,  w8,  #1
-        add            w10,  w8,  w5 // x10 = k + sao_left_class
-        and            w10, w10, #0x1F
-        strh            w9, [sp, x10, lsl #1]
-        bne             0b
-        ld1            {v16.16b-v19.16b}, [sp], #64
-        movi           v20.8h,   #1
-        sub             x2,  x2, x6 // stride_dst - width
-        sub             x3,  x3, x6 // stride_src - width
-1:      // beginning of line
-        mov             x8,  x6
-2:
-        // Simple layout for accessing 16bit values
-        // with 8bit LUT.
-        //
-        //   00  01  02  03  04  05  06  07
-        // +----------------------------------->
-        // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
-        // +----------------------------------->
-        //    i-0     i-1     i-2     i-3
-        // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-        ld1            {v2.8b}, [x1], #8
-        // load src[x]
-        uxtl            v0.8h,  v2.8b
-        // >> shift
-        ushr            v2.8h,  v0.8h, #3 // BIT_DEPTH - 3
-        // x2 (access lower short)
-        shl             v1.8h,  v2.8h, #1 // low (x2, accessing short)
-        // +1 access upper short
-        add             v3.8h,  v1.8h, v20.8h
-        // shift insert index to upper byte
-        sli             v1.8h,  v3.8h, #8
-        // table
-        tbx            v2.16b, {v16.16b-v19.16b}, v1.16b
-        // src[x] + table
-        add             v1.8h,  v0.8h, v2.8h
-        // clip + narrow
-        sqxtun          v4.8b,  v1.8h
-        // store
-        st1            {v4.8b}, [x0], #8
-        // done 8 pixels
-        subs            w8, w8,  #8
-        bne             2b
-        // finished line
-        subs            w7, w7,  #1
-        add             x0, x0,  x2 // dst += stride_dst
-        add             x1, x1,  x3 // src += stride_src
-        bne             1b
-        ret
+       sub             sp,  sp, #64
+       stp            xzr, xzr, [sp]
+       stp            xzr, xzr, [sp, #16]
+       stp            xzr, xzr, [sp, #32]
+       stp            xzr, xzr, [sp, #48]
+       mov             w8,  #4
+       sxtw            x6,  w6
+0:     ldrsh           x9, [x4,  x8, lsl #1]      // sao_offset_val[k+1]
+       subs            w8,  w8,  #1
+       add            w10,  w8,  w5               // k + sao_left_class
+       and            w10, w10, #0x1F
+       strh            w9, [sp, x10, lsl #1]
+       bne             0b
+       ld1            {v16.16b-v19.16b}, [sp], #64
+       movi           v20.8h,   #1
+       sub             x2,  x2, x6                // stride_dst - width
+       sub             x3,  x3, x6                // stride_src - width
+1:     mov             x8,  x6                    // beginning of line
+2:     // Simple layout for accessing 16bit values
+       // with 8bit LUT.
+       //
+       //   00  01  02  03  04  05  06  07
+       // +----------------------------------->
+       // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
+       // +----------------------------------->
+       //    i-0     i-1     i-2     i-3
+       ld1            {v2.8b}, [x1], #8           // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+       uxtl            v0.8h,  v2.8b              // load src[x]
+       ushr            v2.8h,  v0.8h, #3          // >> BIT_DEPTH - 3
+       shl             v1.8h,  v2.8h, #1          // low (x2, accessing short)
+       add             v3.8h,  v1.8h, v20.8h      // +1 access upper short
+       sli             v1.8h,  v3.8h, #8          // shift insert index to upper byte
+       tbx            v2.16b, {v16.16b-v19.16b}, v1.16b // table
+       add             v1.8h,  v0.8h, v2.8h       // src[x] + table
+       sqxtun          v4.8b,  v1.8h              // clip + narrow
+       st1            {v4.8b}, [x0], #8           // store
+       subs            w8, w8,  #8                // done 8 pixels
+       bne             2b
+       subs            w7, w7,  #1                // finished line, prep. new
+       add             x0, x0,  x2                // dst += stride_dst
+       add             x1, x1,  x3                // src += stride_src
+       bne             1b
+       ret
 endfunc
 
 // ASSUMES STRIDE_SRC = 192
@@ -157,50 +142,50 @@  endfunc
 // ff_hevc_sao_edge_filter_8x8_8_neon(char *dst, char *src, ptrdiff stride_dst,
 //                                    int16 *sao_offset_val, int eo, int width, int height)
 function ff_hevc_sao_edge_filter_8x8_8_neon, export=1
-       adr             x7, .Lsao_edge_pos
-       ldr             w4, [x7, w4, uxtw #2]
-       ld1             {v3.8h}, [x3]
-       mov             v3.h[7], v3.h[0]
-       mov             v3.h[0], v3.h[1]
-       mov             v3.h[1], v3.h[2]
-       mov             v3.h[2], v3.h[7]
-       uzp2            v1.16b, v3.16b, v3.16b
-       uzp1            v0.16b, v3.16b, v3.16b
-       movi            v2.16b, #2
-       add             x16, x0, x2
-       lsl             x2, x2, #1
-       mov             x15, #192
-       mov              x8, x1
-       sub              x9, x1, x4
-       add             x10, x1, x4
-       mov             x17, #4
-1:     ld1             {v3.d}[0], [ x8], x15
-       ld1             {v4.d}[0], [ x9], x15
-       ld1             {v5.d}[0], [x10], x15
-       ld1             {v3.d}[1], [ x8], x15
-       ld1             {v4.d}[1], [ x9], x15
-       ld1             {v5.d}[1], [x10], x15
-       cmhi            v16.16b, v4.16b, v3.16b
-       cmhi            v17.16b, v3.16b, v4.16b
-       cmhi            v18.16b, v5.16b, v3.16b
-       cmhi            v19.16b, v3.16b, v5.16b
-       sub             v20.16b, v16.16b, v17.16b
-       sub             v21.16b, v18.16b, v19.16b
-       add             v20.16b, v20.16b, v21.16b
-       add             v20.16b, v20.16b, v2.16b
-       tbl             v16.16b, {v0.16b}, v20.16b
-       tbl             v17.16b, {v1.16b}, v20.16b
-       uxtl            v20.8h, v3.8b
-       uxtl2           v21.8h, v3.16b
-       zip1            v18.16b, v16.16b, v17.16b
-       zip2            v19.16b, v16.16b, v17.16b
-       sqadd           v20.8h, v18.8h, v20.8h
-       sqadd           v21.8h, v19.8h, v21.8h
-       sqxtun          v6.8b, v20.8h
-       sqxtun          v7.8b, v21.8h
-       st1             {v6.8b}, [ x0], x2
-       st1             {v7.8b}, [x16], x2
-       subs            x17, x17, #1
-       b.ne            1b
-       ret
+        adr             x7, .Lsao_edge_pos
+        ldr             w4, [x7, w4, uxtw #2]
+        ld1             {v3.8h}, [x3]
+        mov             v3.h[7], v3.h[0]
+        mov             v3.h[0], v3.h[1]
+        mov             v3.h[1], v3.h[2]
+        mov             v3.h[2], v3.h[7]
+        uzp2            v1.16b, v3.16b, v3.16b
+        uzp1            v0.16b, v3.16b, v3.16b
+        movi            v2.16b, #2
+        add             x16, x0, x2
+        lsl             x2,  x2, #1
+        mov             x15, #192
+        mov             x8,  x1
+        sub             x9,  x1, x4
+        add             x10, x1, x4
+        mov             x17, #4
+1:      ld1             {v3.d}[0], [ x8], x15
+        ld1             {v4.d}[0], [ x9], x15
+        ld1             {v5.d}[0], [x10], x15
+        ld1             {v3.d}[1], [ x8], x15
+        ld1             {v4.d}[1], [ x9], x15
+        ld1             {v5.d}[1], [x10], x15
+        cmhi            v16.16b, v4.16b, v3.16b
+        cmhi            v17.16b, v3.16b, v4.16b
+        cmhi            v18.16b, v5.16b, v3.16b
+        cmhi            v19.16b, v3.16b, v5.16b
+        sub             v20.16b, v16.16b, v17.16b
+        sub             v21.16b, v18.16b, v19.16b
+        add             v20.16b, v20.16b, v21.16b
+        add             v20.16b, v20.16b, v2.16b
+        tbl             v16.16b, {v0.16b}, v20.16b
+        tbl             v17.16b, {v1.16b}, v20.16b
+        uxtl            v20.8h, v3.8b
+        uxtl2           v21.8h, v3.16b
+        zip1            v18.16b, v16.16b, v17.16b
+        zip2            v19.16b, v16.16b, v17.16b
+        sqadd           v20.8h, v18.8h, v20.8h
+        sqadd           v21.8h, v19.8h, v21.8h
+        sqxtun          v6.8b, v20.8h
+        sqxtun          v7.8b, v21.8h
+        st1             {v6.8b}, [ x0], x2
+        st1             {v7.8b}, [x16], x2
+        subs            x17, x17, #1
+        b.ne            1b
+        ret
 endfunc