diff mbox series

[FFmpeg-devel] lavc/aarch64: unify formatting

Message ID 20220614110957.10546-1-jdek@itanimul.li
State New
Headers show
Series [FFmpeg-devel] lavc/aarch64: unify formatting | expand

Checks

Context Check Description
andriy/make_x86 success Make finished
andriy/make_fate_x86 success Make fate finished
andriy/make_armv7_RPi4 success Make finished
andriy/make_fate_armv7_RPi4 success Make fate finished

Commit Message

J. Dekker June 14, 2022, 11:09 a.m. UTC
Performs the following changes:
- Lower case format specifiers
- Lower case sxtw/uxtw
- Numeric labels on same line as next instruction
- Indentation to 9/25

Signed-off-by: J. Dekker <jdek@itanimul.li>
---

 I initially started writing a full lexer to do this but ended up
 wasting too much time on it. Now using Mostly text editor macros. Still
 need to skim through again and make a few manual changes here and there
 but wondering if there's any other major automatable changes I missed.

 libavcodec/aarch64/aacpsdsp_neon.S       | 208 +++----
 libavcodec/aarch64/fft_neon.S            |  89 ++-
 libavcodec/aarch64/fmtconvert_neon.S     |  12 +-
 libavcodec/aarch64/h264cmc_neon.S        | 534 +++++++++---------
 libavcodec/aarch64/h264dsp_neon.S        | 664 +++++++++++------------
 libavcodec/aarch64/h264idct_neon.S       | 418 +++++++-------
 libavcodec/aarch64/h264pred_neon.S       |  16 +-
 libavcodec/aarch64/h264qpel_neon.S       | 644 +++++++++++-----------
 libavcodec/aarch64/hevcdsp_idct_neon.S   | 323 ++++++-----
 libavcodec/aarch64/hevcdsp_sao_neon.S    |   8 +-
 libavcodec/aarch64/hpeldsp_neon.S        | 504 ++++++++---------
 libavcodec/aarch64/mdct_neon.S           |  27 +-
 libavcodec/aarch64/mpegaudiodsp_neon.S   |  26 +-
 libavcodec/aarch64/neon.S                | 246 ++++-----
 libavcodec/aarch64/opusdsp_neon.S        | 106 ++--
 libavcodec/aarch64/pixblockdsp_neon.S    |   6 +-
 libavcodec/aarch64/sbrdsp_neon.S         | 308 +++++------
 libavcodec/aarch64/simple_idct_neon.S    | 410 +++++++-------
 libavcodec/aarch64/synth_filter_neon.S   |  15 +-
 libavcodec/aarch64/vc1dsp_neon.S         | 206 +++----
 libavcodec/aarch64/videodsp.S            |   3 +-
 libavcodec/aarch64/vp8dsp_neon.S         | 485 ++++++++---------
 libavcodec/aarch64/vp9itxfm_16bpp_neon.S |  64 +--
 libavcodec/aarch64/vp9itxfm_neon.S       |  50 +-
 libavcodec/aarch64/vp9lpf_16bpp_neon.S   |  24 +-
 libavcodec/aarch64/vp9lpf_neon.S         |  78 +--
 libavcodec/aarch64/vp9mc_16bpp_neon.S    |  30 +-
 libavcodec/aarch64/vp9mc_aarch64.S       |   9 +-
 libavcodec/aarch64/vp9mc_neon.S          |  39 +-
 29 files changed, 2688 insertions(+), 2864 deletions(-)

Comments

Martin Storsjö June 20, 2022, 8:29 p.m. UTC | #1
On Tue, 14 Jun 2022, J. Dekker wrote:

> Performs the following changes:
> - Lower case format specifiers
> - Lower case sxtw/uxtw
> - Numeric labels on same line as next instruction

Why do you feel the need to do that? I like the occasional extra spacing 
that an empty line for such labels adds - at least I wouldn't go changing 
how that's done in other places. Where there are empty lines and where 
there aren't, is up to the author IMO, based on how they see their code.

> - Indentation to 9/25
>
> Signed-off-by: J. Dekker <jdek@itanimul.li>
> ---
>
> I initially started writing a full lexer to do this but ended up
> wasting too much time on it. Now using Mostly text editor macros. Still
> need to skim through again and make a few manual changes here and there
> but wondering if there's any other major automatable changes I missed.
>
> libavcodec/aarch64/aacpsdsp_neon.S       | 208 +++----
> libavcodec/aarch64/fft_neon.S            |  89 ++-
> libavcodec/aarch64/fmtconvert_neon.S     |  12 +-
> libavcodec/aarch64/h264cmc_neon.S        | 534 +++++++++---------
> libavcodec/aarch64/h264dsp_neon.S        | 664 +++++++++++------------
> libavcodec/aarch64/h264idct_neon.S       | 418 +++++++-------
> libavcodec/aarch64/h264pred_neon.S       |  16 +-
> libavcodec/aarch64/h264qpel_neon.S       | 644 +++++++++++-----------
> libavcodec/aarch64/hevcdsp_idct_neon.S   | 323 ++++++-----
> libavcodec/aarch64/hevcdsp_sao_neon.S    |   8 +-
> libavcodec/aarch64/hpeldsp_neon.S        | 504 ++++++++---------
> libavcodec/aarch64/mdct_neon.S           |  27 +-
> libavcodec/aarch64/mpegaudiodsp_neon.S   |  26 +-
> libavcodec/aarch64/neon.S                | 246 ++++-----
> libavcodec/aarch64/opusdsp_neon.S        | 106 ++--
> libavcodec/aarch64/pixblockdsp_neon.S    |   6 +-
> libavcodec/aarch64/sbrdsp_neon.S         | 308 +++++------
> libavcodec/aarch64/simple_idct_neon.S    | 410 +++++++-------
> libavcodec/aarch64/synth_filter_neon.S   |  15 +-
> libavcodec/aarch64/vc1dsp_neon.S         | 206 +++----
> libavcodec/aarch64/videodsp.S            |   3 +-
> libavcodec/aarch64/vp8dsp_neon.S         | 485 ++++++++---------
> libavcodec/aarch64/vp9itxfm_16bpp_neon.S |  64 +--
> libavcodec/aarch64/vp9itxfm_neon.S       |  50 +-
> libavcodec/aarch64/vp9lpf_16bpp_neon.S   |  24 +-
> libavcodec/aarch64/vp9lpf_neon.S         |  78 +--
> libavcodec/aarch64/vp9mc_16bpp_neon.S    |  30 +-
> libavcodec/aarch64/vp9mc_aarch64.S       |   9 +-
> libavcodec/aarch64/vp9mc_neon.S          |  39 +-
> 29 files changed, 2688 insertions(+), 2864 deletions(-)
>
> diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
> index ff4e6e244a..dfa6a9dc33 100644
> --- a/libavcodec/aarch64/aacpsdsp_neon.S
> +++ b/libavcodec/aarch64/aacpsdsp_neon.S
> @@ -19,130 +19,130 @@
> #include "libavutil/aarch64/asm.S"
>
> function ff_ps_add_squares_neon, export=1
> -1:      ld1         {v0.4S,v1.4S}, [x1], #32
> -        fmul        v0.4S, v0.4S, v0.4S
> -        fmul        v1.4S, v1.4S, v1.4S
> -        faddp       v2.4S, v0.4S, v1.4S
> -        ld1         {v3.4S}, [x0]
> -        fadd        v3.4S, v3.4S, v2.4S
> -        st1         {v3.4S}, [x0], #16
> -        subs        w2, w2, #4
> +1:      ld1             {v0.4s,v1.4s}, [x1], #32
> +        fmul            v0.4s, v0.4s, v0.4s
> +        fmul            v1.4s, v1.4s, v1.4s
> +        faddp           v2.4s, v0.4s, v1.4s
> +        ld1             {v3.4s}, [x0]
> +        fadd            v3.4s, v3.4s, v2.4s
> +        st1             {v3.4s}, [x0], #16
> +        subs            w2, w2, #4
>         b.gt        1b

This leaves the b.gt parameter misaligned with the rest - same thing in 
the whole rest of this file.

> @@ -295,8 +295,7 @@ function fft_pass_neon
>         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
>         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
>         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
> -1:
> -        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
> +1:      ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
>         ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}

I really don't think getting rid of the empty line here or anywhere else 
adds anything - I think it worsens readability.

> @@ -359,18 +358,18 @@ function fft\n\()_neon, align=6
> endfunc
> .endm
>
> -        def_fft    32,    16,     8
> -        def_fft    64,    32,    16
> -        def_fft   128,    64,    32
> -        def_fft   256,   128,    64
> -        def_fft   512,   256,   128
> -        def_fft  1024,   512,   256
> -        def_fft  2048,  1024,   512
> -        def_fft  4096,  2048,  1024
> -        def_fft  8192,  4096,  2048
> -        def_fft 16384,  8192,  4096
> -        def_fft 32768, 16384,  8192
> -        def_fft 65536, 32768, 16384
> +        def_fft         32,    16,     8
> +        def_fft         64,    32,    16
> +        def_fft         128,    64,    32
> +        def_fft         256,   128,    64
> +        def_fft         512,   256,   128
> +        def_fft         1024,   512,   256
> +        def_fft         2048,  1024,   512
> +        def_fft         4096,  2048,  1024
> +        def_fft         8192,  4096,  2048
> +        def_fft         16384,  8192,  4096
> +        def_fft         32768, 16384,  8192
> +        def_fft         65536, 32768, 16384

Previously, the columns were right-aligned here. I don't think we gain 
anything from changing that.

> -const   pmmp, align=4
> +const pmmp, align=4

Changes like this are good indeed.

> diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
> index f8e9407854..0add73ffec 100644
> --- a/libavcodec/aarch64/h264cmc_neon.S
> +++ b/libavcodec/aarch64/h264cmc_neon.S
> @@ -26,24 +26,24 @@
> /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
> .macro  h264_chroma_mc8 type, codec=h264
> function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
> -  .ifc \type,avg
> +.ifc \type,avg
>         mov             x8,  x0
> -  .endif
> +.endif

I didn't mind the indented .if/.endif, but I won't oppose making it 
consistent with the rest either, i.e. making it unindented.

> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index ea221e6862..926c6e8362 100644
> .macro  h264_loop_filter_luma
> -        dup             v22.16B, w2                     // alpha
> -        uxtl            v24.8H,  v24.8B
> -        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
> -        uxtl            v24.4S,  v24.4H
> -        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
> -        sli             v24.8H,  v24.8H,  #8
> -        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
> -        sli             v24.4S,  v24.4S,  #16
> -        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
> -        dup             v22.16B, w3                     // beta
> -        cmlt            v23.16B, v24.16B, #0
> -        cmhi            v28.16B, v22.16B, v28.16B       // < beta
> -        cmhi            v30.16B, v22.16B, v30.16B       // < beta
> -        bic             v21.16B, v21.16B, v23.16B
> -        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
> -        and             v21.16B, v21.16B, v28.16B
> -        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
> -        and             v21.16B, v21.16B, v30.16B      // < beta

Here's a misaligned comment that could be fixed while touching this

> +        dup             v22.16b, w2                     // alpha
> +        uxtl            v24.8h,  v24.8b
> +        uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
> +        uxtl            v24.4s,  v24.4h
> +        uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
> +        sli             v24.8h,  v24.8h,  #8
> +        uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
> +        sli             v24.4s,  v24.4s,  #16
> +        cmhi            v21.16b, v22.16b, v21.16b       // < alpha
> +        dup             v22.16b, w3                     // beta
> +        cmlt            v23.16b, v24.16b, #0
> +        cmhi            v28.16b, v22.16b, v28.16b       // < beta
> +        cmhi            v30.16b, v22.16b, v30.16b       // < beta
> +        bic             v21.16b, v21.16b, v23.16b
> +        uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
> +        and             v21.16b, v21.16b, v28.16b
> +        uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
> +        and             v21.16b, v21.16b, v30.16b      // < beta
>         shrn            v30.8b,  v21.8h,  #4
>         mov             x7, v30.d[0]
> -        cmhi            v17.16B, v22.16B, v17.16B       // < beta
> -        cmhi            v19.16B, v22.16B, v19.16B       // < beta
> +        cmhi            v17.16b, v22.16b, v17.16b       // < beta
> +        cmhi            v19.16b, v22.16b, v19.16b       // < beta
>         cbz             x7,  9f
> -        and             v17.16B, v17.16B, v21.16B
> -        and             v19.16B, v19.16B, v21.16B
> -        and             v24.16B, v24.16B, v21.16B
> -        urhadd          v28.16B, v16.16B,  v0.16B
> -        sub             v21.16B, v24.16B, v17.16B
> -        uqadd           v23.16B, v18.16B, v24.16B
> -        uhadd           v20.16B, v20.16B, v28.16B
> -        sub             v21.16B, v21.16B, v19.16B
> -        uhadd           v28.16B,  v4.16B, v28.16B
> -        umin            v23.16B, v23.16B, v20.16B
> -        uqsub           v22.16B, v18.16B, v24.16B
> -        uqadd           v4.16B,   v2.16B, v24.16B
> -        umax            v23.16B, v23.16B, v22.16B
> -        uqsub           v22.16B,  v2.16B, v24.16B
> -        umin            v28.16B,  v4.16B, v28.16B
> -        uxtl            v4.8H,    v0.8B
> -        umax            v28.16B, v28.16B, v22.16B
> -        uxtl2           v20.8H,   v0.16B
> -        usubw           v4.8H,    v4.8H,  v16.8B
> -        usubw2          v20.8H,  v20.8H,  v16.16B
> -        shl             v4.8H,    v4.8H,  #2
> -        shl             v20.8H,  v20.8H,  #2
> -        uaddw           v4.8H,    v4.8H,  v18.8B
> -        uaddw2          v20.8H,  v20.8H,  v18.16B
> -        usubw           v4.8H,    v4.8H,   v2.8B
> -        usubw2          v20.8H,  v20.8H,   v2.16B
> -        rshrn           v4.8B,    v4.8H,  #3
> -        rshrn2          v4.16B,  v20.8H,  #3
> -        bsl             v17.16B, v23.16B, v18.16B
> -        bsl             v19.16B, v28.16B,  v2.16B
> -        neg             v23.16B, v21.16B
> -        uxtl            v28.8H,  v16.8B
> -        smin            v4.16B,   v4.16B, v21.16B
> -        uxtl2           v21.8H,  v16.16B
> -        smax            v4.16B,   v4.16B, v23.16B
> -        uxtl            v22.8H,   v0.8B
> -        uxtl2           v24.8H,   v0.16B
> -        saddw           v28.8H,  v28.8H,  v4.8B
> -        saddw2          v21.8H,  v21.8H,  v4.16B
> -        ssubw           v22.8H,  v22.8H,  v4.8B
> -        ssubw2          v24.8H,  v24.8H,  v4.16B
> -        sqxtun          v16.8B,  v28.8H
> -        sqxtun2         v16.16B, v21.8H
> -        sqxtun          v0.8B,   v22.8H
> -        sqxtun2         v0.16B,  v24.8H
> +        and             v17.16b, v17.16b, v21.16b
> +        and             v19.16b, v19.16b, v21.16b
> +        and             v24.16b, v24.16b, v21.16b
> +        urhadd          v28.16b, v16.16b,  v0.16b
> +        sub             v21.16b, v24.16b, v17.16b
> +        uqadd           v23.16b, v18.16b, v24.16b
> +        uhadd           v20.16b, v20.16b, v28.16b
> +        sub             v21.16b, v21.16b, v19.16b
> +        uhadd           v28.16b,  v4.16b, v28.16b
> +        umin            v23.16b, v23.16b, v20.16b
> +        uqsub           v22.16b, v18.16b, v24.16b
> +        uqadd           v4.16b,   v2.16b, v24.16b
> +        umax            v23.16b, v23.16b, v22.16b
> +        uqsub           v22.16b,  v2.16b, v24.16b
> +        umin            v28.16b,  v4.16b, v28.16b
> +        uxtl            v4.8h,    v0.8b
> +        umax            v28.16b, v28.16b, v22.16b
> +        uxtl2           v20.8h,   v0.16b
> +        usubw           v4.8h,    v4.8h,  v16.8b
> +        usubw2          v20.8h,  v20.8h,  v16.16b
> +        shl             v4.8h,    v4.8h,  #2
> +        shl             v20.8h,  v20.8h,  #2
> +        uaddw           v4.8h,    v4.8h,  v18.8b
> +        uaddw2          v20.8h,  v20.8h,  v18.16b
> +        usubw           v4.8h,    v4.8h,   v2.8b
> +        usubw2          v20.8h,  v20.8h,   v2.16b
> +        rshrn           v4.8b,    v4.8h,  #3
> +        rshrn2          v4.16b,  v20.8h,  #3
> +        bsl             v17.16b, v23.16b, v18.16b
> +        bsl             v19.16b, v28.16b,  v2.16b
> +        neg             v23.16b, v21.16b
> +        uxtl            v28.8h,  v16.8b
> +        smin            v4.16b,   v4.16b, v21.16b
> +        uxtl2           v21.8h,  v16.16b
> +        smax            v4.16b,   v4.16b, v23.16b
> +        uxtl            v22.8h,   v0.8b
> +        uxtl2           v24.8h,   v0.16b
> +        saddw           v28.8h,  v28.8h,  v4.8b
> +        saddw2          v21.8h,  v21.8h,  v4.16b
> +        ssubw           v22.8h,  v22.8h,  v4.8b
> +        ssubw2          v24.8h,  v24.8h,  v4.16b
> +        sqxtun          v16.8b,  v28.8h
> +        sqxtun2         v16.16b, v21.8h
> +        sqxtun          v0.8b,   v22.8h
> +        sqxtun2         v0.16b,  v24.8h
> .endm
>
> function ff_h264_v_loop_filter_luma_neon, export=1
>         h264_loop_filter_start
>
> -        ld1             {v0.16B},  [x0], x1
> -        ld1             {v2.16B},  [x0], x1
> -        ld1             {v4.16B},  [x0], x1
> +        ld1             {v0.16b},  [x0], x1
> +        ld1             {v2.16b},  [x0], x1
> +        ld1             {v4.16b},  [x0], x1
>         sub             x0,  x0,  x1, lsl #2
>         sub             x0,  x0,  x1, lsl #1
> -        ld1             {v20.16B},  [x0], x1
> -        ld1             {v18.16B},  [x0], x1
> -        ld1             {v16.16B},  [x0], x1
> +        ld1             {v20.16b},  [x0], x1
> +        ld1             {v18.16b},  [x0], x1
> +        ld1             {v16.16b},  [x0], x1
>
>         h264_loop_filter_luma
>
>         sub             x0,  x0,  x1, lsl #1
> -        st1             {v17.16B},  [x0], x1
> -        st1             {v16.16B}, [x0], x1
> -        st1             {v0.16B},  [x0], x1
> -        st1             {v19.16B}, [x0]
> -9:
> -        ret
> +        st1             {v17.16b},  [x0], x1
> +        st1             {v16.16b}, [x0], x1
> +        st1             {v0.16b},  [x0], x1
> +        st1             {v19.16b}, [x0]
> +9:      ret
> endfunc
>
> function ff_h264_h_loop_filter_luma_neon, export=1
>         h264_loop_filter_start
>
>         sub             x0,  x0,  #4
> -        ld1             {v6.8B},  [x0], x1
> -        ld1             {v20.8B}, [x0], x1
> -        ld1             {v18.8B}, [x0], x1
> -        ld1             {v16.8B}, [x0], x1
> -        ld1             {v0.8B},  [x0], x1
> -        ld1             {v2.8B},  [x0], x1
> -        ld1             {v4.8B},  [x0], x1
> -        ld1             {v26.8B}, [x0], x1
> -        ld1             {v6.D}[1],  [x0], x1
> -        ld1             {v20.D}[1], [x0], x1
> -        ld1             {v18.D}[1], [x0], x1
> -        ld1             {v16.D}[1], [x0], x1
> -        ld1             {v0.D}[1],  [x0], x1
> -        ld1             {v2.D}[1],  [x0], x1
> -        ld1             {v4.D}[1],  [x0], x1
> -        ld1             {v26.D}[1], [x0], x1
> +        ld1             {v6.8b},  [x0], x1
> +        ld1             {v20.8b}, [x0], x1
> +        ld1             {v18.8b}, [x0], x1
> +        ld1             {v16.8b}, [x0], x1
> +        ld1             {v0.8b},  [x0], x1
> +        ld1             {v2.8b},  [x0], x1
> +        ld1             {v4.8b},  [x0], x1
> +        ld1             {v26.8b}, [x0], x1
> +        ld1             {v6.d}[1],  [x0], x1
> +        ld1             {v20.d}[1], [x0], x1
> +        ld1             {v18.d}[1], [x0], x1
> +        ld1             {v16.d}[1], [x0], x1
> +        ld1             {v0.d}[1],  [x0], x1
> +        ld1             {v2.d}[1],  [x0], x1
> +        ld1             {v4.d}[1],  [x0], x1
> +        ld1             {v26.d}[1], [x0], x1
>
>         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
>
> @@ -160,24 +158,23 @@ function ff_h264_h_loop_filter_luma_neon, export=1
>
>         sub             x0,  x0,  x1, lsl #4
>         add             x0,  x0,  #2
> -        st1             {v17.S}[0],  [x0], x1
> -        st1             {v16.S}[0], [x0], x1
> -        st1             {v0.S}[0],  [x0], x1
> -        st1             {v19.S}[0], [x0], x1
> -        st1             {v17.S}[1],  [x0], x1
> -        st1             {v16.S}[1], [x0], x1
> -        st1             {v0.S}[1],  [x0], x1
> -        st1             {v19.S}[1], [x0], x1
> -        st1             {v17.S}[2],  [x0], x1
> -        st1             {v16.S}[2], [x0], x1
> -        st1             {v0.S}[2],  [x0], x1
> -        st1             {v19.S}[2], [x0], x1
> -        st1             {v17.S}[3],  [x0], x1
> -        st1             {v16.S}[3], [x0], x1
> -        st1             {v0.S}[3],  [x0], x1
> -        st1             {v19.S}[3], [x0], x1
> -9:
> -        ret
> +        st1             {v17.s}[0],  [x0], x1
> +        st1             {v16.s}[0], [x0], x1
> +        st1             {v0.s}[0],  [x0], x1
> +        st1             {v19.s}[0], [x0], x1
> +        st1             {v17.s}[1],  [x0], x1
> +        st1             {v16.s}[1], [x0], x1
> +        st1             {v0.s}[1],  [x0], x1
> +        st1             {v19.s}[1], [x0], x1
> +        st1             {v17.s}[2],  [x0], x1
> +        st1             {v16.s}[2], [x0], x1
> +        st1             {v0.s}[2],  [x0], x1
> +        st1             {v19.s}[2], [x0], x1
> +        st1             {v17.s}[3],  [x0], x1
> +        st1             {v16.s}[3], [x0], x1
> +        st1             {v0.s}[3],  [x0], x1
> +        st1             {v19.s}[3], [x0], x1

Here, I'd love to align the righthand columns that are uneven (although I 
guess such changes are a bit out of scope for what you do here).



> @@ -193,139 +193,139 @@ function ff_h264_idct_add8_neon, export=1
> endfunc
>
> .macro  idct8x8_cols    pass
> -  .if \pass == 0
> -        va      .req    v18
> -        vb      .req    v30

I think the custom formatting of these lines is better to keep as is, 
rather than forcing it into the formatting of the rest.

> diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
> index 451fd8af24..3829f17bd1 100644
> --- a/libavcodec/aarch64/h264qpel_neon.S
> +++ b/libavcodec/aarch64/h264qpel_neon.S
> @@ -24,130 +24,130 @@
>
>         /* H.264 qpel MC */
>
> -.macro  lowpass_const   r
> +.macro  lowpass_const r

Here you could get rid of the double spaces after .macro too

> @@ -580,12 +580,12 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
> endfunc
> .endm
>
> -        h264_qpel16_hv put
> -        h264_qpel16_hv avg
> +        h264_qpel16_hv  put
> +        h264_qpel16_hv  avg
>
> .macro  h264_qpel8      type
> function ff_\type\()_h264_qpel8_mc10_neon, export=1
> -        lowpass_const   w3
> +        lowpass_const w3

The parameter to lowpass_const was properly aligned before, now it no 
longer is


> .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
> -  .if \rnd
> -    .macro avg  rd, rn, rm
> +.if \rnd
> +.macro avg  rd, rn, rm
>         urhadd          \rd, \rn, \rm

I think this looks much more like a mess than it was before, TBH

> diff --git a/libavcodec/aarch64/opusdsp_neon.S b/libavcodec/aarch64/opusdsp_neon.S
> index 46c2be0874..3b2b89d068 100644
> --- a/libavcodec/aarch64/opusdsp_neon.S
> +++ b/libavcodec/aarch64/opusdsp_neon.S
> @@ -20,93 +20,93 @@
>
>            // 0.85..^1    0.85..^2    0.85..^3    0.85..^4
> const tab_st, align=4
> -        .word 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
> +        .word           0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f

These used to be aligned with the comments above, now the no longer are 
aligned

> diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
> index 9a96c2523c..7df48ea000 100644
> --- a/libavcodec/aarch64/vc1dsp_neon.S
> +++ b/libavcodec/aarch64/vc1dsp_neon.S
> @@ -1391,35 +1391,35 @@ function ff_vc1_unescape_buffer_helper_neon, export=1
>         tst             w1, #32
>         b.ne            1f
>
> -          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
> -          ext             v25.16b, v0.16b, v1.16b, #1

This function intentionally used a different indentation style, to 
visualize how multiple iterations of the same algorithm are unrolled and 
interleaved with each other. It was discussed during the review of this 
patch. So for that, I'd kinda be inclined to keep it as is.


There's lots of changes that I agree with here, but also lots of things I 
disagree with. I think it'd be better to split it up over individual 
patches, fixing one issue at a time (starting with the least controversial 
ones), because right now it's too big and there's too many style 
regressions intermixed among with the good changes.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/aacpsdsp_neon.S b/libavcodec/aarch64/aacpsdsp_neon.S
index ff4e6e244a..dfa6a9dc33 100644
--- a/libavcodec/aarch64/aacpsdsp_neon.S
+++ b/libavcodec/aarch64/aacpsdsp_neon.S
@@ -19,130 +19,130 @@ 
 #include "libavutil/aarch64/asm.S"
 
 function ff_ps_add_squares_neon, export=1
-1:      ld1         {v0.4S,v1.4S}, [x1], #32
-        fmul        v0.4S, v0.4S, v0.4S
-        fmul        v1.4S, v1.4S, v1.4S
-        faddp       v2.4S, v0.4S, v1.4S
-        ld1         {v3.4S}, [x0]
-        fadd        v3.4S, v3.4S, v2.4S
-        st1         {v3.4S}, [x0], #16
-        subs        w2, w2, #4
+1:      ld1             {v0.4s,v1.4s}, [x1], #32
+        fmul            v0.4s, v0.4s, v0.4s
+        fmul            v1.4s, v1.4s, v1.4s
+        faddp           v2.4s, v0.4s, v1.4s
+        ld1             {v3.4s}, [x0]
+        fadd            v3.4s, v3.4s, v2.4s
+        st1             {v3.4s}, [x0], #16
+        subs            w2, w2, #4
         b.gt        1b
         ret
 endfunc
 
 function ff_ps_mul_pair_single_neon, export=1
-1:      ld1         {v0.4S,v1.4S}, [x1], #32
-        ld1         {v2.4S},       [x2], #16
-        zip1        v3.4S, v2.4S, v2.4S
-        zip2        v4.4S, v2.4S, v2.4S
-        fmul        v0.4S, v0.4S, v3.4S
-        fmul        v1.4S, v1.4S, v4.4S
-        st1         {v0.4S,v1.4S}, [x0], #32
-        subs        w3, w3, #4
+1:      ld1             {v0.4s,v1.4s}, [x1], #32
+        ld1             {v2.4s},       [x2], #16
+        zip1            v3.4s, v2.4s, v2.4s
+        zip2            v4.4s, v2.4s, v2.4s
+        fmul            v0.4s, v0.4s, v3.4s
+        fmul            v1.4s, v1.4s, v4.4s
+        st1             {v0.4s,v1.4s}, [x0], #32
+        subs            w3, w3, #4
         b.gt        1b
         ret
 endfunc
 
 function ff_ps_stereo_interpolate_neon, export=1
-        ld1         {v0.4S}, [x2]
-        ld1         {v1.4S}, [x3]
-        zip1        v4.4S, v0.4S, v0.4S
-        zip2        v5.4S, v0.4S, v0.4S
-        zip1        v6.4S, v1.4S, v1.4S
-        zip2        v7.4S, v1.4S, v1.4S
-1:      ld1         {v2.2S}, [x0]
-        ld1         {v3.2S}, [x1]
-        fadd        v4.4S, v4.4S, v6.4S
-        fadd        v5.4S, v5.4S, v7.4S
-        mov         v2.D[1], v2.D[0]
-        mov         v3.D[1], v3.D[0]
-        fmul        v2.4S, v2.4S, v4.4S
-        fmla        v2.4S, v3.4S, v5.4S
-        st1         {v2.D}[0], [x0], #8
-        st1         {v2.D}[1], [x1], #8
-        subs        w4, w4, #1
+        ld1             {v0.4s}, [x2]
+        ld1             {v1.4s}, [x3]
+        zip1            v4.4s, v0.4s, v0.4s
+        zip2            v5.4s, v0.4s, v0.4s
+        zip1            v6.4s, v1.4s, v1.4s
+        zip2            v7.4s, v1.4s, v1.4s
+1:      ld1             {v2.2s}, [x0]
+        ld1             {v3.2s}, [x1]
+        fadd            v4.4s, v4.4s, v6.4s
+        fadd            v5.4s, v5.4s, v7.4s
+        mov             v2.d[1], v2.d[0]
+        mov             v3.d[1], v3.d[0]
+        fmul            v2.4s, v2.4s, v4.4s
+        fmla            v2.4s, v3.4s, v5.4s
+        st1             {v2.d}[0], [x0], #8
+        st1             {v2.d}[1], [x1], #8
+        subs            w4, w4, #1
         b.gt        1b
         ret
 endfunc
 
 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
-        ld1         {v0.4S,v1.4S}, [x2]
-        ld1         {v6.4S,v7.4S}, [x3]
-        fneg        v2.4S, v1.4S
-        fneg        v3.4S, v7.4S
-        zip1        v16.4S, v0.4S, v0.4S
-        zip2        v17.4S, v0.4S, v0.4S
-        zip1        v18.4S, v2.4S, v1.4S
-        zip2        v19.4S, v2.4S, v1.4S
-        zip1        v20.4S, v6.4S, v6.4S
-        zip2        v21.4S, v6.4S, v6.4S
-        zip1        v22.4S, v3.4S, v7.4S
-        zip2        v23.4S, v3.4S, v7.4S
-1:      ld1         {v2.2S}, [x0]
-        ld1         {v3.2S}, [x1]
-        fadd        v16.4S, v16.4S, v20.4S
-        fadd        v17.4S, v17.4S, v21.4S
-        mov         v2.D[1], v2.D[0]
-        mov         v3.D[1], v3.D[0]
-        fmul        v4.4S, v2.4S, v16.4S
-        fmla        v4.4S, v3.4S, v17.4S
-        fadd        v18.4S, v18.4S, v22.4S
-        fadd        v19.4S, v19.4S, v23.4S
-        ext         v2.16B, v2.16B, v2.16B, #4
-        ext         v3.16B, v3.16B, v3.16B, #4
-        fmla        v4.4S, v2.4S, v18.4S
-        fmla        v4.4S, v3.4S, v19.4S
-        st1         {v4.D}[0], [x0], #8
-        st1         {v4.D}[1], [x1], #8
-        subs        w4, w4, #1
+        ld1             {v0.4s,v1.4s}, [x2]
+        ld1             {v6.4s,v7.4s}, [x3]
+        fneg            v2.4s, v1.4s
+        fneg            v3.4s, v7.4s
+        zip1            v16.4s, v0.4s, v0.4s
+        zip2            v17.4s, v0.4s, v0.4s
+        zip1            v18.4s, v2.4s, v1.4s
+        zip2            v19.4s, v2.4s, v1.4s
+        zip1            v20.4s, v6.4s, v6.4s
+        zip2            v21.4s, v6.4s, v6.4s
+        zip1            v22.4s, v3.4s, v7.4s
+        zip2            v23.4s, v3.4s, v7.4s
+1:      ld1             {v2.2s}, [x0]
+        ld1             {v3.2s}, [x1]
+        fadd            v16.4s, v16.4s, v20.4s
+        fadd            v17.4s, v17.4s, v21.4s
+        mov             v2.d[1], v2.d[0]
+        mov             v3.d[1], v3.d[0]
+        fmul            v4.4s, v2.4s, v16.4s
+        fmla            v4.4s, v3.4s, v17.4s
+        fadd            v18.4s, v18.4s, v22.4s
+        fadd            v19.4s, v19.4s, v23.4s
+        ext             v2.16b, v2.16b, v2.16b, #4
+        ext             v3.16b, v3.16b, v3.16b, #4
+        fmla            v4.4s, v2.4s, v18.4s
+        fmla            v4.4s, v3.4s, v19.4s
+        st1             {v4.d}[0], [x0], #8
+        st1             {v4.d}[1], [x1], #8
+        subs            w4, w4, #1
         b.gt        1b
         ret
 endfunc
 
 function ff_ps_hybrid_analysis_neon, export=1
-        lsl         x3, x3, #3
-        ld2         {v0.4S,v1.4S}, [x1], #32
-        ld2         {v2.2S,v3.2S}, [x1], #16
-        ld1         {v24.2S},      [x1], #8
-        ld2         {v4.2S,v5.2S}, [x1], #16
-        ld2         {v6.4S,v7.4S}, [x1]
-        rev64       v6.4S, v6.4S
-        rev64       v7.4S, v7.4S
-        ext         v6.16B, v6.16B, v6.16B, #8
-        ext         v7.16B, v7.16B, v7.16B, #8
-        rev64       v4.2S, v4.2S
-        rev64       v5.2S, v5.2S
-        mov         v2.D[1], v3.D[0]
-        mov         v4.D[1], v5.D[0]
-        mov         v5.D[1], v2.D[0]
-        mov         v3.D[1], v4.D[0]
-        fadd        v16.4S, v0.4S, v6.4S
-        fadd        v17.4S, v1.4S, v7.4S
-        fsub        v18.4S, v1.4S, v7.4S
-        fsub        v19.4S, v0.4S, v6.4S
-        fadd        v22.4S, v2.4S, v4.4S
-        fsub        v23.4S, v5.4S, v3.4S
-        trn1        v20.2D, v22.2D, v23.2D      // {re4+re8, re5+re7, im8-im4, im7-im5}
-        trn2        v21.2D, v22.2D, v23.2D      // {im4+im8, im5+im7, re4-re8, re5-re7}
-1:      ld2         {v2.4S,v3.4S}, [x2], #32
-        ld2         {v4.2S,v5.2S}, [x2], #16
-        ld1         {v6.2S},       [x2], #8
-        add         x2, x2, #8
-        mov         v4.D[1], v5.D[0]
-        mov         v6.S[1], v6.S[0]
-        fmul        v6.2S, v6.2S, v24.2S
-        fmul        v0.4S, v2.4S, v16.4S
-        fmul        v1.4S, v2.4S, v17.4S
-        fmls        v0.4S, v3.4S, v18.4S
-        fmla        v1.4S, v3.4S, v19.4S
-        fmla        v0.4S, v4.4S, v20.4S
-        fmla        v1.4S, v4.4S, v21.4S
-        faddp       v0.4S, v0.4S, v1.4S
-        faddp       v0.4S, v0.4S, v0.4S
-        fadd        v0.2S, v0.2S, v6.2S
-        st1         {v0.2S}, [x0], x3
-        subs        w4, w4, #1
+        lsl             x3, x3, #3
+        ld2             {v0.4s,v1.4s}, [x1], #32
+        ld2             {v2.2s,v3.2s}, [x1], #16
+        ld1             {v24.2s},      [x1], #8
+        ld2             {v4.2s,v5.2s}, [x1], #16
+        ld2             {v6.4s,v7.4s}, [x1]
+        rev64           v6.4s, v6.4s
+        rev64           v7.4s, v7.4s
+        ext             v6.16b, v6.16b, v6.16b, #8
+        ext             v7.16b, v7.16b, v7.16b, #8
+        rev64           v4.2s, v4.2s
+        rev64           v5.2s, v5.2s
+        mov             v2.d[1], v3.d[0]
+        mov             v4.d[1], v5.d[0]
+        mov             v5.d[1], v2.d[0]
+        mov             v3.d[1], v4.d[0]
+        fadd            v16.4s, v0.4s, v6.4s
+        fadd            v17.4s, v1.4s, v7.4s
+        fsub            v18.4s, v1.4s, v7.4s
+        fsub            v19.4s, v0.4s, v6.4s
+        fadd            v22.4s, v2.4s, v4.4s
+        fsub            v23.4s, v5.4s, v3.4s
+        trn1            v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5}
+        trn2            v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7}
+1:      ld2             {v2.4s,v3.4s}, [x2], #32
+        ld2             {v4.2s,v5.2s}, [x2], #16
+        ld1             {v6.2s},       [x2], #8
+        add             x2, x2, #8
+        mov             v4.d[1], v5.d[0]
+        mov             v6.s[1], v6.s[0]
+        fmul            v6.2s, v6.2s, v24.2s
+        fmul            v0.4s, v2.4s, v16.4s
+        fmul            v1.4s, v2.4s, v17.4s
+        fmls            v0.4s, v3.4s, v18.4s
+        fmla            v1.4s, v3.4s, v19.4s
+        fmla            v0.4s, v4.4s, v20.4s
+        fmla            v1.4s, v4.4s, v21.4s
+        faddp           v0.4s, v0.4s, v1.4s
+        faddp           v0.4s, v0.4s, v0.4s
+        fadd            v0.2s, v0.2s, v6.2s
+        st1             {v0.2s}, [x0], x3
+        subs            w4, w4, #1
         b.gt        1b
         ret
 endfunc
diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index 9ff3f9c526..8889596f39 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -235,18 +235,18 @@  function fft16_neon
 endfunc
 
 
-const  trans4_float, align=4
-        .byte    0,  1,  2,  3
-        .byte    8,  9, 10, 11
-        .byte    4,  5,  6,  7
-        .byte   12, 13, 14, 15
+const trans4_float, align=4
+        .byte           0,  1,  2,  3
+        .byte           8,  9, 10, 11
+        .byte           4,  5,  6,  7
+        .byte           12, 13, 14, 15
 endconst
 
-const  trans8_float, align=4
-        .byte   24, 25, 26, 27
-        .byte    0,  1,  2,  3
-        .byte   28, 29, 30, 31
-        .byte    4,  5,  6,  7
+const trans8_float, align=4
+        .byte           24, 25, 26, 27
+        .byte           0,  1,  2,  3
+        .byte           28, 29, 30, 31
+        .byte           4,  5,  6,  7
 endconst
 
 function fft_pass_neon
@@ -295,8 +295,7 @@  function fft_pass_neon
         st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
         st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
         st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
-1:
-        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
+1:      ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
         ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
         ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
         transpose       v26.2d, v27.2d, v20.2d, v22.2d
@@ -359,18 +358,18 @@  function fft\n\()_neon, align=6
 endfunc
 .endm
 
-        def_fft    32,    16,     8
-        def_fft    64,    32,    16
-        def_fft   128,    64,    32
-        def_fft   256,   128,    64
-        def_fft   512,   256,   128
-        def_fft  1024,   512,   256
-        def_fft  2048,  1024,   512
-        def_fft  4096,  2048,  1024
-        def_fft  8192,  4096,  2048
-        def_fft 16384,  8192,  4096
-        def_fft 32768, 16384,  8192
-        def_fft 65536, 32768, 16384
+        def_fft         32,    16,     8
+        def_fft         64,    32,    16
+        def_fft         128,    64,    32
+        def_fft         256,   128,    64
+        def_fft         512,   256,   128
+        def_fft         1024,   512,   256
+        def_fft         2048,  1024,   512
+        def_fft         4096,  2048,  1024
+        def_fft         8192,  4096,  2048
+        def_fft         16384,  8192,  4096
+        def_fft         32768, 16384,  8192
+        def_fft         65536, 32768, 16384
 
 function ff_fft_calc_neon, export=1
         prfm            pldl1keep, [x1]
@@ -399,8 +398,7 @@  function ff_fft_permute_neon, export=1
         ldr             x0,  [x0, #8]   // revtab
         lsl             x6,  x6, x2
         mov             x2,  x6
-1:
-        ld1             {v0.2s,v1.2s}, [x1], #16
+1:      ld1             {v0.2s,v1.2s}, [x1], #16
         ldr             w4,  [x0], #4
         uxth            w5,  w4
         lsr             w4,  w4,  #16
@@ -412,8 +410,7 @@  function ff_fft_permute_neon, export=1
         b.gt            1b
 
         sub             x1,  x1,  x2,  lsl #3
-1:
-        ld1             {v0.4s,v1.4s}, [x3], #32
+1:      ld1             {v0.4s,v1.4s}, [x3], #32
         st1             {v0.4s,v1.4s}, [x1], #32
         subs            x2,  x2,  #4
         b.gt            1b
@@ -421,28 +418,28 @@  function ff_fft_permute_neon, export=1
         ret
 endfunc
 
-const   fft_tab_neon, relocate=1
-        .quad fft4_neon
-        .quad fft8_neon
-        .quad fft16_neon
-        .quad fft32_neon
-        .quad fft64_neon
-        .quad fft128_neon
-        .quad fft256_neon
-        .quad fft512_neon
-        .quad fft1024_neon
-        .quad fft2048_neon
-        .quad fft4096_neon
-        .quad fft8192_neon
-        .quad fft16384_neon
-        .quad fft32768_neon
-        .quad fft65536_neon
+const fft_tab_neon, relocate=1
+        .quad           fft4_neon
+        .quad           fft8_neon
+        .quad           fft16_neon
+        .quad           fft32_neon
+        .quad           fft64_neon
+        .quad           fft128_neon
+        .quad           fft256_neon
+        .quad           fft512_neon
+        .quad           fft1024_neon
+        .quad           fft2048_neon
+        .quad           fft4096_neon
+        .quad           fft8192_neon
+        .quad           fft16384_neon
+        .quad           fft32768_neon
+        .quad           fft65536_neon
 endconst
 
-const   pmmp, align=4
+const pmmp, align=4
         .float          +1.0, -1.0, -1.0, +1.0
 endconst
 
-const   mppm, align=4
+const mppm, align=4
         .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
 endconst
diff --git a/libavcodec/aarch64/fmtconvert_neon.S b/libavcodec/aarch64/fmtconvert_neon.S
index 2161c3a8ae..293f94c0f9 100644
--- a/libavcodec/aarch64/fmtconvert_neon.S
+++ b/libavcodec/aarch64/fmtconvert_neon.S
@@ -27,8 +27,7 @@  function ff_int32_to_float_fmul_scalar_neon, export=1
         ld1             {v1.4s,v2.4s}, [x1], #32
         scvtf           v1.4s,  v1.4s
         scvtf           v2.4s,  v2.4s
-1:
-        subs            w2,  w2,  #8
+1:      subs            w2,  w2,  #8
         fmul            v3.4s,  v1.4s,  v0.s[0]
         fmul            v4.4s,  v2.4s,  v0.s[0]
         b.le            2f
@@ -37,8 +36,7 @@  function ff_int32_to_float_fmul_scalar_neon, export=1
         scvtf           v1.4s,  v1.4s
         scvtf           v2.4s,  v2.4s
         b               1b
-2:
-        st1             {v3.4s,v4.4s}, [x0]
+2:      st1             {v3.4s,v4.4s}, [x0]
         ret
 endfunc
 
@@ -46,8 +44,7 @@  function ff_int32_to_float_fmul_array8_neon, export=1
         lsr             w4,  w4,  #3
         subs            w5,  w4,  #1
         b.eq            1f
-2:
-        ld1             {v0.4s,v1.4s}, [x2], #32
+2:      ld1             {v0.4s,v1.4s}, [x2], #32
         ld1             {v2.4s,v3.4s}, [x2], #32
         scvtf           v0.4s,  v0.4s
         scvtf           v1.4s,  v1.4s
@@ -64,8 +61,7 @@  function ff_int32_to_float_fmul_array8_neon, export=1
         b.gt            2b
         b.eq            1f
         ret
-1:
-        ld1             {v0.4s,v1.4s}, [x2]
+1:      ld1             {v0.4s,v1.4s}, [x2]
         ld1             {v16.s}[0],  [x3]
         scvtf           v0.4s,  v0.4s
         scvtf           v1.4s,  v1.4s
diff --git a/libavcodec/aarch64/h264cmc_neon.S b/libavcodec/aarch64/h264cmc_neon.S
index f8e9407854..0add73ffec 100644
--- a/libavcodec/aarch64/h264cmc_neon.S
+++ b/libavcodec/aarch64/h264cmc_neon.S
@@ -26,24 +26,24 @@ 
 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc8 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
-  .ifc \type,avg
+.ifc \type,avg
         mov             x8,  x0
-  .endif
+.endif
         prfm            pldl1strm, [x1]
         prfm            pldl1strm, [x1, x2]
-  .ifc \codec,rv40
+.ifc \codec,rv40
         movrel          x6,  rv40bias
         lsr             w9,  w5,  #1
         lsr             w10, w4,  #1
         lsl             w9,  w9,  #3
         lsl             w10, w10, #1
         add             w9,  w9,  w10
-        add             x6,  x6,  w9, UXTW
-        ld1r            {v22.8H}, [x6]
-  .endif
-  .ifc \codec,vc1
-        movi            v22.8H,   #28
-  .endif
+        add             x6,  x6,  w9, uxtw
+        ld1r            {v22.8h}, [x6]
+.endif
+.ifc \codec,vc1
+        movi            v22.8h,   #28
+.endif
         mul             w7,  w4,  w5
         lsl             w14, w5,  #3
         lsl             w13, w4,  #3
@@ -55,139 +55,139 @@  function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
         add             w4,  w4,  #64
         b.eq            2f
 
-        dup             v0.8B,  w4
-        dup             v1.8B,  w12
-        ld1             {v4.8B, v5.8B}, [x1], x2
-        dup             v2.8B,  w6
-        dup             v3.8B,  w7
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-1:      ld1             {v6.8B, v7.8B}, [x1], x2
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v5.8B,  v1.8B
-        ext             v7.8B,  v6.8B,  v7.8B,  #1
-        ld1             {v4.8B, v5.8B}, [x1], x2
-        umlal           v16.8H, v6.8B,  v2.8B
+        dup             v0.8b,  w4
+        dup             v1.8b,  w12
+        ld1             {v4.8b, v5.8b}, [x1], x2
+        dup             v2.8b,  w6
+        dup             v3.8b,  w7
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+1:      ld1             {v6.8b, v7.8b}, [x1], x2
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v5.8b,  v1.8b
+        ext             v7.8b,  v6.8b,  v7.8b,  #1
+        ld1             {v4.8b, v5.8b}, [x1], x2
+        umlal           v16.8h, v6.8b,  v2.8b
         prfm            pldl1strm, [x1]
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-        umlal           v16.8H, v7.8B,  v3.8B
-        umull           v17.8H, v6.8B,  v0.8B
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+        umlal           v16.8h, v7.8b,  v3.8b
+        umull           v17.8h, v6.8b,  v0.8b
         subs            w3,  w3,  #2
-        umlal           v17.8H, v7.8B, v1.8B
-        umlal           v17.8H, v4.8B, v2.8B
-        umlal           v17.8H, v5.8B, v3.8B
+        umlal           v17.8h, v7.8b, v1.8b
+        umlal           v17.8h, v4.8b, v2.8b
+        umlal           v17.8h, v5.8b, v3.8b
         prfm            pldl1strm, [x1, x2]
-  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
-  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
-  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+.ifc \codec,h264
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
+.else
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
+.endif
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
         b.gt            1b
         ret
 
 2:      adds            w12, w12, w6
-        dup             v0.8B, w4
+        dup             v0.8b, w4
         b.eq            5f
         tst             w6,  w6
-        dup             v1.8B, w12
+        dup             v1.8b, w12
         b.eq            4f
 
-        ld1             {v4.8B}, [x1], x2
-3:      ld1             {v6.8B}, [x1], x2
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v6.8B,  v1.8B
-        ld1             {v4.8B}, [x1], x2
-        umull           v17.8H, v6.8B,  v0.8B
-        umlal           v17.8H, v4.8B,  v1.8B
+        ld1             {v4.8b}, [x1], x2
+3:      ld1             {v6.8b}, [x1], x2
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v6.8b,  v1.8b
+        ld1             {v4.8b}, [x1], x2
+        umull           v17.8h, v6.8b,  v0.8b
+        umlal           v17.8h, v4.8b,  v1.8b
         prfm            pldl1strm, [x1]
-  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
-  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
-  .endif
+.ifc \codec,h264
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
+.else
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
+.endif
         prfm            pldl1strm, [x1, x2]
-  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
-  .endif
+.ifc \type,avg
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
+.endif
         subs            w3,  w3,  #2
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
         b.gt            3b
         ret
 
-4:      ld1             {v4.8B, v5.8B}, [x1], x2
-        ld1             {v6.8B, v7.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B,  #1
-        ext             v7.8B,  v6.8B,  v7.8B,  #1
+4:      ld1             {v4.8b, v5.8b}, [x1], x2
+        ld1             {v6.8b, v7.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b,  #1
+        ext             v7.8b,  v6.8b,  v7.8b,  #1
         prfm            pldl1strm, [x1]
         subs            w3,  w3,  #2
-        umull           v16.8H, v4.8B, v0.8B
-        umlal           v16.8H, v5.8B, v1.8B
-        umull           v17.8H, v6.8B, v0.8B
-        umlal           v17.8H, v7.8B, v1.8B
+        umull           v16.8h, v4.8b, v0.8b
+        umlal           v16.8h, v5.8b, v1.8b
+        umull           v17.8h, v6.8b, v0.8b
+        umlal           v17.8h, v7.8b, v1.8b
         prfm            pldl1strm, [x1, x2]
-  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
-  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
-  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+.ifc \codec,h264
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
+.else
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
+.endif
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
         b.gt            4b
         ret
 
-5:      ld1             {v4.8B}, [x1], x2
-        ld1             {v5.8B}, [x1], x2
+5:      ld1             {v4.8b}, [x1], x2
+        ld1             {v5.8b}, [x1], x2
         prfm            pldl1strm, [x1]
         subs            w3,  w3,  #2
-        umull           v16.8H, v4.8B, v0.8B
-        umull           v17.8H, v5.8B, v0.8B
+        umull           v16.8h, v4.8b, v0.8b
+        umull           v17.8h, v5.8b, v0.8b
         prfm            pldl1strm, [x1, x2]
-  .ifc \codec,h264
-        rshrn           v16.8B, v16.8H, #6
-        rshrn           v17.8B, v17.8H, #6
-  .else
-        add             v16.8H, v16.8H, v22.8H
-        add             v17.8H, v17.8H, v22.8H
-        shrn            v16.8B, v16.8H, #6
-        shrn            v17.8B, v17.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.8B}, [x8], x2
-        ld1             {v21.8B}, [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-        urhadd          v17.8B, v17.8B, v21.8B
-  .endif
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
+.ifc \codec,h264
+        rshrn           v16.8b, v16.8h, #6
+        rshrn           v17.8b, v17.8h, #6
+.else
+        add             v16.8h, v16.8h, v22.8h
+        add             v17.8h, v17.8h, v22.8h
+        shrn            v16.8b, v16.8h, #6
+        shrn            v17.8b, v17.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.8b}, [x8], x2
+        ld1             {v21.8b}, [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+        urhadd          v17.8b, v17.8b, v21.8b
+.endif
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
         b.gt            5b
         ret
 endfunc
@@ -196,24 +196,24 @@  endfunc
 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
 .macro  h264_chroma_mc4 type, codec=h264
 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
-  .ifc \type,avg
+.ifc \type,avg
         mov             x8,  x0
-  .endif
+.endif
         prfm            pldl1strm, [x1]
         prfm            pldl1strm, [x1, x2]
-  .ifc \codec,rv40
+.ifc \codec,rv40
         movrel          x6,  rv40bias
         lsr             w9,  w5,  #1
         lsr             w10, w4,  #1
         lsl             w9,  w9,  #3
         lsl             w10, w10, #1
         add             w9,  w9,  w10
-        add             x6,  x6,  w9, UXTW
-        ld1r            {v22.8H}, [x6]
-  .endif
-  .ifc \codec,vc1
-        movi            v22.8H,   #28
-  .endif
+        add             x6,  x6,  w9, uxtw
+        ld1r            {v22.8h}, [x6]
+.endif
+.ifc \codec,vc1
+        movi            v22.8h,   #28
+.endif
         mul             w7,  w4,  w5
         lsl             w14, w5,  #3
         lsl             w13, w4,  #3
@@ -225,133 +225,133 @@  function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
         add             w4,  w4,  #64
         b.eq            2f
 
-        dup             v24.8B,  w4
-        dup             v25.8B,  w12
-        ld1             {v4.8B}, [x1], x2
-        dup             v26.8B,  w6
-        dup             v27.8B,  w7
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        trn1            v0.2S,  v24.2S, v25.2S
-        trn1            v2.2S,  v26.2S, v27.2S
-        trn1            v4.2S,  v4.2S,  v5.2S
-1:      ld1             {v6.8B}, [x1], x2
-        ext             v7.8B,  v6.8B,  v7.8B, #1
-        trn1            v6.2S,  v6.2S,  v7.2S
-        umull           v18.8H, v4.8B,  v0.8B
-        umlal           v18.8H, v6.8B,  v2.8B
-        ld1             {v4.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        trn1            v4.2S,  v4.2S,  v5.2S
+        dup             v24.8b,  w4
+        dup             v25.8b,  w12
+        ld1             {v4.8b}, [x1], x2
+        dup             v26.8b,  w6
+        dup             v27.8b,  w7
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        trn1            v0.2s,  v24.2s, v25.2s
+        trn1            v2.2s,  v26.2s, v27.2s
+        trn1            v4.2s,  v4.2s,  v5.2s
+1:      ld1             {v6.8b}, [x1], x2
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        trn1            v6.2s,  v6.2s,  v7.2s
+        umull           v18.8h, v4.8b,  v0.8b
+        umlal           v18.8h, v6.8b,  v2.8b
+        ld1             {v4.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        trn1            v4.2s,  v4.2s,  v5.2s
         prfm            pldl1strm, [x1]
-        umull           v19.8H, v6.8B,  v0.8B
-        umlal           v19.8H, v4.8B,  v2.8B
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
-  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
-  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
-  .endif
+        umull           v19.8h, v6.8b,  v0.8b
+        umlal           v19.8h, v4.8b,  v2.8b
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
+.ifc \codec,h264
+        rshrn           v16.8b, v18.8h, #6
+.else
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
+.endif
         subs            w3,  w3,  #2
         prfm            pldl1strm, [x1, x2]
-  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-  .endif
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+.ifc \type,avg
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+.endif
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
         b.gt            1b
         ret
 
 2:      adds            w12, w12, w6
-        dup             v30.8B, w4
+        dup             v30.8b, w4
         b.eq            5f
         tst             w6,  w6
-        dup             v31.8B, w12
-        trn1            v0.2S,  v30.2S, v31.2S
-        trn2            v1.2S,  v30.2S, v31.2S
+        dup             v31.8b, w12
+        trn1            v0.2s,  v30.2s, v31.2s
+        trn2            v1.2s,  v30.2s, v31.2s
         b.eq            4f
 
-        ext             v1.8B,  v0.8B,  v1.8B, #4
-        ld1             {v4.S}[0], [x1], x2
-3:      ld1             {v4.S}[1], [x1], x2
-        umull           v18.8H, v4.8B,  v0.8B
-        ld1             {v4.S}[0], [x1], x2
-        umull           v19.8H, v4.8B,  v1.8B
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
+        ext             v1.8b,  v0.8b,  v1.8b, #4
+        ld1             {v4.s}[0], [x1], x2
+3:      ld1             {v4.s}[1], [x1], x2
+        umull           v18.8h, v4.8b,  v0.8b
+        ld1             {v4.s}[0], [x1], x2
+        umull           v19.8h, v4.8b,  v1.8b
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
         prfm            pldl1strm, [x1]
-  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
-  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-  .endif
+.ifc \codec,h264
+        rshrn           v16.8b, v18.8h, #6
+.else
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+.endif
         subs            w3,  w3,  #2
         prfm            pldl1strm, [x1, x2]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
         b.gt            3b
         ret
 
-4:      ld1             {v4.8B}, [x1], x2
-        ld1             {v6.8B}, [x1], x2
-        ext             v5.8B,  v4.8B,  v5.8B, #1
-        ext             v7.8B,  v6.8B,  v7.8B, #1
-        trn1            v4.2S,  v4.2S,  v5.2S
-        trn1            v6.2S,  v6.2S,  v7.2S
-        umull           v18.8H, v4.8B,  v0.8B
-        umull           v19.8H, v6.8B,  v0.8B
+4:      ld1             {v4.8b}, [x1], x2
+        ld1             {v6.8b}, [x1], x2
+        ext             v5.8b,  v4.8b,  v5.8b, #1
+        ext             v7.8b,  v6.8b,  v7.8b, #1
+        trn1            v4.2s,  v4.2s,  v5.2s
+        trn1            v6.2s,  v6.2s,  v7.2s
+        umull           v18.8h, v4.8b,  v0.8b
+        umull           v19.8h, v6.8b,  v0.8b
         subs            w3,  w3,  #2
-        trn1            v30.2D, v18.2D, v19.2D
-        trn2            v31.2D, v18.2D, v19.2D
-        add             v18.8H, v30.8H, v31.8H
+        trn1            v30.2d, v18.2d, v19.2d
+        trn2            v31.2d, v18.2d, v19.2d
+        add             v18.8h, v30.8h, v31.8h
         prfm            pldl1strm, [x1]
-  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
-  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-  .endif
+.ifc \codec,h264
+        rshrn           v16.8b, v18.8h, #6
+.else
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+.endif
         prfm            pldl1strm, [x1]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
         b.gt            4b
         ret
 
-5:      ld1             {v4.S}[0], [x1], x2
-        ld1             {v4.S}[1], [x1], x2
-        umull           v18.8H, v4.8B,  v30.8B
+5:      ld1             {v4.s}[0], [x1], x2
+        ld1             {v4.s}[1], [x1], x2
+        umull           v18.8h, v4.8b,  v30.8b
         subs            w3,  w3,  #2
         prfm            pldl1strm, [x1]
-  .ifc \codec,h264
-        rshrn           v16.8B, v18.8H, #6
-  .else
-        add             v18.8H, v18.8H, v22.8H
-        shrn            v16.8B, v18.8H, #6
-  .endif
-  .ifc \type,avg
-        ld1             {v20.S}[0], [x8], x2
-        ld1             {v20.S}[1], [x8], x2
-        urhadd          v16.8B, v16.8B, v20.8B
-  .endif
+.ifc \codec,h264
+        rshrn           v16.8b, v18.8h, #6
+.else
+        add             v18.8h, v18.8h, v22.8h
+        shrn            v16.8b, v18.8h, #6
+.endif
+.ifc \type,avg
+        ld1             {v20.s}[0], [x8], x2
+        ld1             {v20.s}[1], [x8], x2
+        urhadd          v16.8b, v16.8b, v20.8b
+.endif
         prfm            pldl1strm, [x1]
-        st1             {v16.S}[0], [x0], x2
-        st1             {v16.S}[1], [x0], x2
+        st1             {v16.s}[0], [x0], x2
+        st1             {v16.s}[1], [x0], x2
         b.gt            5b
         ret
 endfunc
@@ -372,51 +372,49 @@  function ff_\type\()_h264_chroma_mc2_neon, export=1
         sub             w4,  w7,  w13
         sub             w4,  w4,  w14
         add             w4,  w4,  #64
-        dup             v0.8B,  w4
-        dup             v2.8B,  w12
-        dup             v1.8B,  w6
-        dup             v3.8B,  w7
-        trn1            v0.4H,  v0.4H,  v2.4H
-        trn1            v1.4H,  v1.4H,  v3.4H
-1:
-        ld1             {v4.S}[0],  [x1], x2
-        ld1             {v4.S}[1],  [x1], x2
-        rev64           v5.2S,  v4.2S
-        ld1             {v5.S}[1],  [x1]
-        ext             v6.8B,  v4.8B,  v5.8B,  #1
-        ext             v7.8B,  v5.8B,  v4.8B,  #1
-        trn1            v4.4H,  v4.4H,  v6.4H
-        trn1            v5.4H,  v5.4H,  v7.4H
-        umull           v16.8H, v4.8B,  v0.8B
-        umlal           v16.8H, v5.8B,  v1.8B
-  .ifc \type,avg
-        ld1             {v18.H}[0], [x0], x2
-        ld1             {v18.H}[2], [x0]
+        dup             v0.8b,  w4
+        dup             v2.8b,  w12
+        dup             v1.8b,  w6
+        dup             v3.8b,  w7
+        trn1            v0.4h,  v0.4h,  v2.4h
+        trn1            v1.4h,  v1.4h,  v3.4h
+1:      ld1             {v4.s}[0],  [x1], x2
+        ld1             {v4.s}[1],  [x1], x2
+        rev64           v5.2s,  v4.2s
+        ld1             {v5.s}[1],  [x1]
+        ext             v6.8b,  v4.8b,  v5.8b,  #1
+        ext             v7.8b,  v5.8b,  v4.8b,  #1
+        trn1            v4.4h,  v4.4h,  v6.4h
+        trn1            v5.4h,  v5.4h,  v7.4h
+        umull           v16.8h, v4.8b,  v0.8b
+        umlal           v16.8h, v5.8b,  v1.8b
+.ifc \type,avg
+        ld1             {v18.h}[0], [x0], x2
+        ld1             {v18.h}[2], [x0]
         sub             x0,  x0,  x2
-  .endif
-        rev64           v17.4S, v16.4S
-        add             v16.8H, v16.8H, v17.8H
-        rshrn           v16.8B, v16.8H, #6
-  .ifc \type,avg
-        urhadd          v16.8B, v16.8B, v18.8B
-  .endif
-        st1             {v16.H}[0], [x0], x2
-        st1             {v16.H}[2], [x0], x2
+.endif
+        rev64           v17.4s, v16.4s
+        add             v16.8h, v16.8h, v17.8h
+        rshrn           v16.8b, v16.8h, #6
+.ifc \type,avg
+        urhadd          v16.8b, v16.8b, v18.8b
+.endif
+        st1             {v16.h}[0], [x0], x2
+        st1             {v16.h}[2], [x0], x2
         subs            w3,  w3,  #2
         b.gt            1b
         ret
 
-2:
-        ld1             {v16.H}[0], [x1], x2
-        ld1             {v16.H}[1], [x1], x2
-  .ifc \type,avg
-        ld1             {v18.H}[0], [x0], x2
-        ld1             {v18.H}[1], [x0]
+2:      ld1             {v16.h}[0], [x1], x2
+        ld1             {v16.h}[1], [x1], x2
+.ifc \type,avg
+        ld1             {v18.h}[0], [x0], x2
+        ld1             {v18.h}[1], [x0]
         sub             x0,  x0,  x2
-        urhadd          v16.8B, v16.8B, v18.8B
-  .endif
-        st1             {v16.H}[0], [x0], x2
-        st1             {v16.H}[1], [x0], x2
+        urhadd          v16.8b, v16.8b, v18.8b
+.endif
+        st1             {v16.h}[0], [x0], x2
+        st1             {v16.h}[1], [x0], x2
         subs            w3,  w3,  #2
         b.gt            2b
         ret
@@ -431,10 +429,10 @@  endfunc
         h264_chroma_mc2 avg
 
 #if CONFIG_RV40_DECODER
-const   rv40bias
-        .short           0, 16, 32, 16
+const rv40bias
+        .short          0, 16, 32, 16
         .short          32, 28, 32, 28
-        .short           0, 32, 16, 32
+        .short          0, 32, 16, 32
         .short          32, 28, 32, 28
 endconst
 
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index ea221e6862..926c6e8362 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -27,130 +27,128 @@ 
         cmp             w2,  #0
         ldr             w6,  [x4]
         ccmp            w3,  #0, #0, ne
-        mov             v24.S[0], w6
+        mov             v24.s[0], w6
         and             w8,  w6,  w6,  lsl #16
         b.eq            1f
         ands            w8,  w8,  w8,  lsl #8
         b.ge            2f
-1:
-        ret
+1:      ret
 2:
 .endm
 
 .macro  h264_loop_filter_luma
-        dup             v22.16B, w2                     // alpha
-        uxtl            v24.8H,  v24.8B
-        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
-        uxtl            v24.4S,  v24.4H
-        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
-        sli             v24.8H,  v24.8H,  #8
-        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
-        sli             v24.4S,  v24.4S,  #16
-        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
-        dup             v22.16B, w3                     // beta
-        cmlt            v23.16B, v24.16B, #0
-        cmhi            v28.16B, v22.16B, v28.16B       // < beta
-        cmhi            v30.16B, v22.16B, v30.16B       // < beta
-        bic             v21.16B, v21.16B, v23.16B
-        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
-        and             v21.16B, v21.16B, v28.16B
-        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
-        and             v21.16B, v21.16B, v30.16B      // < beta
+        dup             v22.16b, w2                     // alpha
+        uxtl            v24.8h,  v24.8b
+        uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
+        uxtl            v24.4s,  v24.4h
+        uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
+        sli             v24.8h,  v24.8h,  #8
+        uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
+        sli             v24.4s,  v24.4s,  #16
+        cmhi            v21.16b, v22.16b, v21.16b       // < alpha
+        dup             v22.16b, w3                     // beta
+        cmlt            v23.16b, v24.16b, #0
+        cmhi            v28.16b, v22.16b, v28.16b       // < beta
+        cmhi            v30.16b, v22.16b, v30.16b       // < beta
+        bic             v21.16b, v21.16b, v23.16b
+        uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
+        and             v21.16b, v21.16b, v28.16b
+        uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
+        and             v21.16b, v21.16b, v30.16b      // < beta
         shrn            v30.8b,  v21.8h,  #4
         mov             x7, v30.d[0]
-        cmhi            v17.16B, v22.16B, v17.16B       // < beta
-        cmhi            v19.16B, v22.16B, v19.16B       // < beta
+        cmhi            v17.16b, v22.16b, v17.16b       // < beta
+        cmhi            v19.16b, v22.16b, v19.16b       // < beta
         cbz             x7,  9f
-        and             v17.16B, v17.16B, v21.16B
-        and             v19.16B, v19.16B, v21.16B
-        and             v24.16B, v24.16B, v21.16B
-        urhadd          v28.16B, v16.16B,  v0.16B
-        sub             v21.16B, v24.16B, v17.16B
-        uqadd           v23.16B, v18.16B, v24.16B
-        uhadd           v20.16B, v20.16B, v28.16B
-        sub             v21.16B, v21.16B, v19.16B
-        uhadd           v28.16B,  v4.16B, v28.16B
-        umin            v23.16B, v23.16B, v20.16B
-        uqsub           v22.16B, v18.16B, v24.16B
-        uqadd           v4.16B,   v2.16B, v24.16B
-        umax            v23.16B, v23.16B, v22.16B
-        uqsub           v22.16B,  v2.16B, v24.16B
-        umin            v28.16B,  v4.16B, v28.16B
-        uxtl            v4.8H,    v0.8B
-        umax            v28.16B, v28.16B, v22.16B
-        uxtl2           v20.8H,   v0.16B
-        usubw           v4.8H,    v4.8H,  v16.8B
-        usubw2          v20.8H,  v20.8H,  v16.16B
-        shl             v4.8H,    v4.8H,  #2
-        shl             v20.8H,  v20.8H,  #2
-        uaddw           v4.8H,    v4.8H,  v18.8B
-        uaddw2          v20.8H,  v20.8H,  v18.16B
-        usubw           v4.8H,    v4.8H,   v2.8B
-        usubw2          v20.8H,  v20.8H,   v2.16B
-        rshrn           v4.8B,    v4.8H,  #3
-        rshrn2          v4.16B,  v20.8H,  #3
-        bsl             v17.16B, v23.16B, v18.16B
-        bsl             v19.16B, v28.16B,  v2.16B
-        neg             v23.16B, v21.16B
-        uxtl            v28.8H,  v16.8B
-        smin            v4.16B,   v4.16B, v21.16B
-        uxtl2           v21.8H,  v16.16B
-        smax            v4.16B,   v4.16B, v23.16B
-        uxtl            v22.8H,   v0.8B
-        uxtl2           v24.8H,   v0.16B
-        saddw           v28.8H,  v28.8H,  v4.8B
-        saddw2          v21.8H,  v21.8H,  v4.16B
-        ssubw           v22.8H,  v22.8H,  v4.8B
-        ssubw2          v24.8H,  v24.8H,  v4.16B
-        sqxtun          v16.8B,  v28.8H
-        sqxtun2         v16.16B, v21.8H
-        sqxtun          v0.8B,   v22.8H
-        sqxtun2         v0.16B,  v24.8H
+        and             v17.16b, v17.16b, v21.16b
+        and             v19.16b, v19.16b, v21.16b
+        and             v24.16b, v24.16b, v21.16b
+        urhadd          v28.16b, v16.16b,  v0.16b
+        sub             v21.16b, v24.16b, v17.16b
+        uqadd           v23.16b, v18.16b, v24.16b
+        uhadd           v20.16b, v20.16b, v28.16b
+        sub             v21.16b, v21.16b, v19.16b
+        uhadd           v28.16b,  v4.16b, v28.16b
+        umin            v23.16b, v23.16b, v20.16b
+        uqsub           v22.16b, v18.16b, v24.16b
+        uqadd           v4.16b,   v2.16b, v24.16b
+        umax            v23.16b, v23.16b, v22.16b
+        uqsub           v22.16b,  v2.16b, v24.16b
+        umin            v28.16b,  v4.16b, v28.16b
+        uxtl            v4.8h,    v0.8b
+        umax            v28.16b, v28.16b, v22.16b
+        uxtl2           v20.8h,   v0.16b
+        usubw           v4.8h,    v4.8h,  v16.8b
+        usubw2          v20.8h,  v20.8h,  v16.16b
+        shl             v4.8h,    v4.8h,  #2
+        shl             v20.8h,  v20.8h,  #2
+        uaddw           v4.8h,    v4.8h,  v18.8b
+        uaddw2          v20.8h,  v20.8h,  v18.16b
+        usubw           v4.8h,    v4.8h,   v2.8b
+        usubw2          v20.8h,  v20.8h,   v2.16b
+        rshrn           v4.8b,    v4.8h,  #3
+        rshrn2          v4.16b,  v20.8h,  #3
+        bsl             v17.16b, v23.16b, v18.16b
+        bsl             v19.16b, v28.16b,  v2.16b
+        neg             v23.16b, v21.16b
+        uxtl            v28.8h,  v16.8b
+        smin            v4.16b,   v4.16b, v21.16b
+        uxtl2           v21.8h,  v16.16b
+        smax            v4.16b,   v4.16b, v23.16b
+        uxtl            v22.8h,   v0.8b
+        uxtl2           v24.8h,   v0.16b
+        saddw           v28.8h,  v28.8h,  v4.8b
+        saddw2          v21.8h,  v21.8h,  v4.16b
+        ssubw           v22.8h,  v22.8h,  v4.8b
+        ssubw2          v24.8h,  v24.8h,  v4.16b
+        sqxtun          v16.8b,  v28.8h
+        sqxtun2         v16.16b, v21.8h
+        sqxtun          v0.8b,   v22.8h
+        sqxtun2         v0.16b,  v24.8h
 .endm
 
 function ff_h264_v_loop_filter_luma_neon, export=1
         h264_loop_filter_start
 
-        ld1             {v0.16B},  [x0], x1
-        ld1             {v2.16B},  [x0], x1
-        ld1             {v4.16B},  [x0], x1
+        ld1             {v0.16b},  [x0], x1
+        ld1             {v2.16b},  [x0], x1
+        ld1             {v4.16b},  [x0], x1
         sub             x0,  x0,  x1, lsl #2
         sub             x0,  x0,  x1, lsl #1
-        ld1             {v20.16B},  [x0], x1
-        ld1             {v18.16B},  [x0], x1
-        ld1             {v16.16B},  [x0], x1
+        ld1             {v20.16b},  [x0], x1
+        ld1             {v18.16b},  [x0], x1
+        ld1             {v16.16b},  [x0], x1
 
         h264_loop_filter_luma
 
         sub             x0,  x0,  x1, lsl #1
-        st1             {v17.16B},  [x0], x1
-        st1             {v16.16B}, [x0], x1
-        st1             {v0.16B},  [x0], x1
-        st1             {v19.16B}, [x0]
-9:
-        ret
+        st1             {v17.16b},  [x0], x1
+        st1             {v16.16b}, [x0], x1
+        st1             {v0.16b},  [x0], x1
+        st1             {v19.16b}, [x0]
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_luma_neon, export=1
         h264_loop_filter_start
 
         sub             x0,  x0,  #4
-        ld1             {v6.8B},  [x0], x1
-        ld1             {v20.8B}, [x0], x1
-        ld1             {v18.8B}, [x0], x1
-        ld1             {v16.8B}, [x0], x1
-        ld1             {v0.8B},  [x0], x1
-        ld1             {v2.8B},  [x0], x1
-        ld1             {v4.8B},  [x0], x1
-        ld1             {v26.8B}, [x0], x1
-        ld1             {v6.D}[1],  [x0], x1
-        ld1             {v20.D}[1], [x0], x1
-        ld1             {v18.D}[1], [x0], x1
-        ld1             {v16.D}[1], [x0], x1
-        ld1             {v0.D}[1],  [x0], x1
-        ld1             {v2.D}[1],  [x0], x1
-        ld1             {v4.D}[1],  [x0], x1
-        ld1             {v26.D}[1], [x0], x1
+        ld1             {v6.8b},  [x0], x1
+        ld1             {v20.8b}, [x0], x1
+        ld1             {v18.8b}, [x0], x1
+        ld1             {v16.8b}, [x0], x1
+        ld1             {v0.8b},  [x0], x1
+        ld1             {v2.8b},  [x0], x1
+        ld1             {v4.8b},  [x0], x1
+        ld1             {v26.8b}, [x0], x1
+        ld1             {v6.d}[1],  [x0], x1
+        ld1             {v20.d}[1], [x0], x1
+        ld1             {v18.d}[1], [x0], x1
+        ld1             {v16.d}[1], [x0], x1
+        ld1             {v0.d}[1],  [x0], x1
+        ld1             {v2.d}[1],  [x0], x1
+        ld1             {v4.d}[1],  [x0], x1
+        ld1             {v26.d}[1], [x0], x1
 
         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
 
@@ -160,24 +158,23 @@  function ff_h264_h_loop_filter_luma_neon, export=1
 
         sub             x0,  x0,  x1, lsl #4
         add             x0,  x0,  #2
-        st1             {v17.S}[0],  [x0], x1
-        st1             {v16.S}[0], [x0], x1
-        st1             {v0.S}[0],  [x0], x1
-        st1             {v19.S}[0], [x0], x1
-        st1             {v17.S}[1],  [x0], x1
-        st1             {v16.S}[1], [x0], x1
-        st1             {v0.S}[1],  [x0], x1
-        st1             {v19.S}[1], [x0], x1
-        st1             {v17.S}[2],  [x0], x1
-        st1             {v16.S}[2], [x0], x1
-        st1             {v0.S}[2],  [x0], x1
-        st1             {v19.S}[2], [x0], x1
-        st1             {v17.S}[3],  [x0], x1
-        st1             {v16.S}[3], [x0], x1
-        st1             {v0.S}[3],  [x0], x1
-        st1             {v19.S}[3], [x0], x1
-9:
-        ret
+        st1             {v17.s}[0],  [x0], x1
+        st1             {v16.s}[0], [x0], x1
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v19.s}[0], [x0], x1
+        st1             {v17.s}[1],  [x0], x1
+        st1             {v16.s}[1], [x0], x1
+        st1             {v0.s}[1],  [x0], x1
+        st1             {v19.s}[1], [x0], x1
+        st1             {v17.s}[2],  [x0], x1
+        st1             {v16.s}[2], [x0], x1
+        st1             {v0.s}[2],  [x0], x1
+        st1             {v19.s}[2], [x0], x1
+        st1             {v17.s}[3],  [x0], x1
+        st1             {v16.s}[3], [x0], x1
+        st1             {v0.s}[3],  [x0], x1
+        st1             {v19.s}[3], [x0], x1
+9:      ret
 endfunc
 
 
@@ -185,8 +182,7 @@  endfunc
         orr             w4,  w2,  w3
         cbnz            w4,  1f
         ret
-1:
-        dup             v30.16b, w2                // alpha
+1:      dup             v30.16b, w2                // alpha
         dup             v31.16b, w3                // beta
 .endm
 
@@ -324,8 +320,7 @@  function ff_h264_v_loop_filter_luma_intra_neon, export=1
         st1             {v0.16b}, [x0], x1  // q0
         st1             {v1.16b}, [x0], x1  // q1
         st1             {v2.16b}, [x0]      // q2
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_luma_intra_neon, export=1
@@ -372,59 +367,57 @@  function ff_h264_h_loop_filter_luma_intra_neon, export=1
         st1             {v1.d}[1],  [x0], x1
         st1             {v2.d}[1],  [x0], x1
         st1             {v3.d}[1],  [x0], x1
-9:
-        ret
+9:      ret
 endfunc
 
 .macro  h264_loop_filter_chroma
-        dup             v22.8B, w2              // alpha
-        dup             v23.8B, w3              // beta
-        uxtl            v24.8H, v24.8B
-        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
-        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
-        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
-        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
-        cmhi            v28.8B, v23.8B, v28.8B  // < beta
-        cmhi            v30.8B, v23.8B, v30.8B  // < beta
-        uxtl            v4.8H,  v0.8B
-        and             v26.8B, v26.8B, v28.8B
-        usubw           v4.8H,  v4.8H,  v16.8B
-        and             v26.8B, v26.8B, v30.8B
-        shl             v4.8H,  v4.8H,  #2
+        dup             v22.8b, w2              // alpha
+        dup             v23.8b, w3              // beta
+        uxtl            v24.8h, v24.8b
+        uabd            v26.8b, v16.8b, v0.8b   // abs(p0 - q0)
+        uabd            v28.8b, v18.8b, v16.8b  // abs(p1 - p0)
+        uabd            v30.8b, v2.8b,  v0.8b   // abs(q1 - q0)
+        cmhi            v26.8b, v22.8b, v26.8b  // < alpha
+        cmhi            v28.8b, v23.8b, v28.8b  // < beta
+        cmhi            v30.8b, v23.8b, v30.8b  // < beta
+        uxtl            v4.8h,  v0.8b
+        and             v26.8b, v26.8b, v28.8b
+        usubw           v4.8h,  v4.8h,  v16.8b
+        and             v26.8b, v26.8b, v30.8b
+        shl             v4.8h,  v4.8h,  #2
         mov             x8,  v26.d[0]
-        sli             v24.8H, v24.8H, #8
-        uaddw           v4.8H,  v4.8H,  v18.8B
+        sli             v24.8h, v24.8h, #8
+        uaddw           v4.8h,  v4.8h,  v18.8b
         cbz             x8,  9f
-        usubw           v4.8H,  v4.8H,  v2.8B
-        rshrn           v4.8B,  v4.8H,  #3
-        smin            v4.8B,  v4.8B,  v24.8B
-        neg             v25.8B, v24.8B
-        smax            v4.8B,  v4.8B,  v25.8B
-        uxtl            v22.8H, v0.8B
-        and             v4.8B,  v4.8B,  v26.8B
-        uxtl            v28.8H, v16.8B
-        saddw           v28.8H, v28.8H, v4.8B
-        ssubw           v22.8H, v22.8H, v4.8B
-        sqxtun          v16.8B, v28.8H
-        sqxtun          v0.8B,  v22.8H
+        usubw           v4.8h,  v4.8h,  v2.8b
+        rshrn           v4.8b,  v4.8h,  #3
+        smin            v4.8b,  v4.8b,  v24.8b
+        neg             v25.8b, v24.8b
+        smax            v4.8b,  v4.8b,  v25.8b
+        uxtl            v22.8h, v0.8b
+        and             v4.8b,  v4.8b,  v26.8b
+        uxtl            v28.8h, v16.8b
+        saddw           v28.8h, v28.8h, v4.8b
+        ssubw           v22.8h, v22.8h, v4.8b
+        sqxtun          v16.8b, v28.8h
+        sqxtun          v0.8b,  v22.8h
 .endm
 
 function ff_h264_v_loop_filter_chroma_neon, export=1
         h264_loop_filter_start
 
         sub             x0,  x0,  x1, lsl #1
-        ld1             {v18.8B}, [x0], x1
-        ld1             {v16.8B}, [x0], x1
-        ld1             {v0.8B},  [x0], x1
-        ld1             {v2.8B},  [x0]
+        ld1             {v18.8b}, [x0], x1
+        ld1             {v16.8b}, [x0], x1
+        ld1             {v0.8b},  [x0], x1
+        ld1             {v2.8b},  [x0]
 
         h264_loop_filter_chroma
 
         sub             x0,  x0,  x1, lsl #1
-        st1             {v16.8B}, [x0], x1
-        st1             {v0.8B},  [x0], x1
-9:
-        ret
+        st1             {v16.8b}, [x0], x1
+        st1             {v0.8b},  [x0], x1
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_neon, export=1
@@ -432,14 +425,14 @@  function ff_h264_h_loop_filter_chroma_neon, export=1
 
         sub             x0,  x0,  #2
 h_loop_filter_chroma420:
-        ld1             {v18.S}[0], [x0], x1
-        ld1             {v16.S}[0], [x0], x1
-        ld1             {v0.S}[0],  [x0], x1
-        ld1             {v2.S}[0],  [x0], x1
-        ld1             {v18.S}[1], [x0], x1
-        ld1             {v16.S}[1], [x0], x1
-        ld1             {v0.S}[1],  [x0], x1
-        ld1             {v2.S}[1],  [x0], x1
+        ld1             {v18.s}[0], [x0], x1
+        ld1             {v16.s}[0], [x0], x1
+        ld1             {v0.s}[0],  [x0], x1
+        ld1             {v2.s}[0],  [x0], x1
+        ld1             {v18.s}[1], [x0], x1
+        ld1             {v16.s}[1], [x0], x1
+        ld1             {v0.s}[1],  [x0], x1
+        ld1             {v2.s}[1],  [x0], x1
 
         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
 
@@ -448,16 +441,15 @@  h_loop_filter_chroma420:
         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
 
         sub             x0,  x0,  x1, lsl #3
-        st1             {v18.S}[0], [x0], x1
-        st1             {v16.S}[0], [x0], x1
-        st1             {v0.S}[0],  [x0], x1
-        st1             {v2.S}[0],  [x0], x1
-        st1             {v18.S}[1], [x0], x1
-        st1             {v16.S}[1], [x0], x1
-        st1             {v0.S}[1],  [x0], x1
-        st1             {v2.S}[1],  [x0], x1
-9:
-        ret
+        st1             {v18.s}[0], [x0], x1
+        st1             {v16.s}[0], [x0], x1
+        st1             {v0.s}[0],  [x0], x1
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v18.s}[1], [x0], x1
+        st1             {v16.s}[1], [x0], x1
+        st1             {v0.s}[1],  [x0], x1
+        st1             {v2.s}[1],  [x0], x1
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma422_neon, export=1
@@ -512,8 +504,7 @@  function ff_h264_v_loop_filter_chroma_intra_neon, export=1
         st1             {v16.8b}, [x0], x1
         st1             {v17.8b}, [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
@@ -526,7 +517,7 @@  function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
         ld1             {v17.8b}, [x4], x1
         ld1             {v19.8b}, [x4], x1
 
-        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
 
         h264_loop_filter_chroma_intra
 
@@ -535,8 +526,7 @@  function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
         st2             {v16.b,v17.b}[2], [x0], x1
         st2             {v16.b,v17.b}[3], [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
@@ -554,7 +544,7 @@  h_loop_filter_chroma420_intra:
         ld1             {v17.s}[1], [x4], x1
         ld1             {v19.s}[1], [x4], x1
 
-        transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
 
         h264_loop_filter_chroma_intra
 
@@ -567,8 +557,7 @@  h_loop_filter_chroma420_intra:
         st2             {v16.b,v17.b}[6], [x0], x1
         st2             {v16.b,v17.b}[7], [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
@@ -584,102 +573,102 @@  function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
 endfunc
 
 .macro  biweight_16     macs, macd
-        dup             v0.16B,  w5
-        dup             v1.16B,  w6
-        mov             v4.16B,  v16.16B
-        mov             v6.16B,  v16.16B
+        dup             v0.16b,  w5
+        dup             v1.16b,  w6
+        mov             v4.16b,  v16.16b
+        mov             v6.16b,  v16.16b
 1:      subs            w3,  w3,  #2
-        ld1             {v20.16B}, [x0], x2
-        \macd           v4.8H,   v0.8B,  v20.8B
-        \macd\()2       v6.8H,   v0.16B, v20.16B
-        ld1             {v22.16B}, [x1], x2
-        \macs           v4.8H,   v1.8B,  v22.8B
-        \macs\()2       v6.8H,   v1.16B, v22.16B
-        mov             v24.16B, v16.16B
-        ld1             {v28.16B}, [x0], x2
-        mov             v26.16B, v16.16B
-        \macd           v24.8H,  v0.8B,  v28.8B
-        \macd\()2       v26.8H,  v0.16B, v28.16B
-        ld1             {v30.16B}, [x1], x2
-        \macs           v24.8H,  v1.8B,  v30.8B
-        \macs\()2       v26.8H,  v1.16B, v30.16B
-        sshl            v4.8H,   v4.8H,  v18.8H
-        sshl            v6.8H,   v6.8H,  v18.8H
-        sqxtun          v4.8B,   v4.8H
-        sqxtun2         v4.16B,  v6.8H
-        sshl            v24.8H,  v24.8H, v18.8H
-        sshl            v26.8H,  v26.8H, v18.8H
-        sqxtun          v24.8B,  v24.8H
-        sqxtun2         v24.16B, v26.8H
-        mov             v6.16B,  v16.16B
-        st1             {v4.16B},  [x7], x2
-        mov             v4.16B,  v16.16B
-        st1             {v24.16B}, [x7], x2
+        ld1             {v20.16b}, [x0], x2
+        \macd           v4.8h,   v0.8b,  v20.8b
+        \macd\()2       v6.8h,   v0.16b, v20.16b
+        ld1             {v22.16b}, [x1], x2
+        \macs           v4.8h,   v1.8b,  v22.8b
+        \macs\()2       v6.8h,   v1.16b, v22.16b
+        mov             v24.16b, v16.16b
+        ld1             {v28.16b}, [x0], x2
+        mov             v26.16b, v16.16b
+        \macd           v24.8h,  v0.8b,  v28.8b
+        \macd\()2       v26.8h,  v0.16b, v28.16b
+        ld1             {v30.16b}, [x1], x2
+        \macs           v24.8h,  v1.8b,  v30.8b
+        \macs\()2       v26.8h,  v1.16b, v30.16b
+        sshl            v4.8h,   v4.8h,  v18.8h
+        sshl            v6.8h,   v6.8h,  v18.8h
+        sqxtun          v4.8b,   v4.8h
+        sqxtun2         v4.16b,  v6.8h
+        sshl            v24.8h,  v24.8h, v18.8h
+        sshl            v26.8h,  v26.8h, v18.8h
+        sqxtun          v24.8b,  v24.8h
+        sqxtun2         v24.16b, v26.8h
+        mov             v6.16b,  v16.16b
+        st1             {v4.16b},  [x7], x2
+        mov             v4.16b,  v16.16b
+        st1             {v24.16b}, [x7], x2
         b.ne            1b
         ret
 .endm
 
 .macro  biweight_8      macs, macd
-        dup             v0.8B,  w5
-        dup             v1.8B,  w6
-        mov             v2.16B,  v16.16B
-        mov             v20.16B, v16.16B
+        dup             v0.8b,  w5
+        dup             v1.8b,  w6
+        mov             v2.16b,  v16.16b
+        mov             v20.16b, v16.16b
 1:      subs            w3,  w3,  #2
-        ld1             {v4.8B}, [x0], x2
-        \macd           v2.8H,  v0.8B,  v4.8B
-        ld1             {v5.8B}, [x1], x2
-        \macs           v2.8H,  v1.8B,  v5.8B
-        ld1             {v6.8B}, [x0], x2
-        \macd           v20.8H, v0.8B,  v6.8B
-        ld1             {v7.8B}, [x1], x2
-        \macs           v20.8H, v1.8B,  v7.8B
-        sshl            v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        sshl            v20.8H, v20.8H, v18.8H
-        sqxtun          v4.8B,  v20.8H
-        mov             v20.16B, v16.16B
-        st1             {v2.8B}, [x7], x2
-        mov             v2.16B,  v16.16B
-        st1             {v4.8B}, [x7], x2
+        ld1             {v4.8b}, [x0], x2
+        \macd           v2.8h,  v0.8b,  v4.8b
+        ld1             {v5.8b}, [x1], x2
+        \macs           v2.8h,  v1.8b,  v5.8b
+        ld1             {v6.8b}, [x0], x2
+        \macd           v20.8h, v0.8b,  v6.8b
+        ld1             {v7.8b}, [x1], x2
+        \macs           v20.8h, v1.8b,  v7.8b
+        sshl            v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        sshl            v20.8h, v20.8h, v18.8h
+        sqxtun          v4.8b,  v20.8h
+        mov             v20.16b, v16.16b
+        st1             {v2.8b}, [x7], x2
+        mov             v2.16b,  v16.16b
+        st1             {v4.8b}, [x7], x2
         b.ne            1b
         ret
 .endm
 
 .macro  biweight_4      macs, macd
-        dup             v0.8B,  w5
-        dup             v1.8B,  w6
-        mov             v2.16B, v16.16B
-        mov             v20.16B,v16.16B
+        dup             v0.8b,  w5
+        dup             v1.8b,  w6
+        mov             v2.16b, v16.16b
+        mov             v20.16b,v16.16b
 1:      subs            w3,  w3,  #4
-        ld1             {v4.S}[0], [x0], x2
-        ld1             {v4.S}[1], [x0], x2
-        \macd           v2.8H,  v0.8B,  v4.8B
-        ld1             {v5.S}[0], [x1], x2
-        ld1             {v5.S}[1], [x1], x2
-        \macs           v2.8H,  v1.8B,  v5.8B
+        ld1             {v4.s}[0], [x0], x2
+        ld1             {v4.s}[1], [x0], x2
+        \macd           v2.8h,  v0.8b,  v4.8b
+        ld1             {v5.s}[0], [x1], x2
+        ld1             {v5.s}[1], [x1], x2
+        \macs           v2.8h,  v1.8b,  v5.8b
         b.lt            2f
-        ld1             {v6.S}[0], [x0], x2
-        ld1             {v6.S}[1], [x0], x2
-        \macd           v20.8H, v0.8B,  v6.8B
-        ld1             {v7.S}[0], [x1], x2
-        ld1             {v7.S}[1], [x1], x2
-        \macs           v20.8H, v1.8B,  v7.8B
-        sshl            v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        sshl            v20.8H, v20.8H, v18.8H
-        sqxtun          v4.8B,  v20.8H
-        mov             v20.16B, v16.16B
-        st1             {v2.S}[0], [x7], x2
-        st1             {v2.S}[1], [x7], x2
-        mov             v2.16B,  v16.16B
-        st1             {v4.S}[0], [x7], x2
-        st1             {v4.S}[1], [x7], x2
+        ld1             {v6.s}[0], [x0], x2
+        ld1             {v6.s}[1], [x0], x2
+        \macd           v20.8h, v0.8b,  v6.8b
+        ld1             {v7.s}[0], [x1], x2
+        ld1             {v7.s}[1], [x1], x2
+        \macs           v20.8h, v1.8b,  v7.8b
+        sshl            v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        sshl            v20.8h, v20.8h, v18.8h
+        sqxtun          v4.8b,  v20.8h
+        mov             v20.16b, v16.16b
+        st1             {v2.s}[0], [x7], x2
+        st1             {v2.s}[1], [x7], x2
+        mov             v2.16b,  v16.16b
+        st1             {v4.s}[0], [x7], x2
+        st1             {v4.s}[1], [x7], x2
         b.ne            1b
         ret
-2:      sshl            v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        st1             {v2.S}[0], [x7], x2
-        st1             {v2.S}[1], [x7], x2
+2:      sshl            v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.s}[0], [x7], x2
+        st1             {v2.s}[1], [x7], x2
         ret
 .endm
 
@@ -689,10 +678,10 @@  function ff_biweight_h264_pixels_\w\()_neon, export=1
         add             w7,  w7,  #1
         eor             w8,  w8,  w6,  lsr #30
         orr             w7,  w7,  #1
-        dup             v18.8H,   w4
+        dup             v18.8h,   w4
         lsl             w7,  w7,  w4
-        not             v18.16B,  v18.16B
-        dup             v16.8H,   w7
+        not             v18.16b,  v18.16b
+        dup             v16.8h,   w7
         mov             x7,  x0
         cbz             w8,  10f
         subs            w8,  w8,  #1
@@ -716,78 +705,78 @@  endfunc
         biweight_func   4
 
 .macro  weight_16       add
-        dup             v0.16B,  w4
+        dup             v0.16b,  w4
 1:      subs            w2,  w2,  #2
-        ld1             {v20.16B}, [x0], x1
-        umull           v4.8H,   v0.8B,  v20.8B
-        umull2          v6.8H,   v0.16B, v20.16B
-        ld1             {v28.16B}, [x0], x1
-        umull           v24.8H,  v0.8B,  v28.8B
-        umull2          v26.8H,  v0.16B, v28.16B
-        \add            v4.8H,   v16.8H, v4.8H
-        srshl           v4.8H,   v4.8H,  v18.8H
-        \add            v6.8H,   v16.8H, v6.8H
-        srshl           v6.8H,   v6.8H,  v18.8H
-        sqxtun          v4.8B,   v4.8H
-        sqxtun2         v4.16B,  v6.8H
-        \add            v24.8H,  v16.8H, v24.8H
-        srshl           v24.8H,  v24.8H, v18.8H
-        \add            v26.8H,  v16.8H, v26.8H
-        srshl           v26.8H,  v26.8H, v18.8H
-        sqxtun          v24.8B,  v24.8H
-        sqxtun2         v24.16B, v26.8H
-        st1             {v4.16B},  [x5], x1
-        st1             {v24.16B}, [x5], x1
+        ld1             {v20.16b}, [x0], x1
+        umull           v4.8h,   v0.8b,  v20.8b
+        umull2          v6.8h,   v0.16b, v20.16b
+        ld1             {v28.16b}, [x0], x1
+        umull           v24.8h,  v0.8b,  v28.8b
+        umull2          v26.8h,  v0.16b, v28.16b
+        \add            v4.8h,   v16.8h, v4.8h
+        srshl           v4.8h,   v4.8h,  v18.8h
+        \add            v6.8h,   v16.8h, v6.8h
+        srshl           v6.8h,   v6.8h,  v18.8h
+        sqxtun          v4.8b,   v4.8h
+        sqxtun2         v4.16b,  v6.8h
+        \add            v24.8h,  v16.8h, v24.8h
+        srshl           v24.8h,  v24.8h, v18.8h
+        \add            v26.8h,  v16.8h, v26.8h
+        srshl           v26.8h,  v26.8h, v18.8h
+        sqxtun          v24.8b,  v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v4.16b},  [x5], x1
+        st1             {v24.16b}, [x5], x1
         b.ne            1b
         ret
 .endm
 
 .macro  weight_8        add
-        dup             v0.8B,  w4
+        dup             v0.8b,  w4
 1:      subs            w2,  w2,  #2
-        ld1             {v4.8B}, [x0], x1
-        umull           v2.8H,  v0.8B,  v4.8B
-        ld1             {v6.8B}, [x0], x1
-        umull           v20.8H, v0.8B,  v6.8B
-        \add            v2.8H,  v16.8H,  v2.8H
-        srshl           v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        \add            v20.8H, v16.8H,  v20.8H
-        srshl           v20.8H, v20.8H, v18.8H
-        sqxtun          v4.8B,  v20.8H
-        st1             {v2.8B}, [x5], x1
-        st1             {v4.8B}, [x5], x1
+        ld1             {v4.8b}, [x0], x1
+        umull           v2.8h,  v0.8b,  v4.8b
+        ld1             {v6.8b}, [x0], x1
+        umull           v20.8h, v0.8b,  v6.8b
+        \add            v2.8h,  v16.8h,  v2.8h
+        srshl           v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        \add            v20.8h, v16.8h,  v20.8h
+        srshl           v20.8h, v20.8h, v18.8h
+        sqxtun          v4.8b,  v20.8h
+        st1             {v2.8b}, [x5], x1
+        st1             {v4.8b}, [x5], x1
         b.ne            1b
         ret
 .endm
 
 .macro  weight_4        add
-        dup             v0.8B,  w4
+        dup             v0.8b,  w4
 1:      subs            w2,  w2,  #4
-        ld1             {v4.S}[0], [x0], x1
-        ld1             {v4.S}[1], [x0], x1
-        umull           v2.8H,  v0.8B,  v4.8B
+        ld1             {v4.s}[0], [x0], x1
+        ld1             {v4.s}[1], [x0], x1
+        umull           v2.8h,  v0.8b,  v4.8b
         b.lt            2f
-        ld1             {v6.S}[0], [x0], x1
-        ld1             {v6.S}[1], [x0], x1
-        umull           v20.8H, v0.8B,  v6.8B
-        \add            v2.8H,  v16.8H,  v2.8H
-        srshl           v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        \add            v20.8H, v16.8H,  v20.8H
-        srshl           v20.8H, v20.8h, v18.8H
-        sqxtun          v4.8B,  v20.8H
-        st1             {v2.S}[0], [x5], x1
-        st1             {v2.S}[1], [x5], x1
-        st1             {v4.S}[0], [x5], x1
-        st1             {v4.S}[1], [x5], x1
+        ld1             {v6.s}[0], [x0], x1
+        ld1             {v6.s}[1], [x0], x1
+        umull           v20.8h, v0.8b,  v6.8b
+        \add            v2.8h,  v16.8h,  v2.8h
+        srshl           v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        \add            v20.8h, v16.8h,  v20.8h
+        srshl           v20.8h, v20.8h, v18.8h
+        sqxtun          v4.8b,  v20.8h
+        st1             {v2.s}[0], [x5], x1
+        st1             {v2.s}[1], [x5], x1
+        st1             {v4.s}[0], [x5], x1
+        st1             {v4.s}[1], [x5], x1
         b.ne            1b
         ret
-2:      \add            v2.8H,  v16.8H,  v2.8H
-        srshl           v2.8H,  v2.8H,  v18.8H
-        sqxtun          v2.8B,  v2.8H
-        st1             {v2.S}[0], [x5], x1
-        st1             {v2.S}[1], [x5], x1
+2:      \add            v2.8h,  v16.8h,  v2.8h
+        srshl           v2.8h,  v2.8h,  v18.8h
+        sqxtun          v2.8b,  v2.8h
+        st1             {v2.s}[0], [x5], x1
+        st1             {v2.s}[1], [x5], x1
         ret
 .endm
 
@@ -796,18 +785,18 @@  function ff_weight_h264_pixels_\w\()_neon, export=1
         cmp             w3,  #1
         mov             w6,  #1
         lsl             w5,  w5,  w3
-        dup             v16.8H,  w5
+        dup             v16.8h,  w5
         mov             x5,  x0
         b.le            20f
         sub             w6,  w6,  w3
-        dup             v18.8H,  w6
+        dup             v18.8h,  w6
         cmp             w4, #0
         b.lt            10f
         weight_\w       shadd
 10:     neg             w4,  w4
         weight_\w       shsub
 20:     neg             w6,  w3
-        dup             v18.8H,  w6
+        dup             v18.8h,  w6
         cmp             w4,  #0
         b.lt            10f
         weight_\w       add
@@ -825,14 +814,13 @@  endfunc
         ldr             w6,  [x4]
         ccmp            w3,  #0,  #0,  ne
         lsl             w2,  w2,  #2
-        mov             v24.S[0], w6
+        mov             v24.s[0], w6
         lsl             w3,  w3,  #2
         and             w8,  w6,  w6,  lsl #16
         b.eq            1f
         ands            w8,  w8,  w8,  lsl #8
         b.ge            2f
-1:
-        ret
+1:      ret
 2:
 .endm
 
@@ -840,8 +828,7 @@  endfunc
         orr             w4,  w2,  w3
         cbnz            w4,  1f
         ret
-1:
-        lsl             w2,  w2,  #2
+1:      lsl             w2,  w2,  #2
         lsl             w3,  w3,  #2
         dup             v30.8h,   w2              // alpha
         dup             v31.8h,   w3              // beta
@@ -908,8 +895,7 @@  function ff_h264_v_loop_filter_chroma_neon_10, export=1
         sub             x0,  x10,  x1, lsl #1
         st1             {v16.8h}, [x0], x1
         st1             {v0.8h},  [x0], x1
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_neon_10, export=1
@@ -942,8 +928,7 @@  h_loop_filter_chroma420_10:
         st1             {v16.d}[1], [x0], x1
         st1             {v0.d}[1],  [x0], x1
         st1             {v2.d}[1],  [x0], x1
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma422_neon_10, export=1
@@ -1002,8 +987,7 @@  function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
         st1             {v16.8h}, [x0], x1
         st1             {v17.8h}, [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
@@ -1017,7 +1001,7 @@  function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
         ld1             {v16.8h}, [x4], x1
         ld1             {v19.8h}, [x9], x1
 
-        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
 
         h264_loop_filter_chroma_intra_10
 
@@ -1026,8 +1010,7 @@  function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
         st2             {v16.h,v17.h}[2], [x0], x1
         st2             {v16.h,v17.h}[3], [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
@@ -1045,7 +1028,7 @@  h_loop_filter_chroma420_intra_10:
         ld1             {v19.4h},   [x4], x1
         ld1             {v19.d}[1], [x9], x1
 
-        transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
 
         h264_loop_filter_chroma_intra_10
 
@@ -1058,8 +1041,7 @@  h_loop_filter_chroma420_intra_10:
         st2             {v16.h,v17.h}[6], [x0], x1
         st2             {v16.h,v17.h}[7], [x0], x1
 
-9:
-        ret
+9:      ret
 endfunc
 
 function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 7d2879b0ce..e4aa3cd66e 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -25,54 +25,54 @@ 
 function ff_h264_idct_add_neon, export=1
 .L_ff_h264_idct_add_neon:
         AARCH64_VALID_CALL_TARGET
-        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
+        ld1             {v0.4h, v1.4h, v2.4h, v3.4h},  [x1]
         sxtw            x2,     w2
-        movi            v30.8H, #0
+        movi            v30.8h, #0
 
-        add             v4.4H,  v0.4H,  v2.4H
-        sshr            v16.4H, v1.4H,  #1
-        st1             {v30.8H},    [x1], #16
-        sshr            v17.4H, v3.4H,  #1
-        st1             {v30.8H},    [x1], #16
-        sub             v5.4H,  v0.4H,  v2.4H
-        sub             v6.4H,  v16.4H, v3.4H
-        add             v7.4H,  v1.4H,  v17.4H
-        add             v0.4H,  v4.4H,  v7.4H
-        add             v1.4H,  v5.4H,  v6.4H
-        sub             v2.4H,  v5.4H,  v6.4H
-        sub             v3.4H,  v4.4H,  v7.4H
+        add             v4.4h,  v0.4h,  v2.4h
+        sshr            v16.4h, v1.4h,  #1
+        st1             {v30.8h},    [x1], #16
+        sshr            v17.4h, v3.4h,  #1
+        st1             {v30.8h},    [x1], #16
+        sub             v5.4h,  v0.4h,  v2.4h
+        sub             v6.4h,  v16.4h, v3.4h
+        add             v7.4h,  v1.4h,  v17.4h
+        add             v0.4h,  v4.4h,  v7.4h
+        add             v1.4h,  v5.4h,  v6.4h
+        sub             v2.4h,  v5.4h,  v6.4h
+        sub             v3.4h,  v4.4h,  v7.4h
 
         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
 
-        add             v4.4H,  v0.4H,  v2.4H
-        ld1             {v18.S}[0], [x0], x2
-        sshr            v16.4H,  v3.4H,  #1
-        sshr            v17.4H,  v1.4H,  #1
-        ld1             {v18.S}[1], [x0], x2
-        sub             v5.4H,  v0.4H,  v2.4H
-        ld1             {v19.S}[1], [x0], x2
-        add             v6.4H,  v16.4H, v1.4H
-        ins             v4.D[1],  v5.D[0]
-        sub             v7.4H,  v17.4H, v3.4H
-        ld1             {v19.S}[0], [x0], x2
-        ins             v6.D[1],  v7.D[0]
+        add             v4.4h,  v0.4h,  v2.4h
+        ld1             {v18.s}[0], [x0], x2
+        sshr            v16.4h,  v3.4h,  #1
+        sshr            v17.4h,  v1.4h,  #1
+        ld1             {v18.s}[1], [x0], x2
+        sub             v5.4h,  v0.4h,  v2.4h
+        ld1             {v19.s}[1], [x0], x2
+        add             v6.4h,  v16.4h, v1.4h
+        ins             v4.d[1],  v5.d[0]
+        sub             v7.4h,  v17.4h, v3.4h
+        ld1             {v19.s}[0], [x0], x2
+        ins             v6.d[1],  v7.d[0]
         sub             x0,  x0,  x2, lsl #2
-        add             v0.8H,  v4.8H,  v6.8H
-        sub             v1.8H,  v4.8H,  v6.8H
+        add             v0.8h,  v4.8h,  v6.8h
+        sub             v1.8h,  v4.8h,  v6.8h
 
-        srshr           v0.8H,  v0.8H,  #6
-        srshr           v1.8H,  v1.8H,  #6
+        srshr           v0.8h,  v0.8h,  #6
+        srshr           v1.8h,  v1.8h,  #6
 
-        uaddw           v0.8H,  v0.8H,  v18.8B
-        uaddw           v1.8H,  v1.8H,  v19.8B
+        uaddw           v0.8h,  v0.8h,  v18.8b
+        uaddw           v1.8h,  v1.8h,  v19.8b
 
-        sqxtun          v0.8B, v0.8H
-        sqxtun          v1.8B, v1.8H
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
 
-        st1             {v0.S}[0],  [x0], x2
-        st1             {v0.S}[1],  [x0], x2
-        st1             {v1.S}[1],  [x0], x2
-        st1             {v1.S}[0],  [x0], x2
+        st1             {v0.s}[0],  [x0], x2
+        st1             {v0.s}[1],  [x0], x2
+        st1             {v1.s}[1],  [x0], x2
+        st1             {v1.s}[0],  [x0], x2
 
         sub             x1,  x1,  #32
         ret
@@ -83,22 +83,22 @@  function ff_h264_idct_dc_add_neon, export=1
         AARCH64_VALID_CALL_TARGET
         sxtw            x2,  w2
         mov             w3,       #0
-        ld1r            {v2.8H},  [x1]
+        ld1r            {v2.8h},  [x1]
         strh            w3,       [x1]
-        srshr           v2.8H,  v2.8H,  #6
-        ld1             {v0.S}[0],  [x0], x2
-        ld1             {v0.S}[1],  [x0], x2
-        uaddw           v3.8H,  v2.8H,  v0.8B
-        ld1             {v1.S}[0],  [x0], x2
-        ld1             {v1.S}[1],  [x0], x2
-        uaddw           v4.8H,  v2.8H,  v1.8B
-        sqxtun          v0.8B,  v3.8H
-        sqxtun          v1.8B,  v4.8H
+        srshr           v2.8h,  v2.8h,  #6
+        ld1             {v0.s}[0],  [x0], x2
+        ld1             {v0.s}[1],  [x0], x2
+        uaddw           v3.8h,  v2.8h,  v0.8b
+        ld1             {v1.s}[0],  [x0], x2
+        ld1             {v1.s}[1],  [x0], x2
+        uaddw           v4.8h,  v2.8h,  v1.8b
+        sqxtun          v0.8b,  v3.8h
+        sqxtun          v1.8b,  v4.8h
         sub             x0,  x0,  x2, lsl #2
-        st1             {v0.S}[0],  [x0], x2
-        st1             {v0.S}[1],  [x0], x2
-        st1             {v1.S}[0],  [x0], x2
-        st1             {v1.S}[1],  [x0], x2
+        st1             {v0.s}[0],  [x0], x2
+        st1             {v0.s}[1],  [x0], x2
+        st1             {v1.s}[0],  [x0], x2
+        st1             {v1.s}[1],  [x0], x2
         ret
 endfunc
 
@@ -193,139 +193,139 @@  function ff_h264_idct_add8_neon, export=1
 endfunc
 
 .macro  idct8x8_cols    pass
-  .if \pass == 0
-        va      .req    v18
-        vb      .req    v30
-        sshr            v18.8H, v26.8H, #1
-        add             v16.8H, v24.8H, v28.8H
-        ld1             {v30.8H, v31.8H}, [x1]
-        st1             {v19.8H}, [x1],  #16
-        st1             {v19.8H}, [x1],  #16
-        sub             v17.8H,  v24.8H, v28.8H
-        sshr            v19.8H,  v30.8H, #1
-        sub             v18.8H,  v18.8H,  v30.8H
-        add             v19.8H,  v19.8H,  v26.8H
-  .else
-        va      .req    v30
-        vb      .req    v18
-        sshr            v30.8H, v26.8H, #1
-        sshr            v19.8H, v18.8H, #1
-        add             v16.8H, v24.8H, v28.8H
-        sub             v17.8H, v24.8H, v28.8H
-        sub             v30.8H, v30.8H, v18.8H
-        add             v19.8H, v19.8H, v26.8H
-  .endif
-        add             v26.8H, v17.8H, va.8H
-        sub             v28.8H, v17.8H, va.8H
-        add             v24.8H, v16.8H, v19.8H
-        sub             vb.8H,  v16.8H, v19.8H
-        sub             v16.8H, v29.8H, v27.8H
-        add             v17.8H, v31.8H, v25.8H
-        sub             va.8H,  v31.8H, v25.8H
-        add             v19.8H, v29.8H, v27.8H
-        sub             v16.8H, v16.8H, v31.8H
-        sub             v17.8H, v17.8H, v27.8H
-        add             va.8H,  va.8H,  v29.8H
-        add             v19.8H, v19.8H, v25.8H
-        sshr            v25.8H, v25.8H, #1
-        sshr            v27.8H, v27.8H, #1
-        sshr            v29.8H, v29.8H, #1
-        sshr            v31.8H, v31.8H, #1
-        sub             v16.8H, v16.8H, v31.8H
-        sub             v17.8H, v17.8H, v27.8H
-        add             va.8H,  va.8H,  v29.8H
-        add             v19.8H, v19.8H, v25.8H
-        sshr            v25.8H, v16.8H, #2
-        sshr            v27.8H, v17.8H, #2
-        sshr            v29.8H, va.8H,  #2
-        sshr            v31.8H, v19.8H, #2
-        sub             v19.8H, v19.8H, v25.8H
-        sub             va.8H,  v27.8H, va.8H
-        add             v17.8H, v17.8H, v29.8H
-        add             v16.8H, v16.8H, v31.8H
-  .if \pass == 0
-        sub             v31.8H, v24.8H, v19.8H
-        add             v24.8H, v24.8H, v19.8H
-        add             v25.8H, v26.8H, v18.8H
-        sub             v18.8H, v26.8H, v18.8H
-        add             v26.8H, v28.8H, v17.8H
-        add             v27.8H, v30.8H, v16.8H
-        sub             v29.8H, v28.8H, v17.8H
-        sub             v28.8H, v30.8H, v16.8H
-  .else
-        sub             v31.8H, v24.8H, v19.8H
-        add             v24.8H, v24.8H, v19.8H
-        add             v25.8H, v26.8H, v30.8H
-        sub             v30.8H, v26.8H, v30.8H
-        add             v26.8H, v28.8H, v17.8H
-        sub             v29.8H, v28.8H, v17.8H
-        add             v27.8H, v18.8H, v16.8H
-        sub             v28.8H, v18.8H, v16.8H
-  .endif
-        .unreq          va
-        .unreq          vb
+.if \pass == 0
+        va              .req    v18
+        vb              .req    v30
+        sshr            v18.8h, v26.8h, #1
+        add             v16.8h, v24.8h, v28.8h
+        ld1             {v30.8h, v31.8h}, [x1]
+        st1             {v19.8h}, [x1],  #16
+        st1             {v19.8h}, [x1],  #16
+        sub             v17.8h,  v24.8h, v28.8h
+        sshr            v19.8h,  v30.8h, #1
+        sub             v18.8h,  v18.8h,  v30.8h
+        add             v19.8h,  v19.8h,  v26.8h
+.else
+        va              .req    v30
+        vb              .req    v18
+        sshr            v30.8h, v26.8h, #1
+        sshr            v19.8h, v18.8h, #1
+        add             v16.8h, v24.8h, v28.8h
+        sub             v17.8h, v24.8h, v28.8h
+        sub             v30.8h, v30.8h, v18.8h
+        add             v19.8h, v19.8h, v26.8h
+.endif
+        add             v26.8h, v17.8h, va.8H
+        sub             v28.8h, v17.8h, va.8H
+        add             v24.8h, v16.8h, v19.8h
+        sub             vb.8H,  v16.8h, v19.8h
+        sub             v16.8h, v29.8h, v27.8h
+        add             v17.8h, v31.8h, v25.8h
+        sub             va.8H,  v31.8h, v25.8h
+        add             v19.8h, v29.8h, v27.8h
+        sub             v16.8h, v16.8h, v31.8h
+        sub             v17.8h, v17.8h, v27.8h
+        add             va.8H,  va.8H,  v29.8h
+        add             v19.8h, v19.8h, v25.8h
+        sshr            v25.8h, v25.8h, #1
+        sshr            v27.8h, v27.8h, #1
+        sshr            v29.8h, v29.8h, #1
+        sshr            v31.8h, v31.8h, #1
+        sub             v16.8h, v16.8h, v31.8h
+        sub             v17.8h, v17.8h, v27.8h
+        add             va.8H,  va.8H,  v29.8h
+        add             v19.8h, v19.8h, v25.8h
+        sshr            v25.8h, v16.8h, #2
+        sshr            v27.8h, v17.8h, #2
+        sshr            v29.8h, va.8H,  #2
+        sshr            v31.8h, v19.8h, #2
+        sub             v19.8h, v19.8h, v25.8h
+        sub             va.8H,  v27.8h, va.8H
+        add             v17.8h, v17.8h, v29.8h
+        add             v16.8h, v16.8h, v31.8h
+.if \pass == 0
+        sub             v31.8h, v24.8h, v19.8h
+        add             v24.8h, v24.8h, v19.8h
+        add             v25.8h, v26.8h, v18.8h
+        sub             v18.8h, v26.8h, v18.8h
+        add             v26.8h, v28.8h, v17.8h
+        add             v27.8h, v30.8h, v16.8h
+        sub             v29.8h, v28.8h, v17.8h
+        sub             v28.8h, v30.8h, v16.8h
+.else
+        sub             v31.8h, v24.8h, v19.8h
+        add             v24.8h, v24.8h, v19.8h
+        add             v25.8h, v26.8h, v30.8h
+        sub             v30.8h, v26.8h, v30.8h
+        add             v26.8h, v28.8h, v17.8h
+        sub             v29.8h, v28.8h, v17.8h
+        add             v27.8h, v18.8h, v16.8h
+        sub             v28.8h, v18.8h, v16.8h
+.endif
+.unreq          va
+.unreq          vb
 .endm
 
 function ff_h264_idct8_add_neon, export=1
 .L_ff_h264_idct8_add_neon:
         AARCH64_VALID_CALL_TARGET
-        movi            v19.8H,   #0
+        movi            v19.8h,   #0
         sxtw            x2,       w2
-        ld1             {v24.8H, v25.8H}, [x1]
-        st1             {v19.8H},  [x1],   #16
-        st1             {v19.8H},  [x1],   #16
-        ld1             {v26.8H, v27.8H}, [x1]
-        st1             {v19.8H},  [x1],   #16
-        st1             {v19.8H},  [x1],   #16
-        ld1             {v28.8H, v29.8H}, [x1]
-        st1             {v19.8H},  [x1],   #16
-        st1             {v19.8H},  [x1],   #16
+        ld1             {v24.8h, v25.8h}, [x1]
+        st1             {v19.8h},  [x1],   #16
+        st1             {v19.8h},  [x1],   #16
+        ld1             {v26.8h, v27.8h}, [x1]
+        st1             {v19.8h},  [x1],   #16
+        st1             {v19.8h},  [x1],   #16
+        ld1             {v28.8h, v29.8h}, [x1]
+        st1             {v19.8h},  [x1],   #16
+        st1             {v19.8h},  [x1],   #16
 
         idct8x8_cols    0
         transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
         idct8x8_cols    1
 
         mov             x3,  x0
-        srshr           v24.8H, v24.8H, #6
-        ld1             {v0.8B},     [x0], x2
-        srshr           v25.8H, v25.8H, #6
-        ld1             {v1.8B},     [x0], x2
-        srshr           v26.8H, v26.8H, #6
-        ld1             {v2.8B},     [x0], x2
-        srshr           v27.8H, v27.8H, #6
-        ld1             {v3.8B},     [x0], x2
-        srshr           v28.8H, v28.8H, #6
-        ld1             {v4.8B},     [x0], x2
-        srshr           v29.8H, v29.8H, #6
-        ld1             {v5.8B},     [x0], x2
-        srshr           v30.8H, v30.8H, #6
-        ld1             {v6.8B},     [x0], x2
-        srshr           v31.8H, v31.8H, #6
-        ld1             {v7.8B},     [x0], x2
-        uaddw           v24.8H, v24.8H, v0.8B
-        uaddw           v25.8H, v25.8H, v1.8B
-        uaddw           v26.8H, v26.8H, v2.8B
-        sqxtun          v0.8B,  v24.8H
-        uaddw           v27.8H, v27.8H, v3.8B
-        sqxtun          v1.8B,  v25.8H
-        uaddw           v28.8H, v28.8H, v4.8B
-        sqxtun          v2.8B,  v26.8H
-        st1             {v0.8B},     [x3], x2
-        uaddw           v29.8H, v29.8H, v5.8B
-        sqxtun          v3.8B,  v27.8H
-        st1             {v1.8B},     [x3], x2
-        uaddw           v30.8H, v30.8H, v6.8B
-        sqxtun          v4.8B,  v28.8H
-        st1             {v2.8B},     [x3], x2
-        uaddw           v31.8H, v31.8H, v7.8B
-        sqxtun          v5.8B,  v29.8H
-        st1             {v3.8B},     [x3], x2
-        sqxtun          v6.8B,  v30.8H
-        sqxtun          v7.8B,  v31.8H
-        st1             {v4.8B},     [x3], x2
-        st1             {v5.8B},     [x3], x2
-        st1             {v6.8B},     [x3], x2
-        st1             {v7.8B},     [x3], x2
+        srshr           v24.8h, v24.8h, #6
+        ld1             {v0.8b},     [x0], x2
+        srshr           v25.8h, v25.8h, #6
+        ld1             {v1.8b},     [x0], x2
+        srshr           v26.8h, v26.8h, #6
+        ld1             {v2.8b},     [x0], x2
+        srshr           v27.8h, v27.8h, #6
+        ld1             {v3.8b},     [x0], x2
+        srshr           v28.8h, v28.8h, #6
+        ld1             {v4.8b},     [x0], x2
+        srshr           v29.8h, v29.8h, #6
+        ld1             {v5.8b},     [x0], x2
+        srshr           v30.8h, v30.8h, #6
+        ld1             {v6.8b},     [x0], x2
+        srshr           v31.8h, v31.8h, #6
+        ld1             {v7.8b},     [x0], x2
+        uaddw           v24.8h, v24.8h, v0.8b
+        uaddw           v25.8h, v25.8h, v1.8b
+        uaddw           v26.8h, v26.8h, v2.8b
+        sqxtun          v0.8b,  v24.8h
+        uaddw           v27.8h, v27.8h, v3.8b
+        sqxtun          v1.8b,  v25.8h
+        uaddw           v28.8h, v28.8h, v4.8b
+        sqxtun          v2.8b,  v26.8h
+        st1             {v0.8b},     [x3], x2
+        uaddw           v29.8h, v29.8h, v5.8b
+        sqxtun          v3.8b,  v27.8h
+        st1             {v1.8b},     [x3], x2
+        uaddw           v30.8h, v30.8h, v6.8b
+        sqxtun          v4.8b,  v28.8h
+        st1             {v2.8b},     [x3], x2
+        uaddw           v31.8h, v31.8h, v7.8b
+        sqxtun          v5.8b,  v29.8h
+        st1             {v3.8b},     [x3], x2
+        sqxtun          v6.8b,  v30.8h
+        sqxtun          v7.8b,  v31.8h
+        st1             {v4.8b},     [x3], x2
+        st1             {v5.8b},     [x3], x2
+        st1             {v6.8b},     [x3], x2
+        st1             {v7.8b},     [x3], x2
 
         sub             x1,  x1,  #128
         ret
@@ -336,42 +336,42 @@  function ff_h264_idct8_dc_add_neon, export=1
         AARCH64_VALID_CALL_TARGET
         mov             w3,       #0
         sxtw            x2,       w2
-        ld1r            {v31.8H}, [x1]
+        ld1r            {v31.8h}, [x1]
         strh            w3,       [x1]
-        ld1             {v0.8B},  [x0], x2
-        srshr           v31.8H, v31.8H, #6
-        ld1             {v1.8B},     [x0], x2
-        ld1             {v2.8B},     [x0], x2
-        uaddw           v24.8H, v31.8H, v0.8B
-        ld1             {v3.8B},     [x0], x2
-        uaddw           v25.8H, v31.8H, v1.8B
-        ld1             {v4.8B},     [x0], x2
-        uaddw           v26.8H, v31.8H, v2.8B
-        ld1             {v5.8B},     [x0], x2
-        uaddw           v27.8H, v31.8H, v3.8B
-        ld1             {v6.8B},     [x0], x2
-        uaddw           v28.8H, v31.8H, v4.8B
-        ld1             {v7.8B},     [x0], x2
-        uaddw           v29.8H, v31.8H, v5.8B
-        uaddw           v30.8H, v31.8H, v6.8B
-        uaddw           v31.8H, v31.8H, v7.8B
-        sqxtun          v0.8B,  v24.8H
-        sqxtun          v1.8B,  v25.8H
-        sqxtun          v2.8B,  v26.8H
-        sqxtun          v3.8B,  v27.8H
+        ld1             {v0.8b},  [x0], x2
+        srshr           v31.8h, v31.8h, #6
+        ld1             {v1.8b},     [x0], x2
+        ld1             {v2.8b},     [x0], x2
+        uaddw           v24.8h, v31.8h, v0.8b
+        ld1             {v3.8b},     [x0], x2
+        uaddw           v25.8h, v31.8h, v1.8b
+        ld1             {v4.8b},     [x0], x2
+        uaddw           v26.8h, v31.8h, v2.8b
+        ld1             {v5.8b},     [x0], x2
+        uaddw           v27.8h, v31.8h, v3.8b
+        ld1             {v6.8b},     [x0], x2
+        uaddw           v28.8h, v31.8h, v4.8b
+        ld1             {v7.8b},     [x0], x2
+        uaddw           v29.8h, v31.8h, v5.8b
+        uaddw           v30.8h, v31.8h, v6.8b
+        uaddw           v31.8h, v31.8h, v7.8b
+        sqxtun          v0.8b,  v24.8h
+        sqxtun          v1.8b,  v25.8h
+        sqxtun          v2.8b,  v26.8h
+        sqxtun          v3.8b,  v27.8h
         sub             x0,  x0,  x2, lsl #3
-        st1             {v0.8B},     [x0], x2
-        sqxtun          v4.8B,  v28.8H
-        st1             {v1.8B},     [x0], x2
-        sqxtun          v5.8B,  v29.8H
-        st1             {v2.8B},     [x0], x2
-        sqxtun          v6.8B,  v30.8H
-        st1             {v3.8B},     [x0], x2
-        sqxtun          v7.8B,  v31.8H
-        st1             {v4.8B},     [x0], x2
-        st1             {v5.8B},     [x0], x2
-        st1             {v6.8B},     [x0], x2
-        st1             {v7.8B},     [x0], x2
+        st1             {v0.8b},     [x0], x2
+        sqxtun          v4.8b,  v28.8h
+        st1             {v1.8b},     [x0], x2
+        sqxtun          v5.8b,  v29.8h
+        st1             {v2.8b},     [x0], x2
+        sqxtun          v6.8b,  v30.8h
+        st1             {v3.8b},     [x0], x2
+        sqxtun          v7.8b,  v31.8h
+        st1             {v4.8b},     [x0], x2
+        st1             {v5.8b},     [x0], x2
+        st1             {v6.8b},     [x0], x2
+        st1             {v7.8b},     [x0], x2
         ret
 endfunc
 
@@ -387,7 +387,7 @@  function ff_h264_idct8_add4_neon, export=1
         movrel          x14, .L_ff_h264_idct8_add_neon
 1:      ldrb            w9,  [x7], #4
         ldrsw           x0,  [x5], #16
-        ldrb            w9,  [x4, w9, UXTW]
+        ldrb            w9,  [x4, w9, uxtw]
         subs            w9,  w9,  #1
         b.lt            2f
         ldrsh           w11,  [x1]
@@ -401,7 +401,7 @@  function ff_h264_idct8_add4_neon, export=1
         ret             x12
 endfunc
 
-const   scan8
+const scan8
         .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
         .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
         .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index ea37689f34..f755806b59 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -153,8 +153,7 @@  function ff_pred16x16_plane_neon, export=1
         add             v1.8h,  v1.8h,  v0.8h
         add             v3.8h,  v3.8h,  v2.8h
         mov             w3,  #16
-1:
-        sqshrun         v0.8b,  v1.8h,  #5
+1:      sqshrun         v0.8b,  v1.8h,  #5
         add             v1.8h,  v1.8h,  v2.8h
         sqshrun2        v0.16b, v1.8h,  #5
         add             v1.8h,  v1.8h,  v3.8h
@@ -164,10 +163,10 @@  function ff_pred16x16_plane_neon, export=1
         ret
 endfunc
 
-const   p16weight, align=4
+const p16weight, align=4
         .short          1,2,3,4,5,6,7,8
 endconst
-const   p8weight, align=4
+const p8weight, align=4
         .short          1,2,3,4,1,2,3,4
 endconst
 
@@ -230,8 +229,7 @@  function ff_pred8x8_plane_neon, export=1
         dup             v2.8h,  v5.h[1]
         add             v1.8h,  v1.8h,  v0.8h
         mov             w3,  #8
-1:
-        sqshrun         v0.8b,  v1.8h,  #5
+1:      sqshrun         v0.8b,  v1.8h,  #5
         subs            w3,  w3,  #1
         add             v1.8h,  v1.8h,  v2.8h
         st1             {v0.8b},  [x0], x1
@@ -519,8 +517,7 @@  function ff_pred16x16_plane_neon_10, export=1
 
         mov             w3,      #16
         mvni            v4.8h,   #0xFC, lsl #8 // 1023 for clipping
-1:
-        sqshrun         v0.4h,  v16.4s, #5
+1:      sqshrun         v0.4h,  v16.4s, #5
         sqshrun2        v0.8h,  v17.4s, #5
         saddw           v16.4s, v16.4s, v2.4h
         saddw           v17.4s, v17.4s, v2.4h
@@ -603,8 +600,7 @@  function ff_pred8x8_plane_neon_10, export=1
         saddw2          v2.4s,  v2.4s,  v0.8h
         mov             w3,  #8
         mvni            v4.8h,  #0xFC,  lsl #8 // 1023 for clipping
-1:
-        sqshrun         v0.4h,  v1.4s,  #5
+1:      sqshrun         v0.4h,  v1.4s,  #5
         sqshrun2        v0.8h,  v2.4s,  #5
 
         saddw           v1.4s,  v1.4s,  v3.4h
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index 451fd8af24..3829f17bd1 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -24,130 +24,130 @@ 
 
         /* H.264 qpel MC */
 
-.macro  lowpass_const   r
+.macro  lowpass_const r
         movz            \r, #20, lsl #16
         movk            \r, #5
-        mov             v6.S[0], \r
+        mov             v6.s[0], \r
 .endm
 
 //trashes v0-v5
 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
-        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
-        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
-        uaddl           v2.8H,      v2.8B,     v3.8B
-        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
-        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
-        uaddl           v4.8H,      v4.8B,     v5.8B
-        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
-        uaddl           \d0\().8H,  \r0\().8B, v1.8B
-        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
-        mla             \d0\().8H,  v2.8H,     v6.H[1]
-        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
-        uaddl           v0.8H,      v0.8B,     v1.8B
-        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
-        mls             \d0\().8H,  v4.8H,     v6.H[0]
-        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
-        uaddl           v1.8H,      v1.8B,     v3.8B
-        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
-        uaddl           \d1\().8H,  \r2\().8B, v2.8B
-        mla             \d1\().8H,  v0.8H,     v6.H[1]
-        mls             \d1\().8H,  v1.8H,     v6.H[0]
-  .if \narrow
-        sqrshrun        \d0\().8B,  \d0\().8H, #5
-        sqrshrun        \d1\().8B,  \d1\().8H, #5
-  .endif
+        ext             v2.8b,      \r0\().8b, \r1\().8b, #2
+        ext             v3.8b,      \r0\().8b, \r1\().8b, #3
+        uaddl           v2.8h,      v2.8b,     v3.8b
+        ext             v4.8b,      \r0\().8b, \r1\().8b, #1
+        ext             v5.8b,      \r0\().8b, \r1\().8b, #4
+        uaddl           v4.8h,      v4.8b,     v5.8b
+        ext             v1.8b,      \r0\().8b, \r1\().8b, #5
+        uaddl           \d0\().8h,  \r0\().8b, v1.8b
+        ext             v0.8b,      \r2\().8b, \r3\().8b, #2
+        mla             \d0\().8h,  v2.8h,     v6.h[1]
+        ext             v1.8b,      \r2\().8b, \r3\().8b, #3
+        uaddl           v0.8h,      v0.8b,     v1.8b
+        ext             v1.8b,      \r2\().8b, \r3\().8b, #1
+        mls             \d0\().8h,  v4.8h,     v6.h[0]
+        ext             v3.8b,      \r2\().8b, \r3\().8b, #4
+        uaddl           v1.8h,      v1.8b,     v3.8b
+        ext             v2.8b,      \r2\().8b, \r3\().8b, #5
+        uaddl           \d1\().8h,  \r2\().8b, v2.8b
+        mla             \d1\().8h,  v0.8h,     v6.h[1]
+        mls             \d1\().8h,  v1.8h,     v6.h[0]
+.if \narrow
+        sqrshrun        \d0\().8b,  \d0\().8h, #5
+        sqrshrun        \d1\().8b,  \d1\().8h, #5
+.endif
 .endm
 
 //trashes v0-v4
 .macro  lowpass_8_v     r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1,  narrow=1
-        uaddl           v2.8H,      \r2\().8B, \r3\().8B
-        uaddl           v0.8H,      \r3\().8B, \r4\().8B
-        uaddl           v4.8H,      \r1\().8B, \r4\().8B
-        uaddl           v1.8H,      \r2\().8B, \r5\().8B
-        uaddl           \d0\().8H,  \r0\().8B, \r5\().8B
-        uaddl           \d1\().8H,  \r1\().8B, \r6\().8B
-        mla             \d0\().8H,  v2.8H,     v6.H[1]
-        mls             \d0\().8H,  v4.8H,     v6.H[0]
-        mla             \d1\().8H,  v0.8H,     v6.H[1]
-        mls             \d1\().8H,  v1.8H,     v6.H[0]
-  .if \narrow
-        sqrshrun        \d0\().8B,  \d0\().8H, #5
-        sqrshrun        \d1\().8B,  \d1\().8H, #5
-  .endif
+        uaddl           v2.8h,      \r2\().8b, \r3\().8b
+        uaddl           v0.8h,      \r3\().8b, \r4\().8b
+        uaddl           v4.8h,      \r1\().8b, \r4\().8b
+        uaddl           v1.8h,      \r2\().8b, \r5\().8b
+        uaddl           \d0\().8h,  \r0\().8b, \r5\().8b
+        uaddl           \d1\().8h,  \r1\().8b, \r6\().8b
+        mla             \d0\().8h,  v2.8h,     v6.h[1]
+        mls             \d0\().8h,  v4.8h,     v6.h[0]
+        mla             \d1\().8h,  v0.8h,     v6.h[1]
+        mls             \d1\().8h,  v1.8h,     v6.h[0]
+.if \narrow
+        sqrshrun        \d0\().8b,  \d0\().8h, #5
+        sqrshrun        \d1\().8b,  \d1\().8h, #5
+.endif
 .endm
 
 //trashes v0-v5, v7, v30-v31
 .macro  lowpass_8H      r0,  r1
-        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
-        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
-        uaddl           v0.8H,      v0.8B,      v1.8B
-        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
-        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
-        uaddl           v2.8H,      v2.8B,      v3.8B
-        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
-        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
-        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
-        mla             \r0\().8H,  v0.8H,      v6.H[1]
-        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
-        uaddl           v4.8H,      v4.8B,      v5.8B
-        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
-        mls             \r0\().8H,  v2.8H,      v6.H[0]
-        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
-        uaddl           v7.8H,      v7.8B,      v0.8B
-        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
-        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
-        mla             \r1\().8H,  v4.8H,      v6.H[1]
-        mls             \r1\().8H,  v7.8H,      v6.H[0]
+        ext             v0.16b,     \r0\().16b, \r0\().16b, #2
+        ext             v1.16b,     \r0\().16b, \r0\().16b, #3
+        uaddl           v0.8h,      v0.8b,      v1.8b
+        ext             v2.16b,     \r0\().16b, \r0\().16b, #1
+        ext             v3.16b,     \r0\().16b, \r0\().16b, #4
+        uaddl           v2.8h,      v2.8b,      v3.8b
+        ext             v30.16b,    \r0\().16b, \r0\().16b, #5
+        uaddl           \r0\().8h,  \r0\().8b,  v30.8b
+        ext             v4.16b,     \r1\().16b, \r1\().16b, #2
+        mla             \r0\().8h,  v0.8h,      v6.h[1]
+        ext             v5.16b,     \r1\().16b, \r1\().16b, #3
+        uaddl           v4.8h,      v4.8b,      v5.8b
+        ext             v7.16b,     \r1\().16b, \r1\().16b, #1
+        mls             \r0\().8h,  v2.8h,      v6.h[0]
+        ext             v0.16b,     \r1\().16b, \r1\().16b, #4
+        uaddl           v7.8h,      v7.8b,      v0.8b
+        ext             v31.16b,    \r1\().16b, \r1\().16b, #5
+        uaddl           \r1\().8h,  \r1\().8b,  v31.8b
+        mla             \r1\().8h,  v4.8h,      v6.h[1]
+        mls             \r1\().8h,  v7.8h,      v6.h[0]
 .endm
 
 // trashes v2-v5, v30
 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
-        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
-        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
-        uaddl           v2.8H,     v2.8B,     v3.8B
-        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
-        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
-        uaddl           v4.8H,     v4.8B,     v5.8B
-        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
-        uaddl           \d0\().8H, \r0\().8B, v30.8B
-        mla             \d0\().8H, v2.8H,     v6.H[1]
-        mls             \d0\().8H, v4.8H,     v6.H[0]
-  .if \narrow
-        sqrshrun        \d0\().8B, \d0\().8H, #5
-  .endif
+        ext             v2.8b,     \r0\().8b, \r1\().8b, #2
+        ext             v3.8b,     \r0\().8b, \r1\().8b, #3
+        uaddl           v2.8h,     v2.8b,     v3.8b
+        ext             v4.8b,     \r0\().8b, \r1\().8b, #1
+        ext             v5.8b,     \r0\().8b, \r1\().8b, #4
+        uaddl           v4.8h,     v4.8b,     v5.8b
+        ext             v30.8b,    \r0\().8b, \r1\().8b, #5
+        uaddl           \d0\().8h, \r0\().8b, v30.8b
+        mla             \d0\().8h, v2.8h,     v6.h[1]
+        mls             \d0\().8h, v4.8h,     v6.h[0]
+.if \narrow
+        sqrshrun        \d0\().8b, \d0\().8h, #5
+.endif
 .endm
 
 // trashed v0-v7
 .macro  lowpass_8.16    r0,  r1,  r2,  r3,  r4,  r5
-        saddl           v5.4S,      \r2\().4H,  \r3\().4H
-        saddl2          v1.4S,      \r2\().8H,  \r3\().8H
-        saddl           v6.4S,      \r1\().4H,  \r4\().4H
-        saddl2          v2.4S,      \r1\().8H,  \r4\().8H
-        saddl           v0.4S,      \r0\().4H,  \r5\().4H
-        saddl2          v4.4S,      \r0\().8H,  \r5\().8H
-
-        shl             v3.4S,  v5.4S,  #4
-        shl             v5.4S,  v5.4S,  #2
-        shl             v7.4S,  v6.4S,  #2
-        add             v5.4S,  v5.4S,  v3.4S
-        add             v6.4S,  v6.4S,  v7.4S
-
-        shl             v3.4S,  v1.4S,  #4
-        shl             v1.4S,  v1.4S,  #2
-        shl             v7.4S,  v2.4S,  #2
-        add             v1.4S,  v1.4S,  v3.4S
-        add             v2.4S,  v2.4S,  v7.4S
-
-        add             v5.4S,  v5.4S,  v0.4S
-        sub             v5.4S,  v5.4S,  v6.4S
-
-        add             v1.4S,  v1.4S,  v4.4S
-        sub             v1.4S,  v1.4S,  v2.4S
-
-        rshrn           v5.4H,  v5.4S,  #10
-        rshrn2          v5.8H,  v1.4S,  #10
-
-        sqxtun          \r0\().8B,  v5.8H
+        saddl           v5.4s,      \r2\().4h,  \r3\().4h
+        saddl2          v1.4s,      \r2\().8h,  \r3\().8h
+        saddl           v6.4s,      \r1\().4h,  \r4\().4h
+        saddl2          v2.4s,      \r1\().8h,  \r4\().8h
+        saddl           v0.4s,      \r0\().4h,  \r5\().4h
+        saddl2          v4.4s,      \r0\().8h,  \r5\().8h
+
+        shl             v3.4s,  v5.4s,  #4
+        shl             v5.4s,  v5.4s,  #2
+        shl             v7.4s,  v6.4s,  #2
+        add             v5.4s,  v5.4s,  v3.4s
+        add             v6.4s,  v6.4s,  v7.4s
+
+        shl             v3.4s,  v1.4s,  #4
+        shl             v1.4s,  v1.4s,  #2
+        shl             v7.4s,  v2.4s,  #2
+        add             v1.4s,  v1.4s,  v3.4s
+        add             v2.4s,  v2.4s,  v7.4s
+
+        add             v5.4s,  v5.4s,  v0.4s
+        sub             v5.4s,  v5.4s,  v6.4s
+
+        add             v1.4s,  v1.4s,  v4.4s
+        sub             v1.4s,  v1.4s,  v2.4s
+
+        rshrn           v5.4h,  v5.4s,  #10
+        rshrn2          v5.8h,  v1.4s,  #10
+
+        sqxtun          \r0\().8b,  v5.8h
 .endm
 
 function put_h264_qpel16_h_lowpass_neon_packed
@@ -176,19 +176,19 @@  function \type\()_h264_qpel16_h_lowpass_neon
 endfunc
 
 function \type\()_h264_qpel8_h_lowpass_neon
-1:      ld1             {v28.8B, v29.8B}, [x1], x2
-        ld1             {v16.8B, v17.8B}, [x1], x2
+1:      ld1             {v28.8b, v29.8b}, [x1], x2
+        ld1             {v16.8b, v17.8b}, [x1], x2
         subs            x12, x12, #2
         lowpass_8       v28, v29, v16, v17, v28, v16
-  .ifc \type,avg
-        ld1             {v2.8B},    [x0], x3
-        ld1             {v3.8B},    [x0]
-        urhadd          v28.8B, v28.8B,  v2.8B
-        urhadd          v16.8B, v16.8B, v3.8B
+.ifc \type,avg
+        ld1             {v2.8b},    [x0], x3
+        ld1             {v3.8b},    [x0]
+        urhadd          v28.8b, v28.8b,  v2.8b
+        urhadd          v16.8b, v16.8b, v3.8b
         sub             x0,  x0,  x3
-  .endif
-        st1             {v28.8B},    [x0], x3
-        st1             {v16.8B},    [x0], x3
+.endif
+        st1             {v28.8b},    [x0], x3
+        st1             {v16.8b},    [x0], x3
         b.ne            1b
         ret
 endfunc
@@ -213,23 +213,23 @@  function \type\()_h264_qpel16_h_lowpass_l2_neon
 endfunc
 
 function \type\()_h264_qpel8_h_lowpass_l2_neon
-1:      ld1             {v26.8B, v27.8B}, [x1], x2
-        ld1             {v16.8B, v17.8B}, [x1], x2
-        ld1             {v28.8B},     [x3], x2
-        ld1             {v29.8B},     [x3], x2
+1:      ld1             {v26.8b, v27.8b}, [x1], x2
+        ld1             {v16.8b, v17.8b}, [x1], x2
+        ld1             {v28.8b},     [x3], x2
+        ld1             {v29.8b},     [x3], x2
         subs            x12, x12, #2
         lowpass_8       v26, v27, v16, v17, v26, v27
-        urhadd          v26.8B, v26.8B, v28.8B
-        urhadd          v27.8B, v27.8B, v29.8B
-  .ifc \type,avg
-        ld1             {v2.8B},      [x0], x2
-        ld1             {v3.8B},      [x0]
-        urhadd          v26.8B, v26.8B, v2.8B
-        urhadd          v27.8B, v27.8B, v3.8B
+        urhadd          v26.8b, v26.8b, v28.8b
+        urhadd          v27.8b, v27.8b, v29.8b
+.ifc \type,avg
+        ld1             {v2.8b},      [x0], x2
+        ld1             {v3.8b},      [x0]
+        urhadd          v26.8b, v26.8b, v2.8b
+        urhadd          v27.8b, v27.8b, v3.8b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v26.8B},     [x0], x2
-        st1             {v27.8B},     [x0], x2
+.endif
+        st1             {v26.8b},     [x0], x2
+        st1             {v27.8b},     [x0], x2
         b.ne            1b
         ret
 endfunc
@@ -270,52 +270,52 @@  function \type\()_h264_qpel16_v_lowpass_neon
 endfunc
 
 function \type\()_h264_qpel8_v_lowpass_neon
-        ld1             {v16.8B}, [x1], x3
-        ld1             {v17.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v19.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v21.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v23.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v25.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v27.8B}, [x1], x3
-        ld1             {v28.8B}, [x1]
+        ld1             {v16.8b}, [x1], x3
+        ld1             {v17.8b}, [x1], x3
+        ld1             {v18.8b}, [x1], x3
+        ld1             {v19.8b}, [x1], x3
+        ld1             {v20.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
+        ld1             {v22.8b}, [x1], x3
+        ld1             {v23.8b}, [x1], x3
+        ld1             {v24.8b}, [x1], x3
+        ld1             {v25.8b}, [x1], x3
+        ld1             {v26.8b}, [x1], x3
+        ld1             {v27.8b}, [x1], x3
+        ld1             {v28.8b}, [x1]
 
         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
-  .ifc \type,avg
-        ld1             {v24.8B},  [x0], x2
-        ld1             {v25.8B}, [x0], x2
-        ld1             {v26.8B}, [x0], x2
-        urhadd          v16.8B, v16.8B, v24.8B
-        ld1             {v27.8B}, [x0], x2
-        urhadd          v17.8B, v17.8B, v25.8B
-        ld1             {v28.8B}, [x0], x2
-        urhadd          v18.8B, v18.8B, v26.8B
-        ld1             {v29.8B}, [x0], x2
-        urhadd          v19.8B, v19.8B, v27.8B
-        ld1             {v30.8B}, [x0], x2
-        urhadd          v20.8B, v20.8B, v28.8B
-        ld1             {v31.8B}, [x0], x2
-        urhadd          v21.8B, v21.8B, v29.8B
-        urhadd          v22.8B, v22.8B, v30.8B
-        urhadd          v23.8B, v23.8B, v31.8B
+.ifc \type,avg
+        ld1             {v24.8b},  [x0], x2
+        ld1             {v25.8b}, [x0], x2
+        ld1             {v26.8b}, [x0], x2
+        urhadd          v16.8b, v16.8b, v24.8b
+        ld1             {v27.8b}, [x0], x2
+        urhadd          v17.8b, v17.8b, v25.8b
+        ld1             {v28.8b}, [x0], x2
+        urhadd          v18.8b, v18.8b, v26.8b
+        ld1             {v29.8b}, [x0], x2
+        urhadd          v19.8b, v19.8b, v27.8b
+        ld1             {v30.8b}, [x0], x2
+        urhadd          v20.8b, v20.8b, v28.8b
+        ld1             {v31.8b}, [x0], x2
+        urhadd          v21.8b, v21.8b, v29.8b
+        urhadd          v22.8b, v22.8b, v30.8b
+        urhadd          v23.8b, v23.8b, v31.8b
         sub             x0,  x0,  x2,  lsl #3
-  .endif
+.endif
 
-        st1             {v16.8B}, [x0], x2
-        st1             {v17.8B}, [x0], x2
-        st1             {v18.8B}, [x0], x2
-        st1             {v19.8B}, [x0], x2
-        st1             {v20.8B}, [x0], x2
-        st1             {v21.8B}, [x0], x2
-        st1             {v22.8B}, [x0], x2
-        st1             {v23.8B}, [x0], x2
+        st1             {v16.8b}, [x0], x2
+        st1             {v17.8b}, [x0], x2
+        st1             {v18.8b}, [x0], x2
+        st1             {v19.8b}, [x0], x2
+        st1             {v20.8b}, [x0], x2
+        st1             {v21.8b}, [x0], x2
+        st1             {v22.8b}, [x0], x2
+        st1             {v23.8b}, [x0], x2
 
         ret
 endfunc
@@ -343,70 +343,70 @@  function \type\()_h264_qpel16_v_lowpass_l2_neon
 endfunc
 
 function \type\()_h264_qpel8_v_lowpass_l2_neon
-        ld1             {v16.8B}, [x1], x3
-        ld1             {v17.8B}, [x1], x3
-        ld1             {v18.8B}, [x1], x3
-        ld1             {v19.8B}, [x1], x3
-        ld1             {v20.8B}, [x1], x3
-        ld1             {v21.8B}, [x1], x3
-        ld1             {v22.8B}, [x1], x3
-        ld1             {v23.8B}, [x1], x3
-        ld1             {v24.8B}, [x1], x3
-        ld1             {v25.8B}, [x1], x3
-        ld1             {v26.8B}, [x1], x3
-        ld1             {v27.8B}, [x1], x3
-        ld1             {v28.8B}, [x1]
+        ld1             {v16.8b}, [x1], x3
+        ld1             {v17.8b}, [x1], x3
+        ld1             {v18.8b}, [x1], x3
+        ld1             {v19.8b}, [x1], x3
+        ld1             {v20.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
+        ld1             {v22.8b}, [x1], x3
+        ld1             {v23.8b}, [x1], x3
+        ld1             {v24.8b}, [x1], x3
+        ld1             {v25.8b}, [x1], x3
+        ld1             {v26.8b}, [x1], x3
+        ld1             {v27.8b}, [x1], x3
+        ld1             {v28.8b}, [x1]
 
         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17
         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19
         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21
         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23
 
-        ld1             {v24.8B},  [x12], x2
-        ld1             {v25.8B},  [x12], x2
-        ld1             {v26.8B},  [x12], x2
-        ld1             {v27.8B},  [x12], x2
-        ld1             {v28.8B},  [x12], x2
-        urhadd          v16.8B, v24.8B, v16.8B
-        urhadd          v17.8B, v25.8B, v17.8B
-        ld1             {v29.8B},  [x12], x2
-        urhadd          v18.8B, v26.8B, v18.8B
-        urhadd          v19.8B, v27.8B, v19.8B
-        ld1             {v30.8B}, [x12], x2
-        urhadd          v20.8B, v28.8B, v20.8B
-        urhadd          v21.8B, v29.8B, v21.8B
-        ld1             {v31.8B}, [x12], x2
-        urhadd          v22.8B, v30.8B, v22.8B
-        urhadd          v23.8B, v31.8B, v23.8B
-
-  .ifc \type,avg
-        ld1             {v24.8B}, [x0], x3
-        ld1             {v25.8B}, [x0], x3
-        ld1             {v26.8B}, [x0], x3
-        urhadd          v16.8B, v16.8B, v24.8B
-        ld1             {v27.8B}, [x0], x3
-        urhadd          v17.8B, v17.8B, v25.8B
-        ld1             {v28.8B}, [x0], x3
-        urhadd          v18.8B, v18.8B, v26.8B
-        ld1             {v29.8B}, [x0], x3
-        urhadd          v19.8B, v19.8B, v27.8B
-        ld1             {v30.8B}, [x0], x3
-        urhadd          v20.8B, v20.8B, v28.8B
-        ld1             {v31.8B}, [x0], x3
-        urhadd          v21.8B, v21.8B, v29.8B
-        urhadd          v22.8B, v22.8B, v30.8B
-        urhadd          v23.8B, v23.8B, v31.8B
+        ld1             {v24.8b},  [x12], x2
+        ld1             {v25.8b},  [x12], x2
+        ld1             {v26.8b},  [x12], x2
+        ld1             {v27.8b},  [x12], x2
+        ld1             {v28.8b},  [x12], x2
+        urhadd          v16.8b, v24.8b, v16.8b
+        urhadd          v17.8b, v25.8b, v17.8b
+        ld1             {v29.8b},  [x12], x2
+        urhadd          v18.8b, v26.8b, v18.8b
+        urhadd          v19.8b, v27.8b, v19.8b
+        ld1             {v30.8b}, [x12], x2
+        urhadd          v20.8b, v28.8b, v20.8b
+        urhadd          v21.8b, v29.8b, v21.8b
+        ld1             {v31.8b}, [x12], x2
+        urhadd          v22.8b, v30.8b, v22.8b
+        urhadd          v23.8b, v31.8b, v23.8b
+
+.ifc \type,avg
+        ld1             {v24.8b}, [x0], x3
+        ld1             {v25.8b}, [x0], x3
+        ld1             {v26.8b}, [x0], x3
+        urhadd          v16.8b, v16.8b, v24.8b
+        ld1             {v27.8b}, [x0], x3
+        urhadd          v17.8b, v17.8b, v25.8b
+        ld1             {v28.8b}, [x0], x3
+        urhadd          v18.8b, v18.8b, v26.8b
+        ld1             {v29.8b}, [x0], x3
+        urhadd          v19.8b, v19.8b, v27.8b
+        ld1             {v30.8b}, [x0], x3
+        urhadd          v20.8b, v20.8b, v28.8b
+        ld1             {v31.8b}, [x0], x3
+        urhadd          v21.8b, v21.8b, v29.8b
+        urhadd          v22.8b, v22.8b, v30.8b
+        urhadd          v23.8b, v23.8b, v31.8b
         sub             x0,  x0,  x3,  lsl #3
-  .endif
+.endif
 
-        st1             {v16.8B}, [x0], x3
-        st1             {v17.8B}, [x0], x3
-        st1             {v18.8B}, [x0], x3
-        st1             {v19.8B}, [x0], x3
-        st1             {v20.8B}, [x0], x3
-        st1             {v21.8B}, [x0], x3
-        st1             {v22.8B}, [x0], x3
-        st1             {v23.8B}, [x0], x3
+        st1             {v16.8b}, [x0], x3
+        st1             {v17.8b}, [x0], x3
+        st1             {v18.8b}, [x0], x3
+        st1             {v19.8b}, [x0], x3
+        st1             {v20.8b}, [x0], x3
+        st1             {v21.8b}, [x0], x3
+        st1             {v22.8b}, [x0], x3
+        st1             {v23.8b}, [x0], x3
 
         ret
 endfunc
@@ -416,20 +416,20 @@  endfunc
         h264_qpel_v_lowpass_l2 avg
 
 function put_h264_qpel8_hv_lowpass_neon_top
-        lowpass_const   w12
-        ld1             {v16.8H}, [x1], x3
-        ld1             {v17.8H}, [x1], x3
-        ld1             {v18.8H}, [x1], x3
-        ld1             {v19.8H}, [x1], x3
-        ld1             {v20.8H}, [x1], x3
-        ld1             {v21.8H}, [x1], x3
-        ld1             {v22.8H}, [x1], x3
-        ld1             {v23.8H}, [x1], x3
-        ld1             {v24.8H}, [x1], x3
-        ld1             {v25.8H}, [x1], x3
-        ld1             {v26.8H}, [x1], x3
-        ld1             {v27.8H}, [x1], x3
-        ld1             {v28.8H}, [x1]
+        lowpass_const w12
+        ld1             {v16.8h}, [x1], x3
+        ld1             {v17.8h}, [x1], x3
+        ld1             {v18.8h}, [x1], x3
+        ld1             {v19.8h}, [x1], x3
+        ld1             {v20.8h}, [x1], x3
+        ld1             {v21.8h}, [x1], x3
+        ld1             {v22.8h}, [x1], x3
+        ld1             {v23.8h}, [x1], x3
+        ld1             {v24.8h}, [x1], x3
+        ld1             {v25.8h}, [x1], x3
+        ld1             {v26.8h}, [x1], x3
+        ld1             {v27.8h}, [x1], x3
+        ld1             {v28.8h}, [x1]
         lowpass_8H      v16, v17
         lowpass_8H      v18, v19
         lowpass_8H      v20, v21
@@ -457,34 +457,34 @@  endfunc
 function \type\()_h264_qpel8_hv_lowpass_neon
         mov             x10, x30
         bl              put_h264_qpel8_hv_lowpass_neon_top
-  .ifc \type,avg
-        ld1             {v0.8B},      [x0], x2
-        ld1             {v1.8B},      [x0], x2
-        ld1             {v2.8B},      [x0], x2
-        urhadd          v16.8B, v16.8B, v0.8B
-        ld1             {v3.8B},      [x0], x2
-        urhadd          v17.8B, v17.8B, v1.8B
-        ld1             {v4.8B},      [x0], x2
-        urhadd          v18.8B, v18.8B, v2.8B
-        ld1             {v5.8B},      [x0], x2
-        urhadd          v19.8B, v19.8B, v3.8B
-        ld1             {v6.8B},      [x0], x2
-        urhadd          v20.8B, v20.8B, v4.8B
-        ld1             {v7.8B},      [x0], x2
-        urhadd          v21.8B, v21.8B, v5.8B
-        urhadd          v22.8B, v22.8B, v6.8B
-        urhadd          v23.8B, v23.8B, v7.8B
+.ifc \type,avg
+        ld1             {v0.8b},      [x0], x2
+        ld1             {v1.8b},      [x0], x2
+        ld1             {v2.8b},      [x0], x2
+        urhadd          v16.8b, v16.8b, v0.8b
+        ld1             {v3.8b},      [x0], x2
+        urhadd          v17.8b, v17.8b, v1.8b
+        ld1             {v4.8b},      [x0], x2
+        urhadd          v18.8b, v18.8b, v2.8b
+        ld1             {v5.8b},      [x0], x2
+        urhadd          v19.8b, v19.8b, v3.8b
+        ld1             {v6.8b},      [x0], x2
+        urhadd          v20.8b, v20.8b, v4.8b
+        ld1             {v7.8b},      [x0], x2
+        urhadd          v21.8b, v21.8b, v5.8b
+        urhadd          v22.8b, v22.8b, v6.8b
+        urhadd          v23.8b, v23.8b, v7.8b
         sub             x0,  x0,  x2,  lsl #3
-  .endif
+.endif
 
-        st1             {v16.8B},     [x0], x2
-        st1             {v17.8B},     [x0], x2
-        st1             {v18.8B},     [x0], x2
-        st1             {v19.8B},     [x0], x2
-        st1             {v20.8B},     [x0], x2
-        st1             {v21.8B},     [x0], x2
-        st1             {v22.8B},     [x0], x2
-        st1             {v23.8B},     [x0], x2
+        st1             {v16.8b},     [x0], x2
+        st1             {v17.8b},     [x0], x2
+        st1             {v18.8b},     [x0], x2
+        st1             {v19.8b},     [x0], x2
+        st1             {v20.8b},     [x0], x2
+        st1             {v21.8b},     [x0], x2
+        st1             {v22.8b},     [x0], x2
+        st1             {v23.8b},     [x0], x2
 
         ret             x10
 endfunc
@@ -498,45 +498,45 @@  function \type\()_h264_qpel8_hv_lowpass_l2_neon
         mov             x10, x30
         bl              put_h264_qpel8_hv_lowpass_neon_top
 
-        ld1             {v0.8B, v1.8B},  [x2], #16
-        ld1             {v2.8B, v3.8B},  [x2], #16
-        urhadd          v0.8B,  v0.8B,  v16.8B
-        urhadd          v1.8B,  v1.8B,  v17.8B
-        ld1             {v4.8B, v5.8B},  [x2], #16
-        urhadd          v2.8B,  v2.8B,  v18.8B
-        urhadd          v3.8B,  v3.8B,  v19.8B
-        ld1             {v6.8B, v7.8B},  [x2], #16
-        urhadd          v4.8B,  v4.8B,  v20.8B
-        urhadd          v5.8B,  v5.8B,  v21.8B
-        urhadd          v6.8B,  v6.8B,  v22.8B
-        urhadd          v7.8B,  v7.8B,  v23.8B
-  .ifc \type,avg
-        ld1             {v16.8B},     [x0], x3
-        ld1             {v17.8B},     [x0], x3
-        ld1             {v18.8B},     [x0], x3
-        urhadd          v0.8B,  v0.8B,  v16.8B
-        ld1             {v19.8B},     [x0], x3
-        urhadd          v1.8B,  v1.8B,  v17.8B
-        ld1             {v20.8B},     [x0], x3
-        urhadd          v2.8B,  v2.8B,  v18.8B
-        ld1             {v21.8B},     [x0], x3
-        urhadd          v3.8B,  v3.8B,  v19.8B
-        ld1             {v22.8B},     [x0], x3
-        urhadd          v4.8B,  v4.8B,  v20.8B
-        ld1             {v23.8B},     [x0], x3
-        urhadd          v5.8B,  v5.8B,  v21.8B
-        urhadd          v6.8B,  v6.8B,  v22.8B
-        urhadd          v7.8B,  v7.8B,  v23.8B
+        ld1             {v0.8b, v1.8b},  [x2], #16
+        ld1             {v2.8b, v3.8b},  [x2], #16
+        urhadd          v0.8b,  v0.8b,  v16.8b
+        urhadd          v1.8b,  v1.8b,  v17.8b
+        ld1             {v4.8b, v5.8b},  [x2], #16
+        urhadd          v2.8b,  v2.8b,  v18.8b
+        urhadd          v3.8b,  v3.8b,  v19.8b
+        ld1             {v6.8b, v7.8b},  [x2], #16
+        urhadd          v4.8b,  v4.8b,  v20.8b
+        urhadd          v5.8b,  v5.8b,  v21.8b
+        urhadd          v6.8b,  v6.8b,  v22.8b
+        urhadd          v7.8b,  v7.8b,  v23.8b
+.ifc \type,avg
+        ld1             {v16.8b},     [x0], x3
+        ld1             {v17.8b},     [x0], x3
+        ld1             {v18.8b},     [x0], x3
+        urhadd          v0.8b,  v0.8b,  v16.8b
+        ld1             {v19.8b},     [x0], x3
+        urhadd          v1.8b,  v1.8b,  v17.8b
+        ld1             {v20.8b},     [x0], x3
+        urhadd          v2.8b,  v2.8b,  v18.8b
+        ld1             {v21.8b},     [x0], x3
+        urhadd          v3.8b,  v3.8b,  v19.8b
+        ld1             {v22.8b},     [x0], x3
+        urhadd          v4.8b,  v4.8b,  v20.8b
+        ld1             {v23.8b},     [x0], x3
+        urhadd          v5.8b,  v5.8b,  v21.8b
+        urhadd          v6.8b,  v6.8b,  v22.8b
+        urhadd          v7.8b,  v7.8b,  v23.8b
         sub             x0,  x0,  x3,  lsl #3
-  .endif
-        st1             {v0.8B},      [x0], x3
-        st1             {v1.8B},      [x0], x3
-        st1             {v2.8B},      [x0], x3
-        st1             {v3.8B},      [x0], x3
-        st1             {v4.8B},      [x0], x3
-        st1             {v5.8B},      [x0], x3
-        st1             {v6.8B},      [x0], x3
-        st1             {v7.8B},      [x0], x3
+.endif
+        st1             {v0.8b},      [x0], x3
+        st1             {v1.8b},      [x0], x3
+        st1             {v2.8b},      [x0], x3
+        st1             {v3.8b},      [x0], x3
+        st1             {v4.8b},      [x0], x3
+        st1             {v5.8b},      [x0], x3
+        st1             {v6.8b},      [x0], x3
+        st1             {v7.8b},      [x0], x3
 
         ret             x10
 endfunc
@@ -580,12 +580,12 @@  function \type\()_h264_qpel16_hv_lowpass_l2_neon
 endfunc
 .endm
 
-        h264_qpel16_hv put
-        h264_qpel16_hv avg
+        h264_qpel16_hv  put
+        h264_qpel16_hv  avg
 
 .macro  h264_qpel8      type
 function ff_\type\()_h264_qpel8_mc10_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         mov             x3,  x1
         sub             x1,  x1,  #2
         mov             x12, #8
@@ -593,7 +593,7 @@  function ff_\type\()_h264_qpel8_mc10_neon, export=1
 endfunc
 
 function ff_\type\()_h264_qpel8_mc20_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         sub             x1,  x1,  #2
         mov             x3,  x2
         mov             x12, #8
@@ -601,7 +601,7 @@  function ff_\type\()_h264_qpel8_mc20_neon, export=1
 endfunc
 
 function ff_\type\()_h264_qpel8_mc30_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         add             x3,  x1,  #1
         sub             x1,  x1,  #2
         mov             x12, #8
@@ -612,7 +612,7 @@  function ff_\type\()_h264_qpel8_mc01_neon, export=1
         mov             x14, x30
         mov             x12, x1
 \type\()_h264_qpel8_mc01:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x3,  x2
         sub             x1,  x1,  x2, lsl #1
         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
@@ -624,7 +624,7 @@  function ff_\type\()_h264_qpel8_mc11_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel8_mc11:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #64
         mov             x0,  sp
@@ -647,7 +647,7 @@  function ff_\type\()_h264_qpel8_mc21_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel8_mc21:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #(8*8+16*12)
         sub             x1,  x1,  #2
@@ -677,7 +677,7 @@  endfunc
 
 function ff_\type\()_h264_qpel8_mc02_neon, export=1
         mov             x14, x30
-        lowpass_const   w3
+        lowpass_const w3
         sub             x1,  x1,  x2, lsl #1
         mov             x3,  x2
         bl              \type\()_h264_qpel8_v_lowpass_neon
@@ -689,7 +689,7 @@  function ff_\type\()_h264_qpel8_mc12_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel8_mc12:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #(8*8+16*12)
         sub             x1,  x1,  x2, lsl #1
@@ -759,26 +759,26 @@  function ff_\type\()_h264_qpel8_mc33_neon, export=1
 endfunc
 .endm
 
-        h264_qpel8 put
-        h264_qpel8 avg
+        h264_qpel8      put
+        h264_qpel8      avg
 
 .macro  h264_qpel16     type
 function ff_\type\()_h264_qpel16_mc10_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         mov             x3,  x1
         sub             x1,  x1,  #2
         b               \type\()_h264_qpel16_h_lowpass_l2_neon
 endfunc
 
 function ff_\type\()_h264_qpel16_mc20_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         sub             x1,  x1,  #2
         mov             x3,  x2
         b               \type\()_h264_qpel16_h_lowpass_neon
 endfunc
 
 function ff_\type\()_h264_qpel16_mc30_neon, export=1
-        lowpass_const   w3
+        lowpass_const w3
         add             x3,  x1,  #1
         sub             x1,  x1,  #2
         b               \type\()_h264_qpel16_h_lowpass_l2_neon
@@ -788,7 +788,7 @@  function ff_\type\()_h264_qpel16_mc01_neon, export=1
         mov             x14, x30
         mov             x12, x1
 \type\()_h264_qpel16_mc01:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x3,  x2
         sub             x1,  x1,  x2, lsl #1
         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
@@ -800,7 +800,7 @@  function ff_\type\()_h264_qpel16_mc11_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel16_mc11:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #256
         mov             x0,  sp
@@ -822,7 +822,7 @@  function ff_\type\()_h264_qpel16_mc21_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel16_mc21:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #(16*16+16*12)
         sub             x1,  x1,  #2
@@ -849,7 +849,7 @@  endfunc
 
 function ff_\type\()_h264_qpel16_mc02_neon, export=1
         mov             x14, x30
-        lowpass_const   w3
+        lowpass_const w3
         sub             x1,  x1,  x2, lsl #1
         mov             x3,  x2
         bl              \type\()_h264_qpel16_v_lowpass_neon
@@ -861,7 +861,7 @@  function ff_\type\()_h264_qpel16_mc12_neon, export=1
         mov             x8,  x0
         mov             x9,  x1
 \type\()_h264_qpel16_mc12:
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             sp,  sp,  #(16*16+16*12)
         sub             x1,  x1,  x2, lsl #1
@@ -880,7 +880,7 @@  endfunc
 
 function ff_\type\()_h264_qpel16_mc22_neon, export=1
         mov             x14, x30
-        lowpass_const   w3
+        lowpass_const w3
         mov             x11, sp
         sub             x1,  x1,  x2, lsl #1
         sub             x1,  x1,  #2
@@ -931,5 +931,5 @@  function ff_\type\()_h264_qpel16_mc33_neon, export=1
 endfunc
 .endm
 
-        h264_qpel16 put
-        h264_qpel16 avg
+        h264_qpel16     put
+        h264_qpel16     avg
diff --git a/libavcodec/aarch64/hevcdsp_idct_neon.S b/libavcodec/aarch64/hevcdsp_idct_neon.S
index 0869431294..9931584bf3 100644
--- a/libavcodec/aarch64/hevcdsp_idct_neon.S
+++ b/libavcodec/aarch64/hevcdsp_idct_neon.S
@@ -27,21 +27,21 @@ 
 #include "libavutil/aarch64/asm.S"
 
 const trans, align=4
-        .short 64, 83, 64, 36
-        .short 89, 75, 50, 18
-        .short 90, 87, 80, 70
-        .short 57, 43, 25, 9
-        .short 90, 90, 88, 85
-        .short 82, 78, 73, 67
-        .short 61, 54, 46, 38
-        .short 31, 22, 13, 4
+        .short          64, 83, 64, 36
+        .short          89, 75, 50, 18
+        .short          90, 87, 80, 70
+        .short          57, 43, 25, 9
+        .short          90, 90, 88, 85
+        .short          82, 78, 73, 67
+        .short          61, 54, 46, 38
+        .short          31, 22, 13, 4
 endconst
 
 .macro clip10 in1, in2, c1, c2
-        smax        \in1, \in1, \c1
-        smax        \in2, \in2, \c1
-        smin        \in1, \in1, \c2
-        smin        \in2, \in2, \c2
+        smax            \in1, \in1, \c1
+        smax            \in2, \in2, \c1
+        smin            \in1, \in1, \c2
+        smin            \in2, \in2, \c2
 .endm
 
 function ff_hevc_add_residual_4x4_8_neon, export=1
@@ -50,13 +50,13 @@  function ff_hevc_add_residual_4x4_8_neon, export=1
         ld1             {v2.s}[1], [x0], x2
         ld1             {v2.s}[2], [x0], x2
         ld1             {v2.s}[3], [x0], x2
-        sub              x0,  x0,  x2, lsl #2
-        uxtl             v6.8h,  v2.8b
-        uxtl2            v7.8h,  v2.16b
-        sqadd            v0.8h,  v0.8h, v6.8h
-        sqadd            v1.8h,  v1.8h, v7.8h
-        sqxtun           v0.8b,  v0.8h
-        sqxtun2          v0.16b, v1.8h
+        sub             x0,  x0,  x2, lsl #2
+        uxtl            v6.8h,  v2.8b
+        uxtl2           v7.8h,  v2.16b
+        sqadd           v0.8h,  v0.8h, v6.8h
+        sqadd           v1.8h,  v1.8h, v7.8h
+        sqxtun          v0.8b,  v0.8h
+        sqxtun2         v0.16b, v1.8h
         st1             {v0.s}[0], [x0], x2
         st1             {v0.s}[1], [x0], x2
         st1             {v0.s}[2], [x0], x2
@@ -70,12 +70,12 @@  function ff_hevc_add_residual_4x4_10_neon, export=1
         ld1             {v2.d}[0], [x12], x2
         ld1             {v2.d}[1], [x12], x2
         ld1             {v3.d}[0], [x12], x2
-        sqadd            v0.8h, v0.8h, v2.8h
+        sqadd           v0.8h, v0.8h, v2.8h
         ld1             {v3.d}[1], [x12], x2
-        movi             v4.8h, #0
-        sqadd            v1.8h, v1.8h, v3.8h
-        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
-        clip10           v0.8h, v1.8h, v4.8h, v5.8h
+        movi            v4.8h, #0
+        sqadd           v1.8h, v1.8h, v3.8h
+        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
         st1             {v0.d}[0],  [x0], x2
         st1             {v0.d}[1],  [x0], x2
         st1             {v1.d}[0],  [x0], x2
@@ -85,48 +85,48 @@  endfunc
 
 function ff_hevc_add_residual_8x8_8_neon, export=1
         add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #8
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+        mov             x3,  #8
+1:      subs            x3,  x3, #2
         ld1             {v2.d}[0],     [x0]
         ld1             {v2.d}[1],    [x12]
-        uxtl             v3.8h,  v2.8b
+        uxtl            v3.8h,  v2.8b
         ld1             {v0.8h-v1.8h}, [x1], #32
-        uxtl2            v2.8h,  v2.16b
-        sqadd            v0.8h,  v0.8h,   v3.8h
-        sqadd            v1.8h,  v1.8h,   v2.8h
-        sqxtun           v0.8b,  v0.8h
-        sqxtun2          v0.16b, v1.8h
+        uxtl2           v2.8h,  v2.16b
+        sqadd           v0.8h,  v0.8h,   v3.8h
+        sqadd           v1.8h,  v1.8h,   v2.8h
+        sqxtun          v0.8b,  v0.8h
+        sqxtun2         v0.16b, v1.8h
         st1             {v0.d}[0],     [x0], x2
         st1             {v0.d}[1],    [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_8x8_10_neon, export=1
         add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #8
-        movi             v4.8h, #0
-        mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+        mov             x3,  #8
+        movi            v4.8h, #0
+        mvni            v5.8h, #0xFC, lsl #8 // movi #0x3FF
+1:      subs            x3,  x3, #2
         ld1             {v0.8h-v1.8h}, [x1], #32
         ld1             {v2.8h},       [x0]
-        sqadd            v0.8h, v0.8h, v2.8h
+        sqadd           v0.8h, v0.8h, v2.8h
         ld1             {v3.8h},      [x12]
-        sqadd            v1.8h, v1.8h, v3.8h
-        clip10           v0.8h, v1.8h, v4.8h, v5.8h
+        sqadd           v1.8h, v1.8h, v3.8h
+        clip10          v0.8h, v1.8h, v4.8h, v5.8h
         st1             {v0.8h},       [x0], x2
         st1             {v1.8h},      [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_16x16_8_neon, export=1
-        mov              x3,  #16
+        mov             x3,  #16
         add             x12, x0, x2
-        add              x2,  x2, x2
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
         ld1             {v16.16b},     [x0]
         ld1             {v0.8h-v3.8h}, [x1], #64
         ld1             {v19.16b},    [x12]
@@ -134,47 +134,47 @@  function ff_hevc_add_residual_16x16_8_neon, export=1
         uxtl2           v18.8h, v16.16b
         uxtl            v20.8h, v19.8b
         uxtl2           v21.8h, v19.16b
-        sqadd            v0.8h,  v0.8h, v17.8h
-        sqadd            v1.8h,  v1.8h, v18.8h
-        sqadd            v2.8h,  v2.8h, v20.8h
-        sqadd            v3.8h,  v3.8h, v21.8h
-        sqxtun           v0.8b,  v0.8h
+        sqadd           v0.8h,  v0.8h, v17.8h
+        sqadd           v1.8h,  v1.8h, v18.8h
+        sqadd           v2.8h,  v2.8h, v20.8h
+        sqadd           v3.8h,  v3.8h, v21.8h
+        sqxtun          v0.8b,  v0.8h
         sqxtun2         v0.16b,  v1.8h
-        sqxtun           v1.8b,  v2.8h
+        sqxtun          v1.8b,  v2.8h
         sqxtun2         v1.16b,  v3.8h
         st1             {v0.16b},     [x0], x2
         st1             {v1.16b},    [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_16x16_10_neon, export=1
-        mov              x3,  #16
+        mov             x3,  #16
         movi            v20.8h, #0
         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
         add             x12,  x0, x2
-        add              x2,  x2, x2
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+1:      subs            x3,  x3, #2
         ld1             {v16.8h-v17.8h}, [x0]
         ld1             {v0.8h-v3.8h},  [x1], #64
-        sqadd            v0.8h, v0.8h, v16.8h
+        sqadd           v0.8h, v0.8h, v16.8h
         ld1             {v18.8h-v19.8h}, [x12]
-        sqadd            v1.8h, v1.8h, v17.8h
-        sqadd            v2.8h, v2.8h, v18.8h
-        sqadd            v3.8h, v3.8h, v19.8h
-        clip10           v0.8h, v1.8h, v20.8h, v21.8h
-        clip10           v2.8h, v3.8h, v20.8h, v21.8h
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v1.8h},   [x0], x2
         st1             {v2.8h-v3.8h},  [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_32x32_8_neon, export=1
         add             x12,  x0, x2
-        add              x2,  x2, x2
-        mov              x3,  #32
-1:      subs             x3,  x3, #2
+        add             x2,  x2, x2
+        mov             x3,  #32
+1:      subs            x3,  x3, #2
         ld1             {v20.16b, v21.16b}, [x0]
         uxtl            v16.8h,  v20.8b
         uxtl2           v17.8h,  v20.16b
@@ -187,78 +187,78 @@  function ff_hevc_add_residual_32x32_8_neon, export=1
         uxtl2           v21.8h,  v22.16b
         uxtl            v22.8h,  v23.8b
         uxtl2           v23.8h,  v23.16b
-        sqadd            v0.8h,  v0.8h,  v16.8h
-        sqadd            v1.8h,  v1.8h,  v17.8h
-        sqadd            v2.8h,  v2.8h,  v18.8h
-        sqadd            v3.8h,  v3.8h,  v19.8h
-        sqadd            v4.8h,  v4.8h,  v20.8h
-        sqadd            v5.8h,  v5.8h,  v21.8h
-        sqadd            v6.8h,  v6.8h,  v22.8h
-        sqadd            v7.8h,  v7.8h,  v23.8h
-        sqxtun           v0.8b,  v0.8h
+        sqadd           v0.8h,  v0.8h,  v16.8h
+        sqadd           v1.8h,  v1.8h,  v17.8h
+        sqadd           v2.8h,  v2.8h,  v18.8h
+        sqadd           v3.8h,  v3.8h,  v19.8h
+        sqadd           v4.8h,  v4.8h,  v20.8h
+        sqadd           v5.8h,  v5.8h,  v21.8h
+        sqadd           v6.8h,  v6.8h,  v22.8h
+        sqadd           v7.8h,  v7.8h,  v23.8h
+        sqxtun          v0.8b,  v0.8h
         sqxtun2         v0.16b,  v1.8h
-        sqxtun           v1.8b,  v2.8h
+        sqxtun          v1.8b,  v2.8h
         sqxtun2         v1.16b,  v3.8h
-        sqxtun           v2.8b,  v4.8h
+        sqxtun          v2.8b,  v4.8h
         sqxtun2         v2.16b,  v5.8h
         st1             {v0.16b, v1.16b},  [x0], x2
-        sqxtun           v3.8b,  v6.8h
+        sqxtun          v3.8b,  v6.8h
         sqxtun2         v3.16b,  v7.8h
         st1             {v2.16b, v3.16b}, [x12], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 function ff_hevc_add_residual_32x32_10_neon, export=1
-        mov              x3,  #32
+        mov             x3,  #32
         movi            v20.8h, #0
         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
-1:      subs             x3,  x3, #1
+1:      subs            x3,  x3, #1
         ld1             {v0.8h-v3.8h},   [x1], #64
         ld1             {v16.8h-v19.8h}, [x0]
-        sqadd            v0.8h, v0.8h, v16.8h
-        sqadd            v1.8h, v1.8h, v17.8h
-        sqadd            v2.8h, v2.8h, v18.8h
-        sqadd            v3.8h, v3.8h, v19.8h
-        clip10           v0.8h, v1.8h, v20.8h, v21.8h
-        clip10           v2.8h, v3.8h, v20.8h, v21.8h
+        sqadd           v0.8h, v0.8h, v16.8h
+        sqadd           v1.8h, v1.8h, v17.8h
+        sqadd           v2.8h, v2.8h, v18.8h
+        sqadd           v3.8h, v3.8h, v19.8h
+        clip10          v0.8h, v1.8h, v20.8h, v21.8h
+        clip10          v2.8h, v3.8h, v20.8h, v21.8h
         st1             {v0.8h-v3.8h},   [x0], x2
-        bne              1b
+        bne             1b
         ret
 endfunc
 
 .macro sum_sub out, in, c, op, p
-  .ifc \op, +
+.ifc \op, +
         smlal\p         \out, \in, \c
-  .else
+.else
         smlsl\p         \out, \in, \c
-  .endif
+.endif
 .endm
 
 .macro fixsqrshrn d, dt, n, m
-  .ifc \dt, .8h
+.ifc \dt, .8h
         sqrshrn2        \d\dt, \n\().4s, \m
-  .else
+.else
         sqrshrn         \n\().4h, \n\().4s, \m
         mov             \d\().d[0], \n\().d[0]
-  .endif
+.endif
 .endm
 
 // uses and clobbers v28-v31 as temp registers
 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
-         sshll\p1       v28.4s, \in0, #6
-         mov            v29.16b, v28.16b
-         smull\p1       v30.4s, \in1, v0.h[1]
-         smull\p1       v31.4s, \in1, v0.h[3]
-         smlal\p2       v28.4s, \in2, v0.h[0] //e0
-         smlsl\p2       v29.4s, \in2, v0.h[0] //e1
-         smlal\p2       v30.4s, \in3, v0.h[3] //o0
-         smlsl\p2       v31.4s, \in3, v0.h[1] //o1
-
-         add            \out0, v28.4s, v30.4s
-         add            \out1, v29.4s, v31.4s
-         sub            \out2, v29.4s, v31.4s
-         sub            \out3, v28.4s, v30.4s
+        sshll\p1        v28.4s, \in0, #6
+        mov             v29.16b, v28.16b
+        smull\p1        v30.4s, \in1, v0.h[1]
+        smull\p1        v31.4s, \in1, v0.h[3]
+        smlal\p2        v28.4s, \in2, v0.h[0] //e0
+        smlsl\p2        v29.4s, \in2, v0.h[0] //e1
+        smlal\p2        v30.4s, \in3, v0.h[3] //o0
+        smlsl\p2        v31.4s, \in3, v0.h[1] //o1
+
+        add             \out0, v28.4s, v30.4s
+        add             \out1, v29.4s, v31.4s
+        sub             \out2, v29.4s, v31.4s
+        sub             \out3, v28.4s, v30.4s
 .endm
 
 .macro transpose8_4x4 r0, r1, r2, r3
@@ -325,11 +325,11 @@  endfunc
 .macro idct_8x8 bitdepth
 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 //x0 - coeffs
-        mov              x1,  x0
+        mov             x1,  x0
         ld1             {v16.8h-v19.8h}, [x1], #64
         ld1             {v20.8h-v23.8h}, [x1]
 
-        movrel           x1, trans
+        movrel          x1, trans
         ld1             {v0.8h}, [x1]
 
         tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@@ -342,7 +342,7 @@  function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
 
         transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
 
-        mov              x1,  x0
+        mov             x1,  x0
         st1             {v16.8h-v19.8h}, [x1], #64
         st1             {v20.8h-v23.8h}, [x1]
 
@@ -351,8 +351,8 @@  endfunc
 .endm
 
 .macro butterfly e, o, tmp_p, tmp_m
-        add        \tmp_p, \e, \o
-        sub        \tmp_m, \e, \o
+        add             \tmp_p, \e, \o
+        sub             \tmp_m, \e, \o
 .endm
 
 .macro tr16_8x4 in0, in1, in2, in3, offset
@@ -381,7 +381,7 @@  endfunc
         butterfly       v25.4s, v29.4s, v17.4s, v22.4s
         butterfly       v26.4s, v30.4s, v18.4s, v21.4s
         butterfly       v27.4s, v31.4s, v19.4s, v20.4s
-        add              x4,  sp,  #\offset
+        add             x4,  sp,  #\offset
         st1             {v16.4s-v19.4s}, [x4], #64
         st1             {v20.4s-v23.4s}, [x4]
 .endm
@@ -398,14 +398,14 @@  endfunc
 .endm
 
 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
-        sum_sub v21.4s, \in, \t0, \op0, \p
-        sum_sub v22.4s, \in, \t1, \op1, \p
-        sum_sub v23.4s, \in, \t2, \op2, \p
-        sum_sub v24.4s, \in, \t3, \op3, \p
-        sum_sub v25.4s, \in, \t4, \op4, \p
-        sum_sub v26.4s, \in, \t5, \op5, \p
-        sum_sub v27.4s, \in, \t6, \op6, \p
-        sum_sub v28.4s, \in, \t7, \op7, \p
+        sum_sub         v21.4s, \in, \t0, \op0, \p
+        sum_sub         v22.4s, \in, \t1, \op1, \p
+        sum_sub         v23.4s, \in, \t2, \op2, \p
+        sum_sub         v24.4s, \in, \t3, \op3, \p
+        sum_sub         v25.4s, \in, \t4, \op4, \p
+        sum_sub         v26.4s, \in, \t5, \op5, \p
+        sum_sub         v27.4s, \in, \t6, \op6, \p
+        sum_sub         v28.4s, \in, \t7, \op7, \p
 .endm
 
 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@@ -473,20 +473,20 @@  endfunc
 
 .macro tr_16x4 name, shift, offset, step
 function func_tr_16x4_\name
-        mov              x1,  x5
-        add              x3,  x5, #(\step * 64)
-        mov              x2,  #(\step * 128)
+        mov             x1,  x5
+        add             x3,  x5, #(\step * 64)
+        mov             x2,  #(\step * 128)
         load16          v16.d, v17.d, v18.d, v19.d
-        movrel           x1,  trans
+        movrel          x1,  trans
         ld1             {v0.8h}, [x1]
 
         tr16_8x4        v16, v17, v18, v19, \offset
 
-        add              x1,  x5, #(\step * 32)
-        add              x3,  x5, #(\step * 3 *32)
-        mov              x2,  #(\step * 128)
+        add             x1,  x5, #(\step * 32)
+        add             x3,  x5, #(\step * 3 *32)
+        mov             x2,  #(\step * 128)
         load16          v20.d, v17.d, v18.d, v19.d
-        movrel           x1, trans, 16
+        movrel          x1, trans, 16
         ld1             {v1.8h}, [x1]
         smull           v21.4s, v20.4h, v1.h[0]
         smull           v22.4s, v20.4h, v1.h[1]
@@ -505,16 +505,16 @@  function func_tr_16x4_\name
         add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
         add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
 
-        add              x4, sp, #\offset
+        add             x4, sp, #\offset
         ld1             {v16.4s-v19.4s}, [x4], #64
 
         butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
         scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
         transpose16_4x4_2 v29, v30, v31, v24
-        mov              x1,  x6
-        add              x3,  x6, #(24 +3*32)
-        mov              x2, #32
-        mov              x4, #-32
+        mov             x1,  x6
+        add             x3,  x6, #(24 +3*32)
+        mov             x2, #32
+        mov             x4, #-32
         store16         v29.d, v30.d, v31.d, v24.d, x4
 
         add             x4, sp, #(\offset + 64)
@@ -523,10 +523,10 @@  function func_tr_16x4_\name
         scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
         transpose16_4x4_2 v29, v30, v31, v20
 
-        add              x1,  x6, #8
-        add              x3,  x6, #(16 + 3 * 32)
-        mov              x2, #32
-        mov              x4, #-32
+        add             x1,  x6, #8
+        add             x3,  x6, #(16 + 3 * 32)
+        mov             x2, #32
+        mov             x4, #-32
         store16         v29.d, v30.d, v31.d, v20.d, x4
 
         ret
@@ -539,21 +539,21 @@  function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
         mov             x15, x30
 
         // allocate a temp buffer
-        sub              sp,  sp,  #640
+        sub             sp,  sp,  #640
 
 .irp i, 0, 1, 2, 3
-        add              x5,  x0, #(8 * \i)
-        add              x6,  sp, #(8 * \i * 16)
+        add             x5,  x0, #(8 * \i)
+        add             x6,  sp, #(8 * \i * 16)
         bl              func_tr_16x4_firstpass
 .endr
 
 .irp i, 0, 1, 2, 3
-        add              x5,  sp, #(8 * \i)
-        add              x6,  x0, #(8 * \i * 16)
+        add             x5,  sp, #(8 * \i)
+        add             x6,  x0, #(8 * \i * 16)
         bl              func_tr_16x4_secondpass_\bitdepth
 .endr
 
-        add              sp,  sp,  #640
+        add             sp,  sp,  #640
 
         mov             x30, x15
         ret
@@ -573,35 +573,34 @@  idct_16x16 10
 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
 .macro idct_dc size, bitdepth
 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
-        ld1r         {v4.8h}, [x0]
-        srshr         v4.8h,  v4.8h,  #1
-        srshr         v0.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr         v1.8h,  v4.8h,  #(14 - \bitdepth)
+        ld1r            {v4.8h}, [x0]
+        srshr           v4.8h,  v4.8h,  #1
+        srshr           v0.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v1.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 4
-        srshr         v2.8h,  v4.8h,  #(14 - \bitdepth)
-        srshr         v3.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v2.8h,  v4.8h,  #(14 - \bitdepth)
+        srshr           v3.8h,  v4.8h,  #(14 - \bitdepth)
 .if \size > 16 /* dc 32x32 */
-        mov              x2,  #4
-1:
-        subs             x2,  x2, #1
+        mov             x2,  #4
+1:      subs            x2,  x2, #1
 .endif
         add             x12,  x0, #64
         mov             x13,  #128
 .if \size > 8 /* dc 16x16 */
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
 .endif /* dc 8x8 */
-        st1            {v0.8h-v3.8h},  [x0], x13
-        st1            {v0.8h-v3.8h}, [x12], x13
+        st1             {v0.8h-v3.8h},  [x0], x13
+        st1             {v0.8h-v3.8h}, [x12], x13
 .if \size > 16 /* dc 32x32 */
         bne             1b
 .endif
 .else /* dc 4x4 */
-        st1            {v0.8h-v1.8h},  [x0]
+        st1             {v0.8h-v1.8h},  [x0]
 .endif
         ret
 endfunc
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d4decfde3b..2f9b398075 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -81,10 +81,10 @@  function ff_hevc_sao_band_filter_8x8_8_neon, export=1
 endfunc
 
 .Lsao_edge_pos:
-.word 1 // horizontal
-.word SAO_STRIDE // vertical
-.word SAO_STRIDE + 1 // 45 degree
-.word SAO_STRIDE - 1 // 135 degree
+        .word           1 // horizontal
+        .word           SAO_STRIDE // vertical
+        .word           SAO_STRIDE + 1 // 45 degree
+        .word           SAO_STRIDE - 1 // 135 degree
 
 // ff_hevc_sao_edge_filter_16x16_8_neon(char *dst, char *src, ptrdiff stride_dst,
 //                                      int16 *sao_offset_val, int eo, int width, int height)
diff --git a/libavcodec/aarch64/hpeldsp_neon.S b/libavcodec/aarch64/hpeldsp_neon.S
index a491c173bb..b81f70c662 100644
--- a/libavcodec/aarch64/hpeldsp_neon.S
+++ b/libavcodec/aarch64/hpeldsp_neon.S
@@ -23,336 +23,336 @@ 
 #include "libavutil/aarch64/asm.S"
 
 .macro  pixels16        rnd=1, avg=0
-  .if \avg
+.if \avg
         mov             x12, x0
-  .endif
-1:      ld1             {v0.16B},  [x1], x2
-        ld1             {v1.16B},  [x1], x2
-        ld1             {v2.16B},  [x1], x2
-        ld1             {v3.16B},  [x1], x2
-  .if \avg
-        ld1             {v4.16B},  [x12], x2
-        urhadd          v0.16B,  v0.16B,  v4.16B
-        ld1             {v5.16B},  [x12], x2
-        urhadd          v1.16B,  v1.16B,  v5.16B
-        ld1             {v6.16B},  [x12], x2
-        urhadd          v2.16B,  v2.16B,  v6.16B
-        ld1             {v7.16B},  [x12], x2
-        urhadd          v3.16B,  v3.16B,  v7.16B
-  .endif
+.endif
+1:      ld1             {v0.16b},  [x1], x2
+        ld1             {v1.16b},  [x1], x2
+        ld1             {v2.16b},  [x1], x2
+        ld1             {v3.16b},  [x1], x2
+.if \avg
+        ld1             {v4.16b},  [x12], x2
+        urhadd          v0.16b,  v0.16b,  v4.16b
+        ld1             {v5.16b},  [x12], x2
+        urhadd          v1.16b,  v1.16b,  v5.16b
+        ld1             {v6.16b},  [x12], x2
+        urhadd          v2.16b,  v2.16b,  v6.16b
+        ld1             {v7.16b},  [x12], x2
+        urhadd          v3.16b,  v3.16b,  v7.16b
+.endif
         subs            w3,  w3,  #4
-        st1             {v0.16B},  [x0], x2
-        st1             {v1.16B},  [x0], x2
-        st1             {v2.16B},  [x0], x2
-        st1             {v3.16B},  [x0], x2
+        st1             {v0.16b},  [x0], x2
+        st1             {v1.16b},  [x0], x2
+        st1             {v2.16b},  [x0], x2
+        st1             {v3.16b},  [x0], x2
         b.ne            1b
         ret
 .endm
 
 .macro  pixels16_x2     rnd=1, avg=0
-1:      ld1             {v0.16B, v1.16B}, [x1], x2
-        ld1             {v2.16B, v3.16B}, [x1], x2
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        ld1             {v2.16b, v3.16b}, [x1], x2
         subs            w3,  w3,  #2
-        ext             v1.16B,  v0.16B,  v1.16B,  #1
-        avg             v0.16B,  v0.16B,  v1.16B
-        ext             v3.16B,  v2.16B,  v3.16B,  #1
-        avg             v2.16B,  v2.16B,  v3.16B
-  .if \avg
-        ld1             {v1.16B}, [x0], x2
-        ld1             {v3.16B}, [x0]
-        urhadd          v0.16B,  v0.16B,  v1.16B
-        urhadd          v2.16B,  v2.16B,  v3.16B
+        ext             v1.16b,  v0.16b,  v1.16b,  #1
+        avg             v0.16b,  v0.16b,  v1.16b
+        ext             v3.16b,  v2.16b,  v3.16b,  #1
+        avg             v2.16b,  v2.16b,  v3.16b
+.if \avg
+        ld1             {v1.16b}, [x0], x2
+        ld1             {v3.16b}, [x0]
+        urhadd          v0.16b,  v0.16b,  v1.16b
+        urhadd          v2.16b,  v2.16b,  v3.16b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v0.16B}, [x0], x2
-        st1             {v2.16B}, [x0], x2
+.endif
+        st1             {v0.16b}, [x0], x2
+        st1             {v2.16b}, [x0], x2
         b.ne            1b
         ret
 .endm
 
 .macro  pixels16_y2     rnd=1, avg=0
         sub             w3,  w3,  #2
-        ld1             {v0.16B}, [x1], x2
-        ld1             {v1.16B}, [x1], x2
+        ld1             {v0.16b}, [x1], x2
+        ld1             {v1.16b}, [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v2.16B,  v0.16B,  v1.16B
-        ld1             {v0.16B}, [x1], x2
-        avg             v3.16B,  v0.16B,  v1.16B
-        ld1             {v1.16B}, [x1], x2
-  .if \avg
-        ld1             {v4.16B}, [x0], x2
-        ld1             {v5.16B}, [x0]
-        urhadd          v2.16B,  v2.16B,  v4.16B
-        urhadd          v3.16B,  v3.16B,  v5.16B
+        avg             v2.16b,  v0.16b,  v1.16b
+        ld1             {v0.16b}, [x1], x2
+        avg             v3.16b,  v0.16b,  v1.16b
+        ld1             {v1.16b}, [x1], x2
+.if \avg
+        ld1             {v4.16b}, [x0], x2
+        ld1             {v5.16b}, [x0]
+        urhadd          v2.16b,  v2.16b,  v4.16b
+        urhadd          v3.16b,  v3.16b,  v5.16b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v2.16B}, [x0], x2
-        st1             {v3.16B}, [x0], x2
+.endif
+        st1             {v2.16b}, [x0], x2
+        st1             {v3.16b}, [x0], x2
         b.ne            1b
 
-        avg             v2.16B,  v0.16B,  v1.16B
-        ld1             {v0.16B}, [x1], x2
-        avg             v3.16B,  v0.16B,  v1.16B
-  .if \avg
-        ld1             {v4.16B}, [x0], x2
-        ld1             {v5.16B}, [x0]
-        urhadd          v2.16B,  v2.16B,  v4.16B
-        urhadd          v3.16B,  v3.16B,  v5.16B
+        avg             v2.16b,  v0.16b,  v1.16b
+        ld1             {v0.16b}, [x1], x2
+        avg             v3.16b,  v0.16b,  v1.16b
+.if \avg
+        ld1             {v4.16b}, [x0], x2
+        ld1             {v5.16b}, [x0]
+        urhadd          v2.16b,  v2.16b,  v4.16b
+        urhadd          v3.16b,  v3.16b,  v5.16b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v2.16B},     [x0], x2
-        st1             {v3.16B},     [x0], x2
+.endif
+        st1             {v2.16b},     [x0], x2
+        st1             {v3.16b},     [x0], x2
 
         ret
 .endm
 
 .macro  pixels16_xy2    rnd=1, avg=0
         sub             w3,  w3,  #2
-        ld1             {v0.16B, v1.16B}, [x1], x2
-        ld1             {v4.16B, v5.16B}, [x1], x2
-NRND    movi            v26.8H, #1
-        ext             v1.16B,  v0.16B,  v1.16B,  #1
-        ext             v5.16B,  v4.16B,  v5.16B,  #1
-        uaddl           v16.8H,  v0.8B,   v1.8B
-        uaddl2          v20.8H,  v0.16B,  v1.16B
-        uaddl           v18.8H,  v4.8B,   v5.8B
-        uaddl2          v22.8H,  v4.16B,  v5.16B
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        ld1             {v4.16b, v5.16b}, [x1], x2
+NRND    movi            v26.8h, #1
+        ext             v1.16b,  v0.16b,  v1.16b,  #1
+        ext             v5.16b,  v4.16b,  v5.16b,  #1
+        uaddl           v16.8h,  v0.8b,   v1.8b
+        uaddl2          v20.8h,  v0.16b,  v1.16b
+        uaddl           v18.8h,  v4.8b,   v5.8b
+        uaddl2          v22.8h,  v4.16b,  v5.16b
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16B, v1.16B}, [x1], x2
-        add             v24.8H,  v16.8H,  v18.8H
-NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16B, v0.16B,  v1.16B,  #1
-        add             v1.8H,   v20.8H,  v22.8H
-        mshrn           v28.8B,  v24.8H,  #2
-NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16B, v1.8H,   #2
-  .if \avg
-        ld1             {v16.16B},        [x0]
-        urhadd          v28.16B, v28.16B, v16.16B
-  .endif
-        uaddl           v16.8H,  v0.8B,   v30.8B
-        ld1             {v2.16B, v3.16B}, [x1], x2
-        uaddl2          v20.8H,  v0.16B,  v30.16B
-        st1             {v28.16B},        [x0], x2
-        add             v24.8H,  v16.8H,  v18.8H
-NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v3.16B,  v2.16B,  v3.16B,  #1
-        add             v0.8H,   v20.8H,  v22.8H
-        mshrn           v30.8B,  v24.8H,  #2
-NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16B, v0.8H,   #2
-  .if \avg
-        ld1             {v18.16B},        [x0]
-        urhadd          v30.16B, v30.16B, v18.16B
-  .endif
-        uaddl           v18.8H,   v2.8B,  v3.8B
-        uaddl2          v22.8H,   v2.16B, v3.16B
-        st1             {v30.16B},        [x0], x2
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        add             v24.8h,  v16.8h,  v18.8h
+NRND    add             v24.8h,  v24.8h,  v26.8h
+        ext             v30.16b, v0.16b,  v1.16b,  #1
+        add             v1.8h,   v20.8h,  v22.8h
+        mshrn           v28.8b,  v24.8h,  #2
+NRND    add             v1.8h,   v1.8h,   v26.8h
+        mshrn2          v28.16b, v1.8h,   #2
+.if \avg
+        ld1             {v16.16b},        [x0]
+        urhadd          v28.16b, v28.16b, v16.16b
+.endif
+        uaddl           v16.8h,  v0.8b,   v30.8b
+        ld1             {v2.16b, v3.16b}, [x1], x2
+        uaddl2          v20.8h,  v0.16b,  v30.16b
+        st1             {v28.16b},        [x0], x2
+        add             v24.8h,  v16.8h,  v18.8h
+NRND    add             v24.8h,  v24.8h,  v26.8h
+        ext             v3.16b,  v2.16b,  v3.16b,  #1
+        add             v0.8h,   v20.8h,  v22.8h
+        mshrn           v30.8b,  v24.8h,  #2
+NRND    add             v0.8h,   v0.8h,   v26.8h
+        mshrn2          v30.16b, v0.8h,   #2
+.if \avg
+        ld1             {v18.16b},        [x0]
+        urhadd          v30.16b, v30.16b, v18.16b
+.endif
+        uaddl           v18.8h,   v2.8b,  v3.8b
+        uaddl2          v22.8h,   v2.16b, v3.16b
+        st1             {v30.16b},        [x0], x2
         b.gt            1b
 
-        ld1             {v0.16B, v1.16B}, [x1], x2
-        add             v24.8H,  v16.8H,  v18.8H
-NRND    add             v24.8H,  v24.8H,  v26.8H
-        ext             v30.16B, v0.16B,  v1.16B,  #1
-        add             v1.8H,   v20.8H,  v22.8H
-        mshrn           v28.8B,  v24.8H,  #2
-NRND    add             v1.8H,   v1.8H,   v26.8H
-        mshrn2          v28.16B, v1.8H,   #2
-  .if \avg
-        ld1             {v16.16B},        [x0]
-        urhadd          v28.16B, v28.16B, v16.16B
-  .endif
-        uaddl           v16.8H,  v0.8B,   v30.8B
-        uaddl2          v20.8H,  v0.16B,  v30.16B
-        st1             {v28.16B},        [x0], x2
-        add             v24.8H,  v16.8H,  v18.8H
-NRND    add             v24.8H,  v24.8H,  v26.8H
-        add             v0.8H,   v20.8H,  v22.8H
-        mshrn           v30.8B,  v24.8H,  #2
-NRND    add             v0.8H,   v0.8H,   v26.8H
-        mshrn2          v30.16B, v0.8H,   #2
-  .if \avg
-        ld1             {v18.16B},        [x0]
-        urhadd          v30.16B, v30.16B, v18.16B
-  .endif
-        st1             {v30.16B},        [x0], x2
+        ld1             {v0.16b, v1.16b}, [x1], x2
+        add             v24.8h,  v16.8h,  v18.8h
+NRND    add             v24.8h,  v24.8h,  v26.8h
+        ext             v30.16b, v0.16b,  v1.16b,  #1
+        add             v1.8h,   v20.8h,  v22.8h
+        mshrn           v28.8b,  v24.8h,  #2
+NRND    add             v1.8h,   v1.8h,   v26.8h
+        mshrn2          v28.16b, v1.8h,   #2
+.if \avg
+        ld1             {v16.16b},        [x0]
+        urhadd          v28.16b, v28.16b, v16.16b
+.endif
+        uaddl           v16.8h,  v0.8b,   v30.8b
+        uaddl2          v20.8h,  v0.16b,  v30.16b
+        st1             {v28.16b},        [x0], x2
+        add             v24.8h,  v16.8h,  v18.8h
+NRND    add             v24.8h,  v24.8h,  v26.8h
+        add             v0.8h,   v20.8h,  v22.8h
+        mshrn           v30.8b,  v24.8h,  #2
+NRND    add             v0.8h,   v0.8h,   v26.8h
+        mshrn2          v30.16b, v0.8h,   #2
+.if \avg
+        ld1             {v18.16b},        [x0]
+        urhadd          v30.16b, v30.16b, v18.16b
+.endif
+        st1             {v30.16b},        [x0], x2
 
         ret
 .endm
 
 .macro  pixels8         rnd=1, avg=0
-1:      ld1             {v0.8B}, [x1], x2
-        ld1             {v1.8B}, [x1], x2
-        ld1             {v2.8B}, [x1], x2
-        ld1             {v3.8B}, [x1], x2
-  .if \avg
-        ld1             {v4.8B}, [x0], x2
-        urhadd          v0.8B,  v0.8B,  v4.8B
-        ld1             {v5.8B}, [x0], x2
-        urhadd          v1.8B,  v1.8B,  v5.8B
-        ld1             {v6.8B}, [x0], x2
-        urhadd          v2.8B,  v2.8B,  v6.8B
-        ld1             {v7.8B}, [x0], x2
-        urhadd          v3.8B,  v3.8B,  v7.8B
+1:      ld1             {v0.8b}, [x1], x2
+        ld1             {v1.8b}, [x1], x2
+        ld1             {v2.8b}, [x1], x2
+        ld1             {v3.8b}, [x1], x2
+.if \avg
+        ld1             {v4.8b}, [x0], x2
+        urhadd          v0.8b,  v0.8b,  v4.8b
+        ld1             {v5.8b}, [x0], x2
+        urhadd          v1.8b,  v1.8b,  v5.8b
+        ld1             {v6.8b}, [x0], x2
+        urhadd          v2.8b,  v2.8b,  v6.8b
+        ld1             {v7.8b}, [x0], x2
+        urhadd          v3.8b,  v3.8b,  v7.8b
         sub             x0,  x0,  x2,  lsl #2
-  .endif
+.endif
         subs            w3,  w3,  #4
-        st1             {v0.8B}, [x0], x2
-        st1             {v1.8B}, [x0], x2
-        st1             {v2.8B}, [x0], x2
-        st1             {v3.8B}, [x0], x2
+        st1             {v0.8b}, [x0], x2
+        st1             {v1.8b}, [x0], x2
+        st1             {v2.8b}, [x0], x2
+        st1             {v3.8b}, [x0], x2
         b.ne            1b
         ret
 .endm
 
 .macro  pixels8_x2      rnd=1, avg=0
-1:      ld1             {v0.8B, v1.8B}, [x1], x2
-        ext             v1.8B,  v0.8B,  v1.8B,  #1
-        ld1             {v2.8B, v3.8B}, [x1], x2
-        ext             v3.8B,  v2.8B,  v3.8B,  #1
+1:      ld1             {v0.8b, v1.8b}, [x1], x2
+        ext             v1.8b,  v0.8b,  v1.8b,  #1
+        ld1             {v2.8b, v3.8b}, [x1], x2
+        ext             v3.8b,  v2.8b,  v3.8b,  #1
         subs            w3,  w3,  #2
-        avg             v0.8B,   v0.8B,   v1.8B
-        avg             v2.8B,   v2.8B,   v3.8B
-  .if \avg
-        ld1             {v4.8B},     [x0], x2
-        ld1             {v5.8B},     [x0]
-        urhadd          v0.8B,   v0.8B,   v4.8B
-        urhadd          v2.8B,   v2.8B,   v5.8B
+        avg             v0.8b,   v0.8b,   v1.8b
+        avg             v2.8b,   v2.8b,   v3.8b
+.if \avg
+        ld1             {v4.8b},     [x0], x2
+        ld1             {v5.8b},     [x0]
+        urhadd          v0.8b,   v0.8b,   v4.8b
+        urhadd          v2.8b,   v2.8b,   v5.8b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v0.8B}, [x0], x2
-        st1             {v2.8B}, [x0], x2
+.endif
+        st1             {v0.8b}, [x0], x2
+        st1             {v2.8b}, [x0], x2
         b.ne            1b
         ret
 .endm
 
 .macro  pixels8_y2      rnd=1, avg=0
         sub             w3,  w3,  #2
-        ld1             {v0.8B},  [x1], x2
-        ld1             {v1.8B},  [x1], x2
+        ld1             {v0.8b},  [x1], x2
+        ld1             {v1.8b},  [x1], x2
 1:      subs            w3,  w3,  #2
-        avg             v4.8B,  v0.8B,  v1.8B
-        ld1             {v0.8B},  [x1], x2
-        avg             v5.8B,  v0.8B,  v1.8B
-        ld1             {v1.8B},  [x1], x2
-  .if \avg
-        ld1             {v2.8B},     [x0], x2
-        ld1             {v3.8B},     [x0]
-        urhadd          v4.8B,  v4.8B,  v2.8B
-        urhadd          v5.8B,  v5.8B,  v3.8B
+        avg             v4.8b,  v0.8b,  v1.8b
+        ld1             {v0.8b},  [x1], x2
+        avg             v5.8b,  v0.8b,  v1.8b
+        ld1             {v1.8b},  [x1], x2
+.if \avg
+        ld1             {v2.8b},     [x0], x2
+        ld1             {v3.8b},     [x0]
+        urhadd          v4.8b,  v4.8b,  v2.8b
+        urhadd          v5.8b,  v5.8b,  v3.8b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v4.8B},     [x0], x2
-        st1             {v5.8B},     [x0], x2
+.endif
+        st1             {v4.8b},     [x0], x2
+        st1             {v5.8b},     [x0], x2
         b.ne            1b
 
-        avg             v4.8B,  v0.8B,  v1.8B
-        ld1             {v0.8B},  [x1], x2
-        avg             v5.8B,  v0.8B,  v1.8B
-  .if \avg
-        ld1             {v2.8B},     [x0], x2
-        ld1             {v3.8B},     [x0]
-        urhadd          v4.8B,  v4.8B,  v2.8B
-        urhadd          v5.8B,  v5.8B,  v3.8B
+        avg             v4.8b,  v0.8b,  v1.8b
+        ld1             {v0.8b},  [x1], x2
+        avg             v5.8b,  v0.8b,  v1.8b
+.if \avg
+        ld1             {v2.8b},     [x0], x2
+        ld1             {v3.8b},     [x0]
+        urhadd          v4.8b,  v4.8b,  v2.8b
+        urhadd          v5.8b,  v5.8b,  v3.8b
         sub             x0,  x0,  x2
-  .endif
-        st1             {v4.8B},     [x0], x2
-        st1             {v5.8B},     [x0], x2
+.endif
+        st1             {v4.8b},     [x0], x2
+        st1             {v5.8b},     [x0], x2
 
         ret
 .endm
 
 .macro  pixels8_xy2     rnd=1, avg=0
         sub             w3,  w3,  #2
-        ld1             {v0.16B},     [x1], x2
-        ld1             {v1.16B},     [x1], x2
-NRND    movi            v19.8H, #1
-        ext             v4.16B,  v0.16B,  v4.16B,  #1
-        ext             v6.16B,  v1.16B,  v6.16B,  #1
-        uaddl           v16.8H,  v0.8B,  v4.8B
-        uaddl           v17.8H,  v1.8B,  v6.8B
+        ld1             {v0.16b},     [x1], x2
+        ld1             {v1.16b},     [x1], x2
+NRND    movi            v19.8h, #1
+        ext             v4.16b,  v0.16b,  v4.16b,  #1
+        ext             v6.16b,  v1.16b,  v6.16b,  #1
+        uaddl           v16.8h,  v0.8b,  v4.8b
+        uaddl           v17.8h,  v1.8b,  v6.8b
 1:      subs            w3,  w3,  #2
-        ld1             {v0.16B},     [x1], x2
-        add             v18.8H, v16.8H,  v17.8H
-        ext             v4.16B,  v0.16B,  v4.16B,  #1
-NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8H,  v0.8B,  v4.8B
-        mshrn           v5.8B,  v18.8H, #2
-        ld1             {v1.16B},     [x1], x2
-        add             v18.8H, v16.8H,  v17.8H
-  .if \avg
-        ld1             {v7.8B},     [x0]
-        urhadd          v5.8B,  v5.8B,  v7.8B
-  .endif
-NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8B},     [x0], x2
-        mshrn           v7.8B,  v18.8H, #2
-  .if \avg
-        ld1             {v5.8B},     [x0]
-        urhadd          v7.8B,  v7.8B,  v5.8B
-  .endif
-        ext             v6.16B,  v1.16B,  v6.16B,  #1
-        uaddl           v17.8H,  v1.8B,   v6.8B
-        st1             {v7.8B},     [x0], x2
+        ld1             {v0.16b},     [x1], x2
+        add             v18.8h, v16.8h,  v17.8h
+        ext             v4.16b,  v0.16b,  v4.16b,  #1
+NRND    add             v18.8h, v18.8h, v19.8h
+        uaddl           v16.8h,  v0.8b,  v4.8b
+        mshrn           v5.8b,  v18.8h, #2
+        ld1             {v1.16b},     [x1], x2
+        add             v18.8h, v16.8h,  v17.8h
+.if \avg
+        ld1             {v7.8b},     [x0]
+        urhadd          v5.8b,  v5.8b,  v7.8b
+.endif
+NRND    add             v18.8h, v18.8h, v19.8h
+        st1             {v5.8b},     [x0], x2
+        mshrn           v7.8b,  v18.8h, #2
+.if \avg
+        ld1             {v5.8b},     [x0]
+        urhadd          v7.8b,  v7.8b,  v5.8b
+.endif
+        ext             v6.16b,  v1.16b,  v6.16b,  #1
+        uaddl           v17.8h,  v1.8b,   v6.8b
+        st1             {v7.8b},     [x0], x2
         b.gt            1b
 
-        ld1             {v0.16B},     [x1], x2
-        add             v18.8H, v16.8H, v17.8H
-        ext             v4.16B, v0.16B, v4.16B,  #1
-NRND    add             v18.8H, v18.8H, v19.8H
-        uaddl           v16.8H,  v0.8B, v4.8B
-        mshrn           v5.8B,  v18.8H, #2
-        add             v18.8H, v16.8H, v17.8H
-  .if \avg
-        ld1             {v7.8B},     [x0]
-        urhadd          v5.8B,  v5.8B,  v7.8B
-  .endif
-NRND    add             v18.8H, v18.8H, v19.8H
-        st1             {v5.8B},     [x0], x2
-        mshrn           v7.8B,  v18.8H, #2
-  .if \avg
-        ld1             {v5.8B},     [x0]
-        urhadd          v7.8B,  v7.8B,  v5.8B
-  .endif
-        st1             {v7.8B},     [x0], x2
+        ld1             {v0.16b},     [x1], x2
+        add             v18.8h, v16.8h, v17.8h
+        ext             v4.16b, v0.16b, v4.16b,  #1
+NRND    add             v18.8h, v18.8h, v19.8h
+        uaddl           v16.8h,  v0.8b, v4.8b
+        mshrn           v5.8b,  v18.8h, #2
+        add             v18.8h, v16.8h, v17.8h
+.if \avg
+        ld1             {v7.8b},     [x0]
+        urhadd          v5.8b,  v5.8b,  v7.8b
+.endif
+NRND    add             v18.8h, v18.8h, v19.8h
+        st1             {v5.8b},     [x0], x2
+        mshrn           v7.8b,  v18.8h, #2
+.if \avg
+        ld1             {v5.8b},     [x0]
+        urhadd          v7.8b,  v7.8b,  v5.8b
+.endif
+        st1             {v7.8b},     [x0], x2
 
         ret
 .endm
 
 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
-  .if \rnd
-    .macro avg  rd, rn, rm
+.if \rnd
+.macro avg  rd, rn, rm
         urhadd          \rd, \rn, \rm
-    .endm
-    .macro mshrn rd, rn, rm
+.endm
+.macro mshrn rd, rn, rm
         rshrn           \rd, \rn, \rm
-    .endm
-    .macro mshrn2 rd, rn, rm
+.endm
+.macro mshrn2 rd, rn, rm
         rshrn2          \rd, \rn, \rm
-    .endm
-    .macro NRND insn:vararg
-    .endm
-  .else
-    .macro avg  rd, rn, rm
+.endm
+.macro NRND insn:vararg
+.endm
+.else
+.macro avg  rd, rn, rm
         uhadd           \rd, \rn, \rm
-    .endm
-    .macro mshrn rd, rn, rm
+.endm
+.macro mshrn rd, rn, rm
         shrn            \rd, \rn, \rm
-    .endm
-    .macro mshrn2 rd, rn, rm
+.endm
+.macro mshrn2 rd, rn, rm
         shrn2           \rd, \rn, \rm
-    .endm
-    .macro NRND insn:vararg
+.endm
+.macro NRND insn:vararg
         \insn
-    .endm
-  .endif
+.endm
+.endif
 function ff_\pfx\name\suf\()_neon, export=1
         \name           \rnd, \avg
 endfunc
-        .purgem         avg
-        .purgem         mshrn
-        .purgem         mshrn2
-        .purgem         NRND
+.purgem         avg
+.purgem         mshrn
+.purgem         mshrn2
+.purgem         NRND
 .endm
 
 .macro  pixfunc2        pfx, name, avg=0
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index 6091e72022..f968407be9 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -43,8 +43,7 @@  function ff_imdct_half_neon, export=1
         ld2             {v2.2s,v3.2s},   [x4], #16 // d2=c0,c1 d3=s0,s2
         fmul            v6.2s,  v17.2s, v2.2s
         fmul            v7.2s,  v0.2s,  v2.2s
-1:
-        subs            x14, x14, #2
+1:      subs            x14, x14, #2
         ldr             w6,  [x3], #4
         fmul            v4.2s,  v0.2s,  v3.2s
         fmul            v5.2s,  v17.2s, v3.2s
@@ -64,8 +63,7 @@  function ff_imdct_half_neon, export=1
         st2             {v4.s,v5.s}[0], [x6]
         st2             {v4.s,v5.s}[1], [x8]
         b               1b
-2:
-        st2             {v4.s,v5.s}[0], [x6]
+2:      st2             {v4.s,v5.s}[0], [x6]
         st2             {v4.s,v5.s}[1], [x8]
 
         mov             x19, x0
@@ -90,8 +88,7 @@  function ff_imdct_half_neon, export=1
         ld2             {v0.2s,v1.2s},  [x3], x7 // d0 =i1,r1 d1 =i0,r0
         ld2             {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
         ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
-3:
-        subs            x14, x14, #2
+3:      subs            x14, x14, #2
         fmul            v7.2s,  v0.2s,  v17.2s
         ld2             {v18.2s,v19.2s},[x4], #16    // d17=c2,c3 d19=s2,s3
         fmul            v4.2s,  v1.2s,  v17.2s
@@ -114,8 +111,7 @@  function ff_imdct_half_neon, export=1
         st2             {v4.2s,v5.2s},  [x0], x7
         st2             {v6.2s,v7.2s},  [x8], #16
         b               3b
-4:
-        rev64           v5.2s,  v5.2s
+4:      rev64           v5.2s,  v5.2s
         rev64           v7.2s,  v7.2s
         st2             {v4.2s,v5.2s},  [x0]
         st2             {v6.2s,v7.2s},  [x8]
@@ -147,8 +143,7 @@  function ff_imdct_calc_neon, export=1
         sub             x2,  x1,  #16
         mov             x3,  #-16
         mov             x6,  #-8
-1:
-        ld1             {v0.4s}, [x2], x3
+1:      ld1             {v0.4s}, [x2], x3
         prfum           pldl1keep, [x0, #-16]
         rev64           v0.4s, v0.4s
         ld1             {v2.2s,v3.2s}, [x1], #16
@@ -207,8 +202,7 @@  function ff_mdct_calc_neon, export=1
         fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
         fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
         fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
-1:
-        fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
+1:      fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
         ldr             w10, [x3, x13]
         fmul            v6.2s,  v2.2s,  v20.2s      // -R*c
         ldr             w6,  [x3, #4]!
@@ -254,8 +248,7 @@  function ff_mdct_calc_neon, export=1
         st2             {v24.s,v25.s}[0], [x10]
         st2             {v24.s,v25.s}[1], [x6]
         b               1b
-1:
-        fneg            v7.2s,  v7.2s           //  R*s-I*c
+1:      fneg            v7.2s,  v7.2s           //  R*s-I*c
         ubfm            x12, x6,  #16, #31
         ubfm            x6,  x6,  #0,  #15
         add             x12, x1,  x12, lsl #3
@@ -291,8 +284,7 @@  function ff_mdct_calc_neon, export=1
         ld2             {v0.2s,v1.2s},   [x3], x7   // d0 =r1,i1 d1 =r0,i0
         ld2             {v20.2s,v21.2s}, [x6], #16  // d20=r2,i2 d21=r3,i3
         ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
-1:
-        subs            x14, x14, #2
+1:      subs            x14, x14, #2
         fmul            v7.2s,  v0.2s,  v17.2s      // r1*s1,r0*s0
         ld2             {v18.2s,v19.2s}, [x4], #16  // c2,c3 s2,s3
         fmul            v4.2s,  v1.2s,  v17.2s      // i1*s1,i0*s0
@@ -317,8 +309,7 @@  function ff_mdct_calc_neon, export=1
         st2             {v4.2s,v5.2s},  [x0], x7
         st2             {v6.2s,v7.2s},  [x8], #16
         b               1b
-1:
-        rev64           v5.2s,  v5.2s
+1:      rev64           v5.2s,  v5.2s
         rev64           v7.2s,  v7.2s
         st2             {v4.2s,v5.2s},  [x0]
         st2             {v6.2s,v7.2s},  [x8]
diff --git a/libavcodec/aarch64/mpegaudiodsp_neon.S b/libavcodec/aarch64/mpegaudiodsp_neon.S
index b6ef131228..5df45b037c 100644
--- a/libavcodec/aarch64/mpegaudiodsp_neon.S
+++ b/libavcodec/aarch64/mpegaudiodsp_neon.S
@@ -24,11 +24,11 @@ 
 #define WFRAC_BITS  16   // fractional bits for window
 #define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
 
-const   tbl_rev128_s, align=4
+const tbl_rev128_s, align=4
         .byte           12, 13, 14, 15
-        .byte            8,  9, 10, 11
-        .byte            4,  5,  6,  7
-        .byte            0,  1,  2,  3
+        .byte           8,  9, 10, 11
+        .byte           4,  5,  6,  7
+        .byte           0,  1,  2,  3
 endconst
 
 .macro   apply_window   type, st
@@ -66,8 +66,7 @@  function ff_mpadsp_apply_window_\type\()_neon, export=1
         movi            v28.4s, #0
 .endif
         mov             x14, #4
-1:
-        mov             x8,  x0
+1:      mov             x8,  x0
         sub             x7,  x1,  #3<<2
         sub             x6,  x1,  x14, lsl #4
         add             x7,  x7,  x14, lsl #4
@@ -77,8 +76,7 @@  function ff_mpadsp_apply_window_\type\()_neon, export=1
         movi            v17.2d, #0
         movi            v18.2d, #0
         movi            v19.2d, #0
-2:
-        subs            x15, x15, #1
+2:      subs            x15, x15, #1
         ld1             {v0.4s},  [x8],  x9
         ld1             {v1.4s},  [x10], x9
         ld1             {v2.4s},  [x6],  x9
@@ -106,8 +104,7 @@  function ff_mpadsp_apply_window_\type\()_neon, export=1
 
         b.eq            4f
         round_sample    v19, 1, 1
-4:
-        round_sample    v16, 1, 0
+4:      round_sample    v16, 1, 0
         shrn            v16.2s, v16.2d,  #OUT_SHIFT
         round_sample    v19, 0, 0
         shrn            v19.2s, v19.2d,  #OUT_SHIFT
@@ -126,8 +123,7 @@  function ff_mpadsp_apply_window_\type\()_neon, export=1
         st1             {v16.\st\()}[0], [x3], x4
         b.eq            4f
         st1             {v18.\st\()}[1], [x5], x13
-4:
-        st1             {v16.\st\()}[1], [x3], x4
+4:      st1             {v16.\st\()}[1], [x3], x4
         st1             {v18.\st\()}[0], [x5], x13
         st1             {v16.\st\()}[2], [x3], x4
         st1             {v18.\st\()}[3], [x5], x13
@@ -187,11 +183,11 @@  endfunc
         and             v28.16b,  \r\().16b,  v31.16b
 .endif
 .if \idx != \next
-  .if \next == 0
+.if \next == 0
         ext             v28.16b, v28.16b, v29.16b, #8
-  .else
+.else
         ext             v28.16b, v29.16b, v28.16b, #8
-  .endif
+.endif
 .endif
 .endm
 .macro  MLA             d, s1, s2
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 1ad32c359d..2eb29848bf 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -17,146 +17,146 @@ 
  */
 
 .macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
-        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
-        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
-        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
-        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
-        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
-        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
-        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
-
-        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
-        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
-        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
-        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
-        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
-        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
-        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
-        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
-
-        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
-        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
-
-        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
-        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
-
-        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
-        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
-
-        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
-        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
+        trn1            \r8\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \r9\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \r1\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \r3\().8b,  \r2\().8b,  \r3\().8b
+        trn1            \r0\().8b,  \r4\().8b,  \r5\().8b
+        trn2            \r5\().8b,  \r4\().8b,  \r5\().8b
+        trn1            \r2\().8b,  \r6\().8b,  \r7\().8b
+        trn2            \r7\().8b,  \r6\().8b,  \r7\().8b
+
+        trn1            \r4\().4h,  \r0\().4h,  \r2\().4h
+        trn2            \r2\().4h,  \r0\().4h,  \r2\().4h
+        trn1            \r6\().4h,  \r5\().4h,  \r7\().4h
+        trn2            \r7\().4h,  \r5\().4h,  \r7\().4h
+        trn1            \r5\().4h,  \r9\().4h,  \r3\().4h
+        trn2            \r9\().4h,  \r9\().4h,  \r3\().4h
+        trn1            \r3\().4h,  \r8\().4h,  \r1\().4h
+        trn2            \r8\().4h,  \r8\().4h,  \r1\().4h
+
+        trn1            \r0\().2s,  \r3\().2s,  \r4\().2s
+        trn2            \r4\().2s,  \r3\().2s,  \r4\().2s
+
+        trn1            \r1\().2s,  \r5\().2s,  \r6\().2s
+        trn2            \r5\().2s,  \r5\().2s,  \r6\().2s
+
+        trn2            \r6\().2s,  \r8\().2s,  \r2\().2s
+        trn1            \r2\().2s,  \r8\().2s,  \r2\().2s
+
+        trn1            \r3\().2s,  \r9\().2s,  \r7\().2s
+        trn2            \r7\().2s,  \r9\().2s,  \r7\().2s
 .endm
 
 .macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-        trn1            \t0\().16B, \r0\().16B, \r1\().16B
-        trn2            \t1\().16B, \r0\().16B, \r1\().16B
-        trn1            \r1\().16B, \r2\().16B, \r3\().16B
-        trn2            \r3\().16B, \r2\().16B, \r3\().16B
-        trn1            \r0\().16B, \r4\().16B, \r5\().16B
-        trn2            \r5\().16B, \r4\().16B, \r5\().16B
-        trn1            \r2\().16B, \r6\().16B, \r7\().16B
-        trn2            \r7\().16B, \r6\().16B, \r7\().16B
-
-        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
-        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
-        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
-        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
-        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
-        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
-        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
-        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
-
-        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
-        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
-
-        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
-        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
-
-        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
-        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
-
-        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
-        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
+        trn1            \t0\().16b, \r0\().16b, \r1\().16b
+        trn2            \t1\().16b, \r0\().16b, \r1\().16b
+        trn1            \r1\().16b, \r2\().16b, \r3\().16b
+        trn2            \r3\().16b, \r2\().16b, \r3\().16b
+        trn1            \r0\().16b, \r4\().16b, \r5\().16b
+        trn2            \r5\().16b, \r4\().16b, \r5\().16b
+        trn1            \r2\().16b, \r6\().16b, \r7\().16b
+        trn2            \r7\().16b, \r6\().16b, \r7\().16b
+
+        trn1            \r4\().8h,  \r0\().8h,  \r2\().8h
+        trn2            \r2\().8h,  \r0\().8h,  \r2\().8h
+        trn1            \r6\().8h,  \r5\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r5\().8h,  \r7\().8h
+        trn1            \r5\().8h,  \t1\().8h,  \r3\().8h
+        trn2            \t1\().8h,  \t1\().8h,  \r3\().8h
+        trn1            \r3\().8h,  \t0\().8h,  \r1\().8h
+        trn2            \t0\().8h,  \t0\().8h,  \r1\().8h
+
+        trn1            \r0\().4s,  \r3\().4s,  \r4\().4s
+        trn2            \r4\().4s,  \r3\().4s,  \r4\().4s
+
+        trn1            \r1\().4s,  \r5\().4s,  \r6\().4s
+        trn2            \r5\().4s,  \r5\().4s,  \r6\().4s
+
+        trn2            \r6\().4s,  \t0\().4s,  \r2\().4s
+        trn1            \r2\().4s,  \t0\().4s,  \r2\().4s
+
+        trn1            \r3\().4s,  \t1\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \t1\().4s,  \r7\().4s
 .endm
 
 .macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
-        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
-        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
-        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
-
-        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
-        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
-        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
-        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
+        trn1            \t4\().16b, \r0\().16b,  \r1\().16b
+        trn2            \t5\().16b, \r0\().16b,  \r1\().16b
+        trn1            \t6\().16b, \r2\().16b,  \r3\().16b
+        trn2            \t7\().16b, \r2\().16b,  \r3\().16b
+
+        trn1            \r0\().8h,  \t4\().8h,  \t6\().8h
+        trn2            \r2\().8h,  \t4\().8h,  \t6\().8h
+        trn1            \r1\().8h,  \t5\().8h,  \t7\().8h
+        trn2            \r3\().8h,  \t5\().8h,  \t7\().8h
 .endm
 
 .macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
-        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
-        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
-        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
-
-        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
-        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
-        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
-        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
+        trn1            \t4\().8b,  \r0\().8b,  \r1\().8b
+        trn2            \t5\().8b,  \r0\().8b,  \r1\().8b
+        trn1            \t6\().8b,  \r2\().8b,  \r3\().8b
+        trn2            \t7\().8b,  \r2\().8b,  \r3\().8b
+
+        trn1            \r0\().4h,  \t4\().4h,  \t6\().4h
+        trn2            \r2\().4h,  \t4\().4h,  \t6\().4h
+        trn1            \r1\().4h,  \t5\().4h,  \t7\().4h
+        trn2            \r3\().4h,  \t5\().4h,  \t7\().4h
 .endm
 
 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
-        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
-        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
-        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
-        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
-
-        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
-        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
-        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
-        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
+        trn1            \r4\().4h,  \r0\().4h,  \r1\().4h
+        trn2            \r5\().4h,  \r0\().4h,  \r1\().4h
+        trn1            \r6\().4h,  \r2\().4h,  \r3\().4h
+        trn2            \r7\().4h,  \r2\().4h,  \r3\().4h
+
+        trn1            \r0\().2s,  \r4\().2s,  \r6\().2s
+        trn2            \r2\().2s,  \r4\().2s,  \r6\().2s
+        trn1            \r1\().2s,  \r5\().2s,  \r7\().2s
+        trn2            \r3\().2s,  \r5\().2s,  \r7\().2s
 .endm
 
 .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
-        trn1            \t4\().8H,  \r0\().8H,  \r1\().8H
-        trn2            \t5\().8H,  \r0\().8H,  \r1\().8H
-        trn1            \t6\().8H,  \r2\().8H,  \r3\().8H
-        trn2            \t7\().8H,  \r2\().8H,  \r3\().8H
-
-        trn1            \r0\().4S,  \t4\().4S,  \t6\().4S
-        trn2            \r2\().4S,  \t4\().4S,  \t6\().4S
-        trn1            \r1\().4S,  \t5\().4S,  \t7\().4S
-        trn2            \r3\().4S,  \t5\().4S,  \t7\().4S
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
 .endm
 
 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
-        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
-        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
-        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
-        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
-        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
-        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
-        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
-        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H
-
-        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
-        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
-        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
-        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
-        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
-        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
-        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
-        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S
-
-        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
-        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D
-
-        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
-        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D
-
-        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
-        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D
-
-        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
-        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
+        trn1            \r8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \r9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \r9\().4s,  \r3\().4s
+        trn2            \r9\().4s,  \r9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \r8\().4s,  \r1\().4s
+        trn2            \r8\().4s,  \r8\().4s,  \r1\().4s
+
+        trn1            \r0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \r4\().2d,  \r3\().2d,  \r4\().2d
+
+        trn1            \r1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \r5\().2d,  \r5\().2d,  \r6\().2d
+
+        trn2            \r6\().2d,  \r8\().2d,  \r2\().2d
+        trn1            \r2\().2d,  \r8\().2d,  \r2\().2d
+
+        trn1            \r3\().2d,  \r9\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r9\().2d,  \r7\().2d
 
 .endm
diff --git a/libavcodec/aarch64/opusdsp_neon.S b/libavcodec/aarch64/opusdsp_neon.S
index 46c2be0874..3b2b89d068 100644
--- a/libavcodec/aarch64/opusdsp_neon.S
+++ b/libavcodec/aarch64/opusdsp_neon.S
@@ -20,93 +20,93 @@ 
 
            // 0.85..^1    0.85..^2    0.85..^3    0.85..^4
 const tab_st, align=4
-        .word 0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
+        .word           0x3f599a00, 0x3f38f671, 0x3f1d382a, 0x3f05a32f
 endconst
 const tab_x0, align=4
-        .word 0x0,        0x3f599a00, 0x3f38f671, 0x3f1d382a
+        .word           0x0,        0x3f599a00, 0x3f38f671, 0x3f1d382a
 endconst
 const tab_x1, align=4
-        .word 0x0,        0x0,        0x3f599a00, 0x3f38f671
+        .word           0x0,        0x0,        0x3f599a00, 0x3f38f671
 endconst
 const tab_x2, align=4
-        .word 0x0,        0x0,        0x0,        0x3f599a00
+        .word           0x0,        0x0,        0x0,        0x3f599a00
 endconst
 
 function ff_opus_deemphasis_neon, export=1
-        movrel  x4, tab_st
-        ld1    {v4.4s}, [x4]
-        movrel  x4, tab_x0
-        ld1    {v5.4s}, [x4]
-        movrel  x4, tab_x1
-        ld1    {v6.4s}, [x4]
-        movrel  x4, tab_x2
-        ld1    {v7.4s}, [x4]
+        movrel          x4, tab_st
+        ld1             {v4.4s}, [x4]
+        movrel          x4, tab_x0
+        ld1             {v5.4s}, [x4]
+        movrel          x4, tab_x1
+        ld1             {v6.4s}, [x4]
+        movrel          x4, tab_x2
+        ld1             {v7.4s}, [x4]
 
-        fmul v0.4s, v4.4s, v0.s[0]
+        fmul            v0.4s, v4.4s, v0.s[0]
 
-1:      ld1  {v1.4s, v2.4s}, [x1], #32
+1:      ld1             {v1.4s, v2.4s}, [x1], #32
 
-        fmla v0.4s, v5.4s, v1.s[0]
-        fmul v3.4s, v7.4s, v2.s[2]
+        fmla            v0.4s, v5.4s, v1.s[0]
+        fmul            v3.4s, v7.4s, v2.s[2]
 
-        fmla v0.4s, v6.4s, v1.s[1]
-        fmla v3.4s, v6.4s, v2.s[1]
+        fmla            v0.4s, v6.4s, v1.s[1]
+        fmla            v3.4s, v6.4s, v2.s[1]
 
-        fmla v0.4s, v7.4s, v1.s[2]
-        fmla v3.4s, v5.4s, v2.s[0]
+        fmla            v0.4s, v7.4s, v1.s[2]
+        fmla            v3.4s, v5.4s, v2.s[0]
 
-        fadd v1.4s, v1.4s, v0.4s
-        fadd v2.4s, v2.4s, v3.4s
+        fadd            v1.4s, v1.4s, v0.4s
+        fadd            v2.4s, v2.4s, v3.4s
 
-        fmla v2.4s, v4.4s, v1.s[3]
+        fmla            v2.4s, v4.4s, v1.s[3]
 
-        st1  {v1.4s, v2.4s}, [x0], #32
-        fmul v0.4s, v4.4s, v2.s[3]
+        st1             {v1.4s, v2.4s}, [x0], #32
+        fmul            v0.4s, v4.4s, v2.s[3]
 
-        subs w2, w2, #8
+        subs            w2, w2, #8
         b.gt 1b
 
-        mov s0, v2.s[3]
+        mov             s0, v2.s[3]
 
         ret
 endfunc
 
 function ff_opus_postfilter_neon, export=1
-        ld1 {v0.4s}, [x2]
-        dup v1.4s, v0.s[1]
-        dup v2.4s, v0.s[2]
-        dup v0.4s, v0.s[0]
+        ld1             {v0.4s}, [x2]
+        dup             v1.4s, v0.s[1]
+        dup             v2.4s, v0.s[2]
+        dup             v0.4s, v0.s[0]
 
-        add w1, w1, #2
-        sub x1, x0, x1, lsl #2
+        add             w1, w1, #2
+        sub             x1, x0, x1, lsl #2
 
-        ld1 {v3.4s}, [x1]
-        fmul v3.4s, v3.4s, v2.4s
+        ld1             {v3.4s}, [x1]
+        fmul            v3.4s, v3.4s, v2.4s
 
-1:      add x1, x1, #4
-        ld1 {v4.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v5.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v6.4s}, [x1]
-        add x1, x1, #4
-        ld1 {v7.4s}, [x1]
+1:      add             x1, x1, #4
+        ld1             {v4.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v5.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v6.4s}, [x1]
+        add             x1, x1, #4
+        ld1             {v7.4s}, [x1]
 
-        fmla v3.4s, v7.4s, v2.4s
-        fadd v6.4s, v6.4s, v4.4s
+        fmla            v3.4s, v7.4s, v2.4s
+        fadd            v6.4s, v6.4s, v4.4s
 
-        ld1 {v4.4s}, [x0]
-        fmla v4.4s, v5.4s, v0.4s
+        ld1             {v4.4s}, [x0]
+        fmla            v4.4s, v5.4s, v0.4s
 
-        fmul v6.4s, v6.4s, v1.4s
-        fadd v6.4s, v6.4s, v3.4s
+        fmul            v6.4s, v6.4s, v1.4s
+        fadd            v6.4s, v6.4s, v3.4s
 
-        fadd v4.4s, v4.4s, v6.4s
-        fmul v3.4s, v7.4s, v2.4s
+        fadd            v4.4s, v4.4s, v6.4s
+        fmul            v3.4s, v7.4s, v2.4s
 
-        st1  {v4.4s}, [x0], #16
+        st1             {v4.4s}, [x0], #16
 
-        subs w3, w3, #4
+        subs            w3, w3, #4
         b.gt 1b
 
         ret
diff --git a/libavcodec/aarch64/pixblockdsp_neon.S b/libavcodec/aarch64/pixblockdsp_neon.S
index 0277e0476d..6a463fe53f 100644
--- a/libavcodec/aarch64/pixblockdsp_neon.S
+++ b/libavcodec/aarch64/pixblockdsp_neon.S
@@ -22,8 +22,7 @@ 
 
 function ff_get_pixels_neon, export=1
         mov             w3,  #8
-1:
-        ld1             {v0.8b}, [x1], x2
+1:      ld1             {v0.8b}, [x1], x2
         subs            w3,  w3,  #2
         ld1             {v1.8b}, [x1], x2
         uxtl            v0.8h,   v0.8b
@@ -36,8 +35,7 @@  endfunc
 
 function ff_diff_pixels_neon, export=1
         mov             w4,  #8
-1:
-        ld1             {v0.8b}, [x1], x3
+1:      ld1             {v0.8b}, [x1], x3
         ld1             {v1.8b}, [x2], x3
         subs            w4,  w4,  #2
         ld1             {v2.8b}, [x1], x3
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
index d23717e760..e02db39846 100644
--- a/libavcodec/aarch64/sbrdsp_neon.S
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -19,25 +19,25 @@ 
 #include "libavutil/aarch64/asm.S"
 
 const factors, align=4
-        .float 1.0, -1.0, 1.0, -1.0
+        .float          1.0, -1.0, 1.0, -1.0
 endconst
 
 const phi_noise_0, align=4
-        .float 1.0, 0.0, 1.0, 0.0
+        .float          1.0, 0.0, 1.0, 0.0
 endconst
 
 const phi_noise_1, align=4
-        .float 0.0,  1.0,  0.0, -1.0
-        .float 0.0, -1.0,  0.0,  1.0
+        .float          0.0,  1.0,  0.0, -1.0
+        .float          0.0, -1.0,  0.0,  1.0
 endconst
 
 const phi_noise_2, align=4
-        .float -1.0, 0.0, -1.0, 0.0
+        .float          -1.0, 0.0, -1.0, 0.0
 endconst
 
 const phi_noise_3, align=4
-        .float 0.0, -1.0,  0.0,  1.0
-        .float 0.0,  1.0,  0.0, -1.0
+        .float          0.0, -1.0,  0.0,  1.0
+        .float          0.0,  1.0,  0.0, -1.0
 endconst
 
 function ff_sbr_sum64x5_neon, export=1
@@ -46,49 +46,49 @@  function ff_sbr_sum64x5_neon, export=1
         add             x3, x0, #192*4
         add             x4, x0, #256*4
         mov             x5, #64
-1:      ld1             {v0.4S}, [x0]
-        ld1             {v1.4S}, [x1], #16
-        fadd            v0.4S, v0.4S, v1.4S
-        ld1             {v2.4S}, [x2], #16
-        fadd            v0.4S, v0.4S, v2.4S
-        ld1             {v3.4S}, [x3], #16
-        fadd            v0.4S, v0.4S, v3.4S
-        ld1             {v4.4S}, [x4], #16
-        fadd            v0.4S, v0.4S, v4.4S
-        st1             {v0.4S}, [x0], #16
+1:      ld1             {v0.4s}, [x0]
+        ld1             {v1.4s}, [x1], #16
+        fadd            v0.4s, v0.4s, v1.4s
+        ld1             {v2.4s}, [x2], #16
+        fadd            v0.4s, v0.4s, v2.4s
+        ld1             {v3.4s}, [x3], #16
+        fadd            v0.4s, v0.4s, v3.4s
+        ld1             {v4.4s}, [x4], #16
+        fadd            v0.4s, v0.4s, v4.4s
+        st1             {v0.4s}, [x0], #16
         subs            x5, x5, #4
         b.gt            1b
         ret
 endfunc
 
 function ff_sbr_sum_square_neon, export=1
-        movi            v0.4S, #0
-1:      ld1             {v1.4S}, [x0], #16
-        fmla            v0.4S, v1.4S, v1.4S
+        movi            v0.4s, #0
+1:      ld1             {v1.4s}, [x0], #16
+        fmla            v0.4s, v1.4s, v1.4s
         subs            w1, w1, #2
         b.gt            1b
-        faddp           v0.4S, v0.4S, v0.4S
-        faddp           v0.4S, v0.4S, v0.4S
+        faddp           v0.4s, v0.4s, v0.4s
+        faddp           v0.4s, v0.4s, v0.4s
         ret
 endfunc
 
 function ff_sbr_neg_odd_64_neon, export=1
         mov             x1, x0
-        movi            v5.4S, #1<<7, lsl #24
-        ld2             {v0.4S, v1.4S}, [x0], #32
-        eor             v1.16B, v1.16B, v5.16B
-        ld2             {v2.4S, v3.4S}, [x0], #32
+        movi            v5.4s, #1<<7, lsl #24
+        ld2             {v0.4s, v1.4s}, [x0], #32
+        eor             v1.16b, v1.16b, v5.16b
+        ld2             {v2.4s, v3.4s}, [x0], #32
 .rept 3
-        st2             {v0.4S, v1.4S}, [x1], #32
-        eor             v3.16B, v3.16B, v5.16B
-        ld2             {v0.4S, v1.4S}, [x0], #32
-        st2             {v2.4S, v3.4S}, [x1], #32
-        eor             v1.16B, v1.16B, v5.16B
-        ld2             {v2.4S, v3.4S}, [x0], #32
+        st2             {v0.4s, v1.4s}, [x1], #32
+        eor             v3.16b, v3.16b, v5.16b
+        ld2             {v0.4s, v1.4s}, [x0], #32
+        st2             {v2.4s, v3.4s}, [x1], #32
+        eor             v1.16b, v1.16b, v5.16b
+        ld2             {v2.4s, v3.4s}, [x0], #32
 .endr
-        eor             v3.16B, v3.16B, v5.16B
-        st2             {v0.4S, v1.4S}, [x1], #32
-        st2             {v2.4S, v3.4S}, [x1], #32
+        eor             v3.16b, v3.16b, v5.16b
+        st2             {v0.4s, v1.4s}, [x1], #32
+        st2             {v2.4s, v3.4s}, [x1], #32
         ret
 endfunc
 
@@ -97,26 +97,26 @@  function ff_sbr_qmf_pre_shuffle_neon, export=1
         add             x2, x0, #64*4
         mov             x3, #-16
         mov             x4, #-4
-        movi            v6.4S, #1<<7, lsl #24
-        ld1             {v0.2S}, [x0], #8
-        st1             {v0.2S}, [x2], #8
+        movi            v6.4s, #1<<7, lsl #24
+        ld1             {v0.2s}, [x0], #8
+        st1             {v0.2s}, [x2], #8
 .rept 7
-        ld1             {v1.4S}, [x1], x3
-        ld1             {v2.4S}, [x0], #16
-        eor             v1.16B, v1.16B, v6.16B
-        rev64           v1.4S, v1.4S
-        ext             v1.16B, v1.16B, v1.16B, #8
-        st2             {v1.4S, v2.4S}, [x2], #32
+        ld1             {v1.4s}, [x1], x3
+        ld1             {v2.4s}, [x0], #16
+        eor             v1.16b, v1.16b, v6.16b
+        rev64           v1.4s, v1.4s
+        ext             v1.16b, v1.16b, v1.16b, #8
+        st2             {v1.4s, v2.4s}, [x2], #32
 .endr
         add             x1, x1, #8
-        ld1             {v1.2S}, [x1], x4
-        ld1             {v2.2S}, [x0], #8
-        ld1             {v1.S}[3], [x1]
-        ld1             {v2.S}[2], [x0]
-        eor             v1.16B, v1.16B, v6.16B
-        rev64           v1.4S, v1.4S
-        st2             {v1.2S, v2.2S}, [x2], #16
-        st2             {v1.S, v2.S}[2], [x2]
+        ld1             {v1.2s}, [x1], x4
+        ld1             {v2.2s}, [x0], #8
+        ld1             {v1.s}[3], [x1]
+        ld1             {v2.s}[2], [x0]
+        eor             v1.16b, v1.16b, v6.16b
+        rev64           v1.4s, v1.4s
+        st2             {v1.2s, v2.2s}, [x2], #16
+        st2             {v1.s, v2.s}[2], [x2]
         ret
 endfunc
 
@@ -124,13 +124,13 @@  function ff_sbr_qmf_post_shuffle_neon, export=1
         add             x2, x1, #60*4
         mov             x3, #-16
         mov             x4, #32
-        movi            v6.4S, #1<<7, lsl #24
-1:      ld1             {v0.4S}, [x2], x3
-        ld1             {v1.4S}, [x1], #16
-        eor             v0.16B, v0.16B, v6.16B
-        rev64           v0.4S, v0.4S
-        ext             v0.16B, v0.16B, v0.16B, #8
-        st2             {v0.4S, v1.4S}, [x0], #32
+        movi            v6.4s, #1<<7, lsl #24
+1:      ld1             {v0.4s}, [x2], x3
+        ld1             {v1.4s}, [x1], #16
+        eor             v0.16b, v0.16b, v6.16b
+        rev64           v0.4s, v0.4s
+        ext             v0.16b, v0.16b, v0.16b, #8
+        st2             {v0.4s, v1.4s}, [x0], #32
         subs            x4, x4, #4
         b.gt            1b
         ret
@@ -141,13 +141,13 @@  function ff_sbr_qmf_deint_neg_neon, export=1
         add             x2, x0, #60*4
         mov             x3, #-32
         mov             x4, #32
-        movi            v2.4S, #1<<7, lsl #24
-1:      ld2             {v0.4S, v1.4S}, [x1], x3
-        eor             v0.16B, v0.16B, v2.16B
-        rev64           v1.4S, v1.4S
-        ext             v1.16B, v1.16B, v1.16B, #8
-        st1             {v0.4S}, [x2]
-        st1             {v1.4S}, [x0], #16
+        movi            v2.4s, #1<<7, lsl #24
+1:      ld2             {v0.4s, v1.4s}, [x1], x3
+        eor             v0.16b, v0.16b, v2.16b
+        rev64           v1.4s, v1.4s
+        ext             v1.16b, v1.16b, v1.16b, #8
+        st1             {v0.4s}, [x2]
+        st1             {v1.4s}, [x0], #16
         sub             x2, x2, #16
         subs            x4, x4, #4
         b.gt            1b
@@ -159,16 +159,16 @@  function ff_sbr_qmf_deint_bfly_neon, export=1
         add             x3, x0, #124*4
         mov             x4, #64
         mov             x5, #-16
-1:      ld1             {v0.4S}, [x1], #16
-        ld1             {v1.4S}, [x2], x5
-        rev64           v2.4S, v0.4S
-        ext             v2.16B, v2.16B, v2.16B, #8
-        rev64           v3.4S, v1.4S
-        ext             v3.16B, v3.16B, v3.16B, #8
-        fadd            v1.4S, v1.4S, v2.4S
-        fsub            v0.4S, v0.4S, v3.4S
-        st1             {v0.4S}, [x0], #16
-        st1             {v1.4S}, [x3], x5
+1:      ld1             {v0.4s}, [x1], #16
+        ld1             {v1.4s}, [x2], x5
+        rev64           v2.4s, v0.4s
+        ext             v2.16b, v2.16b, v2.16b, #8
+        rev64           v3.4s, v1.4s
+        ext             v3.16b, v3.16b, v3.16b, #8
+        fadd            v1.4s, v1.4s, v2.4s
+        fsub            v0.4s, v0.4s, v3.4s
+        st1             {v0.4s}, [x0], #16
+        st1             {v1.4s}, [x3], x5
         subs            x4, x4, #4
         b.gt            1b
         ret
@@ -178,32 +178,32 @@  function ff_sbr_hf_gen_neon, export=1
         sxtw            x4, w4
         sxtw            x5, w5
         movrel          x6, factors
-        ld1             {v7.4S}, [x6]
-        dup             v1.4S, v0.S[0]
-        mov             v2.8B, v1.8B
-        mov             v2.S[2], v7.S[0]
-        mov             v2.S[3], v7.S[0]
-        fmul            v1.4S, v1.4S, v2.4S
-        ld1             {v0.D}[0], [x3]
-        ld1             {v0.D}[1], [x2]
-        fmul            v0.4S, v0.4S, v1.4S
-        fmul            v1.4S, v0.4S, v7.4S
-        rev64           v0.4S, v0.4S
+        ld1             {v7.4s}, [x6]
+        dup             v1.4s, v0.s[0]
+        mov             v2.8b, v1.8b
+        mov             v2.s[2], v7.s[0]
+        mov             v2.s[3], v7.s[0]
+        fmul            v1.4s, v1.4s, v2.4s
+        ld1             {v0.d}[0], [x3]
+        ld1             {v0.d}[1], [x2]
+        fmul            v0.4s, v0.4s, v1.4s
+        fmul            v1.4s, v0.4s, v7.4s
+        rev64           v0.4s, v0.4s
         sub             x7, x5, x4
         add             x0, x0, x4, lsl #3
         add             x1, x1, x4, lsl #3
         sub             x1, x1, #16
-1:      ld1             {v2.4S}, [x1], #16
-        ld1             {v3.2S}, [x1]
-        fmul            v4.4S, v2.4S, v1.4S
-        fmul            v5.4S, v2.4S, v0.4S
-        faddp           v4.4S, v4.4S, v4.4S
-        faddp           v5.4S, v5.4S, v5.4S
-        faddp           v4.4S, v4.4S, v4.4S
-        faddp           v5.4S, v5.4S, v5.4S
-        mov             v4.S[1], v5.S[0]
-        fadd            v4.2S, v4.2S, v3.2S
-        st1             {v4.2S}, [x0], #8
+1:      ld1             {v2.4s}, [x1], #16
+        ld1             {v3.2s}, [x1]
+        fmul            v4.4s, v2.4s, v1.4s
+        fmul            v5.4s, v2.4s, v0.4s
+        faddp           v4.4s, v4.4s, v4.4s
+        faddp           v5.4s, v5.4s, v5.4s
+        faddp           v4.4s, v4.4s, v4.4s
+        faddp           v5.4s, v5.4s, v5.4s
+        mov             v4.s[1], v5.s[0]
+        fadd            v4.2s, v4.2s, v3.2s
+        st1             {v4.2s}, [x0], #8
         sub             x1, x1, #8
         subs            x7, x7, #1
         b.gt            1b
@@ -215,10 +215,10 @@  function ff_sbr_hf_g_filt_neon, export=1
         sxtw            x4, w4
         mov             x5, #40*2*4
         add             x1, x1, x4, lsl #3
-1:      ld1             {v0.2S}, [x1], x5
-        ld1             {v1.S}[0], [x2], #4
-        fmul            v2.4S, v0.4S, v1.S[0]
-        st1             {v2.2S}, [x0], #8
+1:      ld1             {v0.2s}, [x1], x5
+        ld1             {v1.s}[0], [x2], #4
+        fmul            v2.4s, v0.4s, v1.s[0]
+        st1             {v2.2s}, [x0], #8
         subs            x3, x3, #1
         b.gt            1b
         ret
@@ -227,46 +227,46 @@  endfunc
 function ff_sbr_autocorrelate_neon, export=1
         mov             x2, #38
         movrel          x3, factors
-        ld1             {v0.4S}, [x3]
-        movi            v1.4S, #0
-        movi            v2.4S, #0
-        movi            v3.4S, #0
-        ld1             {v4.2S}, [x0], #8
-        ld1             {v5.2S}, [x0], #8
-        fmul            v16.2S, v4.2S, v4.2S
-        fmul            v17.2S, v5.2S, v4.S[0]
-        fmul            v18.2S, v5.2S, v4.S[1]
-1:      ld1             {v5.D}[1], [x0], #8
-        fmla            v1.2S, v4.2S, v4.2S
-        fmla            v2.4S, v5.4S, v4.S[0]
-        fmla            v3.4S, v5.4S, v4.S[1]
-        mov             v4.D[0], v5.D[0]
-        mov             v5.D[0], v5.D[1]
+        ld1             {v0.4s}, [x3]
+        movi            v1.4s, #0
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        ld1             {v4.2s}, [x0], #8
+        ld1             {v5.2s}, [x0], #8
+        fmul            v16.2s, v4.2s, v4.2s
+        fmul            v17.2s, v5.2s, v4.s[0]
+        fmul            v18.2s, v5.2s, v4.s[1]
+1:      ld1             {v5.d}[1], [x0], #8
+        fmla            v1.2s, v4.2s, v4.2s
+        fmla            v2.4s, v5.4s, v4.s[0]
+        fmla            v3.4s, v5.4s, v4.s[1]
+        mov             v4.d[0], v5.d[0]
+        mov             v5.d[0], v5.d[1]
         subs            x2, x2, #1
         b.gt            1b
-        fmul            v19.2S, v4.2S, v4.2S
-        fmul            v20.2S, v5.2S, v4.S[0]
-        fmul            v21.2S, v5.2S, v4.S[1]
-        fadd            v22.4S, v2.4S, v20.4S
-        fsub            v22.4S, v22.4S, v17.4S
-        fadd            v23.4S, v3.4S, v21.4S
-        fsub            v23.4S, v23.4S, v18.4S
-        rev64           v23.4S, v23.4S
-        fmul            v23.4S, v23.4S, v0.4S
-        fadd            v22.4S, v22.4S, v23.4S
-        st1             {v22.4S}, [x1], #16
-        fadd            v23.2S, v1.2S, v19.2S
-        fsub            v23.2S, v23.2S, v16.2S
-        faddp           v23.2S, v23.2S, v23.2S
-        st1             {v23.S}[0], [x1]
+        fmul            v19.2s, v4.2s, v4.2s
+        fmul            v20.2s, v5.2s, v4.s[0]
+        fmul            v21.2s, v5.2s, v4.s[1]
+        fadd            v22.4s, v2.4s, v20.4s
+        fsub            v22.4s, v22.4s, v17.4s
+        fadd            v23.4s, v3.4s, v21.4s
+        fsub            v23.4s, v23.4s, v18.4s
+        rev64           v23.4s, v23.4s
+        fmul            v23.4s, v23.4s, v0.4s
+        fadd            v22.4s, v22.4s, v23.4s
+        st1             {v22.4s}, [x1], #16
+        fadd            v23.2s, v1.2s, v19.2s
+        fsub            v23.2s, v23.2s, v16.2s
+        faddp           v23.2s, v23.2s, v23.2s
+        st1             {v23.s}[0], [x1]
         add             x1, x1, #8
-        rev64           v3.2S, v3.2S
-        fmul            v3.2S, v3.2S, v0.2S
-        fadd            v2.2S, v2.2S, v3.2S
-        st1             {v2.2S}, [x1]
+        rev64           v3.2s, v3.2s
+        fmul            v3.2s, v3.2s, v0.2s
+        fadd            v2.2s, v2.2s, v3.2s
+        st1             {v2.2s}, [x1]
         add             x1, x1, #16
-        faddp           v1.2S, v1.2S, v1.2S
-        st1             {v1.S}[0], [x1]
+        faddp           v1.2s, v1.2s, v1.2s
+        st1             {v1.s}[0], [x1]
         ret
 endfunc
 
@@ -278,25 +278,25 @@  endfunc
 1:      and             x3, x3, #0x1ff
         add             x8, x7, x3, lsl #3
         add             x3, x3, #2
-        ld1             {v2.4S}, [x0]
-        ld1             {v3.2S}, [x1], #8
-        ld1             {v4.2S}, [x2], #8
-        ld1             {v5.4S}, [x8]
-        mov             v6.16B, v2.16B
-        zip1            v3.4S, v3.4S, v3.4S
-        zip1            v4.4S, v4.4S, v4.4S
-        fmla            v6.4S, v1.4S, v3.4S
-        fmla            v2.4S, v5.4S, v4.4S
-        fcmeq           v7.4S, v3.4S, #0
-        bif             v2.16B, v6.16B, v7.16B
-        st1             {v2.4S}, [x0], #16
+        ld1             {v2.4s}, [x0]
+        ld1             {v3.2s}, [x1], #8
+        ld1             {v4.2s}, [x2], #8
+        ld1             {v5.4s}, [x8]
+        mov             v6.16b, v2.16b
+        zip1            v3.4s, v3.4s, v3.4s
+        zip1            v4.4s, v4.4s, v4.4s
+        fmla            v6.4s, v1.4s, v3.4s
+        fmla            v2.4s, v5.4s, v4.4s
+        fcmeq           v7.4s, v3.4s, #0
+        bif             v2.16b, v6.16b, v7.16b
+        st1             {v2.4s}, [x0], #16
         subs            x5, x5, #2
         b.gt            1b
 .endm
 
 function ff_sbr_hf_apply_noise_0_neon, export=1
         movrel          x9, phi_noise_0
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
         apply_noise_common
         ret
 endfunc
@@ -305,14 +305,14 @@  function ff_sbr_hf_apply_noise_1_neon, export=1
         movrel          x9, phi_noise_1
         and             x4, x4, #1
         add             x9, x9, x4, lsl #4
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
         apply_noise_common
         ret
 endfunc
 
 function ff_sbr_hf_apply_noise_2_neon, export=1
         movrel          x9, phi_noise_2
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
         apply_noise_common
         ret
 endfunc
@@ -321,7 +321,7 @@  function ff_sbr_hf_apply_noise_3_neon, export=1
         movrel          x9, phi_noise_3
         and             x4, x4, #1
         add             x9, x9, x4, lsl #4
-        ld1             {v1.4S}, [x9]
+        ld1             {v1.4s}, [x9]
         apply_noise_common
         ret
 endfunc
diff --git a/libavcodec/aarch64/simple_idct_neon.S b/libavcodec/aarch64/simple_idct_neon.S
index 210182ff21..e9deae5ced 100644
--- a/libavcodec/aarch64/simple_idct_neon.S
+++ b/libavcodec/aarch64/simple_idct_neon.S
@@ -37,24 +37,24 @@ 
 #define ROW_SHIFT 11
 #define COL_SHIFT 20
 
-#define z1 v0.H[0]
-#define z2 v0.H[1]
-#define z3 v0.H[2]
-#define z4 v0.H[3]
-#define z5 v0.H[4]
-#define z6 v0.H[5]
-#define z7 v0.H[6]
-#define z4c v0.H[7]
-
-const   idct_coeff_neon, align=4
-        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+#define z1 v0.h[0]
+#define z2 v0.h[1]
+#define z3 v0.h[2]
+#define z4 v0.h[3]
+#define z5 v0.h[4]
+#define z6 v0.h[5]
+#define z7 v0.h[6]
+#define z4c v0.h[7]
+
+const idct_coeff_neon, align=4
+        .short          Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
 endconst
 
 .macro idct_start data
         prfm            pldl1keep, [\data]
         mov             x10, x30
         movrel          x3, idct_coeff_neon
-        ld1             {v0.2D}, [x3]
+        ld1             {v0.2d}, [x3]
 .endm
 
 .macro idct_end
@@ -74,146 +74,146 @@  endconst
 .endm
 
 .macro idct_col4_top y1, y2, y3, y4, i, l
-        smull\i         v7.4S,  \y3\l, z2
-        smull\i         v16.4S, \y3\l, z6
-        smull\i         v17.4S, \y2\l, z1
-        add             v19.4S, v23.4S, v7.4S
-        smull\i         v18.4S, \y2\l, z3
-        add             v20.4S, v23.4S, v16.4S
-        smull\i         v5.4S,  \y2\l, z5
-        sub             v21.4S, v23.4S, v16.4S
-        smull\i         v6.4S,  \y2\l, z7
-        sub             v22.4S, v23.4S, v7.4S
-
-        smlal\i         v17.4S, \y4\l, z3
-        smlsl\i         v18.4S, \y4\l, z7
-        smlsl\i         v5.4S,  \y4\l, z1
-        smlsl\i         v6.4S,  \y4\l, z5
+        smull\i         v7.4s,  \y3\l, z2
+        smull\i         v16.4s, \y3\l, z6
+        smull\i         v17.4s, \y2\l, z1
+        add             v19.4s, v23.4s, v7.4s
+        smull\i         v18.4s, \y2\l, z3
+        add             v20.4s, v23.4s, v16.4s
+        smull\i         v5.4s,  \y2\l, z5
+        sub             v21.4s, v23.4s, v16.4s
+        smull\i         v6.4s,  \y2\l, z7
+        sub             v22.4s, v23.4s, v7.4s
+
+        smlal\i         v17.4s, \y4\l, z3
+        smlsl\i         v18.4s, \y4\l, z7
+        smlsl\i         v5.4s,  \y4\l, z1
+        smlsl\i         v6.4s,  \y4\l, z5
 .endm
 
 .macro idct_row4_neon y1, y2, y3, y4, pass
-        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
-        movi            v23.4S, #1<<2, lsl #8
-        orr             v5.16B, \y1\().16B, \y2\().16B
-        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
-        orr             v6.16B, \y3\().16B, \y4\().16B
-        orr             v5.16B, v5.16B, v6.16B
-        mov             x3, v5.D[1]
-        smlal           v23.4S, \y1\().4H, z4
+        ld1             {\y1\().2d,\y2\().2d}, [x2], #32
+        movi            v23.4s, #1<<2, lsl #8
+        orr             v5.16b, \y1\().16b, \y2\().16b
+        ld1             {\y3\().2d,\y4\().2d}, [x2], #32
+        orr             v6.16b, \y3\().16b, \y4\().16b
+        orr             v5.16b, v5.16b, v6.16b
+        mov             x3, v5.d[1]
+        smlal           v23.4s, \y1\().4h, z4
 
         idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
 
         cmp             x3, #0
         b.eq            \pass\()f
 
-        smull2          v7.4S, \y1\().8H, z4
-        smlal2          v17.4S, \y2\().8H, z5
-        smlsl2          v18.4S, \y2\().8H, z1
-        smull2          v16.4S, \y3\().8H, z2
-        smlal2          v5.4S, \y2\().8H, z7
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v7.4S
-        sub             v21.4S, v21.4S, v7.4S
-        add             v22.4S, v22.4S, v7.4S
-        smlal2          v6.4S, \y2\().8H, z3
-        smull2          v7.4S, \y3\().8H, z6
-        smlal2          v17.4S, \y4\().8H, z7
-        smlsl2          v18.4S, \y4\().8H, z5
-        smlal2          v5.4S, \y4\().8H, z3
-        smlsl2          v6.4S, \y4\().8H, z1
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v16.4S
-        add             v21.4S, v21.4S, v16.4S
-        sub             v22.4S, v22.4S, v7.4S
-
-\pass:  add             \y3\().4S, v19.4S, v17.4S
-        add             \y4\().4S, v20.4S, v18.4S
-        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
-        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
-        add             v7.4S, v21.4S, v5.4S
-        add             v16.4S, v22.4S, v6.4S
-        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
-        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
-        sub             v22.4S, v22.4S, v6.4S
-        sub             v19.4S, v19.4S, v17.4S
-        sub             v21.4S, v21.4S, v5.4S
-        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
-        sub             v20.4S, v20.4S, v18.4S
-        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
-        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
-        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
-
-        trn1            v16.8H, \y1\().8H, \y2\().8H
-        trn2            v17.8H, \y1\().8H, \y2\().8H
-        trn1            v18.8H, \y3\().8H, \y4\().8H
-        trn2            v19.8H, \y3\().8H, \y4\().8H
-        trn1            \y1\().4S, v16.4S, v18.4S
-        trn1            \y2\().4S, v17.4S, v19.4S
-        trn2            \y3\().4S, v16.4S, v18.4S
-        trn2            \y4\().4S, v17.4S, v19.4S
+        smull2          v7.4s, \y1\().8h, z4
+        smlal2          v17.4s, \y2\().8h, z5
+        smlsl2          v18.4s, \y2\().8h, z1
+        smull2          v16.4s, \y3\().8h, z2
+        smlal2          v5.4s, \y2\().8h, z7
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v7.4s
+        sub             v21.4s, v21.4s, v7.4s
+        add             v22.4s, v22.4s, v7.4s
+        smlal2          v6.4s, \y2\().8h, z3
+        smull2          v7.4s, \y3\().8h, z6
+        smlal2          v17.4s, \y4\().8h, z7
+        smlsl2          v18.4s, \y4\().8h, z5
+        smlal2          v5.4s, \y4\().8h, z3
+        smlsl2          v6.4s, \y4\().8h, z1
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v16.4s
+        add             v21.4s, v21.4s, v16.4s
+        sub             v22.4s, v22.4s, v7.4s
+
+\pass:  add             \y3\().4s, v19.4s, v17.4s
+        add             \y4\().4s, v20.4s, v18.4s
+        shrn            \y1\().4h, \y3\().4s, #ROW_SHIFT
+        shrn            \y2\().4h, \y4\().4s, #ROW_SHIFT
+        add             v7.4s, v21.4s, v5.4s
+        add             v16.4s, v22.4s, v6.4s
+        shrn            \y3\().4h, v7.4s, #ROW_SHIFT
+        shrn            \y4\().4h, v16.4s, #ROW_SHIFT
+        sub             v22.4s, v22.4s, v6.4s
+        sub             v19.4s, v19.4s, v17.4s
+        sub             v21.4s, v21.4s, v5.4s
+        shrn2           \y1\().8h, v22.4s, #ROW_SHIFT
+        sub             v20.4s, v20.4s, v18.4s
+        shrn2           \y2\().8h, v21.4s, #ROW_SHIFT
+        shrn2           \y3\().8h, v20.4s, #ROW_SHIFT
+        shrn2           \y4\().8h, v19.4s, #ROW_SHIFT
+
+        trn1            v16.8h, \y1\().8h, \y2\().8h
+        trn2            v17.8h, \y1\().8h, \y2\().8h
+        trn1            v18.8h, \y3\().8h, \y4\().8h
+        trn2            v19.8h, \y3\().8h, \y4\().8h
+        trn1            \y1\().4s, v16.4s, v18.4s
+        trn1            \y2\().4s, v17.4s, v19.4s
+        trn2            \y3\().4s, v16.4s, v18.4s
+        trn2            \y4\().4s, v17.4s, v19.4s
 .endm
 
 .macro declare_idct_col4_neon i, l
 function idct_col4_neon\i
-        dup             v23.4H, z4c
+        dup             v23.4h, z4c
 .if \i == 1
-        add             v23.4H, v23.4H, v24.4H
+        add             v23.4h, v23.4h, v24.4h
 .else
-        mov             v5.D[0], v24.D[1]
-        add             v23.4H, v23.4H, v5.4H
+        mov             v5.d[0], v24.d[1]
+        add             v23.4h, v23.4h, v5.4h
 .endif
-        smull           v23.4S, v23.4H, z4
+        smull           v23.4s, v23.4h, z4
 
         idct_col4_top   v24, v25, v26, v27, \i, \l
 
-        mov             x4, v28.D[\i - 1]
-        mov             x5, v29.D[\i - 1]
+        mov             x4, v28.d[\i - 1]
+        mov             x5, v29.d[\i - 1]
         cmp             x4, #0
         b.eq            1f
 
-        smull\i         v7.4S,  v28\l,  z4
-        add             v19.4S, v19.4S, v7.4S
-        sub             v20.4S, v20.4S, v7.4S
-        sub             v21.4S, v21.4S, v7.4S
-        add             v22.4S, v22.4S, v7.4S
+        smull\i         v7.4s,  v28\l,  z4
+        add             v19.4s, v19.4s, v7.4s
+        sub             v20.4s, v20.4s, v7.4s
+        sub             v21.4s, v21.4s, v7.4s
+        add             v22.4s, v22.4s, v7.4s
 
-1:      mov             x4, v30.D[\i - 1]
+1:      mov             x4, v30.d[\i - 1]
         cmp             x5, #0
         b.eq            2f
 
-        smlal\i         v17.4S, v29\l, z5
-        smlsl\i         v18.4S, v29\l, z1
-        smlal\i         v5.4S,  v29\l, z7
-        smlal\i         v6.4S,  v29\l, z3
+        smlal\i         v17.4s, v29\l, z5
+        smlsl\i         v18.4s, v29\l, z1
+        smlal\i         v5.4s,  v29\l, z7
+        smlal\i         v6.4s,  v29\l, z3
 
-2:      mov             x5, v31.D[\i - 1]
+2:      mov             x5, v31.d[\i - 1]
         cmp             x4, #0
         b.eq            3f
 
-        smull\i         v7.4S,  v30\l, z6
-        smull\i         v16.4S, v30\l, z2
-        add             v19.4S, v19.4S, v7.4S
-        sub             v22.4S, v22.4S, v7.4S
-        sub             v20.4S, v20.4S, v16.4S
-        add             v21.4S, v21.4S, v16.4S
+        smull\i         v7.4s,  v30\l, z6
+        smull\i         v16.4s, v30\l, z2
+        add             v19.4s, v19.4s, v7.4s
+        sub             v22.4s, v22.4s, v7.4s
+        sub             v20.4s, v20.4s, v16.4s
+        add             v21.4s, v21.4s, v16.4s
 
 3:      cmp             x5, #0
         b.eq            4f
 
-        smlal\i         v17.4S, v31\l, z7
-        smlsl\i         v18.4S, v31\l, z5
-        smlal\i         v5.4S,  v31\l, z3
-        smlsl\i         v6.4S,  v31\l, z1
+        smlal\i         v17.4s, v31\l, z7
+        smlsl\i         v18.4s, v31\l, z5
+        smlal\i         v5.4s,  v31\l, z3
+        smlsl\i         v6.4s,  v31\l, z1
 
-4:      addhn           v7.4H, v19.4S, v17.4S
-        addhn2          v7.8H, v20.4S, v18.4S
-        subhn           v18.4H, v20.4S, v18.4S
-        subhn2          v18.8H, v19.4S, v17.4S
+4:      addhn           v7.4h, v19.4s, v17.4s
+        addhn2          v7.8h, v20.4s, v18.4s
+        subhn           v18.4h, v20.4s, v18.4s
+        subhn2          v18.8h, v19.4s, v17.4s
 
-        addhn           v16.4H, v21.4S, v5.4S
-        addhn2          v16.8H, v22.4S, v6.4S
-        subhn           v17.4H, v22.4S, v6.4S
-        subhn2          v17.8H, v21.4S, v5.4S
+        addhn           v16.4h, v21.4s, v5.4s
+        addhn2          v16.8h, v22.4s, v6.4s
+        subhn           v17.4h, v22.4s, v6.4s
+        subhn2          v17.8h, v21.4s, v5.4s
 
         ret
 endfunc
@@ -229,33 +229,33 @@  function ff_simple_idct_put_neon, export=1
         idct_row4_neon  v28, v29, v30, v31, 2
         bl              idct_col4_neon1
 
-        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
-        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
-        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
-        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+        sqshrun         v1.8b,  v7.8h, #COL_SHIFT-16
+        sqshrun2        v1.16b, v16.8h, #COL_SHIFT-16
+        sqshrun         v3.8b,  v17.8h, #COL_SHIFT-16
+        sqshrun2        v3.16b, v18.8h, #COL_SHIFT-16
 
         bl              idct_col4_neon2
 
-        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
-        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
-        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
-        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+        sqshrun         v2.8b,  v7.8h, #COL_SHIFT-16
+        sqshrun2        v2.16b, v16.8h, #COL_SHIFT-16
+        sqshrun         v4.8b,  v17.8h, #COL_SHIFT-16
+        sqshrun2        v4.16b, v18.8h, #COL_SHIFT-16
 
-        zip1            v16.4S, v1.4S, v2.4S
-        zip2            v17.4S, v1.4S, v2.4S
+        zip1            v16.4s, v1.4s, v2.4s
+        zip2            v17.4s, v1.4s, v2.4s
 
-        st1             {v16.D}[0], [x0], x1
-        st1             {v16.D}[1], [x0], x1
+        st1             {v16.d}[0], [x0], x1
+        st1             {v16.d}[1], [x0], x1
 
-        zip1            v18.4S, v3.4S, v4.4S
-        zip2            v19.4S, v3.4S, v4.4S
+        zip1            v18.4s, v3.4s, v4.4s
+        zip2            v19.4s, v3.4s, v4.4s
 
-        st1             {v17.D}[0], [x0], x1
-        st1             {v17.D}[1], [x0], x1
-        st1             {v18.D}[0], [x0], x1
-        st1             {v18.D}[1], [x0], x1
-        st1             {v19.D}[0], [x0], x1
-        st1             {v19.D}[1], [x0], x1
+        st1             {v17.d}[0], [x0], x1
+        st1             {v17.d}[1], [x0], x1
+        st1             {v18.d}[0], [x0], x1
+        st1             {v18.d}[1], [x0], x1
+        st1             {v19.d}[0], [x0], x1
+        st1             {v19.d}[1], [x0], x1
 
         idct_end
 endfunc
@@ -267,59 +267,59 @@  function ff_simple_idct_add_neon, export=1
         idct_row4_neon  v28, v29, v30, v31, 2
         bl              idct_col4_neon1
 
-        sshr            v1.8H, v7.8H, #COL_SHIFT-16
-        sshr            v2.8H, v16.8H, #COL_SHIFT-16
-        sshr            v3.8H, v17.8H, #COL_SHIFT-16
-        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+        sshr            v1.8h, v7.8h, #COL_SHIFT-16
+        sshr            v2.8h, v16.8h, #COL_SHIFT-16
+        sshr            v3.8h, v17.8h, #COL_SHIFT-16
+        sshr            v4.8h, v18.8h, #COL_SHIFT-16
 
         bl              idct_col4_neon2
 
-        sshr            v7.8H, v7.8H, #COL_SHIFT-16
-        sshr            v16.8H, v16.8H, #COL_SHIFT-16
-        sshr            v17.8H, v17.8H, #COL_SHIFT-16
-        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+        sshr            v7.8h, v7.8h, #COL_SHIFT-16
+        sshr            v16.8h, v16.8h, #COL_SHIFT-16
+        sshr            v17.8h, v17.8h, #COL_SHIFT-16
+        sshr            v18.8h, v18.8h, #COL_SHIFT-16
 
         mov             x9,  x0
-        ld1             {v19.D}[0], [x0], x1
-        zip1            v23.2D, v1.2D, v7.2D
-        zip2            v24.2D, v1.2D, v7.2D
-        ld1             {v19.D}[1], [x0], x1
-        zip1            v25.2D, v2.2D, v16.2D
-        zip2            v26.2D, v2.2D, v16.2D
-        ld1             {v20.D}[0], [x0], x1
-        zip1            v27.2D, v3.2D, v17.2D
-        zip2            v28.2D, v3.2D, v17.2D
-        ld1             {v20.D}[1], [x0], x1
-        zip1            v29.2D, v4.2D, v18.2D
-        zip2            v30.2D, v4.2D, v18.2D
-        ld1             {v21.D}[0], [x0], x1
-        uaddw           v23.8H, v23.8H, v19.8B
-        uaddw2          v24.8H, v24.8H, v19.16B
-        ld1             {v21.D}[1], [x0], x1
-        sqxtun          v23.8B, v23.8H
-        sqxtun2         v23.16B, v24.8H
-        ld1             {v22.D}[0], [x0], x1
-        uaddw           v24.8H, v25.8H, v20.8B
-        uaddw2          v25.8H, v26.8H, v20.16B
-        ld1             {v22.D}[1], [x0], x1
-        sqxtun          v24.8B, v24.8H
-        sqxtun2         v24.16B, v25.8H
-        st1             {v23.D}[0], [x9], x1
-        uaddw           v25.8H, v27.8H, v21.8B
-        uaddw2          v26.8H, v28.8H, v21.16B
-        st1             {v23.D}[1], [x9], x1
-        sqxtun          v25.8B, v25.8H
-        sqxtun2         v25.16B, v26.8H
-        st1             {v24.D}[0], [x9], x1
-        uaddw           v26.8H, v29.8H, v22.8B
-        uaddw2          v27.8H, v30.8H, v22.16B
-        st1             {v24.D}[1], [x9], x1
-        sqxtun          v26.8B, v26.8H
-        sqxtun2         v26.16B, v27.8H
-        st1             {v25.D}[0], [x9], x1
-        st1             {v25.D}[1], [x9], x1
-        st1             {v26.D}[0], [x9], x1
-        st1             {v26.D}[1], [x9], x1
+        ld1             {v19.d}[0], [x0], x1
+        zip1            v23.2d, v1.2d, v7.2d
+        zip2            v24.2d, v1.2d, v7.2d
+        ld1             {v19.d}[1], [x0], x1
+        zip1            v25.2d, v2.2d, v16.2d
+        zip2            v26.2d, v2.2d, v16.2d
+        ld1             {v20.d}[0], [x0], x1
+        zip1            v27.2d, v3.2d, v17.2d
+        zip2            v28.2d, v3.2d, v17.2d
+        ld1             {v20.d}[1], [x0], x1
+        zip1            v29.2d, v4.2d, v18.2d
+        zip2            v30.2d, v4.2d, v18.2d
+        ld1             {v21.d}[0], [x0], x1
+        uaddw           v23.8h, v23.8h, v19.8b
+        uaddw2          v24.8h, v24.8h, v19.16b
+        ld1             {v21.d}[1], [x0], x1
+        sqxtun          v23.8b, v23.8h
+        sqxtun2         v23.16b, v24.8h
+        ld1             {v22.d}[0], [x0], x1
+        uaddw           v24.8h, v25.8h, v20.8b
+        uaddw2          v25.8h, v26.8h, v20.16b
+        ld1             {v22.d}[1], [x0], x1
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v25.8h
+        st1             {v23.d}[0], [x9], x1
+        uaddw           v25.8h, v27.8h, v21.8b
+        uaddw2          v26.8h, v28.8h, v21.16b
+        st1             {v23.d}[1], [x9], x1
+        sqxtun          v25.8b, v25.8h
+        sqxtun2         v25.16b, v26.8h
+        st1             {v24.d}[0], [x9], x1
+        uaddw           v26.8h, v29.8h, v22.8b
+        uaddw2          v27.8h, v30.8h, v22.16b
+        st1             {v24.d}[1], [x9], x1
+        sqxtun          v26.8b, v26.8h
+        sqxtun2         v26.16b, v27.8h
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x9], x1
+        st1             {v26.d}[0], [x9], x1
+        st1             {v26.d}[1], [x9], x1
 
         idct_end
 endfunc
@@ -333,30 +333,30 @@  function ff_simple_idct_neon, export=1
         sub             x2, x2, #128
         bl              idct_col4_neon1
 
-        sshr            v1.8H, v7.8H, #COL_SHIFT-16
-        sshr            v2.8H, v16.8H, #COL_SHIFT-16
-        sshr            v3.8H, v17.8H, #COL_SHIFT-16
-        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+        sshr            v1.8h, v7.8h, #COL_SHIFT-16
+        sshr            v2.8h, v16.8h, #COL_SHIFT-16
+        sshr            v3.8h, v17.8h, #COL_SHIFT-16
+        sshr            v4.8h, v18.8h, #COL_SHIFT-16
 
         bl              idct_col4_neon2
 
-        sshr            v7.8H, v7.8H, #COL_SHIFT-16
-        sshr            v16.8H, v16.8H, #COL_SHIFT-16
-        sshr            v17.8H, v17.8H, #COL_SHIFT-16
-        sshr            v18.8H, v18.8H, #COL_SHIFT-16
-
-        zip1            v23.2D, v1.2D, v7.2D
-        zip2            v24.2D, v1.2D, v7.2D
-        st1             {v23.2D,v24.2D}, [x2], #32
-        zip1            v25.2D, v2.2D, v16.2D
-        zip2            v26.2D, v2.2D, v16.2D
-        st1             {v25.2D,v26.2D}, [x2], #32
-        zip1            v27.2D, v3.2D, v17.2D
-        zip2            v28.2D, v3.2D, v17.2D
-        st1             {v27.2D,v28.2D}, [x2], #32
-        zip1            v29.2D, v4.2D, v18.2D
-        zip2            v30.2D, v4.2D, v18.2D
-        st1             {v29.2D,v30.2D}, [x2], #32
+        sshr            v7.8h, v7.8h, #COL_SHIFT-16
+        sshr            v16.8h, v16.8h, #COL_SHIFT-16
+        sshr            v17.8h, v17.8h, #COL_SHIFT-16
+        sshr            v18.8h, v18.8h, #COL_SHIFT-16
+
+        zip1            v23.2d, v1.2d, v7.2d
+        zip2            v24.2d, v1.2d, v7.2d
+        st1             {v23.2d,v24.2d}, [x2], #32
+        zip1            v25.2d, v2.2d, v16.2d
+        zip2            v26.2d, v2.2d, v16.2d
+        st1             {v25.2d,v26.2d}, [x2], #32
+        zip1            v27.2d, v3.2d, v17.2d
+        zip2            v28.2d, v3.2d, v17.2d
+        st1             {v27.2d,v28.2d}, [x2], #32
+        zip1            v29.2d, v4.2d, v18.2d
+        zip2            v30.2d, v4.2d, v18.2d
+        st1             {v29.2d,v30.2d}, [x2], #32
 
         idct_end
 endfunc
diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S
index ba79ba9686..9efc5c459d 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -72,8 +72,7 @@  function ff_synth_filter_float_neon, export=1
         add             x8,  x9,  #12*4
         mov             x15, #64*4
         mov             x1,  #4
-1:
-        add             x10, x9,  #16*4         // synth_buf
+1:      add             x10, x9,  #16*4         // synth_buf
         add             x11, x8,  #16*4
         add             x5,  x4,  #16*4         // window
         add             x6,  x4,  #32*4
@@ -85,8 +84,7 @@  function ff_synth_filter_float_neon, export=1
         movi            v7.4s,  #0              // d
 
         mov             x12, #512
-2:
-        sub             x12, x12, #64
+2:      sub             x12, x12, #64
         cmp             x12, x0
         inner_loop
         b.gt            2b
@@ -96,12 +94,10 @@  function ff_synth_filter_float_neon, export=1
         cbz             x12, 4f
         sub             x10, x10, #512*4
         sub             x11, x11, #512*4
-3:
-        subs            x12, x12, #64
+3:      subs            x12, x12, #64
         inner_loop
         b.gt            3b
-4:
-        subs            x1,  x1,  #1
+4:      subs            x1,  x1,  #1
         fmul            v4.4s,  v4.4s,  v0.s[0]
         fmul            v5.4s,  v5.4s,  v0.s[0]
         st1             {v6.4s},   [x2],  #16
@@ -115,7 +111,6 @@  function ff_synth_filter_float_neon, export=1
         sub             x8,  x8,  #4*4          // synth_buf
         b               1b
 
-10:
-        add             sp,  sp,  #64
+10:     add             sp,  sp,  #64
         ret
 endfunc
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index 9a96c2523c..7df48ea000 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -698,11 +698,11 @@  endfunc
 
 .align  5
 .Lcoeffs_it8:
-.quad   0x000F00090003
+        .quad           0x000F00090003
 .Lcoeffs_it4:
-.quad   0x0011000B0005
+        .quad           0x0011000B0005
 .Lcoeffs:
-.quad   0x00050002
+        .quad           0x00050002
 
 // VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
 // On entry:
@@ -1391,35 +1391,35 @@  function ff_vc1_unescape_buffer_helper_neon, export=1
         tst             w1, #32
         b.ne            1f
 
-          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
-          ext             v25.16b, v0.16b, v1.16b, #1
-          ext             v26.16b, v0.16b, v1.16b, #2
-          ext             v27.16b, v0.16b, v1.16b, #3
-          ext             v29.16b, v1.16b, v2.16b, #1
-          ext             v30.16b, v1.16b, v2.16b, #2
-          ext             v31.16b, v1.16b, v2.16b, #3
-          bic             v24.16b, v0.16b, v20.16b
-          bic             v25.16b, v25.16b, v20.16b
-          bic             v26.16b, v26.16b, v20.16b
-          bic             v27.16b, v27.16b, v20.16b
-          bic             v28.16b, v1.16b, v20.16b
-          bic             v29.16b, v29.16b, v20.16b
-          bic             v30.16b, v30.16b, v20.16b
-          bic             v31.16b, v31.16b, v20.16b
-          eor             v24.16b, v24.16b, v21.16b
-          eor             v25.16b, v25.16b, v21.16b
-          eor             v26.16b, v26.16b, v21.16b
-          eor             v27.16b, v27.16b, v21.16b
-          eor             v28.16b, v28.16b, v21.16b
-          eor             v29.16b, v29.16b, v21.16b
-          eor             v30.16b, v30.16b, v21.16b
-          eor             v31.16b, v31.16b, v21.16b
-          cmeq            v24.4s, v24.4s, #0
-          cmeq            v25.4s, v25.4s, #0
-          cmeq            v26.4s, v26.4s, #0
-          cmeq            v27.4s, v27.4s, #0
-          add             w1, w1, #32
-          b               3f
+        ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
+        ext             v25.16b, v0.16b, v1.16b, #1
+        ext             v26.16b, v0.16b, v1.16b, #2
+        ext             v27.16b, v0.16b, v1.16b, #3
+        ext             v29.16b, v1.16b, v2.16b, #1
+        ext             v30.16b, v1.16b, v2.16b, #2
+        ext             v31.16b, v1.16b, v2.16b, #3
+        bic             v24.16b, v0.16b, v20.16b
+        bic             v25.16b, v25.16b, v20.16b
+        bic             v26.16b, v26.16b, v20.16b
+        bic             v27.16b, v27.16b, v20.16b
+        bic             v28.16b, v1.16b, v20.16b
+        bic             v29.16b, v29.16b, v20.16b
+        bic             v30.16b, v30.16b, v20.16b
+        bic             v31.16b, v31.16b, v20.16b
+        eor             v24.16b, v24.16b, v21.16b
+        eor             v25.16b, v25.16b, v21.16b
+        eor             v26.16b, v26.16b, v21.16b
+        eor             v27.16b, v27.16b, v21.16b
+        eor             v28.16b, v28.16b, v21.16b
+        eor             v29.16b, v29.16b, v21.16b
+        eor             v30.16b, v30.16b, v21.16b
+        eor             v31.16b, v31.16b, v21.16b
+        cmeq            v24.4s, v24.4s, #0
+        cmeq            v25.4s, v25.4s, #0
+        cmeq            v26.4s, v26.4s, #0
+        cmeq            v27.4s, v27.4s, #0
+        add             w1, w1, #32
+        b               3f
 
 1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
         ext             v25.16b, v3.16b, v4.16b, #1
@@ -1449,8 +1449,8 @@  function ff_vc1_unescape_buffer_helper_neon, export=1
         cmeq            v26.4s, v26.4s, #0
         cmeq            v27.4s, v27.4s, #0
         // Drop through...
-2:        mov             v0.16b, v5.16b
-          ld1             {v1.16b, v2.16b}, [x0], #32
+2:      mov             v0.16b, v5.16b
+        ld1             {v1.16b, v2.16b}, [x0], #32
         cmeq            v28.4s, v28.4s, #0
         cmeq            v29.4s, v29.4s, #0
         cmeq            v30.4s, v30.4s, #0
@@ -1459,82 +1459,82 @@  function ff_vc1_unescape_buffer_helper_neon, export=1
         orr             v26.16b, v26.16b, v27.16b
         orr             v28.16b, v28.16b, v29.16b
         orr             v30.16b, v30.16b, v31.16b
-          ext             v25.16b, v0.16b, v1.16b, #1
+        ext             v25.16b, v0.16b, v1.16b, #1
         orr             v22.16b, v24.16b, v26.16b
-          ext             v26.16b, v0.16b, v1.16b, #2
-          ext             v27.16b, v0.16b, v1.16b, #3
-          ext             v29.16b, v1.16b, v2.16b, #1
+        ext             v26.16b, v0.16b, v1.16b, #2
+        ext             v27.16b, v0.16b, v1.16b, #3
+        ext             v29.16b, v1.16b, v2.16b, #1
         orr             v23.16b, v28.16b, v30.16b
-          ext             v30.16b, v1.16b, v2.16b, #2
-          ext             v31.16b, v1.16b, v2.16b, #3
-          bic             v24.16b, v0.16b, v20.16b
-          bic             v25.16b, v25.16b, v20.16b
-          bic             v26.16b, v26.16b, v20.16b
+        ext             v30.16b, v1.16b, v2.16b, #2
+        ext             v31.16b, v1.16b, v2.16b, #3
+        bic             v24.16b, v0.16b, v20.16b
+        bic             v25.16b, v25.16b, v20.16b
+        bic             v26.16b, v26.16b, v20.16b
         orr             v22.16b, v22.16b, v23.16b
-          bic             v27.16b, v27.16b, v20.16b
-          bic             v28.16b, v1.16b, v20.16b
-          bic             v29.16b, v29.16b, v20.16b
-          bic             v30.16b, v30.16b, v20.16b
-          bic             v31.16b, v31.16b, v20.16b
+        bic             v27.16b, v27.16b, v20.16b
+        bic             v28.16b, v1.16b, v20.16b
+        bic             v29.16b, v29.16b, v20.16b
+        bic             v30.16b, v30.16b, v20.16b
+        bic             v31.16b, v31.16b, v20.16b
         addv            s22, v22.4s
-          eor             v24.16b, v24.16b, v21.16b
-          eor             v25.16b, v25.16b, v21.16b
-          eor             v26.16b, v26.16b, v21.16b
-          eor             v27.16b, v27.16b, v21.16b
-          eor             v28.16b, v28.16b, v21.16b
+        eor             v24.16b, v24.16b, v21.16b
+        eor             v25.16b, v25.16b, v21.16b
+        eor             v26.16b, v26.16b, v21.16b
+        eor             v27.16b, v27.16b, v21.16b
+        eor             v28.16b, v28.16b, v21.16b
         mov             w3, v22.s[0]
-          eor             v29.16b, v29.16b, v21.16b
-          eor             v30.16b, v30.16b, v21.16b
-          eor             v31.16b, v31.16b, v21.16b
-          cmeq            v24.4s, v24.4s, #0
-          cmeq            v25.4s, v25.4s, #0
-          cmeq            v26.4s, v26.4s, #0
-          cmeq            v27.4s, v27.4s, #0
+        eor             v29.16b, v29.16b, v21.16b
+        eor             v30.16b, v30.16b, v21.16b
+        eor             v31.16b, v31.16b, v21.16b
+        cmeq            v24.4s, v24.4s, #0
+        cmeq            v25.4s, v25.4s, #0
+        cmeq            v26.4s, v26.4s, #0
+        cmeq            v27.4s, v27.4s, #0
         cbnz            w3, 90f
         st1             {v3.16b, v4.16b}, [x2], #32
-3:          mov             v3.16b, v2.16b
-            ld1             {v4.16b, v5.16b}, [x0], #32
-          cmeq            v28.4s, v28.4s, #0
-          cmeq            v29.4s, v29.4s, #0
-          cmeq            v30.4s, v30.4s, #0
-          cmeq            v31.4s, v31.4s, #0
-          orr             v24.16b, v24.16b, v25.16b
-          orr             v26.16b, v26.16b, v27.16b
-          orr             v28.16b, v28.16b, v29.16b
-          orr             v30.16b, v30.16b, v31.16b
-            ext             v25.16b, v3.16b, v4.16b, #1
-          orr             v22.16b, v24.16b, v26.16b
-            ext             v26.16b, v3.16b, v4.16b, #2
-            ext             v27.16b, v3.16b, v4.16b, #3
-            ext             v29.16b, v4.16b, v5.16b, #1
-          orr             v23.16b, v28.16b, v30.16b
-            ext             v30.16b, v4.16b, v5.16b, #2
-            ext             v31.16b, v4.16b, v5.16b, #3
-            bic             v24.16b, v3.16b, v20.16b
-            bic             v25.16b, v25.16b, v20.16b
-            bic             v26.16b, v26.16b, v20.16b
-          orr             v22.16b, v22.16b, v23.16b
-            bic             v27.16b, v27.16b, v20.16b
-            bic             v28.16b, v4.16b, v20.16b
-            bic             v29.16b, v29.16b, v20.16b
-            bic             v30.16b, v30.16b, v20.16b
-            bic             v31.16b, v31.16b, v20.16b
-          addv            s22, v22.4s
-            eor             v24.16b, v24.16b, v21.16b
-            eor             v25.16b, v25.16b, v21.16b
-            eor             v26.16b, v26.16b, v21.16b
-            eor             v27.16b, v27.16b, v21.16b
-            eor             v28.16b, v28.16b, v21.16b
-          mov             w3, v22.s[0]
-            eor             v29.16b, v29.16b, v21.16b
-            eor             v30.16b, v30.16b, v21.16b
-            eor             v31.16b, v31.16b, v21.16b
-            cmeq            v24.4s, v24.4s, #0
-            cmeq            v25.4s, v25.4s, #0
-            cmeq            v26.4s, v26.4s, #0
-            cmeq            v27.4s, v27.4s, #0
-          cbnz            w3, 91f
-          st1             {v0.16b, v1.16b}, [x2], #32
+3:      mov             v3.16b, v2.16b
+        ld1             {v4.16b, v5.16b}, [x0], #32
+        cmeq            v28.4s, v28.4s, #0
+        cmeq            v29.4s, v29.4s, #0
+        cmeq            v30.4s, v30.4s, #0
+        cmeq            v31.4s, v31.4s, #0
+        orr             v24.16b, v24.16b, v25.16b
+        orr             v26.16b, v26.16b, v27.16b
+        orr             v28.16b, v28.16b, v29.16b
+        orr             v30.16b, v30.16b, v31.16b
+        ext             v25.16b, v3.16b, v4.16b, #1
+        orr             v22.16b, v24.16b, v26.16b
+        ext             v26.16b, v3.16b, v4.16b, #2
+        ext             v27.16b, v3.16b, v4.16b, #3
+        ext             v29.16b, v4.16b, v5.16b, #1
+        orr             v23.16b, v28.16b, v30.16b
+        ext             v30.16b, v4.16b, v5.16b, #2
+        ext             v31.16b, v4.16b, v5.16b, #3
+        bic             v24.16b, v3.16b, v20.16b
+        bic             v25.16b, v25.16b, v20.16b
+        bic             v26.16b, v26.16b, v20.16b
+        orr             v22.16b, v22.16b, v23.16b
+        bic             v27.16b, v27.16b, v20.16b
+        bic             v28.16b, v4.16b, v20.16b
+        bic             v29.16b, v29.16b, v20.16b
+        bic             v30.16b, v30.16b, v20.16b
+        bic             v31.16b, v31.16b, v20.16b
+        addv            s22, v22.4s
+        eor             v24.16b, v24.16b, v21.16b
+        eor             v25.16b, v25.16b, v21.16b
+        eor             v26.16b, v26.16b, v21.16b
+        eor             v27.16b, v27.16b, v21.16b
+        eor             v28.16b, v28.16b, v21.16b
+        mov             w3, v22.s[0]
+        eor             v29.16b, v29.16b, v21.16b
+        eor             v30.16b, v30.16b, v21.16b
+        eor             v31.16b, v31.16b, v21.16b
+        cmeq            v24.4s, v24.4s, #0
+        cmeq            v25.4s, v25.4s, #0
+        cmeq            v26.4s, v26.4s, #0
+        cmeq            v27.4s, v27.4s, #0
+        cbnz            w3, 91f
+        st1             {v0.16b, v1.16b}, [x2], #32
         subs            w1, w1, #64
         b.pl            2b
 
diff --git a/libavcodec/aarch64/videodsp.S b/libavcodec/aarch64/videodsp.S
index fe2da0658e..b51e82dc59 100644
--- a/libavcodec/aarch64/videodsp.S
+++ b/libavcodec/aarch64/videodsp.S
@@ -19,8 +19,7 @@ 
 #include "libavutil/aarch64/asm.S"
 
 function ff_prefetch_aarch64, export=1
-1:
-        subs            w2,  w2,  #2
+1:      subs            w2,  w2,  #2
         prfm            pldl1strm, [x0]
         prfm            pldl1strm, [x0,  x1]
         add             x0,  x0,  x1,  lsl #1
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index 4bbf16d1a4..257afb5eb8 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -284,7 +284,7 @@  endfunc
 //   hev_thresh -> x5
 //
 .macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
-    .if \simple
+.if \simple
         uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
         uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
         uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
@@ -292,7 +292,7 @@  endfunc
         uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
         movi            v21.16b, #0x80
         cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
-    .else
+.else
         // calculate hev and normal_limit:
         uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
         uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
@@ -323,39 +323,39 @@  endfunc
         and             v16.16b, v16.16b, v19.16b
         movi            v21.16b, #0x80
         orr             v17.16b, v20.16b, v22.16b
-    .endif
+.endif
 
         // at this point:
         //   v16: normal_limit
         //   v17: hev
 
         // convert to signed value:
-        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
-        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
-
-        movi           v20.8h, #3
-        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
-        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
-        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
-        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
-        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
-        mul            v19.8h, v19.8h, v20.8h
-
-        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
-        movi           v22.16b, #4
-        movi           v23.16b, #3
-    .if \inner
-        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
-    .endif
-        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
-        saddw2         v19.8h,  v19.8h, v20.16b
-        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
-        sqxtn2         v18.16b, v19.8h
-    .if !\inner && !\simple
-        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
-    .endif
-        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
+        eor             v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
+        eor             v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
+
+        movi            v20.8h, #3
+        ssubl           v18.8h, v4.8b,  v3.8b             // QS0 - PS0
+        ssubl2          v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
+        eor             v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
+        eor             v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
+        mul             v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
+        mul             v19.8h, v19.8h, v20.8h
+
+        sqsub           v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
+        movi            v22.16b, #4
+        movi            v23.16b, #3
+.if \inner
+        and             v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
+.endif
+        saddw           v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
+        saddw2          v19.8h,  v19.8h, v20.16b
+        sqxtn           v18.8b,  v18.8h                   // narrow result back into v18
+        sqxtn2          v18.16b, v19.8h
+.if !\inner && !\simple
+        eor             v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
+.endif
+        and             v18.16b, v18.16b, v16.16b         // w &= normal_limit
 
         // registers used at this point..
         //   v0 -> P3  (don't corrupt)
@@ -374,45 +374,45 @@  endfunc
         //   Q0 = s2u(QS0 - c1);
         //   P0 = s2u(PS0 + c2);
 
-    .if \simple
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
-    .elseif \inner
+.if \simple
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+.elseif \inner
         // the !is4tap case of filter_common, only used for inner blocks
         //   c3 = ((c1&~hev) + 1) >> 1;
         //   Q1 = s2u(QS1 - c3);
         //   P1 = s2u(PS1 + c3);
-        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
-        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
-        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
-        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
-        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
-        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
-        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
-        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
-    .else
-        and            v20.16b, v18.16b, v17.16b           // w & hev
-        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
-        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
-        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
-        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
-        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
-        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
-        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        sqadd           v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
+        bic             v19.16b, v19.16b, v17.16b           // c1 & ~hev
+        eor             v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
+        srshr           v19.16b, v19.16b, #1                // c3 >>= 1
+        eor             v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
+        sqsub           v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
+        sqadd           v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
+        eor             v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
+.else
+        and             v20.16b, v18.16b, v17.16b           // w & hev
+        sqadd           v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
+        sqadd           v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
+        sshr            v19.16b, v19.16b, #3                // c1 >>= 3
+        sshr            v20.16b, v20.16b, #3                // c2 >>= 3
+        bic             v18.16b, v18.16b, v17.16b           // w &= ~hev
+        sqsub           v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
+        sqadd           v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
 
         // filter_mbedge:
         //   a = clamp((27*w + 63) >> 7);
@@ -424,36 +424,36 @@  endfunc
         //   a = clamp((9*w + 63) >> 7);
         //   Q2 = s2u(QS2 - a);
         //   P2 = s2u(PS2 + a);
-        movi           v17.8h,  #63
-        sshll          v22.8h,  v18.8b, #3
-        sshll2         v23.8h,  v18.16b, #3
-        saddw          v22.8h,  v22.8h, v18.8b
-        saddw2         v23.8h,  v23.8h, v18.16b
-        add            v16.8h,  v17.8h, v22.8h
-        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
-        add            v19.8h,  v16.8h, v22.8h
-        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
-        add            v22.8h,  v19.8h, v22.8h
-        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
-        sqshrn         v16.8b,  v16.8h,  #7
-        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
-        sqshrn         v19.8b,  v19.8h, #7
-        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
-        sqshrn         v22.8b,  v22.8h, #7
-        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
-        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
-        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
-        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
-        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
-        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
-        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
-        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
-        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
-        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
-        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
-        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
-        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
-    .endif
+        movi            v17.8h,  #63
+        sshll           v22.8h,  v18.8b, #3
+        sshll2          v23.8h,  v18.16b, #3
+        saddw           v22.8h,  v22.8h, v18.8b
+        saddw2          v23.8h,  v23.8h, v18.16b
+        add             v16.8h,  v17.8h, v22.8h
+        add             v17.8h,  v17.8h, v23.8h           //  9*w + 63
+        add             v19.8h,  v16.8h, v22.8h
+        add             v20.8h,  v17.8h, v23.8h           // 18*w + 63
+        add             v22.8h,  v19.8h, v22.8h
+        add             v23.8h,  v20.8h, v23.8h           // 27*w + 63
+        sqshrn          v16.8b,  v16.8h,  #7
+        sqshrn2         v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
+        sqshrn          v19.8b,  v19.8h, #7
+        sqshrn2         v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
+        sqshrn          v22.8b,  v22.8h, #7
+        sqshrn2         v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
+        sqadd           v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
+        sqsub           v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
+        sqadd           v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
+        sqsub           v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
+        sqadd           v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
+        sqsub           v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
+        eor             v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
+        eor             v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
+        eor             v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
+        eor             v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
+        eor             v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
+        eor             v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
+.endif
 .endm
 
 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
@@ -461,38 +461,38 @@  function ff_vp8_v_loop_filter16\name\()_neon, export=1
         sub             x0,  x0,  x1,  lsl #1+!\simple
 
         // Load pixels:
-    .if !\simple
+.if !\simple
         ld1             {v0.16b},     [x0], x1 // P3
         ld1             {v1.16b},     [x0], x1 // P2
-    .endif
+.endif
         ld1             {v2.16b},     [x0], x1 // P1
         ld1             {v3.16b},     [x0], x1 // P0
         ld1             {v4.16b},     [x0], x1 // Q0
         ld1             {v5.16b},     [x0], x1 // Q1
-    .if !\simple
+.if !\simple
         ld1             {v6.16b},     [x0], x1 // Q2
         ld1             {v7.16b},     [x0]     // Q3
         dup             v23.16b, w3                 // flim_I
-    .endif
+.endif
         dup             v22.16b, w2                 // flim_E
 
         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
 
         // back up to P2:  dst -= stride * 6
         sub             x0,  x0,  x1,  lsl #2
-    .if !\simple
+.if !\simple
         sub             x0,  x0,  x1,  lsl #1
 
         // Store pixels:
         st1             {v1.16b},     [x0], x1 // P2
-    .endif
+.endif
         st1             {v2.16b},     [x0], x1 // P1
         st1             {v3.16b},     [x0], x1 // P0
         st1             {v4.16b},     [x0], x1 // Q0
         st1             {v5.16b},     [x0], x1 // Q1
-    .if !\simple
+.if !\simple
         st1             {v6.16b},     [x0]     // Q2
-    .endif
+.endif
 
         ret
 endfunc
@@ -507,48 +507,48 @@  function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
         sub             x0,  x0,  x2,  lsl #2
         sub             x1,  x1,  x2,  lsl #2
         // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2  // P3
-        ld1          {v0.d}[1],     [x1], x2  // P3
-        ld1          {v1.d}[0],     [x0], x2  // P2
-        ld1          {v1.d}[1],     [x1], x2  // P2
-        ld1          {v2.d}[0],     [x0], x2  // P1
-        ld1          {v2.d}[1],     [x1], x2  // P1
-        ld1          {v3.d}[0],     [x0], x2  // P0
-        ld1          {v3.d}[1],     [x1], x2  // P0
-        ld1          {v4.d}[0],     [x0], x2  // Q0
-        ld1          {v4.d}[1],     [x1], x2  // Q0
-        ld1          {v5.d}[0],     [x0], x2  // Q1
-        ld1          {v5.d}[1],     [x1], x2  // Q1
-        ld1          {v6.d}[0],     [x0], x2  // Q2
-        ld1          {v6.d}[1],     [x1], x2  // Q2
-        ld1          {v7.d}[0],     [x0]      // Q3
-        ld1          {v7.d}[1],     [x1]      // Q3
-
-        dup          v22.16b, w3                 // flim_E
-        dup          v23.16b, w4                 // flim_I
+        ld1             {v0.d}[0],     [x0], x2  // P3
+        ld1             {v0.d}[1],     [x1], x2  // P3
+        ld1             {v1.d}[0],     [x0], x2  // P2
+        ld1             {v1.d}[1],     [x1], x2  // P2
+        ld1             {v2.d}[0],     [x0], x2  // P1
+        ld1             {v2.d}[1],     [x1], x2  // P1
+        ld1             {v3.d}[0],     [x0], x2  // P0
+        ld1             {v3.d}[1],     [x1], x2  // P0
+        ld1             {v4.d}[0],     [x0], x2  // Q0
+        ld1             {v4.d}[1],     [x1], x2  // Q0
+        ld1             {v5.d}[0],     [x0], x2  // Q1
+        ld1             {v5.d}[1],     [x1], x2  // Q1
+        ld1             {v6.d}[0],     [x0], x2  // Q2
+        ld1             {v6.d}[1],     [x1], x2  // Q2
+        ld1             {v7.d}[0],     [x0]      // Q3
+        ld1             {v7.d}[1],     [x1]      // Q3
+
+        dup             v22.16b, w3                 // flim_E
+        dup             v23.16b, w4                 // flim_I
 
         vp8_loop_filter inner=\inner, hev_thresh=w5
 
         // back up to P2:  u,v -= stride * 6
-        sub          x0,  x0,  x2,  lsl #2
-        sub          x1,  x1,  x2,  lsl #2
-        sub          x0,  x0,  x2,  lsl #1
-        sub          x1,  x1,  x2,  lsl #1
+        sub             x0,  x0,  x2,  lsl #2
+        sub             x1,  x1,  x2,  lsl #2
+        sub             x0,  x0,  x2,  lsl #1
+        sub             x1,  x1,  x2,  lsl #1
 
         // Store pixels:
 
-        st1          {v1.d}[0],     [x0], x2  // P2
-        st1          {v1.d}[1],     [x1], x2  // P2
-        st1          {v2.d}[0],     [x0], x2  // P1
-        st1          {v2.d}[1],     [x1], x2  // P1
-        st1          {v3.d}[0],     [x0], x2  // P0
-        st1          {v3.d}[1],     [x1], x2  // P0
-        st1          {v4.d}[0],     [x0], x2  // Q0
-        st1          {v4.d}[1],     [x1], x2  // Q0
-        st1          {v5.d}[0],     [x0], x2  // Q1
-        st1          {v5.d}[1],     [x1], x2  // Q1
-        st1          {v6.d}[0],     [x0]      // Q2
-        st1          {v6.d}[1],     [x1]      // Q2
+        st1             {v1.d}[0],     [x0], x2  // P2
+        st1             {v1.d}[1],     [x1], x2  // P2
+        st1             {v2.d}[0],     [x0], x2  // P1
+        st1             {v2.d}[1],     [x1], x2  // P1
+        st1             {v3.d}[0],     [x0], x2  // P0
+        st1             {v3.d}[1],     [x1], x2  // P0
+        st1             {v4.d}[0],     [x0], x2  // Q0
+        st1             {v4.d}[1],     [x1], x2  // Q0
+        st1             {v5.d}[0],     [x0], x2  // Q1
+        st1             {v5.d}[1],     [x1], x2  // Q1
+        st1             {v6.d}[0],     [x0]      // Q2
+        st1             {v6.d}[1],     [x1]      // Q2
 
         ret
 endfunc
@@ -579,18 +579,18 @@  function ff_vp8_h_loop_filter16\name\()_neon, export=1
         ld1             {v6.d}[1], [x0], x1
         ld1             {v7.d}[1], [x0], x1
 
-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         dup             v22.16b, w2                 // flim_E
-    .if !\simple
+.if !\simple
         dup             v23.16b, w3                 // flim_I
-    .endif
+.endif
 
         vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
 
         sub             x0,  x0,  x1, lsl #4    // backup 16 rows
 
-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         // Store pixels:
         st1             {v0.d}[0], [x0], x1
@@ -624,24 +624,24 @@  function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
         sub             x1,  x1,  #4
 
         // Load pixels:
-        ld1          {v0.d}[0],     [x0], x2 // load u
-        ld1          {v0.d}[1],     [x1], x2 // load v
-        ld1          {v1.d}[0],     [x0], x2
-        ld1          {v1.d}[1],     [x1], x2
-        ld1          {v2.d}[0],     [x0], x2
-        ld1          {v2.d}[1],     [x1], x2
-        ld1          {v3.d}[0],     [x0], x2
-        ld1          {v3.d}[1],     [x1], x2
-        ld1          {v4.d}[0],     [x0], x2
-        ld1          {v4.d}[1],     [x1], x2
-        ld1          {v5.d}[0],     [x0], x2
-        ld1          {v5.d}[1],     [x1], x2
-        ld1          {v6.d}[0],     [x0], x2
-        ld1          {v6.d}[1],     [x1], x2
-        ld1          {v7.d}[0],     [x0], x2
-        ld1          {v7.d}[1],     [x1], x2
-
-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        ld1             {v0.d}[0],     [x0], x2 // load u
+        ld1             {v0.d}[1],     [x1], x2 // load v
+        ld1             {v1.d}[0],     [x0], x2
+        ld1             {v1.d}[1],     [x1], x2
+        ld1             {v2.d}[0],     [x0], x2
+        ld1             {v2.d}[1],     [x1], x2
+        ld1             {v3.d}[0],     [x0], x2
+        ld1             {v3.d}[1],     [x1], x2
+        ld1             {v4.d}[0],     [x0], x2
+        ld1             {v4.d}[1],     [x1], x2
+        ld1             {v5.d}[0],     [x0], x2
+        ld1             {v5.d}[1],     [x1], x2
+        ld1             {v6.d}[0],     [x0], x2
+        ld1             {v6.d}[1],     [x1], x2
+        ld1             {v7.d}[0],     [x0], x2
+        ld1             {v7.d}[1],     [x1], x2
+
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         dup             v22.16b, w3                 // flim_E
         dup             v23.16b, w4                 // flim_I
@@ -651,25 +651,25 @@  function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
         sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
         sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
 
-        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
+        transpose_8x16B v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
 
         // Store pixels:
-        st1          {v0.d}[0],     [x0], x2 // load u
-        st1          {v0.d}[1],     [x1], x2 // load v
-        st1          {v1.d}[0],     [x0], x2
-        st1          {v1.d}[1],     [x1], x2
-        st1          {v2.d}[0],     [x0], x2
-        st1          {v2.d}[1],     [x1], x2
-        st1          {v3.d}[0],     [x0], x2
-        st1          {v3.d}[1],     [x1], x2
-        st1          {v4.d}[0],     [x0], x2
-        st1          {v4.d}[1],     [x1], x2
-        st1          {v5.d}[0],     [x0], x2
-        st1          {v5.d}[1],     [x1], x2
-        st1          {v6.d}[0],     [x0], x2
-        st1          {v6.d}[1],     [x1], x2
-        st1          {v7.d}[0],     [x0]
-        st1          {v7.d}[1],     [x1]
+        st1             {v0.d}[0],     [x0], x2 // load u
+        st1             {v0.d}[1],     [x1], x2 // load v
+        st1             {v1.d}[0],     [x0], x2
+        st1             {v1.d}[1],     [x1], x2
+        st1             {v2.d}[0],     [x0], x2
+        st1             {v2.d}[1],     [x1], x2
+        st1             {v3.d}[0],     [x0], x2
+        st1             {v3.d}[1],     [x1], x2
+        st1             {v4.d}[0],     [x0], x2
+        st1             {v4.d}[1],     [x1], x2
+        st1             {v5.d}[0],     [x0], x2
+        st1             {v5.d}[1],     [x1], x2
+        st1             {v6.d}[0],     [x0], x2
+        st1             {v6.d}[1],     [x1], x2
+        st1             {v7.d}[0],     [x0]
+        st1             {v7.d}[1],     [x1]
 
         ret
 
@@ -681,8 +681,7 @@  vp8_h_loop_filter8uv _inner, inner=1
 
 
 function ff_put_vp8_pixels16_neon, export=1
-1:
-        subs            w4, w4, #4
+1:      subs            w4, w4, #4
         ld1             {v0.16b},     [x2], x3
         ld1             {v1.16b},     [x2], x3
         ld1             {v2.16b},     [x2], x3
@@ -696,8 +695,7 @@  function ff_put_vp8_pixels16_neon, export=1
 endfunc
 
 function ff_put_vp8_pixels8_neon, export=1
-1:
-        subs            w4, w4, #4
+1:      subs            w4, w4, #4
         ld1             {v0.8b},   [x2], x3
         ld1             {v0.d}[1], [x2], x3
         ld1             {v1.8b},   [x2], x3
@@ -837,14 +835,14 @@  endfunc
 
 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
 // arithmetic can be used to apply filters
-const   subpel_filters, align=4
-        .short     0,   6, 123,  12,   1,   0,   0,   0
-        .short     2,  11, 108,  36,   8,   1,   0,   0
-        .short     0,   9,  93,  50,   6,   0,   0,   0
-        .short     3,  16,  77,  77,  16,   3,   0,   0
-        .short     0,   6,  50,  93,   9,   0,   0,   0
-        .short     1,   8,  36, 108,  11,   2,   0,   0
-        .short     0,   1,  12, 123,   6,   0,   0,   0
+const subpel_filters, align=4
+        .short          0,   6, 123,  12,   1,   0,   0,   0
+        .short          2,  11, 108,  36,   8,   1,   0,   0
+        .short          0,   9,  93,  50,   6,   0,   0,   0
+        .short          3,  16,  77,  77,  16,   3,   0,   0
+        .short          0,   6,  50,  93,   9,   0,   0,   0
+        .short          1,   8,  36, 108,  11,   2,   0,   0
+        .short          0,   1,  12, 123,   6,   0,   0,   0
 endconst
 
 function ff_put_vp8_epel16_v6_neon, export=1
@@ -855,8 +853,7 @@  function ff_put_vp8_epel16_v6_neon, export=1
         movrel          x17,  subpel_filters, -16
         add             x6,  x17,  x6, lsl #4  // y
         ld1             {v0.8h},     [x6]
-1:
-        ld1             {v1.1d - v2.1d},    [x2], x3
+1:      ld1             {v1.1d - v2.1d},    [x2], x3
         ld1             {v3.1d - v4.1d},    [x2], x3
         ld1             {v16.1d - v17.1d},  [x2], x3
         ld1             {v18.1d - v19.1d},  [x2], x3
@@ -884,8 +881,7 @@  function ff_put_vp8_epel16_h6_neon, export=1
         movrel          x17,  subpel_filters, -16
         add             x5,  x17,  x5, lsl #4 // x
         ld1             {v0.8h},  [x5]
-1:
-        ld1             {v1.16b, v2.16b}, [x2], x3
+1:      ld1             {v1.16b, v2.16b}, [x2], x3
         vp8_epel16_h6   v1, v1, v2
         st1             {v1.16b}, [x0], x1
 
@@ -909,8 +905,7 @@  function ff_put_vp8_epel16_h6v6_neon, export=1
         sxtw            x4,  w4
         add             x16, x4, #5   // h
         bic             x7,  x7,  #15
-1:
-        ld1             {v1.16b, v2.16b}, [x2], x3
+1:      ld1             {v1.16b, v2.16b}, [x2], x3
         vp8_epel16_h6   v1, v1, v2
         st1             {v1.16b}, [x7], #16
         subs            x16, x16, #1
@@ -923,8 +918,7 @@  function ff_put_vp8_epel16_h6v6_neon, export=1
         add             x7,  sp,  #15
         ld1             {v0.8h},     [x6]
         bic             x7,  x7,  #15
-2:
-        ld1             {v1.8b - v4.8b},    [x7], #32
+2:      ld1             {v1.8b - v4.8b},    [x7], #32
         ld1             {v16.8b - v19.8b},  [x7], #32
         ld1             {v20.8b - v23.8b},  [x7], #32
         ld1             {v24.8b - v25.8b},  [x7]
@@ -950,8 +944,7 @@  function ff_put_vp8_epel8_v6_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},  [x6]
-1:
-        ld1             {v2.8b},  [x2], x3
+1:      ld1             {v2.8b},  [x2], x3
         ld1             {v3.8b},  [x2], x3
         ld1             {v4.8b},  [x2], x3
         ld1             {v5.8b},  [x2], x3
@@ -977,8 +970,7 @@  function ff_put_vp8_epel8_h6_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x5,  x7,  w5, uxtw #4
         ld1             {v0.8h},        [x5]
-1:
-        ld1             {v2.8b, v3.8b}, [x2], x3
+1:      ld1             {v2.8b, v3.8b}, [x2], x3
 
         vp8_epel8_h6    v2,  v2,  v3
 
@@ -1003,8 +995,7 @@  function ff_put_vp8_epel8_h6v6_neon, export=1
         add             x7,  sp,  #15
         add             x16, x4,  #5   // h
         bic             x7,  x7,  #15
-1:
-        ld1             {v1.8b, v2.8b}, [x2], x3
+1:      ld1             {v1.8b, v2.8b}, [x2], x3
 
         vp8_epel8_h6    v1, v1, v2
 
@@ -1018,8 +1009,7 @@  function ff_put_vp8_epel8_h6v6_neon, export=1
         add             x7,  sp,   #15
         ld1             {v0.8h},   [x6]
         bic             x7,  x7,   #15
-2:
-        ld1             {v1.8b - v4.8b}, [x7], #32
+2:      ld1             {v1.8b - v4.8b}, [x7], #32
         ld1             {v5.8b - v7.8b}, [x7]
 
         sub             x7,  x7,  #16
@@ -1041,8 +1031,7 @@  function ff_put_vp8_epel8_v4_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},     [x6]
-1:
-        ld1             {v2.8b},     [x2], x3
+1:      ld1             {v2.8b},     [x2], x3
         ld1             {v3.8b},     [x2], x3
         ld1             {v4.8b},     [x2], x3
         ld1             {v5.8b},     [x2], x3
@@ -1065,8 +1054,7 @@  function ff_put_vp8_epel8_h4_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x5,  x7,  w5, uxtw #4
         ld1             {v0.8h},       [x5]
-1:
-        ld1             {v2.8b,v3.8b}, [x2], x3
+1:      ld1             {v2.8b,v3.8b}, [x2], x3
 
         vp8_epel8_h4    v2,  v2,  v3
 
@@ -1091,8 +1079,7 @@  function ff_put_vp8_epel8_h4v6_neon, export=1
         add             x7,  sp,  #15
         add             x16, x4, #5   // h
         bic             x7,  x7,  #15
-1:
-        ld1             {v1.8b, v2.8b}, [x2], x3
+1:      ld1             {v1.8b, v2.8b}, [x2], x3
 
         vp8_epel8_h4    v1, v1, v2
 
@@ -1106,8 +1093,7 @@  function ff_put_vp8_epel8_h4v6_neon, export=1
         add             x7,  sp,   #15
         ld1             {v0.8h},   [x6]
         bic             x7,  x7,   #15
-2:
-        ld1             {v1.8b - v4.8b}, [x7], #32
+2:      ld1             {v1.8b - v4.8b}, [x7], #32
         ld1             {v5.8b - v7.8b}, [x7]
 
         sub             x7,  x7,  #16
@@ -1138,8 +1124,7 @@  function ff_put_vp8_epel8_h4v4_neon, export=1
         add             x7,  sp,  #15
         add             x16, x4, #3   // h
         bic             x7,  x7,  #15
-1:
-        ld1             {v1.8b, v2.8b}, [x2], x3
+1:      ld1             {v1.8b, v2.8b}, [x2], x3
 
         vp8_epel8_h4    v1, v1, v2
 
@@ -1153,8 +1138,7 @@  function ff_put_vp8_epel8_h4v4_neon, export=1
         add             x7,  sp,   #15
         ld1             {v0.8h},   [x6]
         bic             x7,  x7,   #15
-2:
-        ld1             {v1.8b - v2.8b}, [x7], #16
+2:      ld1             {v1.8b - v2.8b}, [x7], #16
         ld1             {v3.8b - v5.8b}, [x7]
 
         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
@@ -1183,8 +1167,7 @@  function ff_put_vp8_epel8_h6v4_neon, export=1
         add             x7,  sp,  #15
         add             x16, x4, #3   // h
         bic             x7,  x7,  #15
-1:
-        ld1             {v1.8b, v2.8b}, [x2], x3
+1:      ld1             {v1.8b, v2.8b}, [x2], x3
 
         vp8_epel8_h6    v1, v1, v2
 
@@ -1198,8 +1181,7 @@  function ff_put_vp8_epel8_h6v4_neon, export=1
         add             x7,  sp,   #15
         ld1             {v0.8h},   [x6]
         bic             x7,  x7,   #15
-2:
-        ld1             {v1.8b - v2.8b}, [x7], #16
+2:      ld1             {v1.8b - v2.8b}, [x7], #16
         ld1             {v3.8b - v5.8b}, [x7]
 
         vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
@@ -1219,8 +1201,7 @@  function ff_put_vp8_epel4_v6_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},    [x6]
-1:
-        ld1r            {v2.2s},    [x2], x3
+1:      ld1r            {v2.2s},    [x2], x3
         ld1r            {v3.2s},    [x2], x3
         ld1r            {v4.2s},    [x2], x3
         ld1r            {v5.2s},    [x2], x3
@@ -1255,8 +1236,7 @@  function ff_put_vp8_epel4_h6_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x5,  x7,  w5, uxtw #4
         ld1             {v0.8h},       [x5]
-1:
-        ld1             {v2.8b,v3.8b}, [x2], x3
+1:      ld1             {v2.8b,v3.8b}, [x2], x3
         vp8_epel8_h6    v2,  v2,  v3
         st1             {v2.s}[0], [x0], x1
         subs            w4,  w4,  #1
@@ -1276,8 +1256,7 @@  function ff_put_vp8_epel4_h6v6_neon, export=1
         sub             sp,  sp,  #52
         add             w8,  w4,  #5
         mov             x9,  sp
-1:
-        ld1             {v2.8b,v3.8b}, [x2], x3
+1:      ld1             {v2.8b,v3.8b}, [x2], x3
         vp8_epel8_h6    v2,  v2,  v3
         st1             {v2.s}[0],     [x9], #4
         subs            w8,  w8,  #1
@@ -1286,8 +1265,7 @@  function ff_put_vp8_epel4_h6v6_neon, export=1
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},       [x6]
         mov             x9,  sp
-2:
-        ld1             {v2.8b,v3.8b}, [x9], #16
+2:      ld1             {v2.8b,v3.8b}, [x9], #16
         ld1             {v6.8b},       [x9], #8
         ld1r            {v28.2s},      [x9]
         sub             x9,  x9,  #16
@@ -1324,8 +1302,7 @@  function ff_put_vp8_epel4_h4v6_neon, export=1
         sub             sp,  sp,  #52
         add             w8,  w4,  #5
         mov             x9,  sp
-1:
-        ld1             {v2.8b},       [x2], x3
+1:      ld1             {v2.8b},       [x2], x3
         vp8_epel8_h4    v2,  v2,  v2
         st1             {v2.s}[0],     [x9], #4
         subs            w8,  w8,  #1
@@ -1334,8 +1311,7 @@  function ff_put_vp8_epel4_h4v6_neon, export=1
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},       [x6]
         mov             x9,  sp
-2:
-        ld1             {v2.8b,v3.8b}, [x9], #16
+2:      ld1             {v2.8b,v3.8b}, [x9], #16
         ld1             {v6.8b},       [x9], #8
         ld1r            {v28.2s},      [x9]
         sub             x9,  x9,  #16
@@ -1372,8 +1348,7 @@  function ff_put_vp8_epel4_h6v4_neon, export=1
         sub             sp,  sp,  #44
         add             w8,  w4,  #3
         mov             x9,  sp
-1:
-        ld1             {v2.8b,v3.8b}, [x2], x3
+1:      ld1             {v2.8b,v3.8b}, [x2], x3
         vp8_epel8_h6    v2, v2, v3
         st1             {v2.s}[0],     [x9], #4
         subs            w8,  w8,  #1
@@ -1382,8 +1357,7 @@  function ff_put_vp8_epel4_h6v4_neon, export=1
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},       [x6]
         mov             x9,  sp
-2:
-        ld1             {v2.8b,v3.8b}, [x9], #16
+2:      ld1             {v2.8b,v3.8b}, [x9], #16
         ld1r            {v6.2s},       [x9]
         sub             x9,  x9,  #8
         ld1             {v4.8b,v5.8b}, [x9], #16
@@ -1411,8 +1385,7 @@  function ff_put_vp8_epel4_h4_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x5,  x7,  w5, uxtw #4
         ld1             {v0.8h},    [x5]
-1:
-        ld1             {v2.8b},    [x2], x3
+1:      ld1             {v2.8b},    [x2], x3
         vp8_epel8_h4    v2,  v2,  v2
         st1             {v2.s}[0],  [x0], x1
         subs            w4,  w4,  #1
@@ -1427,8 +1400,7 @@  function ff_put_vp8_epel4_v4_neon, export=1
         movrel          x7,  subpel_filters, -16
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},   [x6]
-1:
-        ld1r            {v2.2s},   [x2], x3
+1:      ld1r            {v2.2s},   [x2], x3
         ld1r            {v3.2s},   [x2], x3
         ld1r            {v4.2s},   [x2], x3
         ld1r            {v5.2s},   [x2], x3
@@ -1464,8 +1436,7 @@  function ff_put_vp8_epel4_h4v4_neon, export=1
         sub             sp,  sp,  #44
         add             w8,  w4,  #3
         mov             x9,  sp
-1:
-        ld1             {v2.8b},       [x2], x3
+1:      ld1             {v2.8b},       [x2], x3
         vp8_epel8_h4    v2,  v2,  v3
         st1             {v2.s}[0],     [x9], #4
         subs            w8,  w8,  #1
@@ -1474,8 +1445,7 @@  function ff_put_vp8_epel4_h4v4_neon, export=1
         add             x6,  x7,  w6, uxtw #4
         ld1             {v0.8h},       [x6]
         mov             x9,  sp
-2:
-        ld1             {v2.8b,v3.8b}, [x9], #16
+2:      ld1             {v2.8b,v3.8b}, [x9], #16
         ld1r            {v6.2s},       [x9]
         sub             x9,  x9,  #8
         ld1             {v4.8b,v5.8b}, [x9], #16
@@ -1504,8 +1474,7 @@  function ff_put_vp8_bilin16_h_neon, export=1
         dup             v0.8b,  w5
         sub             w5,     w7,     w5
         dup             v1.8b,  w5
-1:
-        subs            w4,     w4,     #2
+1:      subs            w4,     w4,     #2
         ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
         ext             v5.8b,  v3.8b,  v4.8b,  #1
         ext             v4.8b,  v2.8b,  v3.8b,  #1
@@ -1538,8 +1507,7 @@  function ff_put_vp8_bilin16_v_neon, export=1
         dup             v1.16b, w6
 
         ld1             {v2.16b}, [x2], x3
-1:
-        subs            w4,     w4,     #2
+1:      subs            w4,     w4,     #2
         ld1             {v4.16b}, [x2], x3
         umull           v6.8h,  v2.8b,  v1.8b
         umlal           v6.8h,  v4.8b,  v0.8b
@@ -1580,8 +1548,7 @@  function ff_put_vp8_bilin16_hv_neon, export=1
         umlal           v18.8h,  v7.8b,  v0.8b
         rshrn           v4.8b,   v16.8h, #3
         rshrn2          v4.16b,  v18.8h, #3
-1:
-        subs            w4,  w4,  #2
+1:      subs            w4,  w4,  #2
         ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
         ext             v21.8b,  v19.8b, v20.8b, #1
         ext             v20.8b,  v18.8b, v19.8b, #1
@@ -1624,8 +1591,7 @@  function ff_put_vp8_bilin8_h_neon, export=1
         dup             v0.8b,  w5
         sub             w5,     w7,     w5
         dup             v1.8b,  w5
-1:
-        subs            w4,     w4,     #2
+1:      subs            w4,     w4,     #2
         ld1             {v2.8b,v3.8b},  [x2],  x3
         ext             v3.8b,  v2.8b,  v3.8b, #1
         umull           v4.8h,  v2.8b,  v1.8b
@@ -1650,8 +1616,7 @@  function ff_put_vp8_bilin8_v_neon, export=1
         dup             v1.8b,   w6
 
         ld1             {v2.8b}, [x2],  x3
-1:
-        subs            w4,      w4,    #2
+1:      subs            w4,      w4,    #2
         ld1             {v3.8b}, [x2],  x3
         umull           v4.8h,   v2.8b, v1.8b
         umlal           v4.8h,   v3.8b, v0.8b
@@ -1681,8 +1646,7 @@  function ff_put_vp8_bilin8_hv_neon, export=1
         umull           v18.8h, v4.8b,  v1.8b
         umlal           v18.8h, v5.8b,  v0.8b
         rshrn           v22.8b, v18.8h, #3
-1:
-        subs            w4,     w4,     #2
+1:      subs            w4,     w4,     #2
         ld1             {v6.8b,v7.8b},  [x2],  x3
         ext             v7.8b,  v6.8b,  v7.8b, #1
         umull           v16.8h, v6.8b,  v1.8b
@@ -1711,8 +1675,7 @@  function ff_put_vp8_bilin4_h_neon, export=1
         dup             v0.8b,   w5
         sub             w5,      w7,     w5
         dup             v1.8b,   w5
-1:
-        subs            w4,      w4,     #2
+1:      subs            w4,      w4,     #2
         ld1             {v2.8b}, [x2],   x3
         ext             v3.8b,   v2.8b,  v3.8b,  #1
         ld1             {v6.8b}, [x2],   x3
@@ -1736,8 +1699,7 @@  function ff_put_vp8_bilin4_v_neon, export=1
         dup             v1.8b,  w6
 
         ld1r            {v2.2s},    [x2], x3
-1:
-        ld1r            {v3.2s},   [x2]
+1:      ld1r            {v3.2s},   [x2]
         ld1             {v2.s}[1], [x2], x3
         ld1             {v3.s}[1], [x2], x3
         umull           v4.8h,  v2.8b,  v1.8b
@@ -1766,8 +1728,7 @@  function ff_put_vp8_bilin4_hv_neon, export=1
         umull           v18.8h,  v4.8b,  v1.8b
         umlal           v18.8h,  v5.8b,  v0.8b
         rshrn           v22.8b,  v18.8h, #3
-1:
-        subs            w4,      w4,     #2
+1:      subs            w4,      w4,     #2
         ld1             {v6.8b}, [x2],   x3
         ext             v7.8b,   v6.8b,  v6.8b,  #1
         ld1             {v4.8b}, [x2],   x3
diff --git a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
index c5f43d36a3..ff52a47881 100644
--- a/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -22,23 +22,23 @@ 
 #include "neon.S"
 
 const itxfm4_coeffs, align=4
-        .short  11585, 0, 6270, 15137
+        .short          11585, 0, 6270, 15137
 iadst4_coeffs:
-        .short  5283, 15212, 9929, 13377
+        .short          5283, 15212, 9929, 13377
 endconst
 
 const iadst8_coeffs, align=4
-        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+        .short          16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
 idct_coeffs:
-        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
-        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
-        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
-        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+        .short          11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short          1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short          804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short          3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
 endconst
 
 const iadst16_coeffs, align=4
-        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
-        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+        .short          16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short          14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
 endconst
 
 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
@@ -392,8 +392,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
         b               2f
 .endif
 
-1:
-        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
+1:      ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
         st1             {v30.4s,v31.4s}, [x2], #32
 
 .ifc \txfm1,iwht
@@ -410,8 +409,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
         transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 
         \txfm2\()4_\bpp v4,  v5,  v6,  v7
-2:
-        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+2:      mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
         ld1             {v0.4h},   [x0], x1
         ld1             {v1.4h},   [x0], x1
 .ifnc \txfm1,iwht
@@ -475,8 +473,7 @@  function idct8x8_dc_add_neon
         mov             x4,  #8
         mov             x3,  x0
         dup             v31.8h, w5
-1:
-        // Loop to add the constant from v2 into all 8x8 outputs
+1:      // Loop to add the constant from v2 into all 8x8 outputs
         subs            x4,  x4,  #2
         ld1             {v3.8h},  [x0], x1
         ld1             {v4.8h},  [x0], x1
@@ -575,8 +572,7 @@  function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
         movi            v6.4s, #0
         movi            v7.4s, #0
 
-1:
-        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
+1:      ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
         ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
         ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
         ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
@@ -604,8 +600,7 @@  function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
         \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
         \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
 .endif
-2:
-        mov             x3,  x0
+2:      mov             x3,  x0
         // Add into the destination
         ld1             {v0.8h},  [x0], x1
         srshr           v16.4s, v16.4s, #5
@@ -726,8 +721,7 @@  function idct16x16_dc_add_neon
         mov             x3, x0
         mov             x4, #16
         dup             v31.8h, w13
-1:
-        // Loop to add the constant from v2 into all 16x16 outputs
+1:      // Loop to add the constant from v2 into all 16x16 outputs
         subs            x4,  x4,  #2
         ld1             {v1.8h,v2.8h},  [x0], x1
         uaddw           v16.4s, v0.4s,  v1.4h
@@ -1041,8 +1035,7 @@  function \txfm\()16_1d_4x16_pass1_neon
         store           \i,  x0,  #16
 .endr
         ret             x14
-1:
-        // Special case: For the last input column (x1 == 12),
+1:      // Special case: For the last input column (x1 == 12),
         // which would be stored as the last row in the temp buffer,
         // don't store the first 4x4 block, but keep it in registers
         // for the first slice of the second pass (where it is the
@@ -1107,7 +1100,7 @@  itxfm16_1d_funcs iadst
 
 // This is the minimum eob value for each subpartition, in increments of 4
 const min_eob_idct_idct_16, align=4
-        .short  0, 10, 38, 89
+        .short          0, 10, 38, 89
 endconst
 
 .macro itxfm_func16x16 txfm1, txfm2
@@ -1177,15 +1170,13 @@  function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
 
 .ifc \txfm1\()_\txfm2,idct_idct
         b               3f
-1:
-        // Set v28-v31 to zero, for the in-register passthrough of
+1:      // Set v28-v31 to zero, for the in-register passthrough of
         // coefficients to pass 2.
         movi            v28.4s,  #0
         movi            v29.4s,  #0
         movi            v30.4s,  #0
         movi            v31.4s,  #0
-2:
-        subs            x1,  x1,  #1
+2:      subs            x1,  x1,  #1
 .rept 4
         st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
 .endr
@@ -1314,8 +1305,7 @@  function idct16_1d_4x16_pass1_half_neon
         store           \i,  x0,  #16
 .endr
         ret             x14
-1:
-        // Special case: For the second input column (r1 == 4),
+1:      // Special case: For the second input column (r1 == 4),
         // which would be stored as the second row in the temp buffer,
         // don't store the first 4x4 block, but keep it in registers
         // for the first slice of the second pass (where it is the
@@ -1418,8 +1408,7 @@  function idct32x32_dc_add_neon
         mov             x4,  #32
         sub             x1,  x1,  #32
         dup             v31.8h, w13
-1:
-        // Loop to add the constant v0 into all 32x32 outputs
+1:      // Loop to add the constant v0 into all 32x32 outputs
         subs            x4,  x4,  #1
         ld1             {v1.8h,v2.8h},  [x0], #32
         uaddw           v16.4s, v0.4s,  v1.4h
@@ -1858,7 +1847,7 @@  idct32_funcs _quarter
 idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
-        .short  0, 9, 34, 70, 135, 240, 336, 448
+        .short          0, 9, 34, 70, 135, 240, 336, 448
 endconst
 
 function vp9_idct_idct_32x32_add_16_neon
@@ -1916,14 +1905,12 @@  function vp9_idct_idct_32x32_add_16_neon
 .endr
         b               3f
 
-1:
-        // Write zeros to the temp buffer for pass 2
+1:      // Write zeros to the temp buffer for pass 2
         movi            v16.4s,  #0
         movi            v17.4s,  #0
         movi            v18.4s,  #0
         movi            v19.4s,  #0
-2:
-        subs            x1,  x1,  #1
+2:      subs            x1,  x1,  #1
 .rept 4
         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
@@ -1983,8 +1970,7 @@  function idct32x32_\size\()_add_16_neon
 .endif
         b               3f
 
-1:
-        // Write zeros to the temp buffer for pass 2
+1:      // Write zeros to the temp buffer for pass 2
         movi            v16.4s,  #0
         movi            v17.4s,  #0
         movi            v18.4s,  #0
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 03272eae82..9b4828b2a3 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -22,23 +22,23 @@ 
 #include "neon.S"
 
 const itxfm4_coeffs, align=4
-        .short  11585, 0, 6270, 15137
+        .short          11585, 0, 6270, 15137
 iadst4_coeffs:
-        .short  5283, 15212, 9929, 13377
+        .short          5283, 15212, 9929, 13377
 endconst
 
 const iadst8_coeffs, align=4
-        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+        .short          16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
 idct_coeffs:
-        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
-        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
-        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
-        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+        .short          11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short          1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short          804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short          3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
 endconst
 
 const iadst16_coeffs, align=4
-        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
-        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+        .short          16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short          14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
 endconst
 
 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
@@ -268,8 +268,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
         b               2f
 .endif
 
-1:
-        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
+1:      ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
         st1             {v31.8h}, [x2], #16
 
 .ifc \txfm1,iwht
@@ -286,8 +285,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
         transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
 
         \txfm2\()4      v4,  v5,  v6,  v7
-2:
-        ld1             {v0.s}[0],   [x0], x1
+2:      ld1             {v0.s}[0],   [x0], x1
         ld1             {v1.s}[0],   [x0], x1
 .ifnc \txfm1,iwht
         srshr           v4.4h,  v4.4h,  #4
@@ -410,8 +408,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         mov             v23.16b, v16.16b
         b               2f
 .endif
-1:
-        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
+1:      ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
         ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
         sub             x2,  x2,  #128
         st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
@@ -423,8 +420,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
         \txfm2\()8
-2:
-        mov             x3,  x0
+2:      mov             x3,  x0
         // Add into the destination
         ld1             {v0.8b},  [x0], x1
         srshr           v16.8h, v16.8h, #5
@@ -497,8 +493,7 @@  function idct16x16_dc_add_neon
 
         mov             x3,  x0
         mov             x4,  #16
-1:
-        // Loop to add the constant from v2 into all 16x16 outputs
+1:      // Loop to add the constant from v2 into all 16x16 outputs
         subs            x4,  x4,  #2
         ld1             {v3.16b},  [x0], x1
         ld1             {v4.16b},  [x0], x1
@@ -788,8 +783,7 @@  function \txfm\()16_1d_8x16_pass1_neon
         store           \i,  x0,  #16
 .endr
         ret             x14
-1:
-        // Special case: For the last input column (x1 == 8),
+1:      // Special case: For the last input column (x1 == 8),
         // which would be stored as the last row in the temp buffer,
         // don't store the first 8x8 block, but keep it in registers
         // for the first slice of the second pass (where it is the
@@ -896,8 +890,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 
 .ifc \txfm1\()_\txfm2,idct_idct
         b               3f
-1:
-        // Set v24-v31 to zero, for the in-register passthrough of
+1:      // Set v24-v31 to zero, for the in-register passthrough of
         // coefficients to pass 2. Since we only do two slices, this can
         // only ever happen for the second slice. So we only need to store
         // zeros to the temp buffer for the second half of the buffer.
@@ -1063,8 +1056,7 @@  function idct32x32_dc_add_neon
 
         mov             x3,  x0
         mov             x4,  #32
-1:
-        // Loop to add the constant v0 into all 32x32 outputs
+1:      // Loop to add the constant v0 into all 32x32 outputs
         subs            x4,  x4,  #2
         ld1             {v1.16b,v2.16b},  [x0], x1
         uaddw           v16.8h, v0.8h,  v1.8b
@@ -1475,7 +1467,7 @@  idct32_funcs _quarter
 idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
-        .short  0, 34, 135, 336
+        .short          0, 34, 135, 336
 endconst
 
 function ff_vp9_idct_idct_32x32_add_neon, export=1
@@ -1522,14 +1514,12 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
 .endr
         b               3f
 
-1:
-        // Write zeros to the temp buffer for pass 2
+1:      // Write zeros to the temp buffer for pass 2
         movi            v16.8h,  #0
         movi            v17.8h,  #0
         movi            v18.8h,  #0
         movi            v19.8h,  #0
-2:
-        subs            x1,  x1,  #1
+2:      subs            x1,  x1,  #1
 .rept 4
         st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
 .endr
diff --git a/libavcodec/aarch64/vp9lpf_16bpp_neon.S b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
index a092617b92..77bbb2b704 100644
--- a/libavcodec/aarch64/vp9lpf_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -244,8 +244,7 @@ 
         bit             v26.16b, \tmp6\().16b,  v6.16b
 .endif
 .if \wd == 16
-6:
-        orr             v2.16b,  v6.16b,  v7.16b
+6:      orr             v2.16b,  v6.16b,  v7.16b
         mov             x11, v2.d[0]
         mov             x12, v2.d[1]
         adds            x11, x11, x12
@@ -262,8 +261,7 @@ 
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
         ret             x15
 
-1:
-        // flat8out
+1:      // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is the input
         // to this section).
@@ -621,8 +619,7 @@  function vp9_loop_filter_v_8_8_16_neon
         sub             x0,  x0,  x1
 
         ret             x10
-6:
-        sub             x9,  x0,  x1, lsl #1
+6:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.8h}, [x9], x1
         st1             {v24.8h}, [x0], x1
         st1             {v23.8h}, [x9], x1
@@ -672,8 +669,7 @@  function vp9_loop_filter_h_8_8_16_neon
         add             x0,  x0,  #8
 
         ret             x10
-6:
-        // If we didn't need to do the flat8in part, we use the same writeback
+6:      // If we didn't need to do the flat8in part, we use the same writeback
         // as in loop_filter_h_4_8.
         add             x9,  x9,  #4
         add             x0,  x9,  x1, lsl #2
@@ -744,8 +740,7 @@  function vp9_loop_filter_v_16_8_16_neon
         add             x0,  x0,  x1
 
         ret             x10
-8:
-        add             x9,  x9,  x1, lsl #2
+8:      add             x9,  x9,  x1, lsl #2
         // If we didn't do the flat8out part, the output is left in the
         // input registers.
         st1             {v21.8h}, [x9], x1
@@ -757,8 +752,7 @@  function vp9_loop_filter_v_16_8_16_neon
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
         ret             x10
-7:
-        sub             x9,  x0,  x1, lsl #1
+7:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.8h}, [x9], x1
         st1             {v24.8h}, [x0], x1
         st1             {v23.8h}, [x9], x1
@@ -822,8 +816,7 @@  function vp9_loop_filter_h_16_8_16_neon
         sub             x0,  x0,  x1, lsl #3
 
         ret             x10
-8:
-        // The same writeback as in loop_filter_h_8_8
+8:      // The same writeback as in loop_filter_h_8_8
         sub             x9,  x0,  #8
         add             x0,  x9,  x1, lsl #2
         transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
@@ -839,8 +832,7 @@  function vp9_loop_filter_h_16_8_16_neon
         sub             x0,  x0,  x1, lsl #3
         add             x0,  x0,  #8
         ret             x10
-7:
-        // The same writeback as in loop_filter_h_4_8
+7:      // The same writeback as in loop_filter_h_4_8
         sub             x9,  x0,  #4
         add             x0,  x9,  x1, lsl #2
         transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 694ff8956f..9472cbeb43 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -387,8 +387,7 @@ 
         bit             v26\sz, \tmp6\sz,  v6\sz
 .endif
 .if \wd == 16
-6:
-        orr             v2\sz,  v6\sz,  v7\sz
+6:      orr             v2\sz,  v6\sz,  v7\sz
         mov             x5,  v2.d[0]
 .ifc \sz, .16b
         mov             x6,  v2.d[1]
@@ -413,8 +412,7 @@ 
         // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
         ret             x15
 
-1:
-        // flat8out
+1:      // flat8out
         // This writes all outputs into v2-v17 (skipping v6 and v16).
         // If this part is skipped, the output is read from v21-v26 (which is the input
         // to this section).
@@ -531,40 +529,33 @@ 
 function vp9_loop_filter_4
         loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
-9:
-        ret             x10
+9:      ret             x10
 endfunc
 
 function vp9_loop_filter_4_16b_mix_44
         loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
-9:
-        ret             x10
+9:      ret             x10
 endfunc
 
 function vp9_loop_filter_8
         loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
         ret
-6:
-        ret             x13
-9:
-        ret             x10
+6:      ret             x13
+9:      ret             x10
 endfunc
 
 function vp9_loop_filter_8_16b_mix
         loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
         ret
-6:
-        ret             x13
-9:
-        ret             x10
+6:      ret             x13
+9:      ret             x10
 endfunc
 
 function vp9_loop_filter_16
         loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
@@ -574,8 +565,7 @@  endfunc
 function vp9_loop_filter_16_16b
         loop_filter     16, .16b, 0,    v8,  v9,  v10, v11, v12, v13, v14, v15
         ret
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
@@ -794,8 +784,7 @@  function ff_vp9_loop_filter_v_8_8_neon, export=1
         st1             {v26.8b}, [x0], x1
 
         ret             x10
-6:
-        sub             x9,  x0,  x1, lsl #1
+6:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.8b}, [x9], x1
         st1             {v24.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
@@ -829,8 +818,7 @@  function ff_vp9_loop_filter_v_\mix\()_16_neon, export=1
         st1             {v26.16b}, [x0], x1
 
         ret             x10
-6:
-        sub             x9,  x0,  x1, lsl #1
+6:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.16b}, [x9], x1
         st1             {v24.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
@@ -877,8 +865,7 @@  function ff_vp9_loop_filter_h_8_8_neon, export=1
         st1             {v27.8b}, [x0], x1
 
         ret             x10
-6:
-        // If we didn't need to do the flat8in part, we use the same writeback
+6:      // If we didn't need to do the flat8in part, we use the same writeback
         // as in loop_filter_h_4_8.
         add             x9,  x9,  #2
         add             x0,  x0,  #2
@@ -943,8 +930,7 @@  function ff_vp9_loop_filter_h_\mix\()_16_neon, export=1
         st1             {v27.d}[1], [x0], x1
 
         ret             x10
-6:
-        add             x9,  x9,  #2
+6:      add             x9,  x9,  #2
         add             x0,  x0,  #2
         transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
         st1             {v22.s}[0], [x9], x1
@@ -1017,14 +1003,12 @@  function ff_vp9_loop_filter_v_16_8_neon, export=1
         st1             {v15.8b}, [x0], x1
         st1             {v9.8b},  [x9], x1
         st1             {v17.8b}, [x0], x1
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
         ret             x10
-8:
-        add             x9,  x9,  x1, lsl #2
+8:      add             x9,  x9,  x1, lsl #2
         // If we didn't do the flat8out part, the output is left in the
         // input registers.
         st1             {v21.8b}, [x9], x1
@@ -1034,8 +1018,7 @@  function ff_vp9_loop_filter_v_16_8_neon, export=1
         st1             {v23.8b}, [x9], x1
         st1             {v26.8b}, [x0], x1
         b               9b
-7:
-        sub             x9,  x0,  x1, lsl #1
+7:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.8b}, [x9], x1
         st1             {v24.8b}, [x0], x1
         st1             {v23.8b}, [x9], x1
@@ -1086,14 +1069,12 @@  function ff_vp9_loop_filter_v_16_16_neon, export=1
         st1             {v15.16b}, [x0], x1
         st1             {v9.16b},  [x9], x1
         st1             {v17.16b}, [x0], x1
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
         ret             x10
-8:
-        add             x9,  x9,  x1, lsl #2
+8:      add             x9,  x9,  x1, lsl #2
         st1             {v21.16b}, [x9], x1
         st1             {v24.16b}, [x0], x1
         st1             {v22.16b}, [x9], x1
@@ -1101,8 +1082,7 @@  function ff_vp9_loop_filter_v_16_16_neon, export=1
         st1             {v23.16b}, [x9], x1
         st1             {v26.16b}, [x0], x1
         b               9b
-7:
-        sub             x9,  x0,  x1, lsl #1
+7:      sub             x9,  x0,  x1, lsl #1
         st1             {v22.16b}, [x9], x1
         st1             {v24.16b}, [x0], x1
         st1             {v23.16b}, [x9], x1
@@ -1163,14 +1143,12 @@  function ff_vp9_loop_filter_h_16_8_neon, export=1
         st1             {v17.8b}, [x0], x1
         st1             {v9.8b},  [x9], x1
         st1             {v31.8b}, [x0], x1
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
         ret             x10
-8:
-        // The same writeback as in loop_filter_h_8_8
+8:      // The same writeback as in loop_filter_h_8_8
         sub             x9,  x0,  #4
         add             x0,  x9,  x1, lsl #2
         transpose_8x8B  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
@@ -1184,8 +1162,7 @@  function ff_vp9_loop_filter_h_16_8_neon, export=1
         st1             {v23.8b}, [x9], x1
         st1             {v27.8b}, [x0], x1
         b               9b
-7:
-        // The same writeback as in loop_filter_h_4_8
+7:      // The same writeback as in loop_filter_h_4_8
         sub             x9,  x0,  #2
         add             x0,  x9,  x1, lsl #2
         transpose_4x8B  v22, v23, v24, v25, v26, v27, v28, v29
@@ -1282,14 +1259,12 @@  function ff_vp9_loop_filter_h_16_16_neon, export=1
         st1             {v17.d}[1], [x0], x1
         st1             {v9.d}[1],  [x9], x1
         st1             {v31.d}[1], [x0], x1
-9:
-        ldp             d8,  d9,  [sp], 0x10
+9:      ldp             d8,  d9,  [sp], 0x10
         ldp             d10, d11, [sp], 0x10
         ldp             d12, d13, [sp], 0x10
         ldp             d14, d15, [sp], 0x10
         ret             x10
-8:
-        sub             x9,  x0,  #4
+8:      sub             x9,  x0,  #4
         add             x0,  x9,  x1, lsl #3
         transpose_8x16B v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
 
@@ -1310,8 +1285,7 @@  function ff_vp9_loop_filter_h_16_16_neon, export=1
         st1             {v27.8b},   [x9], x1
         st1             {v27.d}[1], [x0], x1
         b               9b
-7:
-        sub             x9,  x0,  #2
+7:      sub             x9,  x0,  #2
         add             x0,  x9,  x1, lsl #3
         transpose_4x16B v22, v23, v24, v25, v26, v27, v28, v29
         st1             {v22.s}[0], [x9], x1
diff --git a/libavcodec/aarch64/vp9mc_16bpp_neon.S b/libavcodec/aarch64/vp9mc_16bpp_neon.S
index 53b372c262..8ab988bb4c 100644
--- a/libavcodec/aarch64/vp9mc_16bpp_neon.S
+++ b/libavcodec/aarch64/vp9mc_16bpp_neon.S
@@ -29,8 +29,7 @@  function ff_vp9_avg64_16_neon, export=1
         mov             x5,  x0
         sub             x1,  x1,  #64
         sub             x3,  x3,  #64
-1:
-        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
+1:      ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], #64
         ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
         ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
         urhadd          v0.8h,  v0.8h,  v4.8h
@@ -51,8 +50,7 @@  endfunc
 
 function ff_vp9_avg32_16_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
+1:      ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x2], x3
         ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], x1
         ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
         urhadd          v0.8h,  v0.8h,  v4.8h
@@ -72,8 +70,7 @@  function ff_vp9_avg32_16_neon, export=1
 endfunc
 
 function ff_vp9_avg16_16_neon, export=1
-1:
-        ld1             {v2.8h, v3.8h},  [x2], x3
+1:      ld1             {v2.8h, v3.8h},  [x2], x3
         ld1             {v0.8h, v1.8h},  [x0]
         urhadd          v0.8h,  v0.8h,  v2.8h
         urhadd          v1.8h,  v1.8h,  v3.8h
@@ -85,8 +82,7 @@  endfunc
 
 function ff_vp9_avg8_16_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v2.8h},  [x2], x3
+1:      ld1             {v2.8h},  [x2], x3
         ld1             {v0.8h},  [x0], x1
         ld1             {v3.8h},  [x2], x3
         urhadd          v0.8h,  v0.8h,  v2.8h
@@ -101,8 +97,7 @@  endfunc
 
 function ff_vp9_avg4_16_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v2.4h},  [x2], x3
+1:      ld1             {v2.4h},  [x2], x3
         ld1             {v0.4h},  [x0], x1
         ld1             {v3.4h},  [x2], x3
         urhadd          v0.4h,  v0.4h,  v2.4h
@@ -263,8 +258,7 @@  function \type\()_8tap_\size\()h
         st1             {v1.4h},  [x0]
         st1             {v24.4h}, [x6]
 .endif
-3:
-        // Loop vertically
+3:      // Loop vertically
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         add             x2,  x2,  x3
@@ -464,8 +458,7 @@  function \type\()_8tap_8v
         ld1             {v21.8h}, [x2], x3
         ld1             {v22.8h}, [x2], x3
         ld1             {v23.8h}, [x2], x3
-2:
-        ld1             {v24.8h}, [x2], x3
+2:      ld1             {v24.8h}, [x2], x3
         ld1             {v25.8h}, [x2], x3
         ld1             {v26.8h}, [x2], x3
         ld1             {v27.8h}, [x2], x3
@@ -499,8 +492,7 @@  function \type\()_8tap_8v
         subs            x6,  x6,  #4
         b.ne            2b
 
-8:
-        subs            x5,  x5,  #8
+8:      subs            x5,  x5,  #8
         b.eq            9f
         // x0 -= h * dst_stride
         msub            x0,  x1,  x4, x0
@@ -513,8 +505,7 @@  function \type\()_8tap_8v
         add             x2,  x2,  #16
         add             x0,  x0,  #16
         b               1b
-9:
-        ret
+9:      ret
 endfunc
 .endm
 
@@ -561,8 +552,7 @@  function \type\()_8tap_4v
         convolve4       v4,  v5,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
         do_store4       v2,  v3,  v4,  v5,  v16, v17, v18, v19, v1,  \type
 
-9:
-        ret
+9:      ret
 endfunc
 .endm
 
diff --git a/libavcodec/aarch64/vp9mc_aarch64.S b/libavcodec/aarch64/vp9mc_aarch64.S
index f17a8cf04a..e85945020a 100644
--- a/libavcodec/aarch64/vp9mc_aarch64.S
+++ b/libavcodec/aarch64/vp9mc_aarch64.S
@@ -26,8 +26,7 @@ 
 //                            int h, int mx, int my);
 
 function ff_vp9_copy128_aarch64, export=1
-1:
-        ldp             x5,  x6,  [x2]
+1:      ldp             x5,  x6,  [x2]
         ldp             x7,  x8,  [x2, #16]
         stp             x5,  x6,  [x0]
         ldp             x9,  x10, [x2, #32]
@@ -51,8 +50,7 @@  function ff_vp9_copy128_aarch64, export=1
 endfunc
 
 function ff_vp9_copy64_aarch64, export=1
-1:
-        ldp             x5,  x6,  [x2]
+1:      ldp             x5,  x6,  [x2]
         ldp             x7,  x8,  [x2, #16]
         stp             x5,  x6,  [x0]
         ldp             x9,  x10, [x2, #32]
@@ -68,8 +66,7 @@  function ff_vp9_copy64_aarch64, export=1
 endfunc
 
 function ff_vp9_copy32_aarch64, export=1
-1:
-        ldp             x5,  x6,  [x2]
+1:      ldp             x5,  x6,  [x2]
         ldp             x7,  x8,  [x2, #16]
         stp             x5,  x6,  [x0]
         subs            w4,  w4,  #1
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index abf2bae9db..36b0df635a 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -27,8 +27,7 @@ 
 
 function ff_vp9_avg64_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
+1:      ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
         urhadd          v0.16b,  v0.16b,  v4.16b
@@ -48,8 +47,7 @@  function ff_vp9_avg64_neon, export=1
 endfunc
 
 function ff_vp9_avg32_neon, export=1
-1:
-        ld1             {v2.16b, v3.16b},  [x2], x3
+1:      ld1             {v2.16b, v3.16b},  [x2], x3
         ld1             {v0.16b, v1.16b},  [x0]
         urhadd          v0.16b,  v0.16b,  v2.16b
         urhadd          v1.16b,  v1.16b,  v3.16b
@@ -64,8 +62,7 @@  function ff_vp9_copy16_neon, export=1
         lsl             x1,  x1,  #1
         add             x6,  x2,  x3
         lsl             x3,  x3,  #1
-1:
-        ld1             {v0.16b},  [x2], x3
+1:      ld1             {v0.16b},  [x2], x3
         ld1             {v1.16b},  [x6], x3
         ld1             {v2.16b},  [x2], x3
         ld1             {v3.16b},  [x6], x3
@@ -80,8 +77,7 @@  endfunc
 
 function ff_vp9_avg16_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v2.16b},  [x2], x3
+1:      ld1             {v2.16b},  [x2], x3
         ld1             {v0.16b},  [x0], x1
         ld1             {v3.16b},  [x2], x3
         urhadd          v0.16b,  v0.16b,  v2.16b
@@ -95,8 +91,7 @@  function ff_vp9_avg16_neon, export=1
 endfunc
 
 function ff_vp9_copy8_neon, export=1
-1:
-        ld1             {v0.8b},  [x2], x3
+1:      ld1             {v0.8b},  [x2], x3
         ld1             {v1.8b},  [x2], x3
         subs            w4,  w4,  #2
         st1             {v0.8b},  [x0], x1
@@ -107,8 +102,7 @@  endfunc
 
 function ff_vp9_avg8_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v2.8b},  [x2], x3
+1:      ld1             {v2.8b},  [x2], x3
         ld1             {v0.8b},  [x0], x1
         ld1             {v3.8b},  [x2], x3
         urhadd          v0.8b,  v0.8b,  v2.8b
@@ -122,8 +116,7 @@  function ff_vp9_avg8_neon, export=1
 endfunc
 
 function ff_vp9_copy4_neon, export=1
-1:
-        ld1             {v0.s}[0], [x2], x3
+1:      ld1             {v0.s}[0], [x2], x3
         ld1             {v1.s}[0], [x2], x3
         st1             {v0.s}[0], [x0], x1
         ld1             {v2.s}[0], [x2], x3
@@ -138,8 +131,7 @@  endfunc
 
 function ff_vp9_avg4_neon, export=1
         mov             x5,  x0
-1:
-        ld1             {v2.s}[0], [x2], x3
+1:      ld1             {v2.s}[0], [x2], x3
         ld1             {v0.s}[0], [x0], x1
         ld1             {v2.s}[1], [x2], x3
         ld1             {v0.s}[1], [x0], x1
@@ -328,8 +320,7 @@  function \type\()_8tap_\size\()h_\idx1\idx2
         st1             {v1.s}[0],  [x0]
         st1             {v24.s}[0], [x6]
 .endif
-3:
-        // Loop vertically
+3:      // Loop vertically
         add             x0,  x0,  x1
         add             x6,  x6,  x1
         add             x2,  x2,  x3
@@ -495,8 +486,7 @@  function \type\()_8tap_8v_\idx1\idx2
         loadl           v17, v18, v19
 
         loadl           v20, v21, v22, v23
-2:
-        loadl           v24, v25, v26, v27
+2:      loadl           v24, v25, v26, v27
         convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
         convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
         do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
@@ -520,8 +510,7 @@  function \type\()_8tap_8v_\idx1\idx2
         subs            x6,  x6,  #4
         b.ne            2b
 
-8:
-        subs            x5,  x5,  #8
+8:      subs            x5,  x5,  #8
         b.eq            9f
         // x0 -= h * dst_stride
         msub            x0,  x1,  x4, x0
@@ -534,8 +523,7 @@  function \type\()_8tap_8v_\idx1\idx2
         add             x2,  x2,  #8
         add             x0,  x0,  #8
         b               1b
-9:
-        ret
+9:      ret
 endfunc
 .endm
 
@@ -613,8 +601,7 @@  function \type\()_8tap_4v_\idx1\idx2
         convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
         do_store4       v1,  v2,  v5,  v6,  \type
 
-9:
-        ret
+9:      ret
 endfunc
 .endm