diff mbox series

[FFmpeg-devel,v2,2/2] lavc/aarch64: h264, add chroma loop filters for 10bit

Message ID 20210816094545.448283-2-mnitenko@gmail.com
State New
Headers show
Series [FFmpeg-devel,1/2] lavc/aarch64: move transpose_4x8H to neon.S | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Mikhail Nitenko Aug. 16, 2021, 9:45 a.m. UTC
Benchmarks:                                             A53     A72
h264_h_loop_filter_chroma422_10bpp_c:                  277.5   114.2
h264_h_loop_filter_chroma422_10bpp_neon:               109.7    81.7
h264_h_loop_filter_chroma_10bpp_c:                     165.0    75.5
h264_h_loop_filter_chroma_10bpp_neon:                  121.2    74.7
h264_h_loop_filter_chroma_intra422_10bpp_c:            324.2   124.2
h264_h_loop_filter_chroma_intra422_10bpp_neon:         155.2    99.5
h264_h_loop_filter_chroma_intra_10bpp_c:               121.0    48.5
h264_h_loop_filter_chroma_intra_10bpp_neon:             79.5    52.7
h264_h_loop_filter_chroma_mbaff422_10bpp_c:            191.0    73.5
h264_h_loop_filter_chroma_mbaff422_10bpp_neon:         121.2    75.5
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c:      117.0    51.5
h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon:    79.5    53.7
h264_h_loop_filter_chroma_mbaff_intra_10bpp_c:          63.0    28.5
h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon:       48.7    33.2
h264_v_loop_filter_chroma_10bpp_c:                     260.2   135.5
h264_v_loop_filter_chroma_10bpp_neon:                   72.2    49.2
h264_v_loop_filter_chroma_intra_10bpp_c:               158.0    70.7
h264_v_loop_filter_chroma_intra_10bpp_neon:             48.7    32.0

Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
---

removed leftover code, moved from 32bit and started loading with two
alternating registers, code became quite a bit faster!

 libavcodec/aarch64/h264dsp_init_aarch64.c |  37 ++++
 libavcodec/aarch64/h264dsp_neon.S         | 255 ++++++++++++++++++++++
 2 files changed, 292 insertions(+)

Comments

Martin Storsjö Aug. 17, 2021, 10:52 a.m. UTC | #1
On Mon, 16 Aug 2021, Mikhail Nitenko wrote:

> Benchmarks:                                             A53     A72
> h264_h_loop_filter_chroma422_10bpp_c:                  277.5   114.2
> h264_h_loop_filter_chroma422_10bpp_neon:               109.7    81.7
> h264_h_loop_filter_chroma_10bpp_c:                     165.0    75.5
> h264_h_loop_filter_chroma_10bpp_neon:                  121.2    74.7
> h264_h_loop_filter_chroma_intra422_10bpp_c:            324.2   124.2
> h264_h_loop_filter_chroma_intra422_10bpp_neon:         155.2    99.5
> h264_h_loop_filter_chroma_intra_10bpp_c:               121.0    48.5
> h264_h_loop_filter_chroma_intra_10bpp_neon:             79.5    52.7
> h264_h_loop_filter_chroma_mbaff422_10bpp_c:            191.0    73.5
> h264_h_loop_filter_chroma_mbaff422_10bpp_neon:         121.2    75.5
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_c:      117.0    51.5
> h264_h_loop_filter_chroma_mbaff_intra422_10bpp_neon:    79.5    53.7
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_c:          63.0    28.5
> h264_h_loop_filter_chroma_mbaff_intra_10bpp_neon:       48.7    33.2
> h264_v_loop_filter_chroma_10bpp_c:                     260.2   135.5
> h264_v_loop_filter_chroma_10bpp_neon:                   72.2    49.2
> h264_v_loop_filter_chroma_intra_10bpp_c:               158.0    70.7
> h264_v_loop_filter_chroma_intra_10bpp_neon:             48.7    32.0
>
> Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
> ---
>
> removed leftover code, moved from 32bit and started loading with two
> alternating registers, code became quite a bit faster!
>
> libavcodec/aarch64/h264dsp_init_aarch64.c |  37 ++++
> libavcodec/aarch64/h264dsp_neon.S         | 255 ++++++++++++++++++++++
> 2 files changed, 292 insertions(+)
>

> diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
> index 997082498f..80b7ed5ce1 100644
> --- a/libavcodec/aarch64/h264dsp_neon.S
> +++ b/libavcodec/aarch64/h264dsp_neon.S
> @@ -819,3 +819,258 @@ endfunc
>         weight_func     16
>         weight_func     8
>         weight_func     4
> +
> +.macro  h264_loop_filter_start_10
> +        cmp             w2,  #0
> +        ldr             w6,  [x4]
> +        ccmp            w3,  #0, #0, ne
> +        lsl             w2,  w2, #2
> +        mov             v24.S[0], w6
> +        lsl             w3,  w3, #2
> +        and             w8,  w6,  w6,  lsl #16

Nitpick: Align the third operand column on ccmp/lsl above with how it's 
done here for the 'and'. (Yes the existing code here seems to have the 
same misalignment.)

> +        b.eq            1f
> +        ands            w8,  w8,  w8,  lsl #8
> +        b.ge            2f
> +1:
> +        ret
> +2:
> +.endm
> +
> +.macro h264_loop_filter_start_intra_10
> +        orr             w4,  w2,  w3
> +        cbnz            w4,  1f
> +        ret
> +1:
> +        lsl             w2, w2, #2
> +        lsl             w3, w3, #2
> +        dup             v30.8h, w2              // alpha
> +        dup             v31.8h, w3              // beta
> +.endm
> +
> +.macro  h264_loop_filter_chroma_10
> +        dup             v22.8h, w2              // alpha
> +        dup             v23.8h, w3              // beta
> +        uxtl            v24.8h, v24.8b          // tc0
> +
> +        uabd            v26.8h, v16.8h, v0.8h   // abs(p0 - q0)
> +        uabd            v28.8h, v18.8h, v16.8h  // abs(p1 - p0)
> +        uabd            v30.8h, v2.8h,  v0.8h   // abs(q1 - q0)
> +        cmhi            v26.8h, v22.8h, v26.8h  // < alpha
> +        cmhi            v28.8h, v23.8h, v28.8h  // < beta
> +        cmhi            v30.8h, v23.8h, v30.8h  // < beta
> +
> +        and             v26.16b, v26.16b, v28.16b
> +        mov             v4.16b, v0.16b
> +        sub             v4.8h,  v4.8h,  v16.8h
> +        and             v26.16b, v26.16b, v30.16b
> +        shl             v4.8h,  v4.8h,  #2
> +        mov             x8, v26.d[0]
> +        mov             x9, v26.d[1]
> +        sli             v24.8H, v24.8H, #8
> +        uxtl            v24.8H, v24.8B
> +        add             v4.8h,  v4.8h,  v18.8h
> +        shl             v24.8h, v24.8h,  #2
> +
> +        adds            x8,  x8,  x9

I think it would be better for in-order cores to do this 'adds' maybe a 
couple instructions earlier (but the 'mov' from SIMD to GPR probably takes 
a couple cycles, so not too far earlier), maybe one instruction earlier?

> +        b.eq            9f
> +
> +        movi            v31.8h, #3              // (tc0 - 1) << (BIT_DEPTH - 8)) + 1

I guess this 'movi' could be done before the 'b.eq' too? If we branch out, 
we'd have run it in vain, but it's probably essentially free in that case 
anyway, and avoids having the next 'uqsub' stalling, waiting for it.

> +        uqsub           v24.8h, v24.8h,  v31.8h
> +        sub             v4.8h , v4.8h,  v2.8h
> +        srshr           v4.8h,  v4.8h,  #3
> +        smin            v4.8h,  v4.8h,  v24.8h
> +        neg             v25.8h, v24.8h
> +        smax            v4.8h,  v4.8h,  v25.8h
> +        and             v4.16B, v4.16B, v26.16B
> +        add             v16.8h,  v16.8h,  v4.8h
> +        sub             v0.8h,  v0.8h,  v4.8h

Nit: The vertical alignment is quite wobbly here, please try to make nice 
vertical lines if possible.

> +
> +        mvni            v4.8h,  #0xFC, lsl #8  // 1023 for clipping
> +        movi            v5.8h,  #0
> +        smin            v0.8h,  v0.8h,  v4.8h
> +        smax            v16.8h, v16.8h, v5.8h
> +        smax            v0.8h,  v0.8h,  v5.8h
> +        smin            v16.8h, v16.8h, v4.8h

I think it'd be (marginally) better pipelining, and more consistent, if 
you'd do first two smin v4, then two smax v5.

> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_neon_10, export=1
> +        h264_loop_filter_start_10
> +
> +        mov             x10,  x0
> +        sub             x0,  x0,  x1, lsl #1
> +        ld1             {v18.8h}, [x0 ], x1
> +        ld1             {v0.8h},  [x10], x1
> +        ld1             {v16.8h}, [x0 ], x1
> +        ld1             {v2.8h},  [x10]
> +
> +        h264_loop_filter_chroma_10
> +
> +        sub             x0,  x10,  x1, lsl #1
> +        st1             {v16.8h}, [x0], x1
> +        st1             {v0.8h},  [x0], x1
> +9:
> +        ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_neon_10, export=1
> +        h264_loop_filter_start_10
> +
> +        sub             x0,  x0,  #4 // access the 2nd left pixel
> +h_loop_filter_chroma420_10:
> +        add             x10,  x0,  x1,  lsl #2
> +        ld1             {v18.d}[0], [x0 ], x1
> +        ld1             {v18.d}[1], [x10], x1
> +        ld1             {v16.d}[0], [x0 ], x1
> +        ld1             {v16.d}[1], [x10], x1
> +        ld1             {v0.d}[0],  [x0 ], x1
> +        ld1             {v0.d}[1],  [x10], x1
> +        ld1             {v2.d}[0],  [x0 ], x1
> +        ld1             {v2.d}[1],  [x10], x1
> +
> +        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
> +
> +        h264_loop_filter_chroma_10
> +
> +        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
> +
> +        sub             x0,  x10,  x1, lsl #3
> +        st1             {v18.d}[0], [x0], x1
> +        st1             {v16.d}[0], [x0], x1
> +        st1             {v0.d}[0],  [x0], x1
> +        st1             {v2.d}[0],  [x0], x1
> +        st1             {v18.d}[1], [x0], x1
> +        st1             {v16.d}[1], [x0], x1
> +        st1             {v0.d}[1],  [x0], x1
> +        st1             {v2.d}[1],  [x0], x1
> +9:
> +        ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma422_neon_10, export=1
> +        h264_loop_filter_start_10
> +        add             x5,  x0,  x1
> +        sub             x0,  x0,  #4
> +        add             x1,  x1,  x1
> +        mov             x7,  x30
> +        bl              h_loop_filter_chroma420_10
> +        mov             x30, x7
> +        sub             x0,  x5,  #4
> +        mov             v24.s[0], w6
> +        b               h_loop_filter_chroma420_10
> +endfunc
> +
> +.macro h264_loop_filter_chroma_intra_10
> +       uabd            v26.8h, v16.8h, v17.8h  // abs(p0 - q0)
> +       uabd            v27.8h, v18.8h, v16.8h  // abs(p1 - p0)
> +       uabd            v28.8h, v19.8h, v17.8h  // abs(q1 - q0)
> +       cmhi            v26.8h, v30.8h, v26.8h  // < alpha
> +       cmhi            v27.8h, v31.8h, v27.8h  // < beta
> +       cmhi            v28.8h, v31.8h, v28.8h  // < beta
> +       and             v26.16b, v26.16b, v27.16b
> +       and             v26.16b, v26.16b, v28.16b
> +       mov             x2, v26.d[0]
> +       mov             x3, v26.d[1]
> +
> +       shl             v4.8h,  v18.8h,  #1
> +       shl             v6.8h,  v19.8h,  #1
> +
> +       adds            x2,  x2,  x3
> +       b.eq            9f
> +
> +       add             v20.8h,  v16.8h,  v19.8h
> +       add             v22.8h,  v17.8h,  v18.8h
> +       add             v20.8h,  v20.8h,  v4.8h
> +       add             v22.8h,  v22.8h,  v6.8h
> +       urshr           v24.8h,  v20.8h,  #2
> +       urshr           v25.8h,  v22.8h,  #2
> +       bit             v16.16b, v24.16b, v26.16b
> +       bit             v17.16b, v25.16b, v26.16b
> +.endm
> +
> +function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
> +       h264_loop_filter_start_intra_10
> +       mov             x9,  x0
> +       sub             x0,  x0,  x1, lsl #1
> +       ld1             {v18.8h}, [x0], x1
> +       ld1             {v17.8h}, [x9], x1
> +       ld1             {v16.8h}, [x0], x1
> +       ld1             {v19.8h}, [x9]
> +
> +       h264_loop_filter_chroma_intra_10
> +
> +       sub             x0,  x9,  x1, lsl #1
> +       st1             {v16.8h}, [x0], x1
> +       st1             {v17.8h}, [x0], x1
> +
> +9:
> +       ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
> +       h264_loop_filter_start_intra_10
> +
> +       sub             x4,  x0,  #4
> +       sub             x0,  x0,  #2
> +       add             x9,  x4,  x1, lsl #1
> +       ld1             {v18.8h}, [x4], x1
> +       ld1             {v17.8h}, [x9], x1
> +       ld1             {v16.8h}, [x4], x1
> +       ld1             {v19.8h}, [x9], x1
> +
> +       transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
> +
> +       h264_loop_filter_chroma_intra_10
> +
> +       st2             {v16.h,v17.h}[0], [x0], x1
> +       st2             {v16.h,v17.h}[1], [x0], x1
> +       st2             {v16.h,v17.h}[2], [x0], x1
> +       st2             {v16.h,v17.h}[3], [x0], x1
> +
> +9:
> +       ret
> +endfunc
> +
> +function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
> +       h264_loop_filter_start_intra_10
> +       sub             x4,  x0,  #4
> +       sub             x0,  x0,  #2
> +h_loop_filter_chroma420_intra_10:
> +       add             x9,  x4,  x1, lsl #2
> +       ld1             {v18.4h}, [x4], x1
> +       ld1             {v18.d}[1], [x9], x1
> +       ld1             {v16.4h}, [x4], x1
> +       ld1             {v16.d}[1], [x9], x1
> +       ld1             {v17.4h}, [x4], x1
> +       ld1             {v17.d}[1], [x9], x1
> +       ld1             {v19.4h}, [x4], x1
> +       ld1             {v19.d}[1], [x9], x1

Try to align things here to make this bit at least a bit more readable.

This looks pretty good now overall, just a bit minor cosmetic cleanup and 
maybe some minor instruction scheduling left.

If you have access to run and benchmark things on an in-order core (like 
Cortex A53), that'd be good to do while trying to do the instruction 
scheduling tweaks to avoid stalls.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
index cbaf8d31eb..6bf3ecb8a1 100644
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -83,6 +83,29 @@  void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
                              int16_t *block, int stride,
                              const uint8_t nnzc[5 * 8]);
 
+void ff_h264_v_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                        int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                        int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                              int beta);
+void ff_h264_h_loop_filter_luma_intra_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                              int beta);
+void ff_h264_v_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                          int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                          int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_neon_10(uint8_t *pix, ptrdiff_t stride, int alpha,
+                                             int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+                                                int alpha, int beta);
+void ff_h264_h_loop_filter_chroma422_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+                                                   int alpha, int beta);
+void ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10(uint8_t *pix, ptrdiff_t stride,
+                                                      int alpha, int beta);
+
 av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
                                      const int chroma_format_idc)
 {
@@ -125,5 +148,19 @@  av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
         c->h264_idct8_add       = ff_h264_idct8_add_neon;
         c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
         c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
+    } else if (have_neon(cpu_flags) && bit_depth == 10) {
+        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon_10;
+        c->h264_v_loop_filter_chroma_intra = ff_h264_v_loop_filter_chroma_intra_neon_10;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon_10;
+            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
+            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10;
+        } else {
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon_10;
+            c->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_neon_10;
+            c->h264_h_loop_filter_chroma_intra = ff_h264_h_loop_filter_chroma422_intra_neon_10;
+            c->h264_h_loop_filter_chroma_mbaff_intra = ff_h264_h_loop_filter_chroma_intra_neon_10;
+        }
     }
 }
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 997082498f..80b7ed5ce1 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -819,3 +819,258 @@  endfunc
         weight_func     16
         weight_func     8
         weight_func     4
+
+.macro  h264_loop_filter_start_10
+        cmp             w2,  #0
+        ldr             w6,  [x4]
+        ccmp            w3,  #0, #0, ne
+        lsl             w2,  w2, #2
+        mov             v24.S[0], w6
+        lsl             w3,  w3, #2
+        and             w8,  w6,  w6,  lsl #16
+        b.eq            1f
+        ands            w8,  w8,  w8,  lsl #8
+        b.ge            2f
+1:
+        ret
+2:
+.endm
+
+.macro h264_loop_filter_start_intra_10
+        orr             w4,  w2,  w3
+        cbnz            w4,  1f
+        ret
+1:
+        lsl             w2, w2, #2
+        lsl             w3, w3, #2
+        dup             v30.8h, w2              // alpha
+        dup             v31.8h, w3              // beta
+.endm
+
+.macro  h264_loop_filter_chroma_10
+        dup             v22.8h, w2              // alpha
+        dup             v23.8h, w3              // beta
+        uxtl            v24.8h, v24.8b          // tc0
+
+        uabd            v26.8h, v16.8h, v0.8h   // abs(p0 - q0)
+        uabd            v28.8h, v18.8h, v16.8h  // abs(p1 - p0)
+        uabd            v30.8h, v2.8h,  v0.8h   // abs(q1 - q0)
+        cmhi            v26.8h, v22.8h, v26.8h  // < alpha
+        cmhi            v28.8h, v23.8h, v28.8h  // < beta
+        cmhi            v30.8h, v23.8h, v30.8h  // < beta
+
+        and             v26.16b, v26.16b, v28.16b
+        mov             v4.16b, v0.16b
+        sub             v4.8h,  v4.8h,  v16.8h
+        and             v26.16b, v26.16b, v30.16b
+        shl             v4.8h,  v4.8h,  #2
+        mov             x8, v26.d[0]
+        mov             x9, v26.d[1]
+        sli             v24.8H, v24.8H, #8
+        uxtl            v24.8H, v24.8B
+        add             v4.8h,  v4.8h,  v18.8h
+        shl             v24.8h, v24.8h,  #2
+
+        adds            x8,  x8,  x9
+        b.eq            9f
+
+        movi            v31.8h, #3              // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
+        uqsub           v24.8h, v24.8h,  v31.8h
+        sub             v4.8h , v4.8h,  v2.8h
+        srshr           v4.8h,  v4.8h,  #3
+        smin            v4.8h,  v4.8h,  v24.8h
+        neg             v25.8h, v24.8h
+        smax            v4.8h,  v4.8h,  v25.8h
+        and             v4.16B, v4.16B, v26.16B
+        add             v16.8h,  v16.8h,  v4.8h
+        sub             v0.8h,  v0.8h,  v4.8h
+
+        mvni            v4.8h,  #0xFC, lsl #8  // 1023 for clipping
+        movi            v5.8h,  #0
+        smin            v0.8h,  v0.8h,  v4.8h
+        smax            v16.8h, v16.8h, v5.8h
+        smax            v0.8h,  v0.8h,  v5.8h
+        smin            v16.8h, v16.8h, v4.8h
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon_10, export=1
+        h264_loop_filter_start_10
+
+        mov             x10,  x0
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v18.8h}, [x0 ], x1
+        ld1             {v0.8h},  [x10], x1
+        ld1             {v16.8h}, [x0 ], x1
+        ld1             {v2.8h},  [x10]
+
+        h264_loop_filter_chroma_10
+
+        sub             x0,  x10,  x1, lsl #1
+        st1             {v16.8h}, [x0], x1
+        st1             {v0.8h},  [x0], x1
+9:
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon_10, export=1
+        h264_loop_filter_start_10
+
+        sub             x0,  x0,  #4 // access the 2nd left pixel
+h_loop_filter_chroma420_10:
+        add             x10,  x0,  x1,  lsl #2
+        ld1             {v18.d}[0], [x0 ], x1
+        ld1             {v18.d}[1], [x10], x1
+        ld1             {v16.d}[0], [x0 ], x1
+        ld1             {v16.d}[1], [x10], x1
+        ld1             {v0.d}[0],  [x0 ], x1
+        ld1             {v0.d}[1],  [x10], x1
+        ld1             {v2.d}[0],  [x0 ], x1
+        ld1             {v2.d}[1],  [x10], x1
+
+        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
+
+        h264_loop_filter_chroma_10
+
+        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
+
+        sub             x0,  x10,  x1, lsl #3
+        st1             {v18.d}[0], [x0], x1
+        st1             {v16.d}[0], [x0], x1
+        st1             {v0.d}[0],  [x0], x1
+        st1             {v2.d}[0],  [x0], x1
+        st1             {v18.d}[1], [x0], x1
+        st1             {v16.d}[1], [x0], x1
+        st1             {v0.d}[1],  [x0], x1
+        st1             {v2.d}[1],  [x0], x1
+9:
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_neon_10, export=1
+        h264_loop_filter_start_10
+        add             x5,  x0,  x1
+        sub             x0,  x0,  #4
+        add             x1,  x1,  x1
+        mov             x7,  x30
+        bl              h_loop_filter_chroma420_10
+        mov             x30, x7
+        sub             x0,  x5,  #4
+        mov             v24.s[0], w6
+        b               h_loop_filter_chroma420_10
+endfunc
+
+.macro h264_loop_filter_chroma_intra_10
+       uabd            v26.8h, v16.8h, v17.8h  // abs(p0 - q0)
+       uabd            v27.8h, v18.8h, v16.8h  // abs(p1 - p0)
+       uabd            v28.8h, v19.8h, v17.8h  // abs(q1 - q0)
+       cmhi            v26.8h, v30.8h, v26.8h  // < alpha
+       cmhi            v27.8h, v31.8h, v27.8h  // < beta
+       cmhi            v28.8h, v31.8h, v28.8h  // < beta
+       and             v26.16b, v26.16b, v27.16b
+       and             v26.16b, v26.16b, v28.16b
+       mov             x2, v26.d[0]
+       mov             x3, v26.d[1]
+
+       shl             v4.8h,  v18.8h,  #1
+       shl             v6.8h,  v19.8h,  #1
+
+       adds            x2,  x2,  x3
+       b.eq            9f
+
+       add             v20.8h,  v16.8h,  v19.8h
+       add             v22.8h,  v17.8h,  v18.8h
+       add             v20.8h,  v20.8h,  v4.8h
+       add             v22.8h,  v22.8h,  v6.8h
+       urshr           v24.8h,  v20.8h,  #2
+       urshr           v25.8h,  v22.8h,  #2
+       bit             v16.16b, v24.16b, v26.16b
+       bit             v17.16b, v25.16b, v26.16b
+.endm
+
+function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
+       h264_loop_filter_start_intra_10
+       mov             x9,  x0
+       sub             x0,  x0,  x1, lsl #1
+       ld1             {v18.8h}, [x0], x1
+       ld1             {v17.8h}, [x9], x1
+       ld1             {v16.8h}, [x0], x1
+       ld1             {v19.8h}, [x9]
+
+       h264_loop_filter_chroma_intra_10
+
+       sub             x0,  x9,  x1, lsl #1
+       st1             {v16.8h}, [x0], x1
+       st1             {v17.8h}, [x0], x1
+
+9:
+       ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
+       h264_loop_filter_start_intra_10
+
+       sub             x4,  x0,  #4
+       sub             x0,  x0,  #2
+       add             x9,  x4,  x1, lsl #1
+       ld1             {v18.8h}, [x4], x1
+       ld1             {v17.8h}, [x9], x1
+       ld1             {v16.8h}, [x4], x1
+       ld1             {v19.8h}, [x9], x1
+
+       transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+
+       h264_loop_filter_chroma_intra_10
+
+       st2             {v16.h,v17.h}[0], [x0], x1
+       st2             {v16.h,v17.h}[1], [x0], x1
+       st2             {v16.h,v17.h}[2], [x0], x1
+       st2             {v16.h,v17.h}[3], [x0], x1
+
+9:
+       ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
+       h264_loop_filter_start_intra_10
+       sub             x4,  x0,  #4
+       sub             x0,  x0,  #2
+h_loop_filter_chroma420_intra_10:
+       add             x9,  x4,  x1, lsl #2
+       ld1             {v18.4h}, [x4], x1
+       ld1             {v18.d}[1], [x9], x1
+       ld1             {v16.4h}, [x4], x1
+       ld1             {v16.d}[1], [x9], x1
+       ld1             {v17.4h}, [x4], x1
+       ld1             {v17.d}[1], [x9], x1
+       ld1             {v19.4h}, [x4], x1
+       ld1             {v19.d}[1], [x9], x1
+
+       transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
+
+       h264_loop_filter_chroma_intra_10
+
+       st2             {v16.h,v17.h}[0], [x0], x1
+       st2             {v16.h,v17.h}[1], [x0], x1
+       st2             {v16.h,v17.h}[2], [x0], x1
+       st2             {v16.h,v17.h}[3], [x0], x1
+       st2             {v16.h,v17.h}[4], [x0], x1
+       st2             {v16.h,v17.h}[5], [x0], x1
+       st2             {v16.h,v17.h}[6], [x0], x1
+       st2             {v16.h,v17.h}[7], [x0], x1
+
+9:
+       ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
+       h264_loop_filter_start_intra_10
+       sub             x4,  x0,  #4
+       add             x5,  x0,  x1, lsl #3
+       sub             x0,  x0,  #2
+       mov             x7,  x30
+       bl              h_loop_filter_chroma420_intra_10
+       mov             x4,  x9
+       sub             x0,  x5,  #2
+       mov             x30, x7
+       b               h_loop_filter_chroma420_intra_10
+endfunc