diff mbox series

[FFmpeg-devel] lavc/aarch64: h264qpel, add lowpass_8 based functions

Message ID 20210819205318.593233-1-mnitenko@gmail.com
State New
Headers show
Series [FFmpeg-devel] lavc/aarch64: h264qpel, add lowpass_8 based functions | expand

Checks

Context Check Description
andriy/x86_make success Make finished
andriy/x86_make_fate success Make fate finished
andriy/PPC64_make success Make finished
andriy/PPC64_make_fate success Make fate finished

Commit Message

Mikhail Nitenko Aug. 19, 2021, 8:53 p.m. UTC
Benchmarks:                        A53     A72
avg_h264_qpel_8_mc01_10_c:        932.7   638.5
avg_h264_qpel_8_mc01_10_neon:     397.7   212.2
avg_h264_qpel_8_mc02_10_c:        946.2   691.2
avg_h264_qpel_8_mc02_10_neon:     365.0   199.0
avg_h264_qpel_8_mc03_10_c:        932.7   639.5
avg_h264_qpel_8_mc03_10_neon:     399.2   214.0
avg_h264_qpel_8_mc10_10_c:       1441.7   810.2
avg_h264_qpel_8_mc10_10_neon:     341.7   156.0
avg_h264_qpel_8_mc11_10_c:       2158.0  1330.0
avg_h264_qpel_8_mc11_10_neon:     671.0   343.5
avg_h264_qpel_8_mc13_10_c:       2163.7  1327.7
avg_h264_qpel_8_mc13_10_neon:     673.0   335.0
avg_h264_qpel_8_mc20_10_c:       1434.0   769.5
avg_h264_qpel_8_mc20_10_neon:     309.7   140.5
avg_h264_qpel_8_mc30_10_c:       1448.2   802.0
avg_h264_qpel_8_mc30_10_neon:     357.7   156.7
avg_h264_qpel_8_mc31_10_c:       2188.5  1329.2
avg_h264_qpel_8_mc31_10_neon:     699.0   346.2
avg_h264_qpel_8_mc33_10_c:       2192.2  1337.5
avg_h264_qpel_8_mc33_10_neon:     700.0   349.0
avg_h264_qpel_16_mc01_10_c:      3768.5  2583.5
avg_h264_qpel_16_mc01_10_neon:   1572.5   854.5
avg_h264_qpel_16_mc02_10_c:      3783.0  2736.2
avg_h264_qpel_16_mc02_10_neon:   1442.7   796.7
avg_h264_qpel_16_mc03_10_c:      3789.5  2572.5
avg_h264_qpel_16_mc03_10_neon:   1574.0   854.2
avg_h264_qpel_16_mc10_10_c:      5879.0  3276.0
avg_h264_qpel_16_mc10_10_neon:   1331.5   611.0
avg_h264_qpel_16_mc11_10_c:      8711.7  5344.0
avg_h264_qpel_16_mc11_10_neon:   2634.0  1349.0
avg_h264_qpel_16_mc13_10_c:      8645.0  5309.2
avg_h264_qpel_16_mc13_10_neon:   2630.7  1356.5
avg_h264_qpel_16_mc20_10_c:      5722.5  3111.0
avg_h264_qpel_16_mc20_10_neon:   1203.5   561.0
avg_h264_qpel_16_mc30_10_c:      5926.0  3252.0
avg_h264_qpel_16_mc30_10_neon:   1395.5   613.5
avg_h264_qpel_16_mc31_10_c:      8722.2  5310.2
avg_h264_qpel_16_mc31_10_neon:   2739.7  1382.2
avg_h264_qpel_16_mc33_10_c:      8754.7  5312.7
avg_h264_qpel_16_mc33_10_neon:   2735.7  1402.7
put_h264_qpel_8_mc01_10_c:        854.7   589.0
put_h264_qpel_8_mc01_10_neon:     356.7   196.2
put_h264_qpel_8_mc02_10_c:        780.0   548.5
put_h264_qpel_8_mc02_10_neon:     324.0   181.2
put_h264_qpel_8_mc03_10_c:        854.7   591.7
put_h264_qpel_8_mc03_10_neon:     358.2   199.0
put_h264_qpel_8_mc10_10_c:       1364.7   754.2
put_h264_qpel_8_mc10_10_neon:     305.7   140.7
put_h264_qpel_8_mc11_10_c:       2079.0  1282.2
put_h264_qpel_8_mc11_10_neon:     630.0   328.2
put_h264_qpel_8_mc13_10_c:       2078.5  1279.0
put_h264_qpel_8_mc13_10_neon:     632.0   322.5
put_h264_qpel_8_mc20_10_c:       1221.5   683.7
put_h264_qpel_8_mc20_10_neon:     273.7   125.0
put_h264_qpel_8_mc30_10_c:       1377.2   758.0
put_h264_qpel_8_mc30_10_neon:     326.7   141.5
put_h264_qpel_8_mc31_10_c:       2107.0  1278.5
put_h264_qpel_8_mc31_10_neon:     658.0   331.2
put_h264_qpel_8_mc33_10_c:       2107.0  1285.0
put_h264_qpel_8_mc33_10_neon:     659.0   332.0
put_h264_qpel_16_mc01_10_c:      3529.7  2412.5
put_h264_qpel_16_mc01_10_neon:   1408.5   786.5
put_h264_qpel_16_mc02_10_c:      3151.5  2121.0
put_h264_qpel_16_mc02_10_neon:   1278.7   725.5
put_h264_qpel_16_mc03_10_c:      3546.5  2375.5
put_h264_qpel_16_mc03_10_neon:   1410.0   787.7
put_h264_qpel_16_mc10_10_c:      5511.5  2999.0
put_h264_qpel_16_mc10_10_neon:   1187.5   558.2
put_h264_qpel_16_mc11_10_c:      8424.2  5137.7
put_h264_qpel_16_mc11_10_neon:   2465.0  1277.7
put_h264_qpel_16_mc13_10_c:      8597.2  5127.7
put_h264_qpel_16_mc13_10_neon:   2466.7  1290.5
put_h264_qpel_16_mc20_10_c:      4894.5  2745.7
put_h264_qpel_16_mc20_10_neon:   1059.5   494.2
put_h264_qpel_16_mc30_10_c:      5576.5  3035.0
put_h264_qpel_16_mc30_10_neon:   1251.5   558.2
put_h264_qpel_16_mc31_10_c:      8695.5  5150.5
put_h264_qpel_16_mc31_10_neon:   2570.7  1320.5
put_h264_qpel_16_mc33_10_c:      8702.5  5131.2
put_h264_qpel_16_mc33_10_neon:   2571.7  1337.0

Signed-off-by: Mikhail Nitenko <mnitenko@gmail.com>
---
 libavcodec/aarch64/h264qpel_init_aarch64.c |  91 +++-
 libavcodec/aarch64/h264qpel_neon.S         | 515 +++++++++++++++++++++
 2 files changed, 604 insertions(+), 2 deletions(-)

Comments

Martin Storsjö Sept. 3, 2021, 9:53 a.m. UTC | #1
On Thu, 19 Aug 2021, Mikhail Nitenko wrote:

> diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
> index d27cfac494..eb18469b7f 100644
> --- a/libavcodec/aarch64/h264qpel_neon.S
> +++ b/libavcodec/aarch64/h264qpel_neon.S
> @@ -932,3 +932,518 @@ endfunc
>
>         h264_qpel16 put
>         h264_qpel16 avg
> +
> +//trashes v0-v5, v7
> +.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
> +        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
> +        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
> +        add             v2.8H,      v2.8H,       v3.8H
> +        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
> +        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
> +        add             v4.8H,      v4.8H,       v5.8H
> +        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
> +        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
> +        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
> +        ext             v0.16B,      \r2\().16B, \r3\().16B, #4

Nit: Indentation is off for the center column

> +        umlal           \d0\().4S,  v2.4H,       v6.H[1]
> +        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
> +        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
> +        add             v0.8H,      v0.8H,       v1.8H
> +        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
> +        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
> +        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]

I see why you need to go to 32 bit here, but I think this could be kept in 
16 bit with this trick:

First add + mla of the two positive coefficients. This is can go outside 
of the range of a signed 16 bit integer, so this must be considered 
unsigned 16 bit. Then do mul of the negative coefficient (corresponding to 
the umlsl here) into a separate register. We see this as a separate 
unsigned 16 bit value.

Then we so a uqsub of these two 16 bit values; the result is nonnegative, 
but still possibly larger than signed 16 bit range. So then finally you do 
urshr instead of sqrshrun (and maybe also umin instead of smin).

Previously you had:
- 2 uaddl (16->32)
- 2 umlal (16->32)
- 2 umlsl (16->32)
- 2 sqrshrun (32->16)

With this, you'd get this down to:
- 1 add
- 1 mla
- 1 mul
- 1 uqsub
- 1 urshr

So 5 instructions instead of 8.

As there's fewer of each operation, it might be good to interleave it more 
with the second calculation if there's enough registers, to avoid stalling 
in a long sequential operation on one single register.

> +        sqrshrun        \d0\().4H,  \d0\().4S,   #5
> +        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
> +        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
> +        add             v1.8H,      v1.8H,       v3.8H
> +        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
> +        uaddl           v3.4S,      \r2\().4H,   v2.4H
> +        uaddl2          v4.4S,      \r2\().8H,   v2.8H
> +        umlal           v3.4S,      v0.4H,       v6.H[1]
> +        umlal2          v4.4S,      v0.8H,       v6.H[1]
> +        umlsl           v3.4S,      v1.4H,       v6.H[0]
> +        umlsl2          v4.4S,      v1.8H,       v6.H[0]
> +        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
> +        sqrshrun        \d1\().4H,  v3.4S,       #5
> +        sqrshrun2       \d1\().8H,  v4.4S,       #5
> +        smin            \d0\().8H,  \d0\().8H,   v5.8h
> +        smin            \d1\().8H,  \d1\().8H,   v5.8h
> +.endm
> +
> +function put_h264_qpel16_h_lowpass_neon_packed_10
> +        mov             x4,  x30
> +        mov             x12, #32
> +        mov             x3,  #16
> +        bl              put_h264_qpel8_h_lowpass_neon_10
> +        sub             x1,  x1,  x2, lsl #4
> +        add             x1,  x1,  #16
> +        mov             x12, #32
> +        mov             x30, x4
> +        b               put_h264_qpel8_h_lowpass_neon_10
> +endfunc
> +
> +.macro  h264_qpel_h_lowpass_10 type
> +function \type\()_h264_qpel16_h_lowpass_neon_10
> +        mov             x13, x30
> +        mov             x12, #32
> +        bl              \type\()_h264_qpel8_h_lowpass_neon_10
> +        sub             x0,  x0,  x3, lsl #4
> +        sub             x1,  x1,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        add             x1,  x1,  #16
> +        mov             x12, #32
> +        mov             x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_neon_10
> +1:      ld1             {v28.8H, v29.8H}, [x1], x2
> +        ld1             {v16.8H, v17.8H}, [x1], x2
> +        subs            x12, x12, #4
> +        lowpass_8_10    v28, v29, v16, v17, v28, v20
> +  .ifc \type,avg
> +        ld1             {v2.8H},    [x0], x3
> +        urhadd          v28.8H, v28.8H,  v2.8H
> +        ld1             {v3.8H},    [x0]
> +        urhadd          v20.8H, v20.8H, v3.8H
> +        sub             x0,  x0,  x3
> +  .endif
> +        st1             {v28.8H},    [x0], x3
> +        st1             {v20.8H},    [x0], x3
> +        b.ne            1b
> +        ret
> +endfunc
> +.endm
> +
> +        h264_qpel_h_lowpass_10 put
> +        h264_qpel_h_lowpass_10 avg
> +
> +.macro h264_qpel_h_lowpass_l2_10 type
> +function \type\()_h264_qpel16_h_lowpass_l2_neon_10
> +        mov             x13, x30
> +        mov             x12, #32
> +        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
> +        sub             x0,  x0,  x2, lsl #4
> +        sub             x1,  x1,  x2, lsl #4
> +        sub             x3,  x3,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        add             x1,  x1,  #16
> +        add             x3,  x3,  #16
> +        mov             x12, #32
> +        mov             x30, x13
> +endfunc
> +
> +function \type\()_h264_qpel8_h_lowpass_l2_neon_10
> +1:      ld1             {v26.8H, v27.8H}, [x1], x2
> +        ld1             {v16.8H, v17.8H}, [x1], x2
> +        ld1             {v28.8H},     [x3], x2
> +        ld1             {v29.8H},     [x3], x2
> +        subs            x12, x12, #4
> +        lowpass_8_10    v26, v27, v16, v17, v26, v27
> +        urhadd          v26.8H, v26.8H, v28.8H
> +        urhadd          v27.8H, v27.8H, v29.8H
> +  .ifc \type,avg
> +        ld1             {v2.8H},      [x0], x2
> +        urhadd          v26.8H, v26.8H, v2.8H
> +        ld1             {v3.8H},      [x0]
> +        urhadd          v27.8H, v27.8H, v3.8H
> +        sub             x0,  x0,  x2
> +  .endif
> +        st1             {v26.8H},     [x0], x2
> +        st1             {v27.8H},     [x0], x2
> +        b.ne            1b
> +        ret
> +endfunc
> +.endm
> +
> +        h264_qpel_h_lowpass_l2_10 put
> +        h264_qpel_h_lowpass_l2_10 avg
> +
> +function put_h264_qpel16_v_lowpass_neon_packed_10
> +        mov             x4,  x30
> +        mov             x2,  #8
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #2
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #4
> +        sub             x1,  x1,  x3, lsl #2
> +        add             x1,  x1,  #8
> +        bl              put_h264_qpel8_v_lowpass_neon
> +        sub             x1,  x1,  x3, lsl #2
> +        mov             x30, x4
> +        b               put_h264_qpel8_v_lowpass_neon
> +endfunc
> +
> +.macro  h264_qpel_v_lowpass_10 type
> +function \type\()_h264_qpel16_v_lowpass_neon_10
> +        mov             x4,  x30
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x1,  x1,  x3, lsl #2
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x0,  x0,  x2, lsl #4
> +        add             x0,  x0,  #16
> +        sub             x1,  x1,  x3, lsl #4
> +        sub             x1,  x1,  x3, lsl #2
> +        add             x1,  x1,  #16
> +        bl              \type\()_h264_qpel8_v_lowpass_neon_10
> +        sub             x1,  x1,  x3, lsl #2
> +        mov             x30, x4
> +endfunc
> +
> +function \type\()_h264_qpel8_v_lowpass_neon_10
> +        ld1             {v16.8H}, [x1], x3
> +        ld1             {v18.8H}, [x1], x3
> +        ld1             {v20.8H}, [x1], x3
> +        ld1             {v22.8H}, [x1], x3
> +        ld1             {v24.8H}, [x1], x3
> +        ld1             {v26.8H}, [x1], x3
> +        ld1             {v28.8H}, [x1], x3
> +        ld1             {v30.8H}, [x1], x3
> +        ld1             {v17.8H}, [x1], x3
> +        ld1             {v19.8H}, [x1], x3
> +        ld1             {v21.8H}, [x1], x3
> +        ld1             {v23.8H}, [x1], x3
> +        ld1             {v25.8H}, [x1]
> +
> +        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
> +        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
> +        lowpass_8_10    v16, v17, v18, v19, v16, v17
> +        lowpass_8_10    v20, v21, v22, v23, v18, v19
> +        lowpass_8_10    v24, v25, v26, v27, v20, v21
> +        lowpass_8_10    v28, v29, v30, v31, v22, v23
> +        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1

I'm a bit surprised by doing this kind of vertical filtering by 
transposing and doing it horizontally - when vertical filtering can be 
done so efficiently as-is without needing any extra 'ext' instructions and 
such. But I see that the existing code does it this way. I'll give it a 
try to make a PoC of rewriting the existing code for some case to see how 
it behaves without the transposes.

// Martin
Martin Storsjö Sept. 3, 2021, 11:26 a.m. UTC | #2
On Fri, 3 Sep 2021, Martin Storsjö wrote:

>> +function \type\()_h264_qpel8_v_lowpass_neon_10
>> +        ld1             {v16.8H}, [x1], x3
>> +        ld1             {v18.8H}, [x1], x3
>> +        ld1             {v20.8H}, [x1], x3
>> +        ld1             {v22.8H}, [x1], x3
>> +        ld1             {v24.8H}, [x1], x3
>> +        ld1             {v26.8H}, [x1], x3
>> +        ld1             {v28.8H}, [x1], x3
>> +        ld1             {v30.8H}, [x1], x3
>> +        ld1             {v17.8H}, [x1], x3
>> +        ld1             {v19.8H}, [x1], x3
>> +        ld1             {v21.8H}, [x1], x3
>> +        ld1             {v23.8H}, [x1], x3
>> +        ld1             {v25.8H}, [x1]
>> +
>> +        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
>> +        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
>> +        lowpass_8_10    v16, v17, v18, v19, v16, v17
>> +        lowpass_8_10    v20, v21, v22, v23, v18, v19
>> +        lowpass_8_10    v24, v25, v26, v27, v20, v21
>> +        lowpass_8_10    v28, v29, v30, v31, v22, v23
>> +        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
>
> I'm a bit surprised by doing this kind of vertical filtering by transposing 
> and doing it horizontally - when vertical filtering can be done so 
> efficiently as-is without needing any extra 'ext' instructions and such. But 
> I see that the existing code does it this way. I'll give it a try to make a 
> PoC of rewriting the existing code for some case to see how it behaves 
> without the transposes.

The potential speedups for the vertical filters are huge actually; I've 
sent a patch that IMO simplifies this (getting rid of all transposes). I'd 
appreciate if you'd remodel your patch according to it.

// Martin
diff mbox series

Patch

diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 77f41d9a21..93fa5246c4 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -95,12 +95,55 @@  void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
 void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 
+void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
 av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
 {
-    const int high_bit_depth = bit_depth > 8;
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_neon(cpu_flags) && !high_bit_depth) {
+    if (have_neon(cpu_flags) && bit_depth <= 8) {
         c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
         c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
         c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@@ -168,5 +211,49 @@  av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
         c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
         c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+    } else if (have_neon(cpu_flags) && bit_depth == 10) {
+        c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
+
+        c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
     }
 }
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index d27cfac494..eb18469b7f 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -932,3 +932,518 @@  endfunc
 
         h264_qpel16 put
         h264_qpel16 avg
+
+//trashes v0-v5, v7
+.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
+        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
+        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
+        add             v2.8H,      v2.8H,       v3.8H
+        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
+        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
+        add             v4.8H,      v4.8H,       v5.8H
+        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
+        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
+        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
+        ext             v0.16B,      \r2\().16B, \r3\().16B, #4
+        umlal           \d0\().4S,  v2.4H,       v6.H[1]
+        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
+        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
+        add             v0.8H,      v0.8H,       v1.8H
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
+        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
+        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]
+        sqrshrun        \d0\().4H,  \d0\().4S,   #5
+        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
+        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
+        add             v1.8H,      v1.8H,       v3.8H
+        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
+        uaddl           v3.4S,      \r2\().4H,   v2.4H
+        uaddl2          v4.4S,      \r2\().8H,   v2.8H
+        umlal           v3.4S,      v0.4H,       v6.H[1]
+        umlal2          v4.4S,      v0.8H,       v6.H[1]
+        umlsl           v3.4S,      v1.4H,       v6.H[0]
+        umlsl2          v4.4S,      v1.8H,       v6.H[0]
+        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
+        sqrshrun        \d1\().4H,  v3.4S,       #5
+        sqrshrun2       \d1\().8H,  v4.4S,       #5
+        smin            \d0\().8H,  \d0\().8H,   v5.8h
+        smin            \d1\().8H,  \d1\().8H,   v5.8h
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x12, #32
+        mov             x3,  #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro  h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1:      ld1             {v28.8H, v29.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v28, v29, v16, v17, v28, v20
+  .ifc \type,avg
+        ld1             {v2.8H},    [x0], x3
+        urhadd          v28.8H, v28.8H,  v2.8H
+        ld1             {v3.8H},    [x0]
+        urhadd          v20.8H, v20.8H, v3.8H
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8H},    [x0], x3
+        st1             {v20.8H},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_10 put
+        h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        add             x3,  x3,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1:      ld1             {v26.8H, v27.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        ld1             {v28.8H},     [x3], x2
+        ld1             {v29.8H},     [x3], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v26, v27, v16, v17, v26, v27
+        urhadd          v26.8H, v26.8H, v28.8H
+        urhadd          v27.8H, v27.8H, v29.8H
+  .ifc \type,avg
+        ld1             {v2.8H},      [x0], x2
+        urhadd          v26.8H, v26.8H, v2.8H
+        ld1             {v3.8H},      [x0]
+        urhadd          v27.8H, v27.8H, v3.8H
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8H},     [x0], x2
+        st1             {v27.8H},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2_10 put
+        h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+  .ifc \type,avg
+        ld1             {v24.8H},  [x0], x2
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v25.8H}, [x0], x2
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v26.8H}, [x0], x2
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v27.8H}, [x0], x2
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v28.8H}, [x0], x2
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v29.8H}, [x0], x2
+        urhadd          v21.8H, v21.8H, v29.8H
+        ld1             {v30.8H}, [x0], x2
+        urhadd          v22.8H, v22.8H, v30.8H
+        ld1             {v31.8H}, [x0], x2
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x2
+        st1             {v17.8H}, [x0], x2
+        st1             {v18.8H}, [x0], x2
+        st1             {v19.8H}, [x0], x2
+        st1             {v20.8H}, [x0], x2
+        st1             {v21.8H}, [x0], x2
+        st1             {v22.8H}, [x0], x2
+        st1             {v23.8H}, [x0], x2
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_10 put
+        h264_qpel_v_lowpass_10 avg
+
+.macro  h264_qpel_v_lowpass_l2_10 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x12, x12, x2, lsl #4
+        add             x0,  x0,  #16
+        add             x12, x12, #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ld1             {v24.8H},  [x12], x2
+        ld1             {v25.8H},  [x12], x2
+        ld1             {v26.8H},  [x12], x2
+        ld1             {v27.8H},  [x12], x2
+        ld1             {v28.8H},  [x12], x2
+        urhadd          v16.8H, v24.8H, v16.8H
+        urhadd          v17.8H, v25.8H, v17.8H
+        ld1             {v29.8H},  [x12], x2
+        urhadd          v18.8H, v26.8H, v18.8H
+        urhadd          v19.8H, v27.8H, v19.8H
+        ld1             {v30.8H}, [x12], x2
+        urhadd          v20.8H, v28.8H, v20.8H
+        urhadd          v21.8H, v29.8H, v21.8H
+        ld1             {v31.8H}, [x12], x2
+        urhadd          v22.8H, v30.8H, v22.8H
+        urhadd          v23.8H, v31.8H, v23.8H
+
+  .ifc \type,avg
+        ld1             {v24.8H}, [x0], x3
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v25.8H}, [x0], x3
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v26.8H}, [x0], x3
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v27.8H}, [x0], x3
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v28.8H}, [x0], x3
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v29.8H}, [x0], x3
+        urhadd          v21.8H, v21.8H, v29.8H
+        ld1             {v30.8H}, [x0], x3
+        urhadd          v22.8H, v22.8H, v30.8H
+        ld1             {v31.8H}, [x0], x3
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x3
+        st1             {v17.8H}, [x0], x3
+        st1             {v18.8H}, [x0], x3
+        st1             {v19.8H}, [x0], x3
+        st1             {v20.8H}, [x0], x3
+        st1             {v21.8H}, [x0], x3
+        st1             {v22.8H}, [x0], x3
+        st1             {v23.8H}, [x0], x3
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_l2_10 put
+        h264_qpel_v_lowpass_l2_10 avg
+
+.macro  h264_qpel8_10   type
+function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel8_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #128
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #16
+        mov             x12, #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel8_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+.endm
+
+        h264_qpel8_10 put
+        h264_qpel8_10 avg
+
+.macro  h264_qpel16_10     type
+function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        b               \type\()_h264_qpel16_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel16_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #512
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #32
+        bl              put_h264_qpel16_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #32
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel16_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+.endm
+
+        h264_qpel16_10 put
+        h264_qpel16_10 avg